From 50ac6607845755e594c8a39b9c6a00d1c9b48ea4 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <akarwar@marvell.com>
Date: Tue, 25 Jun 2013 19:03:56 -0700
Subject: cfg80211/nl80211: rename packet pattern related structures and enums

Currently packet patterns and it's enum/structures are used only
for WoWLAN feature. As we intend to reuse them for new feature
packet coalesce, they are renamed in this patch.

Older names are kept for backward compatibility purpose.

Signed-off-by: Amitkumar Karwar <akarwar@marvell.com>
Signed-off-by: Bing Zhao <bzhao@marvell.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/uapi/linux/nl80211.h | 45 ++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 861e5eba395..de0ce809068 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3060,11 +3060,11 @@ enum nl80211_tx_power_setting {
 };
 
 /**
- * enum nl80211_wowlan_packet_pattern_attr - WoWLAN packet pattern attribute
- * @__NL80211_WOWLAN_PKTPAT_INVALID: invalid number for nested attribute
- * @NL80211_WOWLAN_PKTPAT_PATTERN: the pattern, values where the mask has
+ * enum nl80211_packet_pattern_attr - packet pattern attribute
+ * @__NL80211_PKTPAT_INVALID: invalid number for nested attribute
+ * @NL80211_PKTPAT_PATTERN: the pattern, values where the mask has
  *	a zero bit are ignored
- * @NL80211_WOWLAN_PKTPAT_MASK: pattern mask, must be long enough to have
+ * @NL80211_PKTPAT_MASK: pattern mask, must be long enough to have
  *	a bit for each byte in the pattern. The lowest-order bit corresponds
  *	to the first byte of the pattern, but the bytes of the pattern are
  *	in a little-endian-like format, i.e. the 9th byte of the pattern
@@ -3075,23 +3075,23 @@ enum nl80211_tx_power_setting {
  *	Note that the pattern matching is done as though frames were not
  *	802.11 frames but 802.3 frames, i.e. the frame is fully unpacked
  *	first (including SNAP header unpacking) and then matched.
- * @NL80211_WOWLAN_PKTPAT_OFFSET: packet offset, pattern is matched after
+ * @NL80211_PKTPAT_OFFSET: packet offset, pattern is matched after
  *	these fixed number of bytes of received packet
- * @NUM_NL80211_WOWLAN_PKTPAT: number of attributes
- * @MAX_NL80211_WOWLAN_PKTPAT: max attribute number
+ * @NUM_NL80211_PKTPAT: number of attributes
+ * @MAX_NL80211_PKTPAT: max attribute number
  */
-enum nl80211_wowlan_packet_pattern_attr {
-	__NL80211_WOWLAN_PKTPAT_INVALID,
-	NL80211_WOWLAN_PKTPAT_MASK,
-	NL80211_WOWLAN_PKTPAT_PATTERN,
-	NL80211_WOWLAN_PKTPAT_OFFSET,
+enum nl80211_packet_pattern_attr {
+	__NL80211_PKTPAT_INVALID,
+	NL80211_PKTPAT_MASK,
+	NL80211_PKTPAT_PATTERN,
+	NL80211_PKTPAT_OFFSET,
 
-	NUM_NL80211_WOWLAN_PKTPAT,
-	MAX_NL80211_WOWLAN_PKTPAT = NUM_NL80211_WOWLAN_PKTPAT - 1,
+	NUM_NL80211_PKTPAT,
+	MAX_NL80211_PKTPAT = NUM_NL80211_PKTPAT - 1,
 };
 
 /**
- * struct nl80211_wowlan_pattern_support - pattern support information
+ * struct nl80211_pattern_support - packet pattern support information
  * @max_patterns: maximum number of patterns supported
  * @min_pattern_len: minimum length of each pattern
  * @max_pattern_len: maximum length of each pattern
@@ -3101,13 +3101,22 @@ enum nl80211_wowlan_packet_pattern_attr {
  * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED in the
  * capability information given by the kernel to userspace.
  */
-struct nl80211_wowlan_pattern_support {
+struct nl80211_pattern_support {
 	__u32 max_patterns;
 	__u32 min_pattern_len;
 	__u32 max_pattern_len;
 	__u32 max_pkt_offset;
 } __attribute__((packed));
 
+/* only for backward compatibility */
+#define __NL80211_WOWLAN_PKTPAT_INVALID __NL80211_PKTPAT_INVALID
+#define NL80211_WOWLAN_PKTPAT_MASK NL80211_PKTPAT_MASK
+#define NL80211_WOWLAN_PKTPAT_PATTERN NL80211_PKTPAT_PATTERN
+#define NL80211_WOWLAN_PKTPAT_OFFSET NL80211_PKTPAT_OFFSET
+#define NUM_NL80211_WOWLAN_PKTPAT NUM_NL80211_PKTPAT
+#define MAX_NL80211_WOWLAN_PKTPAT MAX_NL80211_PKTPAT
+#define nl80211_wowlan_pattern_support nl80211_pattern_support
+
 /**
  * enum nl80211_wowlan_triggers - WoWLAN trigger definitions
  * @__NL80211_WOWLAN_TRIG_INVALID: invalid number for nested attributes
@@ -3127,7 +3136,7 @@ struct nl80211_wowlan_pattern_support {
  *	pattern matching is done after the packet is converted to the MSDU.
  *
  *	In %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED, it is a binary attribute
- *	carrying a &struct nl80211_wowlan_pattern_support.
+ *	carrying a &struct nl80211_pattern_support.
  *
  *	When reporting wakeup. it is a u32 attribute containing the 0-based
  *	index of the pattern that caused the wakeup, in the patterns passed
@@ -3284,7 +3293,7 @@ struct nl80211_wowlan_tcp_data_token_feature {
  * @NL80211_WOWLAN_TCP_WAKE_PAYLOAD: wake packet payload, for advertising a
  *	u32 attribute holding the maximum length
  * @NL80211_WOWLAN_TCP_WAKE_MASK: Wake packet payload mask, not used for
- *	feature advertising. The mask works like @NL80211_WOWLAN_PKTPAT_MASK
+ *	feature advertising. The mask works like @NL80211_PKTPAT_MASK
  *	but on the TCP payload only.
  * @NUM_NL80211_WOWLAN_TCP: number of TCP attributes
  * @MAX_NL80211_WOWLAN_TCP: highest attribute number
-- 
cgit v1.2.3-70-g09d2


From be29b99a9b51b0338eea3c66a58de53bbd01de24 Mon Sep 17 00:00:00 2001
From: Amitkumar Karwar <akarwar@marvell.com>
Date: Fri, 28 Jun 2013 11:51:26 -0700
Subject: cfg80211/nl80211: Add packet coalesce support

In most cases, host that receives IPv4 and IPv6 multicast/broadcast
packets does not do anything with these packets. Therefore the
reception of these unwanted packets causes unnecessary processing
and power consumption.

Packet coalesce feature helps to reduce number of received
interrupts to host by buffering these packets in firmware/hardware
for some predefined time. Received interrupt will be generated when
one of the following events occur.
a) Expiration of hardware timer whose expiration time is set to
maximum coalescing delay of matching coalesce rule.
b) Coalescing buffer in hardware reaches it's limit.
c) Packet doesn't match any of the configured coalesce rules.

This patch adds set/get configuration support for packet coalesce.
User needs to configure following parameters for creating a coalesce
rule.
a) Maximum coalescing delay
b) List of packet patterns which needs to be matched
c) Condition for coalescence. pattern 'match' or 'no match'
Multiple such rules can be created.

This feature needs to be advertised during driver initialization.
Drivers are supposed to do required firmware/hardware settings based
on user configuration.

Signed-off-by: Amitkumar Karwar <akarwar@marvell.com>
Signed-off-by: Bing Zhao <bzhao@marvell.com>
[fix kernel-doc, change free function, fix copy/paste error]
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/net/cfg80211.h       |  54 ++++++++
 include/uapi/linux/nl80211.h |  90 ++++++++++++-
 net/wireless/core.c          |   9 ++
 net/wireless/core.h          |   2 +
 net/wireless/nl80211.c       | 308 +++++++++++++++++++++++++++++++++++++++++++
 net/wireless/nl80211.h       |   2 +
 6 files changed, 463 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 49409602fe3..071ed2395c9 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1780,6 +1780,35 @@ struct cfg80211_wowlan {
 	int n_patterns;
 };
 
+/**
+ * struct cfg80211_coalesce_rules - Coalesce rule parameters
+ *
+ * This structure defines coalesce rule for the device.
+ * @delay: maximum coalescing delay in msecs.
+ * @condition: condition for packet coalescence.
+ *	see &enum nl80211_coalesce_condition.
+ * @patterns: array of packet patterns
+ * @n_patterns: number of patterns
+ */
+struct cfg80211_coalesce_rules {
+	int delay;
+	enum nl80211_coalesce_condition condition;
+	struct cfg80211_pkt_pattern *patterns;
+	int n_patterns;
+};
+
+/**
+ * struct cfg80211_coalesce - Packet coalescing settings
+ *
+ * This structure defines coalescing settings.
+ * @rules: array of coalesce rules
+ * @n_rules: number of rules
+ */
+struct cfg80211_coalesce {
+	struct cfg80211_coalesce_rules *rules;
+	int n_rules;
+};
+
 /**
  * struct cfg80211_wowlan_wakeup - wakeup report
  * @disconnect: woke up by getting disconnected
@@ -2076,6 +2105,7 @@ struct cfg80211_update_ft_ies_params {
  *	driver can take the most appropriate actions.
  * @crit_proto_stop: Indicates critical protocol no longer needs increased link
  *	reliability. This operation can not fail.
+ * @set_coalesce: Set coalesce parameters.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -2311,6 +2341,8 @@ struct cfg80211_ops {
 				    u16 duration);
 	void	(*crit_proto_stop)(struct wiphy *wiphy,
 				   struct wireless_dev *wdev);
+	int	(*set_coalesce)(struct wiphy *wiphy,
+				struct cfg80211_coalesce *coalesce);
 };
 
 /*
@@ -2536,6 +2568,25 @@ struct wiphy_wowlan_support {
 	const struct wiphy_wowlan_tcp_support *tcp;
 };
 
+/**
+ * struct wiphy_coalesce_support - coalesce support data
+ * @n_rules: maximum number of coalesce rules
+ * @max_delay: maximum supported coalescing delay in msecs
+ * @n_patterns: number of supported patterns in a rule
+ *	(see nl80211.h for the pattern definition)
+ * @pattern_max_len: maximum length of each pattern
+ * @pattern_min_len: minimum length of each pattern
+ * @max_pkt_offset: maximum Rx packet offset
+ */
+struct wiphy_coalesce_support {
+	int n_rules;
+	int max_delay;
+	int n_patterns;
+	int pattern_max_len;
+	int pattern_min_len;
+	int max_pkt_offset;
+};
+
 /**
  * struct wiphy - wireless hardware description
  * @reg_notifier: the driver's regulatory notification callback,
@@ -2646,6 +2697,7 @@ struct wiphy_wowlan_support {
  *	802.11-2012 8.4.2.29 for the defined fields.
  * @extended_capabilities_mask: mask of the valid values
  * @extended_capabilities_len: length of the extended capabilities
+ * @coalesce: packet coalescing support information
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
@@ -2755,6 +2807,8 @@ struct wiphy {
 	const struct iw_handler_def *wext;
 #endif
 
+	const struct wiphy_coalesce_support *coalesce;
+
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index de0ce809068..5abc54d14d4 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -125,6 +125,31 @@
  * interfaces that a given device supports.
  */
 
+/**
+ * DOC: packet coalesce support
+ *
+ * In most cases, host that receives IPv4 and IPv6 multicast/broadcast
+ * packets does not do anything with these packets. Therefore the
+ * reception of these unwanted packets causes unnecessary processing
+ * and power consumption.
+ *
+ * Packet coalesce feature helps to reduce number of received interrupts
+ * to host by buffering these packets in firmware/hardware for some
+ * predefined time. Received interrupt will be generated when one of the
+ * following events occur.
+ * a) Expiration of hardware timer whose expiration time is set to maximum
+ * coalescing delay of matching coalesce rule.
+ * b) Coalescing buffer in hardware reaches it's limit.
+ * c) Packet doesn't match any of the configured coalesce rules.
+ *
+ * User needs to configure following parameters for creating a coalesce
+ * rule.
+ * a) Maximum coalescing delay
+ * b) List of packet patterns which needs to be matched
+ * c) Condition for coalescence. pattern 'match' or 'no match'
+ * Multiple such rules can be created.
+ */
+
 /**
  * enum nl80211_commands - supported nl80211 commands
  *
@@ -648,6 +673,9 @@
  * @NL80211_CMD_CRIT_PROTOCOL_STOP: Indicates the connection reliability can
  *	return back to normal.
  *
+ * @NL80211_CMD_GET_COALESCE: Get currently supported coalesce rules.
+ * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -810,6 +838,9 @@ enum nl80211_commands {
 	NL80211_CMD_CRIT_PROTOCOL_START,
 	NL80211_CMD_CRIT_PROTOCOL_STOP,
 
+	NL80211_CMD_GET_COALESCE,
+	NL80211_CMD_SET_COALESCE,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -1436,6 +1467,8 @@ enum nl80211_commands {
  *	allowed to be used with the first @NL80211_CMD_SET_STATION command to
  *	update a TDLS peer STA entry.
  *
+ * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1736,6 +1769,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_PEER_AID,
 
+	NL80211_ATTR_COALESCE_RULE,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3098,8 +3133,10 @@ enum nl80211_packet_pattern_attr {
  * @max_pkt_offset: maximum Rx packet offset
  *
  * This struct is carried in %NL80211_WOWLAN_TRIG_PKT_PATTERN when
- * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED in the
- * capability information given by the kernel to userspace.
+ * that is part of %NL80211_ATTR_WOWLAN_TRIGGERS_SUPPORTED or in
+ * %NL80211_ATTR_COALESCE_RULE_PKT_PATTERN when that is part of
+ * %NL80211_ATTR_COALESCE_RULE in the capability information given
+ * by the kernel to userspace.
  */
 struct nl80211_pattern_support {
 	__u32 max_patterns;
@@ -3317,6 +3354,55 @@ enum nl80211_wowlan_tcp_attrs {
 	MAX_NL80211_WOWLAN_TCP = NUM_NL80211_WOWLAN_TCP - 1
 };
 
+/**
+ * struct nl80211_coalesce_rule_support - coalesce rule support information
+ * @max_rules: maximum number of rules supported
+ * @pat: packet pattern support information
+ * @max_delay: maximum supported coalescing delay in msecs
+ *
+ * This struct is carried in %NL80211_ATTR_COALESCE_RULE in the
+ * capability information given by the kernel to userspace.
+ */
+struct nl80211_coalesce_rule_support {
+	__u32 max_rules;
+	struct nl80211_pattern_support pat;
+	__u32 max_delay;
+} __attribute__((packed));
+
+/**
+ * enum nl80211_attr_coalesce_rule - coalesce rule attribute
+ * @__NL80211_COALESCE_RULE_INVALID: invalid number for nested attribute
+ * @NL80211_ATTR_COALESCE_RULE_DELAY: delay in msecs used for packet coalescing
+ * @NL80211_ATTR_COALESCE_RULE_CONDITION: condition for packet coalescence,
+ *	see &enum nl80211_coalesce_condition.
+ * @NL80211_ATTR_COALESCE_RULE_PKT_PATTERN: packet offset, pattern is matched
+ *	after these fixed number of bytes of received packet
+ * @NUM_NL80211_ATTR_COALESCE_RULE: number of attributes
+ * @NL80211_ATTR_COALESCE_RULE_MAX: max attribute number
+ */
+enum nl80211_attr_coalesce_rule {
+	__NL80211_COALESCE_RULE_INVALID,
+	NL80211_ATTR_COALESCE_RULE_DELAY,
+	NL80211_ATTR_COALESCE_RULE_CONDITION,
+	NL80211_ATTR_COALESCE_RULE_PKT_PATTERN,
+
+	/* keep last */
+	NUM_NL80211_ATTR_COALESCE_RULE,
+	NL80211_ATTR_COALESCE_RULE_MAX = NUM_NL80211_ATTR_COALESCE_RULE - 1
+};
+
+/**
+ * enum nl80211_coalesce_condition - coalesce rule conditions
+ * @NL80211_COALESCE_CONDITION_MATCH: coalaesce Rx packets when patterns
+ *	in a rule are matched.
+ * @NL80211_COALESCE_CONDITION_NO_MATCH: coalesce Rx packets when patterns
+ *	in a rule are not matched.
+ */
+enum nl80211_coalesce_condition {
+	NL80211_COALESCE_CONDITION_MATCH,
+	NL80211_COALESCE_CONDITION_NO_MATCH
+};
+
 /**
  * enum nl80211_iface_limit_attrs - limit attributes
  * @NL80211_IFACE_LIMIT_UNSPEC: (reserved)
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 4f9f216665e..389a3f2ee46 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -462,6 +462,14 @@ int wiphy_register(struct wiphy *wiphy)
 		return -EINVAL;
 #endif
 
+	if (WARN_ON(wiphy->coalesce &&
+		    (!wiphy->coalesce->n_rules ||
+		     !wiphy->coalesce->n_patterns) &&
+		    (!wiphy->coalesce->pattern_min_len ||
+		     wiphy->coalesce->pattern_min_len >
+			wiphy->coalesce->pattern_max_len)))
+		return -EINVAL;
+
 	if (WARN_ON(wiphy->ap_sme_capa &&
 		    !(wiphy->flags & WIPHY_FLAG_HAVE_AP_SME)))
 		return -EINVAL;
@@ -668,6 +676,7 @@ void wiphy_unregister(struct wiphy *wiphy)
 		rdev_set_wakeup(rdev, false);
 #endif
 	cfg80211_rdev_free_wowlan(rdev);
+	cfg80211_rdev_free_coalesce(rdev);
 }
 EXPORT_SYMBOL(wiphy_unregister);
 
diff --git a/net/wireless/core.h b/net/wireless/core.h
index a6b45bf00f3..9ad43c619c5 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -79,6 +79,8 @@ struct cfg80211_registered_device {
 	/* netlink port which started critical protocol (0 means not started) */
 	u32 crit_proto_nlportid;
 
+	struct cfg80211_coalesce *coalesce;
+
 	/* must be last because of the way we do wiphy_priv(),
 	 * and it should at least be aligned to NETDEV_ALIGN */
 	struct wiphy wiphy __aligned(NETDEV_ALIGN);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0492478ab74..6dca5a70017 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -403,6 +403,14 @@ nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = {
 	[NL80211_WOWLAN_TCP_WAKE_MASK] = { .len = 1 },
 };
 
+/* policy for coalesce rule attributes */
+static const struct nla_policy
+nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = {
+	[NL80211_ATTR_COALESCE_RULE_DELAY] = { .type = NLA_U32 },
+	[NL80211_ATTR_COALESCE_RULE_CONDITION] = { .type = NLA_U32 },
+	[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN] = { .type = NLA_NESTED },
+};
+
 /* policy for GTK rekey offload attributes */
 static const struct nla_policy
 nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = {
@@ -995,6 +1003,27 @@ static int nl80211_send_wowlan(struct sk_buff *msg,
 }
 #endif
 
+static int nl80211_send_coalesce(struct sk_buff *msg,
+				 struct cfg80211_registered_device *dev)
+{
+	struct nl80211_coalesce_rule_support rule;
+
+	if (!dev->wiphy.coalesce)
+		return 0;
+
+	rule.max_rules = dev->wiphy.coalesce->n_rules;
+	rule.max_delay = dev->wiphy.coalesce->max_delay;
+	rule.pat.max_patterns = dev->wiphy.coalesce->n_patterns;
+	rule.pat.min_pattern_len = dev->wiphy.coalesce->pattern_min_len;
+	rule.pat.max_pattern_len = dev->wiphy.coalesce->pattern_max_len;
+	rule.pat.max_pkt_offset = dev->wiphy.coalesce->max_pkt_offset;
+
+	if (nla_put(msg, NL80211_ATTR_COALESCE_RULE, sizeof(rule), &rule))
+		return -ENOBUFS;
+
+	return 0;
+}
+
 static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 				      struct ieee80211_supported_band *sband)
 {
@@ -1513,6 +1542,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev,
 			    dev->wiphy.vht_capa_mod_mask))
 			goto nla_put_failure;
 
+		state->split_start++;
+		break;
+	case 10:
+		if (nl80211_send_coalesce(msg, dev))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -8043,6 +8078,264 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
 }
 #endif
 
+static int nl80211_send_coalesce_rules(struct sk_buff *msg,
+				       struct cfg80211_registered_device *rdev)
+{
+	struct nlattr *nl_pats, *nl_pat, *nl_rule, *nl_rules;
+	int i, j, pat_len;
+	struct cfg80211_coalesce_rules *rule;
+
+	if (!rdev->coalesce->n_rules)
+		return 0;
+
+	nl_rules = nla_nest_start(msg, NL80211_ATTR_COALESCE_RULE);
+	if (!nl_rules)
+		return -ENOBUFS;
+
+	for (i = 0; i < rdev->coalesce->n_rules; i++) {
+		nl_rule = nla_nest_start(msg, i + 1);
+		if (!nl_rule)
+			return -ENOBUFS;
+
+		rule = &rdev->coalesce->rules[i];
+		if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_DELAY,
+				rule->delay))
+			return -ENOBUFS;
+
+		if (nla_put_u32(msg, NL80211_ATTR_COALESCE_RULE_CONDITION,
+				rule->condition))
+			return -ENOBUFS;
+
+		nl_pats = nla_nest_start(msg,
+				NL80211_ATTR_COALESCE_RULE_PKT_PATTERN);
+		if (!nl_pats)
+			return -ENOBUFS;
+
+		for (j = 0; j < rule->n_patterns; j++) {
+			nl_pat = nla_nest_start(msg, j + 1);
+			if (!nl_pat)
+				return -ENOBUFS;
+			pat_len = rule->patterns[j].pattern_len;
+			if (nla_put(msg, NL80211_PKTPAT_MASK,
+				    DIV_ROUND_UP(pat_len, 8),
+				    rule->patterns[j].mask) ||
+			    nla_put(msg, NL80211_PKTPAT_PATTERN, pat_len,
+				    rule->patterns[j].pattern) ||
+			    nla_put_u32(msg, NL80211_PKTPAT_OFFSET,
+					rule->patterns[j].pkt_offset))
+				return -ENOBUFS;
+			nla_nest_end(msg, nl_pat);
+		}
+		nla_nest_end(msg, nl_pats);
+		nla_nest_end(msg, nl_rule);
+	}
+	nla_nest_end(msg, nl_rules);
+
+	return 0;
+}
+
+static int nl80211_get_coalesce(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct sk_buff *msg;
+	void *hdr;
+
+	if (!rdev->wiphy.coalesce)
+		return -EOPNOTSUPP;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0,
+			     NL80211_CMD_GET_COALESCE);
+	if (!hdr)
+		goto nla_put_failure;
+
+	if (rdev->coalesce && nl80211_send_coalesce_rules(msg, rdev))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev)
+{
+	struct cfg80211_coalesce *coalesce = rdev->coalesce;
+	int i, j;
+	struct cfg80211_coalesce_rules *rule;
+
+	if (!coalesce)
+		return;
+
+	for (i = 0; i < coalesce->n_rules; i++) {
+		rule = &coalesce->rules[i];
+		for (j = 0; j < rule->n_patterns; j++)
+			kfree(rule->patterns[j].mask);
+		kfree(rule->patterns);
+	}
+	kfree(coalesce->rules);
+	kfree(coalesce);
+	rdev->coalesce = NULL;
+}
+
+static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
+				       struct nlattr *rule,
+				       struct cfg80211_coalesce_rules *new_rule)
+{
+	int err, i;
+	const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce;
+	struct nlattr *tb[NUM_NL80211_ATTR_COALESCE_RULE], *pat;
+	int rem, pat_len, mask_len, pkt_offset, n_patterns = 0;
+	struct nlattr *pat_tb[NUM_NL80211_PKTPAT];
+
+	err = nla_parse(tb, NL80211_ATTR_COALESCE_RULE_MAX, nla_data(rule),
+			nla_len(rule), nl80211_coalesce_policy);
+	if (err)
+		return err;
+
+	if (tb[NL80211_ATTR_COALESCE_RULE_DELAY])
+		new_rule->delay =
+			nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_DELAY]);
+	if (new_rule->delay > coalesce->max_delay)
+		return -EINVAL;
+
+	if (tb[NL80211_ATTR_COALESCE_RULE_CONDITION])
+		new_rule->condition =
+			nla_get_u32(tb[NL80211_ATTR_COALESCE_RULE_CONDITION]);
+	if (new_rule->condition != NL80211_COALESCE_CONDITION_MATCH &&
+	    new_rule->condition != NL80211_COALESCE_CONDITION_NO_MATCH)
+		return -EINVAL;
+
+	if (!tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN])
+		return -EINVAL;
+
+	nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN],
+			    rem)
+		n_patterns++;
+	if (n_patterns > coalesce->n_patterns)
+		return -EINVAL;
+
+	new_rule->patterns = kcalloc(n_patterns, sizeof(new_rule->patterns[0]),
+				     GFP_KERNEL);
+	if (!new_rule->patterns)
+		return -ENOMEM;
+
+	new_rule->n_patterns = n_patterns;
+	i = 0;
+
+	nla_for_each_nested(pat, tb[NL80211_ATTR_COALESCE_RULE_PKT_PATTERN],
+			    rem) {
+		nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat),
+			  nla_len(pat), NULL);
+		if (!pat_tb[NL80211_PKTPAT_MASK] ||
+		    !pat_tb[NL80211_PKTPAT_PATTERN])
+			return -EINVAL;
+		pat_len = nla_len(pat_tb[NL80211_PKTPAT_PATTERN]);
+		mask_len = DIV_ROUND_UP(pat_len, 8);
+		if (nla_len(pat_tb[NL80211_PKTPAT_MASK]) != mask_len)
+			return -EINVAL;
+		if (pat_len > coalesce->pattern_max_len ||
+		    pat_len < coalesce->pattern_min_len)
+			return -EINVAL;
+
+		if (!pat_tb[NL80211_PKTPAT_OFFSET])
+			pkt_offset = 0;
+		else
+			pkt_offset = nla_get_u32(pat_tb[NL80211_PKTPAT_OFFSET]);
+		if (pkt_offset > coalesce->max_pkt_offset)
+			return -EINVAL;
+		new_rule->patterns[i].pkt_offset = pkt_offset;
+
+		new_rule->patterns[i].mask =
+			kmalloc(mask_len + pat_len, GFP_KERNEL);
+		if (!new_rule->patterns[i].mask)
+			return -ENOMEM;
+		new_rule->patterns[i].pattern =
+			new_rule->patterns[i].mask + mask_len;
+		memcpy(new_rule->patterns[i].mask,
+		       nla_data(pat_tb[NL80211_PKTPAT_MASK]), mask_len);
+		new_rule->patterns[i].pattern_len = pat_len;
+		memcpy(new_rule->patterns[i].pattern,
+		       nla_data(pat_tb[NL80211_PKTPAT_PATTERN]), pat_len);
+		i++;
+	}
+
+	return 0;
+}
+
+static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	const struct wiphy_coalesce_support *coalesce = rdev->wiphy.coalesce;
+	struct cfg80211_coalesce new_coalesce = {};
+	struct cfg80211_coalesce *n_coalesce;
+	int err, rem_rule, n_rules = 0, i, j;
+	struct nlattr *rule;
+	struct cfg80211_coalesce_rules *tmp_rule;
+
+	if (!rdev->wiphy.coalesce || !rdev->ops->set_coalesce)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_COALESCE_RULE]) {
+		cfg80211_rdev_free_coalesce(rdev);
+		rdev->ops->set_coalesce(&rdev->wiphy, NULL);
+		return 0;
+	}
+
+	nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE],
+			    rem_rule)
+		n_rules++;
+	if (n_rules > coalesce->n_rules)
+		return -EINVAL;
+
+	new_coalesce.rules = kcalloc(n_rules, sizeof(new_coalesce.rules[0]),
+				     GFP_KERNEL);
+	if (!new_coalesce.rules)
+		return -ENOMEM;
+
+	new_coalesce.n_rules = n_rules;
+	i = 0;
+
+	nla_for_each_nested(rule, info->attrs[NL80211_ATTR_COALESCE_RULE],
+			    rem_rule) {
+		err = nl80211_parse_coalesce_rule(rdev, rule,
+						  &new_coalesce.rules[i]);
+		if (err)
+			goto error;
+
+		i++;
+	}
+
+	err = rdev->ops->set_coalesce(&rdev->wiphy, &new_coalesce);
+	if (err)
+		goto error;
+
+	n_coalesce = kmemdup(&new_coalesce, sizeof(new_coalesce), GFP_KERNEL);
+	if (!n_coalesce) {
+		err = -ENOMEM;
+		goto error;
+	}
+	cfg80211_rdev_free_coalesce(rdev);
+	rdev->coalesce = n_coalesce;
+
+	return 0;
+error:
+	for (i = 0; i < new_coalesce.n_rules; i++) {
+		tmp_rule = &new_coalesce.rules[i];
+		for (j = 0; j < tmp_rule->n_patterns; j++)
+			kfree(tmp_rule->patterns[j].mask);
+		kfree(tmp_rule->patterns);
+	}
+	kfree(new_coalesce.rules);
+
+	return err;
+}
+
 static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -9050,6 +9343,21 @@ static struct genl_ops nl80211_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = NL80211_FLAG_NEED_WDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_GET_COALESCE,
+		.doit = nl80211_get_coalesce,
+		.policy = nl80211_policy,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
+	},
+	{
+		.cmd = NL80211_CMD_SET_COALESCE,
+		.doit = nl80211_set_coalesce,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_WIPHY |
+				  NL80211_FLAG_NEED_RTNL,
 	}
 };
 
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index a4073e808c1..44341bf53cf 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -74,4 +74,6 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev,
 		     enum nl80211_radar_event event,
 		     struct net_device *netdev, gfp_t gfp);
 
+void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev);
+
 #endif /* __NET_WIRELESS_NL80211_H */
-- 
cgit v1.2.3-70-g09d2


From dcd6eac1f3b5fa1df11dfa99da0cf75b76cfef97 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Mon, 8 Jul 2013 16:55:49 +0200
Subject: nl80211: add scan width to bss and scan request structs

To allow scanning and working with 5 MHz and 10 MHz BSS, extend the
inform bss commands and add wrappers to take 5 and 10 MHz bss into
account.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
---
 include/net/cfg80211.h       | 54 +++++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/nl80211.h | 18 +++++++++++++++
 net/wireless/nl80211.c       |  1 +
 net/wireless/scan.c          | 31 +++++++++++++++----------
 net/wireless/trace.h         | 12 ++++++----
 5 files changed, 97 insertions(+), 19 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 071ed2395c9..bae8614a450 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1285,6 +1285,7 @@ struct cfg80211_ssid {
  * @n_ssids: number of SSIDs
  * @channels: channels to scan on.
  * @n_channels: total number of channels to scan
+ * @scan_width: channel width for scanning
  * @ie: optional information element(s) to add into Probe Request or %NULL
  * @ie_len: length of ie in octets
  * @flags: bit field of flags controlling operation
@@ -1300,6 +1301,7 @@ struct cfg80211_scan_request {
 	struct cfg80211_ssid *ssids;
 	int n_ssids;
 	u32 n_channels;
+	enum nl80211_bss_scan_width scan_width;
 	const u8 *ie;
 	size_t ie_len;
 	u32 flags;
@@ -1333,6 +1335,7 @@ struct cfg80211_match_set {
  * @ssids: SSIDs to scan for (passed in the probe_reqs in active scans)
  * @n_ssids: number of SSIDs
  * @n_channels: total number of channels to scan
+ * @scan_width: channel width for scanning
  * @interval: interval between each scheduled scan cycle
  * @ie: optional information element(s) to add into Probe Request or %NULL
  * @ie_len: length of ie in octets
@@ -1352,6 +1355,7 @@ struct cfg80211_sched_scan_request {
 	struct cfg80211_ssid *ssids;
 	int n_ssids;
 	u32 n_channels;
+	enum nl80211_bss_scan_width scan_width;
 	u32 interval;
 	const u8 *ie;
 	size_t ie_len;
@@ -1403,6 +1407,7 @@ struct cfg80211_bss_ies {
  * for use in scan results and similar.
  *
  * @channel: channel this BSS is on
+ * @scan_width: width of the control channel
  * @bssid: BSSID of the BSS
  * @beacon_interval: the beacon interval as from the frame
  * @capability: the capability field in host byte order
@@ -1424,6 +1429,7 @@ struct cfg80211_bss_ies {
  */
 struct cfg80211_bss {
 	struct ieee80211_channel *channel;
+	enum nl80211_bss_scan_width scan_width;
 
 	const struct cfg80211_bss_ies __rcu *ies;
 	const struct cfg80211_bss_ies __rcu *beacon_ies;
@@ -3438,10 +3444,11 @@ void cfg80211_sched_scan_results(struct wiphy *wiphy);
 void cfg80211_sched_scan_stopped(struct wiphy *wiphy);
 
 /**
- * cfg80211_inform_bss_frame - inform cfg80211 of a received BSS frame
+ * cfg80211_inform_bss_width_frame - inform cfg80211 of a received BSS frame
  *
  * @wiphy: the wiphy reporting the BSS
  * @channel: The channel the frame was received on
+ * @scan_width: width of the control channel
  * @mgmt: the management frame (probe response or beacon)
  * @len: length of the management frame
  * @signal: the signal strength, type depends on the wiphy's signal_type
@@ -3454,16 +3461,29 @@ void cfg80211_sched_scan_stopped(struct wiphy *wiphy);
  * Or %NULL on error.
  */
 struct cfg80211_bss * __must_check
+cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
+				struct ieee80211_channel *channel,
+				enum nl80211_bss_scan_width scan_width,
+				struct ieee80211_mgmt *mgmt, size_t len,
+				s32 signal, gfp_t gfp);
+
+static inline struct cfg80211_bss * __must_check
 cfg80211_inform_bss_frame(struct wiphy *wiphy,
 			  struct ieee80211_channel *channel,
 			  struct ieee80211_mgmt *mgmt, size_t len,
-			  s32 signal, gfp_t gfp);
+			  s32 signal, gfp_t gfp)
+{
+	return cfg80211_inform_bss_width_frame(wiphy, channel,
+					       NL80211_BSS_CHAN_WIDTH_20,
+					       mgmt, len, signal, gfp);
+}
 
 /**
  * cfg80211_inform_bss - inform cfg80211 of a new BSS
  *
  * @wiphy: the wiphy reporting the BSS
  * @channel: The channel the frame was received on
+ * @scan_width: width of the control channel
  * @bssid: the BSSID of the BSS
  * @tsf: the TSF sent by the peer in the beacon/probe response (or 0)
  * @capability: the capability field sent by the peer
@@ -3480,11 +3500,26 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
  * Or %NULL on error.
  */
 struct cfg80211_bss * __must_check
+cfg80211_inform_bss_width(struct wiphy *wiphy,
+			  struct ieee80211_channel *channel,
+			  enum nl80211_bss_scan_width scan_width,
+			  const u8 *bssid, u64 tsf, u16 capability,
+			  u16 beacon_interval, const u8 *ie, size_t ielen,
+			  s32 signal, gfp_t gfp);
+
+static inline struct cfg80211_bss * __must_check
 cfg80211_inform_bss(struct wiphy *wiphy,
 		    struct ieee80211_channel *channel,
 		    const u8 *bssid, u64 tsf, u16 capability,
 		    u16 beacon_interval, const u8 *ie, size_t ielen,
-		    s32 signal, gfp_t gfp);
+		    s32 signal, gfp_t gfp)
+{
+	return cfg80211_inform_bss_width(wiphy, channel,
+					 NL80211_BSS_CHAN_WIDTH_20,
+					 bssid, tsf, capability,
+					 beacon_interval, ie, ielen, signal,
+					 gfp);
+}
 
 struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy,
 				      struct ieee80211_channel *channel,
@@ -3530,6 +3565,19 @@ void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);
  */
 void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *bss);
 
+static inline enum nl80211_bss_scan_width
+cfg80211_chandef_to_scan_width(const struct cfg80211_chan_def *chandef)
+{
+	switch (chandef->width) {
+	case NL80211_CHAN_WIDTH_5:
+		return NL80211_BSS_CHAN_WIDTH_5;
+	case NL80211_CHAN_WIDTH_10:
+		return NL80211_BSS_CHAN_WIDTH_10;
+	default:
+		return NL80211_BSS_CHAN_WIDTH_20;
+	}
+}
+
 /**
  * cfg80211_rx_mlme_mgmt - notification of processed MLME management frame
  * @dev: network device
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5abc54d14d4..eb68735b331 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2807,6 +2807,21 @@ enum nl80211_chan_width {
 	NL80211_CHAN_WIDTH_10,
 };
 
+/**
+ * enum nl80211_bss_scan_width - control channel width for a BSS
+ *
+ * These values are used with the %NL80211_BSS_CHAN_WIDTH attribute.
+ *
+ * @NL80211_BSS_CHAN_WIDTH_20: control channel is 20 MHz wide or compatible
+ * @NL80211_BSS_CHAN_WIDTH_10: control channel is 10 MHz wide
+ * @NL80211_BSS_CHAN_WIDTH_5: control channel is 5 MHz wide
+ */
+enum nl80211_bss_scan_width {
+	NL80211_BSS_CHAN_WIDTH_20,
+	NL80211_BSS_CHAN_WIDTH_10,
+	NL80211_BSS_CHAN_WIDTH_5,
+};
+
 /**
  * enum nl80211_bss - netlink attributes for a BSS
  *
@@ -2831,6 +2846,8 @@ enum nl80211_chan_width {
  * @NL80211_BSS_BEACON_IES: binary attribute containing the raw information
  *	elements from a Beacon frame (bin); not present if no Beacon frame has
  *	yet been received
+ * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel
+ *	(u32, enum nl80211_bss_scan_width)
  * @__NL80211_BSS_AFTER_LAST: internal
  * @NL80211_BSS_MAX: highest BSS attribute
  */
@@ -2847,6 +2864,7 @@ enum nl80211_bss {
 	NL80211_BSS_STATUS,
 	NL80211_BSS_SEEN_MS_AGO,
 	NL80211_BSS_BEACON_IES,
+	NL80211_BSS_CHAN_WIDTH,
 
 	/* keep last */
 	__NL80211_BSS_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 6dca5a70017..ef4c312cc92 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5674,6 +5674,7 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 		goto nla_put_failure;
 	if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) ||
 	    nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) ||
+	    nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) ||
 	    nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO,
 			jiffies_to_msecs(jiffies - intbss->ts)))
 		goto nla_put_failure;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index ae8c186b50d..ad1e4068ce0 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -651,6 +651,8 @@ static bool cfg80211_combine_bsses(struct cfg80211_registered_device *dev,
 			continue;
 		if (bss->pub.channel != new->pub.channel)
 			continue;
+		if (bss->pub.scan_width != new->pub.scan_width)
+			continue;
 		if (rcu_access_pointer(bss->pub.beacon_ies))
 			continue;
 		ies = rcu_access_pointer(bss->pub.ies);
@@ -870,11 +872,12 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
 
 /* Returned bss is reference counted and must be cleaned up appropriately. */
 struct cfg80211_bss*
-cfg80211_inform_bss(struct wiphy *wiphy,
-		    struct ieee80211_channel *channel,
-		    const u8 *bssid, u64 tsf, u16 capability,
-		    u16 beacon_interval, const u8 *ie, size_t ielen,
-		    s32 signal, gfp_t gfp)
+cfg80211_inform_bss_width(struct wiphy *wiphy,
+			  struct ieee80211_channel *channel,
+			  enum nl80211_bss_scan_width scan_width,
+			  const u8 *bssid, u64 tsf, u16 capability,
+			  u16 beacon_interval, const u8 *ie, size_t ielen,
+			  s32 signal, gfp_t gfp)
 {
 	struct cfg80211_bss_ies *ies;
 	struct cfg80211_internal_bss tmp = {}, *res;
@@ -892,6 +895,7 @@ cfg80211_inform_bss(struct wiphy *wiphy,
 
 	memcpy(tmp.pub.bssid, bssid, ETH_ALEN);
 	tmp.pub.channel = channel;
+	tmp.pub.scan_width = scan_width;
 	tmp.pub.signal = signal;
 	tmp.pub.beacon_interval = beacon_interval;
 	tmp.pub.capability = capability;
@@ -924,14 +928,15 @@ cfg80211_inform_bss(struct wiphy *wiphy,
 	/* cfg80211_bss_update gives us a referenced result */
 	return &res->pub;
 }
-EXPORT_SYMBOL(cfg80211_inform_bss);
+EXPORT_SYMBOL(cfg80211_inform_bss_width);
 
 /* Returned bss is reference counted and must be cleaned up appropriately. */
 struct cfg80211_bss *
-cfg80211_inform_bss_frame(struct wiphy *wiphy,
-			  struct ieee80211_channel *channel,
-			  struct ieee80211_mgmt *mgmt, size_t len,
-			  s32 signal, gfp_t gfp)
+cfg80211_inform_bss_width_frame(struct wiphy *wiphy,
+				struct ieee80211_channel *channel,
+				enum nl80211_bss_scan_width scan_width,
+				struct ieee80211_mgmt *mgmt, size_t len,
+				s32 signal, gfp_t gfp)
 {
 	struct cfg80211_internal_bss tmp = {}, *res;
 	struct cfg80211_bss_ies *ies;
@@ -941,7 +946,8 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 	BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) !=
 			offsetof(struct ieee80211_mgmt, u.beacon.variable));
 
-	trace_cfg80211_inform_bss_frame(wiphy, channel, mgmt, len, signal);
+	trace_cfg80211_inform_bss_width_frame(wiphy, channel, scan_width, mgmt,
+					      len, signal);
 
 	if (WARN_ON(!mgmt))
 		return NULL;
@@ -976,6 +982,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 	
 	memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN);
 	tmp.pub.channel = channel;
+	tmp.pub.scan_width = scan_width;
 	tmp.pub.signal = signal;
 	tmp.pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
 	tmp.pub.capability = le16_to_cpu(mgmt->u.probe_resp.capab_info);
@@ -991,7 +998,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
 	/* cfg80211_bss_update gives us a referenced result */
 	return &res->pub;
 }
-EXPORT_SYMBOL(cfg80211_inform_bss_frame);
+EXPORT_SYMBOL(cfg80211_inform_bss_width_frame);
 
 void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub)
 {
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index e1534baf2eb..09af6eb426a 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2391,26 +2391,30 @@ TRACE_EVENT(cfg80211_get_bss,
 		  __entry->capa_mask, __entry->capa_val)
 );
 
-TRACE_EVENT(cfg80211_inform_bss_frame,
+TRACE_EVENT(cfg80211_inform_bss_width_frame,
 	TP_PROTO(struct wiphy *wiphy, struct ieee80211_channel *channel,
+		 enum nl80211_bss_scan_width scan_width,
 		 struct ieee80211_mgmt *mgmt, size_t len,
 		 s32 signal),
-	TP_ARGS(wiphy, channel, mgmt, len, signal),
+	TP_ARGS(wiphy, channel, scan_width, mgmt, len, signal),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		CHAN_ENTRY
+		__field(enum nl80211_bss_scan_width, scan_width)
 		__dynamic_array(u8, mgmt, len)
 		__field(s32, signal)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		CHAN_ASSIGN(channel);
+		__entry->scan_width = scan_width;
 		if (mgmt)
 			memcpy(__get_dynamic_array(mgmt), mgmt, len);
 		__entry->signal = signal;
 	),
-	TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "signal: %d",
-		  WIPHY_PR_ARG, CHAN_PR_ARG, __entry->signal)
+	TP_printk(WIPHY_PR_FMT ", " CHAN_PR_FMT "(scan_width: %d) signal: %d",
+		  WIPHY_PR_ARG, CHAN_PR_ARG, __entry->scan_width,
+		  __entry->signal)
 );
 
 DECLARE_EVENT_CLASS(cfg80211_bss_evt,
-- 
cgit v1.2.3-70-g09d2


From eda297729171fe16bf34fe5b0419dfb69060f623 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Fri, 19 Jul 2013 19:40:10 +0200
Subject: tun: Support software transmit time stamping.

This patch adds transmit time stamping to the tun/tap driver. Similar
support already exists for UDP, can, and raw packets.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 14 ++++++++++++--
 include/uapi/linux/if_tun.h |  3 +++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db690a37226..a72d141047c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -739,6 +739,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 			  >= dev->tx_queue_len / tun->numqueues)
 		goto drop;
 
+	if (skb->sk) {
+		sock_tx_timestamp(skb->sk, &skb_shinfo(skb)->tx_flags);
+		sw_tx_timestamp(skb);
+	}
+
 	/* Orphan the skb - required as we might hang on to it
 	 * for indefinite time. */
 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
@@ -1476,7 +1481,6 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
 	return ret;
 }
 
-
 static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 		       struct msghdr *m, size_t total_len,
 		       int flags)
@@ -1488,10 +1492,15 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (!tun)
 		return -EBADFD;
 
-	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
+	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
 		ret = -EINVAL;
 		goto out;
 	}
+	if (flags & MSG_ERRQUEUE) {
+		ret = sock_recv_errqueue(sock->sk, m, total_len,
+					 SOL_PACKET, TUN_TX_TIMESTAMP);
+		goto out;
+	}
 	ret = tun_do_read(tun, tfile, iocb, m->msg_iov, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
@@ -2274,6 +2283,7 @@ static const struct ethtool_ops tun_ethtool_ops = {
 	.get_msglevel	= tun_get_msglevel,
 	.set_msglevel	= tun_set_msglevel,
 	.get_link	= ethtool_op_get_link,
+	.get_ts_info	= ethtool_op_get_ts_info,
 };
 
 
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 82334f88967..1870ee29bb3 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -71,6 +71,9 @@
 /* read-only flag */
 #define IFF_PERSIST	0x0800
 
+/* Socket options */
+#define TUN_TX_TIMESTAMP 1
+
 /* Features for GSO (TUNSETOFFLOAD). */
 #define TUN_F_CSUM	0x01	/* You can hand me unchecksummed packets. */
 #define TUN_F_TSO4	0x02	/* I can handle TSO for IPv4 packets */
-- 
cgit v1.2.3-70-g09d2


From 91705c61b52029ab5da67a15a23eef08667bf40e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 23 Jul 2013 14:51:47 +0200
Subject: net: sctp: trivial: update mailing list address

The SCTP mailing list address to send patches or questions
to is linux-sctp@vger.kernel.org and not
lksctp-developers@lists.sourceforge.net anymore. Therefore,
update all occurences.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/sctp.txt | 5 +----
 include/net/sctp/auth.h           | 2 +-
 include/net/sctp/checksum.h       | 2 +-
 include/net/sctp/constants.h      | 2 +-
 include/net/sctp/sctp.h           | 2 +-
 include/net/sctp/sm.h             | 2 +-
 include/net/sctp/structs.h        | 2 +-
 include/net/sctp/tsnmap.h         | 2 +-
 include/net/sctp/ulpevent.h       | 2 +-
 include/net/sctp/ulpqueue.h       | 2 +-
 include/uapi/linux/sctp.h         | 2 +-
 net/sctp/associola.c              | 2 +-
 net/sctp/auth.c                   | 2 +-
 net/sctp/bind_addr.c              | 2 +-
 net/sctp/chunk.c                  | 2 +-
 net/sctp/command.c                | 2 +-
 net/sctp/debug.c                  | 2 +-
 net/sctp/endpointola.c            | 2 +-
 net/sctp/input.c                  | 2 +-
 net/sctp/inqueue.c                | 2 +-
 net/sctp/ipv6.c                   | 2 +-
 net/sctp/objcnt.c                 | 2 +-
 net/sctp/output.c                 | 2 +-
 net/sctp/outqueue.c               | 2 +-
 net/sctp/primitive.c              | 2 +-
 net/sctp/proc.c                   | 2 +-
 net/sctp/protocol.c               | 4 ++--
 net/sctp/sm_make_chunk.c          | 2 +-
 net/sctp/sm_sideeffect.c          | 2 +-
 net/sctp/sm_statefuns.c           | 2 +-
 net/sctp/sm_statetable.c          | 2 +-
 net/sctp/socket.c                 | 2 +-
 net/sctp/ssnmap.c                 | 2 +-
 net/sctp/sysctl.c                 | 2 +-
 net/sctp/transport.c              | 2 +-
 net/sctp/tsnmap.c                 | 2 +-
 net/sctp/ulpevent.c               | 2 +-
 net/sctp/ulpqueue.c               | 2 +-
 38 files changed, 39 insertions(+), 42 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/sctp.txt b/Documentation/networking/sctp.txt
index 0c790a76910..97b810ca908 100644
--- a/Documentation/networking/sctp.txt
+++ b/Documentation/networking/sctp.txt
@@ -19,7 +19,6 @@ of SCTP that is RFC 2960 compliant and provides an programming interface
 referred to as the  UDP-style API of the Sockets Extensions for SCTP, as 
 proposed in IETF Internet-Drafts.    
 
-
 Caveats:  
 
 -lksctp can be built as statically or as a module.  However, be aware that 
@@ -33,6 +32,4 @@ For more information, please visit the lksctp project website:
    http://www.sf.net/projects/lksctp
 
 Or contact the lksctp developers through the mailing list:
-   <lksctp-developers@lists.sourceforge.net> 
-
-
+   <linux-sctp@vger.kernel.org>
diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h
index 49bc9577c61..754c511f4cf 100644
--- a/include/net/sctp/auth.h
+++ b/include/net/sctp/auth.h
@@ -22,7 +22,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 0cb08e6fb6d..78b88aa8c81 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index ca50e0751e4..a1e7aa5c8bb 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index d8e37ecea69..554cf88605f 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 2a82d138470..2aa66dda4a5 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email addresses:
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e745c92a153..75c4c16601b 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email addresses:
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/tsnmap.h b/include/net/sctp/tsnmap.h
index 2c5d2b4d5d1..c361d6f7e0c 100644
--- a/include/net/sctp/tsnmap.h
+++ b/include/net/sctp/tsnmap.h
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index ca4693b4e09..34f0d34d22c 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -31,7 +31,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h
index 00e50ba3f24..add3237a9fd 100644
--- a/include/net/sctp/ulpqueue.h
+++ b/include/net/sctp/ulpqueue.h
@@ -30,7 +30,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email addresses:
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 66b466e4ca0..ca451e99b28 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index bce5b79662a..e425ba0bd79 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index ba1dfc3f8de..3aab967081b 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -22,7 +22,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 64977ea0f9c..f34ce8bc139 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 5780565f5b7..b50b90c16a8 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -24,7 +24,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/command.c b/net/sctp/command.c
index c0044019db9..f4bebdadf00 100644
--- a/net/sctp/command.c
+++ b/net/sctp/command.c
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index f4998780d6d..44aa4e7543a 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 9e3d257de0e..825b7543a43 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 3fa4d858c35..7993495a4c0 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cb25f040fed..e70f60ae92e 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -30,7 +30,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 09ffcc912d2..85d688f59e6 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index fe012c44f8d..aec346cbed6 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -26,7 +26,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/output.c b/net/sctp/output.c
index a46d1eb4176..5a55c55d71a 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -26,7 +26,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index ef9e2bbc0f2..51313233e63 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index 794bb14decd..31fa437da11 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 62526c47705..aff0cac45b6 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -22,7 +22,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 4a17494d736..b52ec251010 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
@@ -1547,7 +1547,7 @@ module_exit(sctp_exit);
  */
 MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132");
 MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132");
-MODULE_AUTHOR("Linux Kernel SCTP developers <lksctp-developers@lists.sourceforge.net>");
+MODULE_AUTHOR("Linux Kernel SCTP developers <linux-sctp@vger.kernel.org>");
 MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
 module_param_named(no_checksums, sctp_checksum_disable, bool, 0644);
 MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification");
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 362ae6e2fd9..780a2d4b98a 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -29,7 +29,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 9da68852ee9..f1f3aac3f3b 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index f6b7109195a..93271f0966c 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 84d98d8a5a7..64ea5fd87eb 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c6670d2e3f8..02457123bde 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -34,7 +34,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index da860352380..72b5939101b 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -24,7 +24,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 9a5c4c9edda..190674702b2 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -25,7 +25,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index bdbbc3fd7c1..9602c52aa61 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -30,7 +30,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c
index b46019568a8..0eff8667d4c 100644
--- a/net/sctp/tsnmap.c
+++ b/net/sctp/tsnmap.c
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 44a45dbee4d..c07624fca7e 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -28,7 +28,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 04e3d470f87..2cdb3010f50 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -27,7 +27,7 @@
  *
  * Please send any bug reports or fixes you make to the
  * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *    lksctp developers <linux-sctp@vger.kernel.org>
  *
  * Or submit a bug report through the following website:
  *    http://www.sf.net/projects/lksctp
-- 
cgit v1.2.3-70-g09d2


From c9bee3b7fdecb0c1d070c7b54113b3bdfb9a3d36 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Jul 2013 20:27:07 -0700
Subject: tcp: TCP_NOTSENT_LOWAT socket option

Idea of this patch is to add optional limitation of number of
unsent bytes in TCP sockets, to reduce usage of kernel memory.

TCP receiver might announce a big window, and TCP sender autotuning
might allow a large amount of bytes in write queue, but this has little
performance impact if a large part of this buffering is wasted :

Write queue needs to be large only to deal with large BDP, not
necessarily to cope with scheduling delays (incoming ACKS make room
for the application to queue more bytes)

For most workloads, using a value of 128 KB or less is OK to give
applications enough time to react to POLLOUT events in time
(or being awaken in a blocking sendmsg())

This patch adds two ways to set the limit :

1) Per socket option TCP_NOTSENT_LOWAT

2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets
not using TCP_NOTSENT_LOWAT socket option (or setting a zero value)
Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect.

This changes poll()/select()/epoll() to report POLLOUT
only if number of unsent bytes is below tp->nosent_lowat

Note this might increase number of sendmsg()/sendfile() calls
when using non blocking sockets,
and increase number of context switches for blocking sockets.

Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is
defined as :
 Specify the minimum number of bytes in the buffer until
 the socket layer will pass the data to the protocol)

Tested:

netperf sessions, and watching /proc/net/protocols "memory" column for TCP

With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory
used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458)

lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols
TCPv6     1880      2   45458   no     208   yes  ipv6        y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y
TCP       1696    508   45458   no     208   yes  kernel      y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y

lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols
TCPv6     1880      2   20567   no     208   yes  ipv6        y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y
TCP       1696    508   20567   no     208   yes  kernel      y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y

Using 128KB has no bad effect on the throughput or cpu usage
of a single flow, although there is an increase of context switches.

A bonus is that we hold socket lock for a shorter amount
of time and should improve latencies of ACK processing.

lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf.
Local       Remote      Local  Elapsed Throughput Throughput  Local Local  Remote Remote Local   Remote  Service
Send Socket Recv Socket Send   Time               Units       CPU   CPU    CPU    CPU    Service Service Demand
Size        Size        Size   (sec)                          Util  Util   Util   Util   Demand  Demand  Units
Final       Final                                             %     Method %      Method
1651584     6291456     16384  20.00   17447.90   10^6bits/s  3.13  S      -1.00  U      0.353   -1.000  usec/KB

 Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3':

           412,514 context-switches

     200.034645535 seconds time elapsed

lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf.
Local       Remote      Local  Elapsed Throughput Throughput  Local Local  Remote Remote Local   Remote  Service
Send Socket Recv Socket Send   Time               Units       CPU   CPU    CPU    CPU    Service Service Demand
Size        Size        Size   (sec)                          Util  Util   Util   Util   Demand  Demand  Units
Final       Final                                             %     Method %      Method
1593240     6291456     16384  20.00   17321.16   10^6bits/s  3.35  S      -1.00  U      0.381   -1.000  usec/KB

 Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3':

         2,675,818 context-switches

     200.029651391 seconds time elapsed

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-By: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 13 +++++++++++++
 include/linux/tcp.h                    |  1 +
 include/net/sock.h                     | 19 +++++++++++++------
 include/net/tcp.h                      | 14 ++++++++++++++
 include/uapi/linux/tcp.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp.c                         |  7 +++++++
 net/ipv4/tcp_ipv4.c                    |  1 +
 net/ipv4/tcp_output.c                  |  3 +++
 net/ipv6/tcp_ipv6.c                    |  1 +
 10 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 10742902146..53cea9bcb14 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -516,6 +516,19 @@ tcp_wmem - vector of 3 INTEGERs: min, default, max
 	this value is ignored.
 	Default: between 64K and 4MB, depending on RAM size.
 
+tcp_notsent_lowat - UNSIGNED INTEGER
+	A TCP socket can control the amount of unsent bytes in its write queue,
+	thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll()
+	reports POLLOUT events if the amount of unsent bytes is below a per
+	socket value, and if the write queue is not full. sendmsg() will
+	also not add new buffers if the limit is hit.
+
+	This global variable controls the amount of unsent data for
+	sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change
+	to the global variable has immediate effect.
+
+	Default: UINT_MAX (0xFFFFFFFF)
+
 tcp_workaround_signed_windows - BOOLEAN
 	If set, assume no receipt of a window scaling option means the
 	remote TCP is broken and treats the window as a signed quantity.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 472120b4fac..9640803a17a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -238,6 +238,7 @@ struct tcp_sock {
 
  	u32	rcv_wnd;	/* Current receiver window		*/
 	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
+	u32	notsent_lowat;	/* TCP_NOTSENT_LOWAT */
 	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
 	u32	lost_out;	/* Lost packets			*/
 	u32	sacked_out;	/* SACK'd packets			*/
diff --git a/include/net/sock.h b/include/net/sock.h
index d0b5fdee50a..b9f2b095b1a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -746,11 +746,6 @@ static inline int sk_stream_wspace(const struct sock *sk)
 
 extern void sk_stream_write_space(struct sock *sk);
 
-static inline bool sk_stream_memory_free(const struct sock *sk)
-{
-	return sk->sk_wmem_queued < sk->sk_sndbuf;
-}
-
 /* OOB backlog add */
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
@@ -950,6 +945,7 @@ struct proto {
 	unsigned int		inuse_idx;
 #endif
 
+	bool			(*stream_memory_free)(const struct sock *sk);
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
 	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
@@ -1088,11 +1084,22 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto,
 }
 #endif
 
+static inline bool sk_stream_memory_free(const struct sock *sk)
+{
+	if (sk->sk_wmem_queued >= sk->sk_sndbuf)
+		return false;
+
+	return sk->sk_prot->stream_memory_free ?
+		sk->sk_prot->stream_memory_free(sk) : true;
+}
+
 static inline bool sk_stream_is_writeable(const struct sock *sk)
 {
-	return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk);
+	return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
+	       sk_stream_memory_free(sk);
 }
 
+
 static inline bool sk_has_memory_pressure(const struct sock *sk)
 {
 	return sk->sk_prot->memory_pressure != NULL;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c5868471aba..18fc999dae3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -284,6 +284,7 @@ extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
+extern unsigned int sysctl_tcp_notsent_lowat;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -1539,6 +1540,19 @@ extern int tcp_gro_complete(struct sk_buff *skb);
 extern void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr,
 				__be32 daddr);
 
+static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
+{
+	return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat;
+}
+
+static inline bool tcp_stream_memory_free(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 notsent_bytes = tp->write_seq - tp->snd_nxt;
+
+	return notsent_bytes < tcp_notsent_lowat(tp);
+}
+
 #ifdef CONFIG_PROC_FS
 extern int tcp4_proc_init(void);
 extern void tcp4_proc_exit(void);
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8d776ebc482..377f1e59411 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -111,6 +111,7 @@ enum {
 #define TCP_REPAIR_OPTIONS	22
 #define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */
 #define TCP_TIMESTAMP		24
+#define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b2c123c44d6..69ed203802d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -554,6 +554,13 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
+	{
+		.procname	= "tcp_notsent_lowat",
+		.data		= &sysctl_tcp_notsent_lowat,
+		.maxlen		= sizeof(sysctl_tcp_notsent_lowat),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "tcp_rmem",
 		.data		= &sysctl_tcp_rmem,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5eca9060bb8..c27e8139239 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2631,6 +2631,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		else
 			tp->tsoffset = val - tcp_time_stamp;
 		break;
+	case TCP_NOTSENT_LOWAT:
+		tp->notsent_lowat = val;
+		sk->sk_write_space(sk);
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -2847,6 +2851,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp + tp->tsoffset;
 		break;
+	case TCP_NOTSENT_LOWAT:
+		val = tp->notsent_lowat;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2e3f129df0e..2a5d5c469d1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2800,6 +2800,7 @@ struct proto tcp_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.stream_memory_free	= tcp_stream_memory_free,
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.orphan_count		= &tcp_orphan_count,
 	.memory_allocated	= &tcp_memory_allocated,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 92fde8d1aa8..884efff5b53 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
+unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
+EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
+
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 80fe69ef218..b792e870686 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1924,6 +1924,7 @@ struct proto tcpv6_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.stream_memory_free	= tcp_stream_memory_free,
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
-- 
cgit v1.2.3-70-g09d2


From e7428e95a06fb516fac1308bd0e176e27c0b9287 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 25 Jul 2013 10:20:23 +0930
Subject: virtio-net: put virtio net header inline with data

For small packets we can simplify xmit processing
by linearizing buffers with the header:
most packets seem to have enough head room
we can use for this purpose.
Since existing hypervisors require that header
is the first s/g element, we need a feature bit
for this.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c        | 42 +++++++++++++++++++++++++++++++++--------
 include/uapi/linux/virtio_net.h |  4 +++-
 2 files changed, 37 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3d2a90a6264..f2160027758 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -106,6 +106,9 @@ struct virtnet_info {
 	/* Has control virtqueue */
 	bool has_cvq;
 
+	/* Host can handle any s/g split between our header and packet data */
+	bool any_header_sg;
+
 	/* enable config space updates */
 	bool config_enable;
 
@@ -669,12 +672,28 @@ static void free_old_xmit_skbs(struct send_queue *sq)
 
 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 {
-	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
+	struct skb_vnet_hdr *hdr;
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
 	struct virtnet_info *vi = sq->vq->vdev->priv;
 	unsigned num_sg;
+	unsigned hdr_len;
+	bool can_push;
 
 	pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
+	if (vi->mergeable_rx_bufs)
+		hdr_len = sizeof hdr->mhdr;
+	else
+		hdr_len = sizeof hdr->hdr;
+
+	can_push = vi->any_header_sg &&
+		!((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
+		!skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
+	/* Even if we can, don't push here yet as this would skew
+	 * csum_start offset below. */
+	if (can_push)
+		hdr = (struct skb_vnet_hdr *)(skb->data - hdr_len);
+	else
+		hdr = skb_vnet_hdr(skb);
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
@@ -703,15 +722,18 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 		hdr->hdr.gso_size = hdr->hdr.hdr_len = 0;
 	}
 
-	hdr->mhdr.num_buffers = 0;
-
-	/* Encode metadata header at front. */
 	if (vi->mergeable_rx_bufs)
-		sg_set_buf(sq->sg, &hdr->mhdr, sizeof hdr->mhdr);
-	else
-		sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
+		hdr->mhdr.num_buffers = 0;
 
-	num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
+	if (can_push) {
+		__skb_push(skb, hdr_len);
+		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
+		/* Pull header back to avoid skew in tx bytes calculations. */
+		__skb_pull(skb, hdr_len);
+	} else {
+		sg_set_buf(sq->sg, hdr, hdr_len);
+		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
+	}
 	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
 }
 
@@ -1552,6 +1574,9 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 
+	if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT))
+		vi->any_header_sg = true;
+
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
 		vi->has_cvq = true;
 
@@ -1727,6 +1752,7 @@ static unsigned int features[] = {
 	VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
 	VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
 	VIRTIO_NET_F_CTRL_MAC_ADDR,
+	VIRTIO_F_ANY_LAYOUT,
 };
 
 static struct virtio_driver virtio_net_driver = {
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index c520203fac2..227d4ce84e7 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -70,7 +70,9 @@ struct virtio_net_config {
 	__u16 max_virtqueue_pairs;
 } __attribute__((packed));
 
-/* This is the first element of the scatter-gather list.  If you don't
+/* This header comes first in the scatter-gather list.
+ * If VIRTIO_F_ANY_LAYOUT is not negotiated, it must
+ * be the first element of the scatter-gather list.  If you don't
  * specify GSO or CSUM features, you can simply ignore the header. */
 struct virtio_net_hdr {
 #define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	// Use csum_start, csum_offset
-- 
cgit v1.2.3-70-g09d2


From 66cae9ed6bc46b8cc57a9693f99f69926f3cc7ef Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Mon, 29 Jul 2013 18:16:50 +0200
Subject: rtnl: export physical port id via RT netlink

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: Narendra K <narendra_k@dell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c         | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 03f6170ab33..04c0e7a5d48 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -143,6 +143,7 @@ enum {
 	IFLA_NUM_TX_QUEUES,
 	IFLA_NUM_RX_QUEUES,
 	IFLA_CARRIER,
+	IFLA_PHYS_PORT_ID,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 3de740834d1..0b2972c59b9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -767,7 +767,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
 	       + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
 	       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
-	       + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */
+	       + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */
+	       + nla_total_size(MAX_PHYS_PORT_ID_LEN); /* IFLA_PHYS_PORT_ID */
 }
 
 static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -846,6 +847,24 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
+static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
+{
+	int err;
+	struct netdev_phys_port_id ppid;
+
+	err = dev_get_phys_port_id(dev, &ppid);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask)
@@ -913,6 +932,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			goto nla_put_failure;
 	}
 
+	if (rtnl_phys_port_id_fill(skb, dev))
+		goto nla_put_failure;
+
 	attr = nla_reserve(skb, IFLA_STATS,
 			sizeof(struct rtnl_link_stats));
 	if (attr == NULL)
@@ -1113,6 +1135,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PROMISCUITY]	= { .type = NLA_U32 },
 	[IFLA_NUM_TX_QUEUES]	= { .type = NLA_U32 },
 	[IFLA_NUM_RX_QUEUES]	= { .type = NLA_U32 },
+	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_PORT_ID_LEN },
 };
 EXPORT_SYMBOL(ifla_policy);
 
-- 
cgit v1.2.3-70-g09d2


From 7764a45a8f1fe74d4f7d301eaca2e558e7e2831a Mon Sep 17 00:00:00 2001
From: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Date: Thu, 1 Aug 2013 02:17:15 +0200
Subject: fib_rules: add .suppress operation

This change adds a new operation to the fib_rules_ops struct; it allows the
suppression of routing decisions if certain criteria are not met by its
results.

The first implemented constraint is a minimum prefix length added to the
structures of routing rules. If a rule is added with a minimum prefix length
>0, only routes meeting this threshold will be considered. Any other (more
general) routing table entries will be ignored.

When configuring a system with multiple network uplinks and default routes, it
is often convinient to reference the main routing table multiple times - but
omitting the default route. Using this patch and a modified "ip" utility, this
can be achieved by using the following command sequence:

  $ ip route add table secuplink default via 10.42.23.1

  $ ip rule add pref 100            table main prefixlength 1
  $ ip rule add pref 150 fwmark 0xA table secuplink

With this setup, packets marked 0xA will be processed by the additional routing
table "secuplink", but only if no suitable route in the main routing table can
be found. By using a minimal prefixlength of 1, the default route (/0) of the
table "main" is hidden to packets processed by rule 100; packets traveling to
destinations with more specific routing entries are processed as usual.

Signed-off-by: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h        |  4 ++++
 include/uapi/linux/fib_rules.h |  2 +-
 net/core/fib_rules.c           |  8 ++++++++
 net/ipv4/fib_rules.c           | 14 ++++++++++++++
 net/ipv6/fib6_rules.c          | 13 +++++++++++++
 5 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index e361f488242..2f286dce925 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -18,6 +18,7 @@ struct fib_rule {
 	u32			pref;
 	u32			flags;
 	u32			table;
+	u8			table_prefixlen_min;
 	u8			action;
 	u32			target;
 	struct fib_rule __rcu	*ctarget;
@@ -46,6 +47,8 @@ struct fib_rules_ops {
 	int			(*action)(struct fib_rule *,
 					  struct flowi *, int,
 					  struct fib_lookup_arg *);
+	bool			(*suppress)(struct fib_rule *,
+					    struct fib_lookup_arg *);
 	int			(*match)(struct fib_rule *,
 					 struct flowi *, int);
 	int			(*configure)(struct fib_rule *,
@@ -80,6 +83,7 @@ struct fib_rules_ops {
 	[FRA_FWMARK]	= { .type = NLA_U32 }, \
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
+	[FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \
 	[FRA_GOTO]	= { .type = NLA_U32 }
 
 static inline void fib_rule_get(struct fib_rule *rule)
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 51da65b68b8..59cd31b3455 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -45,7 +45,7 @@ enum {
 	FRA_FLOW,	/* flow/class id */
 	FRA_UNUSED6,
 	FRA_UNUSED7,
-	FRA_UNUSED8,
+	FRA_TABLE_PREFIXLEN_MIN,
 	FRA_TABLE,	/* Extended table id */
 	FRA_FWMASK,	/* mask for netfilter mark */
 	FRA_OIFNAME,
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 21735440c44..2ef5040c99c 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -226,6 +226,9 @@ jumped:
 		else
 			err = ops->action(rule, fl, flags, arg);
 
+		if (!err && ops->suppress && ops->suppress(rule, arg))
+			continue;
+
 		if (err != -EAGAIN) {
 			if ((arg->flags & FIB_LOOKUP_NOREF) ||
 			    likely(atomic_inc_not_zero(&rule->refcnt))) {
@@ -337,6 +340,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	rule->action = frh->action;
 	rule->flags = frh->flags;
 	rule->table = frh_get_table(frh, tb);
+	if (tb[FRA_TABLE_PREFIXLEN_MIN])
+		rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]);
 
 	if (!tb[FRA_PRIORITY] && ops->default_pref)
 		rule->pref = ops->default_pref(ops);
@@ -523,6 +528,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
+			 + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */
 			 + nla_total_size(4) /* FRA_FWMARK */
 			 + nla_total_size(4); /* FRA_FWMASK */
 
@@ -548,6 +554,8 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	frh->table = rule->table;
 	if (nla_put_u32(skb, FRA_TABLE, rule->table))
 		goto nla_put_failure;
+	if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min))
+		goto nla_put_failure;
 	frh->res1 = 0;
 	frh->res2 = 0;
 	frh->action = rule->action;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 26aa65d1fce..9f2906679d1 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -101,6 +101,19 @@ errout:
 	return err;
 }
 
+static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+	/* do not accept result if the route does
+	 * not meet the required prefix length
+	 */
+	struct fib_result *result = (struct fib_result *) arg->result;
+	if (result->prefixlen < rule->table_prefixlen_min) {
+		if (!(arg->flags & FIB_LOOKUP_NOREF))
+			fib_info_put(result->fi);
+		return true;
+	}
+	return false;
+}
 
 static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 {
@@ -267,6 +280,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
 	.rule_size	= sizeof(struct fib4_rule),
 	.addr_size	= sizeof(u32),
 	.action		= fib4_rule_action,
+	.suppress	= fib4_rule_suppress,
 	.match		= fib4_rule_match,
 	.configure	= fib4_rule_configure,
 	.delete		= fib4_rule_delete,
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 2e1a432867c..e64e6a55fc4 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -111,6 +111,18 @@ out:
 	return rt == NULL ? -EAGAIN : 0;
 }
 
+static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+{
+	struct rt6_info *rt = (struct rt6_info *) arg->result;
+	/* do not accept result if the route does
+	 * not meet the required prefix length
+	 */
+	if (rt->rt6i_dst.plen < rule->table_prefixlen_min) {
+		ip6_rt_put(rt);
+		return true;
+	}
+	return false;
+}
 
 static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 {
@@ -244,6 +256,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
 	.addr_size		= sizeof(struct in6_addr),
 	.action			= fib6_rule_action,
 	.match			= fib6_rule_match,
+	.suppress		= fib6_rule_suppress,
 	.configure		= fib6_rule_configure,
 	.compare		= fib6_rule_compare,
 	.fill			= fib6_rule_fill,
-- 
cgit v1.2.3-70-g09d2


From 16ef1fe272332b2f7fd99236017b891db48d9cd6 Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <simon.wunderlich@s2003.tu-chemnitz.de>
Date: Thu, 11 Jul 2013 16:09:05 +0200
Subject: nl80211/cfg80211: add channel switch command

To allow channel switch announcements within beacons, add
the channel switch command to nl80211/cfg80211. This is
implementation is intended for AP and (later) IBSS mode.

Signed-off-by: Simon Wunderlich <siwu@hrz.tu-chemnitz.de>
Signed-off-by: Mathias Kretschmer <mathias.kretschmer@fokus.fraunhofer.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  33 ++++++++++++
 include/uapi/linux/nl80211.h |  30 +++++++++++
 net/wireless/nl80211.c       | 122 ++++++++++++++++++++++++++++++++++++++++++-
 net/wireless/rdev-ops.h      |  12 +++++
 net/wireless/trace.h         |  33 ++++++++++++
 5 files changed, 229 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index aeaf6dff6e0..b7495c72061 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -665,6 +665,30 @@ struct cfg80211_ap_settings {
 	bool radar_required;
 };
 
+/**
+ * struct cfg80211_csa_settings - channel switch settings
+ *
+ * Used for channel switch
+ *
+ * @chandef: defines the channel to use after the switch
+ * @beacon_csa: beacon data while performing the switch
+ * @counter_offset_beacon: offset for the counter within the beacon (tail)
+ * @counter_offset_presp: offset for the counter within the probe response
+ * @beacon_after: beacon data to be used on the new channel
+ * @radar_required: whether radar detection is required on the new channel
+ * @block_tx: whether transmissions should be blocked while changing
+ * @count: number of beacons until switch
+ */
+struct cfg80211_csa_settings {
+	struct cfg80211_chan_def chandef;
+	struct cfg80211_beacon_data beacon_csa;
+	u16 counter_offset_beacon, counter_offset_presp;
+	struct cfg80211_beacon_data beacon_after;
+	bool radar_required;
+	bool block_tx;
+	u8 count;
+};
+
 /**
  * enum station_parameters_apply_mask - station parameter values to apply
  * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp)
@@ -2139,6 +2163,8 @@ struct cfg80211_update_ft_ies_params {
  * @crit_proto_stop: Indicates critical protocol no longer needs increased link
  *	reliability. This operation can not fail.
  * @set_coalesce: Set coalesce parameters.
+ *
+ * @channel_switch: initiate channel-switch procedure (with CSA)
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -2376,6 +2402,10 @@ struct cfg80211_ops {
 				   struct wireless_dev *wdev);
 	int	(*set_coalesce)(struct wiphy *wiphy,
 				struct cfg80211_coalesce *coalesce);
+
+	int	(*channel_switch)(struct wiphy *wiphy,
+				  struct net_device *dev,
+				  struct cfg80211_csa_settings *params);
 };
 
 /*
@@ -2441,6 +2471,8 @@ struct cfg80211_ops {
  * @WIPHY_FLAG_OFFCHAN_TX: Device supports direct off-channel TX.
  * @WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL: Device supports remain-on-channel call.
  * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels.
+ * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in
+ *	beaconing mode (AP, IBSS, Mesh, ...).
  */
 enum wiphy_flags {
 	WIPHY_FLAG_CUSTOM_REGULATORY		= BIT(0),
@@ -2465,6 +2497,7 @@ enum wiphy_flags {
 	WIPHY_FLAG_OFFCHAN_TX			= BIT(20),
 	WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL	= BIT(21),
 	WIPHY_FLAG_SUPPORTS_5_10_MHZ		= BIT(22),
+	WIPHY_FLAG_HAS_CHANNEL_SWITCH		= BIT(23),
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index eb68735b331..1f42bc3dcb9 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -676,6 +676,16 @@
  * @NL80211_CMD_GET_COALESCE: Get currently supported coalesce rules.
  * @NL80211_CMD_SET_COALESCE: Configure coalesce rules or clear existing rules.
  *
+ * @NL80211_CMD_CHANNEL_SWITCH: Perform a channel switch by announcing the
+ *	the new channel information (Channel Switch Announcement - CSA)
+ *	in the beacon for some time (as defined in the
+ *	%NL80211_ATTR_CH_SWITCH_COUNT parameter) and then change to the
+ *	new channel. Userspace provides the new channel information (using
+ *	%NL80211_ATTR_WIPHY_FREQ and the attributes determining channel
+ *	width). %NL80211_ATTR_CH_SWITCH_BLOCK_TX may be supplied to inform
+ *	other station that transmission must be blocked until the channel
+ *	switch is complete.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -841,6 +851,8 @@ enum nl80211_commands {
 	NL80211_CMD_GET_COALESCE,
 	NL80211_CMD_SET_COALESCE,
 
+	NL80211_CMD_CHANNEL_SWITCH,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -1469,6 +1481,18 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information.
  *
+ * @NL80211_ATTR_CH_SWITCH_COUNT: u32 attribute specifying the number of TBTT's
+ *	until the channel switch event.
+ * @NL80211_ATTR_CH_SWITCH_BLOCK_TX: flag attribute specifying that transmission
+ *	must be blocked on the current channel (before the channel switch
+ *	operation).
+ * @NL80211_ATTR_CSA_IES: Nested set of attributes containing the IE information
+ *	for the time while performing a channel switch.
+ * @NL80211_ATTR_CSA_C_OFF_BEACON: Offset of the channel switch counter
+ *	field in the beacons tail (%NL80211_ATTR_BEACON_TAIL).
+ * @NL80211_ATTR_CSA_C_OFF_PRESP: Offset of the channel switch counter
+ *	field in the probe response (%NL80211_ATTR_PROBE_RESP).
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1771,6 +1795,12 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_COALESCE_RULE,
 
+	NL80211_ATTR_CH_SWITCH_COUNT,
+	NL80211_ATTR_CH_SWITCH_BLOCK_TX,
+	NL80211_ATTR_CSA_IES,
+	NL80211_ATTR_CSA_C_OFF_BEACON,
+	NL80211_ATTR_CSA_C_OFF_PRESP,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 03d4ef95292..f7cb12178bd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -349,6 +349,11 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = {
 	[NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
 				  .len = IEEE80211_MAX_DATA_LEN },
 	[NL80211_ATTR_PEER_AID] = { .type = NLA_U16 },
+	[NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 },
+	[NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG },
+	[NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED },
+	[NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_U16 },
+	[NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_U16 },
 };
 
 /* policy for the key attributes */
@@ -1422,6 +1427,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *dev,
 		if (state->split) {
 			CMD(crit_proto_start, CRIT_PROTOCOL_START);
 			CMD(crit_proto_stop, CRIT_PROTOCOL_STOP);
+			if (dev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH)
+				CMD(channel_switch, CHANNEL_SWITCH);
 		}
 
 #ifdef CONFIG_NL80211_TESTMODE
@@ -5613,6 +5620,111 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
 	return err;
 }
 
+static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_csa_settings params;
+	/* csa_attrs is defined static to avoid waste of stack size - this
+	 * function is called under RTNL lock, so this should not be a problem.
+	 */
+	static struct nlattr *csa_attrs[NL80211_ATTR_MAX+1];
+	u8 radar_detect_width = 0;
+	int err;
+
+	if (!rdev->ops->channel_switch ||
+	    !(rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH))
+		return -EOPNOTSUPP;
+
+	/* may add IBSS support later */
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+		return -EOPNOTSUPP;
+
+	memset(&params, 0, sizeof(params));
+
+	if (!info->attrs[NL80211_ATTR_WIPHY_FREQ] ||
+	    !info->attrs[NL80211_ATTR_CH_SWITCH_COUNT])
+		return -EINVAL;
+
+	/* only important for AP, IBSS and mesh create IEs internally */
+	if (!info->attrs[NL80211_ATTR_CSA_IES])
+		return -EINVAL;
+
+	/* useless if AP is not running */
+	if (!wdev->beacon_interval)
+		return -EINVAL;
+
+	params.count = nla_get_u32(info->attrs[NL80211_ATTR_CH_SWITCH_COUNT]);
+
+	err = nl80211_parse_beacon(info->attrs, &params.beacon_after);
+	if (err)
+		return err;
+
+	err = nla_parse_nested(csa_attrs, NL80211_ATTR_MAX,
+			       info->attrs[NL80211_ATTR_CSA_IES],
+			       nl80211_policy);
+	if (err)
+		return err;
+
+	err = nl80211_parse_beacon(csa_attrs, &params.beacon_csa);
+	if (err)
+		return err;
+
+	if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON])
+		return -EINVAL;
+
+	params.counter_offset_beacon =
+		nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]);
+	if (params.counter_offset_beacon >= params.beacon_csa.tail_len)
+		return -EINVAL;
+
+	/* sanity check - counters should be the same */
+	if (params.beacon_csa.tail[params.counter_offset_beacon] !=
+	    params.count)
+		return -EINVAL;
+
+	if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) {
+		params.counter_offset_presp =
+			nla_get_u16(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]);
+		if (params.counter_offset_presp >=
+		    params.beacon_csa.probe_resp_len)
+			return -EINVAL;
+
+		if (params.beacon_csa.probe_resp[params.counter_offset_presp] !=
+		    params.count)
+			return -EINVAL;
+	}
+
+	err = nl80211_parse_chandef(rdev, info, &params.chandef);
+	if (err)
+		return err;
+
+	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef))
+		return -EINVAL;
+
+	err = cfg80211_chandef_dfs_required(wdev->wiphy, &params.chandef);
+	if (err < 0) {
+		return err;
+	} else if (err) {
+		radar_detect_width = BIT(params.chandef.width);
+		params.radar_required = true;
+	}
+
+	err = cfg80211_can_use_iftype_chan(rdev, wdev, wdev->iftype,
+					   params.chandef.chan,
+					   CHAN_MODE_SHARED,
+					   radar_detect_width);
+	if (err)
+		return err;
+
+	if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX])
+		params.block_tx = true;
+
+	return rdev_channel_switch(rdev, dev, &params);
+}
+
 static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb,
 			    u32 seq, int flags,
 			    struct cfg80211_registered_device *rdev,
@@ -9361,7 +9473,15 @@ static struct genl_ops nl80211_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = NL80211_FLAG_NEED_WIPHY |
 				  NL80211_FLAG_NEED_RTNL,
-	}
+	},
+	{
+		.cmd = NL80211_CMD_CHANNEL_SWITCH,
+		.doit = nl80211_channel_switch,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 };
 
 static struct genl_multicast_group nl80211_mlme_mcgrp = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 9f15f0ac824..de870d4d0bc 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -923,4 +923,16 @@ static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev,
 	trace_rdev_return_void(&rdev->wiphy);
 }
 
+static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev,
+				      struct net_device *dev,
+				      struct cfg80211_csa_settings *params)
+{
+	int ret;
+
+	trace_rdev_channel_switch(&rdev->wiphy, dev, params);
+	ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 09af6eb426a..f0ebdcd394e 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1841,6 +1841,39 @@ TRACE_EVENT(rdev_crit_proto_stop,
 		  WIPHY_PR_ARG, WDEV_PR_ARG)
 );
 
+TRACE_EVENT(rdev_channel_switch,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct cfg80211_csa_settings *params),
+	TP_ARGS(wiphy, netdev, params),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		CHAN_DEF_ENTRY
+		__field(u16, counter_offset_beacon)
+		__field(u16, counter_offset_presp)
+		__field(bool, radar_required)
+		__field(bool, block_tx)
+		__field(u8, count)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		CHAN_DEF_ASSIGN(&params->chandef);
+		__entry->counter_offset_beacon = params->counter_offset_beacon;
+		__entry->counter_offset_presp = params->counter_offset_presp;
+		__entry->radar_required = params->radar_required;
+		__entry->block_tx = params->block_tx;
+		__entry->count = params->count;
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
+		  ", block_tx: %d, count: %u, radar_required: %d"
+		  ", counter offsets (beacon/presp): %u/%u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
+		  __entry->block_tx, __entry->count, __entry->radar_required,
+		  __entry->counter_offset_beacon,
+		  __entry->counter_offset_presp)
+);
+
 /*************************************************************
  *	     cfg80211 exported functions traces		     *
  *************************************************************/
-- 
cgit v1.2.3-70-g09d2


From e216975ad97cfcfc436789aa66d59a0e93f337f7 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 1 Aug 2013 16:17:47 -0700
Subject: uapi: Convert some uses of 6 to ETH_ALEN

Use the #define where appropriate.

Add #include <linux/if_ether.h>
where appropriate too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/dn.h                           | 3 ++-
 include/uapi/linux/if_bridge.h                    | 3 ++-
 include/uapi/linux/netfilter_bridge/ebt_802_3.h   | 5 +++--
 include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 3 ++-
 include/uapi/linux/virtio_net.h                   | 2 +-
 include/uapi/linux/wimax/i2400m.h                 | 4 ++--
 6 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/dn.h b/include/uapi/linux/dn.h
index 9c50445462d..5fbdd3d49eb 100644
--- a/include/uapi/linux/dn.h
+++ b/include/uapi/linux/dn.h
@@ -2,6 +2,7 @@
 #define _LINUX_DN_H
 
 #include <linux/types.h>
+#include <linux/if_ether.h>
 
 /*
 
@@ -120,7 +121,7 @@ struct linkinfo_dn {
  * Ethernet address format (for DECnet)
  */
 union etheraddress {
-        __u8 dne_addr[6];             /* Full ethernet address */
+        __u8 dne_addr[ETH_ALEN];      /* Full ethernet address */
   struct {
                 __u8 dne_hiord[4];    /* DECnet HIORD prefix   */
                 __u8 dne_nodeaddr[2]; /* DECnet node address   */
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 2d70d79ce2f..39f621a9fe8 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -14,6 +14,7 @@
 #define _UAPI_LINUX_IF_BRIDGE_H
 
 #include <linux/types.h>
+#include <linux/if_ether.h>
 
 #define SYSFS_BRIDGE_ATTR	"bridge"
 #define SYSFS_BRIDGE_FDB	"brforward"
@@ -88,7 +89,7 @@ struct __port_info {
 };
 
 struct __fdb_entry {
-	__u8 mac_addr[6];
+	__u8 mac_addr[ETH_ALEN];
 	__u8 port_no;
 	__u8 is_local;
 	__u32 ageing_timer_value;
diff --git a/include/uapi/linux/netfilter_bridge/ebt_802_3.h b/include/uapi/linux/netfilter_bridge/ebt_802_3.h
index 5bf84912a08..f37522aade2 100644
--- a/include/uapi/linux/netfilter_bridge/ebt_802_3.h
+++ b/include/uapi/linux/netfilter_bridge/ebt_802_3.h
@@ -2,6 +2,7 @@
 #define _UAPI__LINUX_BRIDGE_EBT_802_3_H
 
 #include <linux/types.h>
+#include <linux/if_ether.h>
 
 #define EBT_802_3_SAP 0x01
 #define EBT_802_3_TYPE 0x02
@@ -42,8 +43,8 @@ struct hdr_ni {
 };
 
 struct ebt_802_3_hdr {
-	__u8  daddr[6];
-	__u8  saddr[6];
+	__u8  daddr[ETH_ALEN];
+	__u8  saddr[ETH_ALEN];
 	__be16 len;
 	union {
 		struct hdr_ui ui;
diff --git a/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h
index c6a204c9704..eac0f6548f4 100644
--- a/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h
+++ b/include/uapi/linux/netfilter_ipv4/ipt_CLUSTERIP.h
@@ -2,6 +2,7 @@
 #define _IPT_CLUSTERIP_H_target
 
 #include <linux/types.h>
+#include <linux/if_ether.h>
 
 enum clusterip_hashmode {
     CLUSTERIP_HASHMODE_SIP = 0,
@@ -22,7 +23,7 @@ struct ipt_clusterip_tgt_info {
 	__u32 flags;
 
 	/* only relevant for new ones */
-	__u8 clustermac[6];
+	__u8 clustermac[ETH_ALEN];
 	__u16 num_total_nodes;
 	__u16 num_local_nodes;
 	__u16 local_nodes[CLUSTERIP_MAX_NODES];
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 227d4ce84e7..172a7f00780 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -60,7 +60,7 @@
 
 struct virtio_net_config {
 	/* The config defining mac address (if VIRTIO_NET_F_MAC) */
-	__u8 mac[6];
+	__u8 mac[ETH_ALEN];
 	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
 	__u16 status;
 	/* Maximum number of each of transmit and receive queues;
diff --git a/include/uapi/linux/wimax/i2400m.h b/include/uapi/linux/wimax/i2400m.h
index 62d35615356..fd198bc24a3 100644
--- a/include/uapi/linux/wimax/i2400m.h
+++ b/include/uapi/linux/wimax/i2400m.h
@@ -122,7 +122,7 @@
 #define __LINUX__WIMAX__I2400M_H__
 
 #include <linux/types.h>
-
+#include <linux/if_ether.h>
 
 /*
  * Host Device Interface (HDI) common to all busses
@@ -487,7 +487,7 @@ struct i2400m_tlv_l4_message_versions {
 struct i2400m_tlv_detailed_device_info {
 	struct i2400m_tlv_hdr hdr;
 	__u8 reserved1[400];
-	__u8 mac_address[6];
+	__u8 mac_address[ETH_ALEN];
 	__u8 reserved2[2];
 } __attribute__((packed));
 
-- 
cgit v1.2.3-70-g09d2


From 6ef94cfafba159d6b1a902ccb3349ac6a34ff6ad Mon Sep 17 00:00:00 2001
From: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Date: Fri, 2 Aug 2013 17:19:56 +0200
Subject: fib_rules: add route suppression based on ifgroup

This change adds the ability to suppress a routing decision based upon the
interface group the selected interface belongs to. This allows it to
exclude specific devices from a routing decision.

Signed-off-by: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h        |  2 ++
 include/uapi/linux/fib_rules.h |  2 +-
 net/core/fib_rules.c           | 10 ++++++++++
 net/ipv4/fib_rules.c           | 23 +++++++++++++++++------
 net/ipv6/fib6_rules.c          | 16 +++++++++++++---
 5 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 2f286dce925..d13c461b4b5 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -18,6 +18,7 @@ struct fib_rule {
 	u32			pref;
 	u32			flags;
 	u32			table;
+	int			suppress_ifgroup;
 	u8			table_prefixlen_min;
 	u8			action;
 	u32			target;
@@ -84,6 +85,7 @@ struct fib_rules_ops {
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
 	[FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \
+	[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
 	[FRA_GOTO]	= { .type = NLA_U32 }
 
 static inline void fib_rule_get(struct fib_rule *rule)
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 59cd31b3455..63e31166e85 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -44,7 +44,7 @@ enum {
 	FRA_FWMARK,	/* mark */
 	FRA_FLOW,	/* flow/class id */
 	FRA_UNUSED6,
-	FRA_UNUSED7,
+	FRA_SUPPRESS_IFGROUP,
 	FRA_TABLE_PREFIXLEN_MIN,
 	FRA_TABLE,	/* Extended table id */
 	FRA_FWMASK,	/* mask for netfilter mark */
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 2ef5040c99c..5040a61bf28 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -343,6 +343,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (tb[FRA_TABLE_PREFIXLEN_MIN])
 		rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]);
 
+	if (tb[FRA_SUPPRESS_IFGROUP])
+		rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+
 	if (!tb[FRA_PRIORITY] && ops->default_pref)
 		rule->pref = ops->default_pref(ops);
 
@@ -529,6 +532,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
 			 + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */
+			 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
 			 + nla_total_size(4) /* FRA_FWMARK */
 			 + nla_total_size(4); /* FRA_FWMASK */
 
@@ -588,6 +592,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	    (rule->target &&
 	     nla_put_u32(skb, FRA_GOTO, rule->target)))
 		goto nla_put_failure;
+
+	if (rule->suppress_ifgroup != -1) {
+		if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
+			goto nla_put_failure;
+	}
+
 	if (ops->fill(rule, skb, frh) < 0)
 		goto nla_put_failure;
 
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 9f2906679d1..b78fd28970c 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -103,16 +103,27 @@ errout:
 
 static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
+	struct fib_result *result = (struct fib_result *) arg->result;
+	struct net_device *dev = result->fi->fib_dev;
+
 	/* do not accept result if the route does
 	 * not meet the required prefix length
 	 */
-	struct fib_result *result = (struct fib_result *) arg->result;
-	if (result->prefixlen < rule->table_prefixlen_min) {
-		if (!(arg->flags & FIB_LOOKUP_NOREF))
-			fib_info_put(result->fi);
-		return true;
-	}
+	if (result->prefixlen < rule->table_prefixlen_min)
+		goto suppress_route;
+
+	/* do not accept result if the route uses a device
+	 * belonging to a forbidden interface group
+	 */
+	if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+		goto suppress_route;
+
 	return false;
+
+suppress_route:
+	if (!(arg->flags & FIB_LOOKUP_NOREF))
+		fib_info_put(result->fi);
+	return true;
 }
 
 static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 554a4fbabfb..36283267e2f 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -122,14 +122,24 @@ out:
 static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = (struct rt6_info *) arg->result;
+	struct net_device *dev = rt->rt6i_idev->dev;
 	/* do not accept result if the route does
 	 * not meet the required prefix length
 	 */
-	if (rt->rt6i_dst.plen < rule->table_prefixlen_min) {
+	if (rt->rt6i_dst.plen < rule->table_prefixlen_min)
+		goto suppress_route;
+
+	/* do not accept result if the route uses a device
+	 * belonging to a forbidden interface group
+	 */
+	if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
+		goto suppress_route;
+
+	return false;
+
+suppress_route:
 		ip6_rt_put(rt);
 		return true;
-	}
-	return false;
 }
 
 static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
-- 
cgit v1.2.3-70-g09d2


From 73f5698e77219bfc3ea1903759fe8e20ab5b285e Mon Sep 17 00:00:00 2001
From: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Date: Sat, 3 Aug 2013 14:14:43 +0200
Subject: fib_rules: fix suppressor names and default values

This change brings the suppressor attribute names into line; it also changes
the data types to provide a more consistent interface.

While -1 indicates that the suppressor is not enabled, values >= 0 for
suppress_prefixlen or suppress_ifgroup  reject routing decisions violating the
constraint.

This changes the previously presented behaviour of suppress_prefixlen, where a
prefix length _less_ than the attribute value was rejected. After this change,
a prefix length less than *or* equal to the value is considered a violation of
the rule constraint.

It also changes the default values for default and newly added rules (disabling
any suppression for those).

Signed-off-by: Stefan Tomanek <stefan.tomanek@wertarbyte.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h        |  4 ++--
 include/uapi/linux/fib_rules.h |  2 +-
 net/core/fib_rules.c           | 15 +++++++++++----
 net/ipv4/fib_rules.c           |  2 +-
 net/ipv6/fib6_rules.c          |  2 +-
 5 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index d13c461b4b5..9d0fcbaa9cb 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -19,7 +19,7 @@ struct fib_rule {
 	u32			flags;
 	u32			table;
 	int			suppress_ifgroup;
-	u8			table_prefixlen_min;
+	int			suppress_prefixlen;
 	u8			action;
 	u32			target;
 	struct fib_rule __rcu	*ctarget;
@@ -84,7 +84,7 @@ struct fib_rules_ops {
 	[FRA_FWMARK]	= { .type = NLA_U32 }, \
 	[FRA_FWMASK]	= { .type = NLA_U32 }, \
 	[FRA_TABLE]     = { .type = NLA_U32 }, \
-	[FRA_TABLE_PREFIXLEN_MIN] = { .type = NLA_U8 }, \
+	[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
 	[FRA_GOTO]	= { .type = NLA_U32 }
 
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 63e31166e85..2b82d7e3097 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -45,7 +45,7 @@ enum {
 	FRA_FLOW,	/* flow/class id */
 	FRA_UNUSED6,
 	FRA_SUPPRESS_IFGROUP,
-	FRA_TABLE_PREFIXLEN_MIN,
+	FRA_SUPPRESS_PREFIXLEN,
 	FRA_TABLE,	/* Extended table id */
 	FRA_FWMASK,	/* mask for netfilter mark */
 	FRA_OIFNAME,
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 5040a61bf28..2e654138433 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -33,6 +33,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 	r->flags = flags;
 	r->fr_net = hold_net(ops->fro_net);
 
+	r->suppress_prefixlen = -1;
+	r->suppress_ifgroup = -1;
+
 	/* The lock is not required here, the list in unreacheable
 	 * at the moment this function is called */
 	list_add_tail(&r->list, &ops->rules_list);
@@ -340,11 +343,15 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	rule->action = frh->action;
 	rule->flags = frh->flags;
 	rule->table = frh_get_table(frh, tb);
-	if (tb[FRA_TABLE_PREFIXLEN_MIN])
-		rule->table_prefixlen_min = nla_get_u8(tb[FRA_TABLE_PREFIXLEN_MIN]);
+	if (tb[FRA_SUPPRESS_PREFIXLEN])
+		rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+	else
+		rule->suppress_prefixlen = -1;
 
 	if (tb[FRA_SUPPRESS_IFGROUP])
 		rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+	else
+		rule->suppress_ifgroup = -1;
 
 	if (!tb[FRA_PRIORITY] && ops->default_pref)
 		rule->pref = ops->default_pref(ops);
@@ -531,7 +538,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
 			 + nla_total_size(4) /* FRA_PRIORITY */
 			 + nla_total_size(4) /* FRA_TABLE */
-			 + nla_total_size(1) /* FRA_TABLE_PREFIXLEN_MIN */
+			 + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
 			 + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
 			 + nla_total_size(4) /* FRA_FWMARK */
 			 + nla_total_size(4); /* FRA_FWMASK */
@@ -558,7 +565,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	frh->table = rule->table;
 	if (nla_put_u32(skb, FRA_TABLE, rule->table))
 		goto nla_put_failure;
-	if (nla_put_u8(skb, FRA_TABLE_PREFIXLEN_MIN, rule->table_prefixlen_min))
+	if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
 		goto nla_put_failure;
 	frh->res1 = 0;
 	frh->res2 = 0;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index b78fd28970c..523be38e37d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -109,7 +109,7 @@ static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
 	/* do not accept result if the route does
 	 * not meet the required prefix length
 	 */
-	if (result->prefixlen < rule->table_prefixlen_min)
+	if (result->prefixlen <= rule->suppress_prefixlen)
 		goto suppress_route;
 
 	/* do not accept result if the route uses a device
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 36283267e2f..a6c58ce43d3 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -126,7 +126,7 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
 	/* do not accept result if the route does
 	 * not meet the required prefix length
 	 */
-	if (rt->rt6i_dst.plen < rule->table_prefixlen_min)
+	if (rt->rt6i_dst.plen <= rule->suppress_prefixlen)
 		goto suppress_route;
 
 	/* do not accept result if the route uses a device
-- 
cgit v1.2.3-70-g09d2


From 1f07d03e2069df2ecd82301936b598a3b257c6d6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 6 Aug 2013 03:32:11 -0700
Subject: net: add SNMP counters tracking incoming ECN bits

With GRO/LRO processing, there is a problem because Ip[6]InReceives SNMP
counters do not count the number of frames, but number of aggregated
segments.

Its probably too late to change this now.

This patch adds four new counters, tracking number of frames, regardless
of LRO/GRO, and on a per ECN status basis, for IPv4 and IPv6.

Ip[6]NoECTPkts : Number of packets received with NOECT
Ip[6]ECT1Pkts  : Number of packets received with ECT(1)
Ip[6]ECT0Pkts  : Number of packets received with ECT(0)
Ip[6]CEPkts    : Number of packets received with Congestion Experienced

lph37:~# nstat | egrep "Pkts|InReceive"
IpInReceives                    1634137            0.0
Ip6InReceives                   3714107            0.0
Ip6InNoECTPkts                  19205              0.0
Ip6InECT0Pkts                   52651828           0.0
IpExtInNoECTPkts                33630              0.0
IpExtInECT0Pkts                 15581379           0.0
IpExtInCEPkts                   6                  0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 4 ++++
 net/ipv4/ip_input.c       | 8 ++++++++
 net/ipv4/proc.c           | 7 ++++++-
 net/ipv6/ip6_input.c      | 6 +++++-
 net/ipv6/proc.c           | 4 ++++
 5 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index af0a674cc67..60601d28da7 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -51,6 +51,10 @@ enum
 	IPSTATS_MIB_INBCASTOCTETS,		/* InBcastOctets */
 	IPSTATS_MIB_OUTBCASTOCTETS,		/* OutBcastOctets */
 	IPSTATS_MIB_CSUMERRORS,			/* InCsumErrors */
+	IPSTATS_MIB_NOECTPKTS,			/* InNoECTPkts */
+	IPSTATS_MIB_ECT1PKTS,			/* InECT1Pkts */
+	IPSTATS_MIB_ECT0PKTS,			/* InECT0Pkts */
+	IPSTATS_MIB_CEPKTS,			/* InCEPkts */
 	__IPSTATS_MIB_MAX
 };
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 15e3e683ade..054a3e97d82 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -141,6 +141,7 @@
 #include <net/icmp.h>
 #include <net/raw.h>
 #include <net/checksum.h>
+#include <net/inet_ecn.h>
 #include <linux/netfilter_ipv4.h>
 #include <net/xfrm.h>
 #include <linux/mroute.h>
@@ -410,6 +411,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	if (iph->ihl < 5 || iph->version != 4)
 		goto inhdr_error;
 
+	BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
+	BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
+	BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
+	IP_ADD_STATS_BH(dev_net(dev),
+			IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+			max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+
 	if (!pskb_may_pull(skb, iph->ihl*4))
 		goto inhdr_error;
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 6577a1149a4..5f5fa612647 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -111,7 +111,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
 	SNMP_MIB_SENTINEL
 };
 
-/* Following RFC4293 items are displayed in /proc/net/netstat */
+/* Following items are displayed in /proc/net/netstat */
 static const struct snmp_mib snmp4_ipextstats_list[] = {
 	SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
 	SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
@@ -125,7 +125,12 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
 	SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
 	SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
 	SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
+	/* Non RFC4293 fields */
 	SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
+	SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+	SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+	SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+	SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 2bab2aa5974..302d6fb1ff2 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -44,7 +44,7 @@
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
 #include <net/xfrm.h>
-
+#include <net/inet_ecn.h>
 
 
 int ip6_rcv_finish(struct sk_buff *skb)
@@ -109,6 +109,10 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	if (hdr->version != 6)
 		goto err;
 
+	IP6_ADD_STATS_BH(dev_net(dev), idev,
+			 IPSTATS_MIB_NOECTPKTS +
+				(ipv6_get_dsfield(hdr) & INET_ECN_MASK),
+			 max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
 	/*
 	 * RFC4291 2.5.3
 	 * A packet received on an interface with a destination address
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 51c3285b5d9..091d066a57b 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -91,6 +91,10 @@ static const struct snmp_mib snmp6_ipstats_list[] = {
 	SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
 	SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
 	/* IPSTATS_MIB_CSUMERRORS is not relevant in IPv6 (no checksum) */
+	SNMP_MIB_ITEM("Ip6InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
+	SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
+	SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
+	SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS),
 	SNMP_MIB_SENTINEL
 };
 
-- 
cgit v1.2.3-70-g09d2


From bd0779370588386e4a67ba5d0b176cfded8e6a53 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 7 Aug 2013 18:13:20 +0200
Subject: netfilter: nfnetlink_queue: allow to attach expectations to
 conntracks

This patch adds the capability to attach expectations via nfnetlink_queue.
This is required by conntrack helpers that trigger expectations based on
the first packet seen like the TFTP and the DHCPv6 user-space helpers.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                      |  2 +
 include/net/netfilter/nfnetlink_queue.h        |  8 +++
 include/uapi/linux/netfilter/nfnetlink_queue.h |  1 +
 net/netfilter/nf_conntrack_netlink.c           | 95 ++++++++++++++++++++++----
 net/netfilter/nfnetlink_queue_core.c           |  9 ++-
 net/netfilter/nfnetlink_queue_ct.c             | 15 ++++
 6 files changed, 114 insertions(+), 16 deletions(-)

(limited to 'include/uapi')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 655d5d198d4..e2cf786be22 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -325,6 +325,8 @@ struct nfq_ct_hook {
 	size_t (*build_size)(const struct nf_conn *ct);
 	int (*build)(struct sk_buff *skb, struct nf_conn *ct);
 	int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
+	int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct,
+			     u32 portid, u32 report);
 };
 extern struct nfq_ct_hook __rcu *nfq_ct_hook;
 
diff --git a/include/net/netfilter/nfnetlink_queue.h b/include/net/netfilter/nfnetlink_queue.h
index 86267a52951..aff88ba9139 100644
--- a/include/net/netfilter/nfnetlink_queue.h
+++ b/include/net/netfilter/nfnetlink_queue.h
@@ -15,6 +15,8 @@ int nfqnl_ct_put(struct sk_buff *skb, struct nf_conn *ct,
 		 enum ip_conntrack_info ctinfo);
 void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 			 enum ip_conntrack_info ctinfo, int diff);
+int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
+			u32 portid, u32 report);
 #else
 inline struct nf_conn *
 nfqnl_ct_get(struct sk_buff *entskb, size_t *size, enum ip_conntrack_info *ctinfo)
@@ -39,5 +41,11 @@ inline void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 				enum ip_conntrack_info ctinfo, int diff)
 {
 }
+
+inline int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
+			       u32 portid, u32 report)
+{
+	return 0;
+}
 #endif /* NF_CONNTRACK */
 #endif
diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index 3a9b9214733..0132bad79de 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -46,6 +46,7 @@ enum nfqnl_attr_type {
 	NFQA_CT_INFO,			/* enum ip_conntrack_info */
 	NFQA_CAP_LEN,			/* __u32 length of captured packet */
 	NFQA_SKB_INFO,			/* __u32 skb meta information */
+	NFQA_EXP,			/* nf_conntrack_netlink.h */
 
 	__NFQA_MAX
 };
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9aaa68bbbcd..fa61fea6323 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1987,6 +1987,27 @@ out:
 	return err == -EAGAIN ? -ENOBUFS : err;
 }
 
+static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
+	[CTA_EXPECT_MASTER]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_TUPLE]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_MASK]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
+	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
+	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING,
+				    .len = NF_CT_HELPER_NAME_LEN - 1 },
+	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
+	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 },
+	[CTA_EXPECT_CLASS]	= { .type = NLA_U32 },
+	[CTA_EXPECT_NAT]	= { .type = NLA_NESTED },
+	[CTA_EXPECT_FN]		= { .type = NLA_NUL_STRING },
+};
+
+static struct nf_conntrack_expect *
+ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct,
+		       struct nf_conntrack_helper *helper,
+		       struct nf_conntrack_tuple *tuple,
+		       struct nf_conntrack_tuple *mask);
+
 #ifdef CONFIG_NETFILTER_NETLINK_QUEUE_CT
 static size_t
 ctnetlink_nfqueue_build_size(const struct nf_conn *ct)
@@ -2127,10 +2148,69 @@ ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
 	return ret;
 }
 
+static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda,
+				       const struct nf_conn *ct,
+				       struct nf_conntrack_tuple *tuple,
+				       struct nf_conntrack_tuple *mask)
+{
+	int err;
+
+	err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE,
+				    nf_ct_l3num(ct));
+	if (err < 0)
+		return err;
+
+	return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK,
+				     nf_ct_l3num(ct));
+}
+
+static int
+ctnetlink_nfqueue_attach_expect(const struct nlattr *attr, struct nf_conn *ct,
+				u32 portid, u32 report)
+{
+	struct nlattr *cda[CTA_EXPECT_MAX+1];
+	struct nf_conntrack_tuple tuple, mask;
+	struct nf_conntrack_helper *helper;
+	struct nf_conntrack_expect *exp;
+	int err;
+
+	err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy);
+	if (err < 0)
+		return err;
+
+	err = ctnetlink_nfqueue_exp_parse((const struct nlattr * const *)cda,
+					  ct, &tuple, &mask);
+	if (err < 0)
+		return err;
+
+	if (cda[CTA_EXPECT_HELP_NAME]) {
+		const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]);
+
+		helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
+						    nf_ct_protonum(ct));
+		if (helper == NULL)
+			return -EOPNOTSUPP;
+	}
+
+	exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct,
+				     helper, &tuple, &mask);
+	if (IS_ERR(exp))
+		return PTR_ERR(exp);
+
+	err = nf_ct_expect_related_report(exp, portid, report);
+	if (err < 0) {
+		nf_ct_expect_put(exp);
+		return err;
+	}
+
+	return 0;
+}
+
 static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
 	.build_size	= ctnetlink_nfqueue_build_size,
 	.build		= ctnetlink_nfqueue_build,
 	.parse		= ctnetlink_nfqueue_parse,
+	.attach_expect	= ctnetlink_nfqueue_attach_expect,
 };
 #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
 
@@ -2498,21 +2578,6 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
 	return err;
 }
 
-static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
-	[CTA_EXPECT_MASTER]	= { .type = NLA_NESTED },
-	[CTA_EXPECT_TUPLE]	= { .type = NLA_NESTED },
-	[CTA_EXPECT_MASK]	= { .type = NLA_NESTED },
-	[CTA_EXPECT_TIMEOUT]	= { .type = NLA_U32 },
-	[CTA_EXPECT_ID]		= { .type = NLA_U32 },
-	[CTA_EXPECT_HELP_NAME]	= { .type = NLA_NUL_STRING,
-				    .len = NF_CT_HELPER_NAME_LEN - 1 },
-	[CTA_EXPECT_ZONE]	= { .type = NLA_U16 },
-	[CTA_EXPECT_FLAGS]	= { .type = NLA_U32 },
-	[CTA_EXPECT_CLASS]	= { .type = NLA_U32 },
-	[CTA_EXPECT_NAT]	= { .type = NLA_NESTED },
-	[CTA_EXPECT_FN]		= { .type = NLA_NUL_STRING },
-};
-
 static int
 ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
 		     const struct nlmsghdr *nlh,
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index ec9de12aa48..e8c9f3bb779 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -859,6 +859,7 @@ static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
 	[NFQA_MARK]		= { .type = NLA_U32 },
 	[NFQA_PAYLOAD]		= { .type = NLA_UNSPEC },
 	[NFQA_CT]		= { .type = NLA_UNSPEC },
+	[NFQA_EXP]		= { .type = NLA_UNSPEC },
 };
 
 static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
@@ -987,8 +988,14 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
 	if (entry == NULL)
 		return -ENOENT;
 
-	if (nfqa[NFQA_CT])
+	if (nfqa[NFQA_CT]) {
 		ct = nfqnl_ct_parse(entry->skb, nfqa[NFQA_CT], &ctinfo);
+		if (ct && nfqa[NFQA_EXP]) {
+			nfqnl_attach_expect(ct, nfqa[NFQA_EXP],
+					    NETLINK_CB(skb).portid,
+					    nlmsg_report(nlh));
+		}
+	}
 
 	if (nfqa[NFQA_PAYLOAD]) {
 		u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c
index ab61d66bc0b..be893039966 100644
--- a/net/netfilter/nfnetlink_queue_ct.c
+++ b/net/netfilter/nfnetlink_queue_ct.c
@@ -96,3 +96,18 @@ void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 	if ((ct->status & IPS_NAT_MASK) && diff)
 		nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff);
 }
+
+int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
+			u32 portid, u32 report)
+{
+	struct nfq_ct_hook *nfq_ct;
+
+	if (nf_ct_is_untracked(ct))
+		return 0;
+
+	nfq_ct = rcu_dereference(nfq_ct_hook);
+	if (nfq_ct == NULL)
+		return -EOPNOTSUPP;
+
+	return nfq_ct->attach_expect(attr, ct, portid, report);
+}
-- 
cgit v1.2.3-70-g09d2


From ebd8b934e23f45ad3fc8a5a28bc5a96741a6a106 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Sat, 10 Aug 2013 15:22:58 -0700
Subject: pptp: fix byte order warnings

Pptp driver has lots of byte order warnings from sparse.
This was because the on-the-wire header is in network byte order (obviously)
but the definition did not reflect that.

Also, the address structure to user space actually put the call id
in host order. Rather than break ABI compatibility, just acknowledge
the existing design.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ppp/pptp.c        | 10 +++++-----
 include/uapi/linux/if_pppox.h |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 9785a3ac02b..6fa5ae00039 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -83,11 +83,11 @@ static const struct proto_ops pptp_ops;
 struct pptp_gre_header {
 	u8  flags;
 	u8  ver;
-	u16 protocol;
-	u16 payload_len;
-	u16 call_id;
-	u32 seq;
-	u32 ack;
+	__be16 protocol;
+	__be16 payload_len;
+	__be16 call_id;
+	__be32 seq;
+	__be32 ack;
 } __packed;
 
 static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr)
diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h
index e36a4aecd31..e128769331b 100644
--- a/include/uapi/linux/if_pppox.h
+++ b/include/uapi/linux/if_pppox.h
@@ -46,7 +46,7 @@ struct pppoe_addr {
  * PPTP addressing definition
  */
 struct pptp_addr {
-	__be16		call_id;
+	__u16		call_id;
 	struct in_addr	sin_addr;
 };
 
-- 
cgit v1.2.3-70-g09d2


From 1972b5b3a66f1b6a9df04cc91faf401354919262 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Tue, 25 Jun 2013 12:42:54 +0200
Subject: NFC: Document secure element addition/removal netlink events

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/nfc.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 8137dd8d2ad..40ada984cb7 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -71,6 +71,11 @@
  * @NFC_CMD_DISABLE_SE: Disable the physical link to a specific secure element.
  * @NFC_CMD_FW_DOWNLOAD: Request to Load/flash firmware, or event to inform
  *	that some firmware was loaded
+ * @NFC_EVENT_SE_ADDED: Event emitted when a new secure element is discovered.
+ *	This typically will be sent whenever a new NFC controller with either
+ *	an embedded SE or an UICC one connected to it through SWP.
+ * @NFC_EVENT_SE_REMOVED: Event emitted when a secure element is removed from
+ *	the system, as a consequence of e.g. an NFC controller being unplugged.
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
-- 
cgit v1.2.3-70-g09d2


From 91a32269e31282405db405b9ec9ce1a30568e4b0 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Tue, 25 Jun 2013 16:22:08 +0200
Subject: NFC: Define secure element connectivity and transaction events

The SE_CONNECTIVITY event is for an SE to request connection to e.g. a
modem. The SE_TRANSACTION one is sent when an application running on a
specific SE wants to notify the host CPU about the end of a transaction.
Those events respectively map to the EVT_CONNECTIVITY and the
EVT_TRANSACTION HCI events.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/nfc.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 40ada984cb7..539d60494c0 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -76,6 +76,14 @@
  *	an embedded SE or an UICC one connected to it through SWP.
  * @NFC_EVENT_SE_REMOVED: Event emitted when a secure element is removed from
  *	the system, as a consequence of e.g. an NFC controller being unplugged.
+ * @NFC_EVENT_SE_CONNECTIVITY: This event is emitted whenever a secure element
+ *	is requesting connectivity access. For example a UICC SE may need to
+ *	talk with a sleeping modem and will notify this need by sending this
+ *	event. It is then up to userspace to decide if it will wake the modem
+ *	up or not.
+ * @NFC_EVENT_SE_TRANSACTION: This event is sent when an application running on
+ *	a specific SE notifies us about the end of a transaction. The parameter
+ *	for this event is the application ID (AID).
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
@@ -102,6 +110,8 @@ enum nfc_commands {
 	NFC_CMD_FW_DOWNLOAD,
 	NFC_EVENT_SE_ADDED,
 	NFC_EVENT_SE_REMOVED,
+	NFC_EVENT_SE_CONNECTIVITY,
+	NFC_EVENT_SE_TRANSACTION,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
@@ -159,6 +169,7 @@ enum nfc_attrs {
 	NFC_ATTR_FIRMWARE_NAME,
 	NFC_ATTR_SE_INDEX,
 	NFC_ATTR_SE_TYPE,
+	NFC_ATTR_SE_AID,
 /* private: internal use only */
 	__NFC_ATTR_AFTER_LAST
 };
-- 
cgit v1.2.3-70-g09d2


From ac22ac466a659f1b2e02a2e2ee23fc5c42da2c95 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 24 Jul 2013 18:10:50 +0200
Subject: NFC: Add a GET_SE netlink API

In order to fetch the discovered secure elements from an NFC controller,
we need to send a netlink command that will dump the list of available
SEs from NFC.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/uapi/linux/nfc.h |  2 ++
 net/nfc/netlink.c        | 91 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 539d60494c0..029921b067f 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -84,6 +84,7 @@
  * @NFC_EVENT_SE_TRANSACTION: This event is sent when an application running on
  *	a specific SE notifies us about the end of a transaction. The parameter
  *	for this event is the application ID (AID).
+ * @NFC_CMD_GET_SE: Dump all discovered secure elements from an NFC controller.
  */
 enum nfc_commands {
 	NFC_CMD_UNSPEC,
@@ -112,6 +113,7 @@ enum nfc_commands {
 	NFC_EVENT_SE_REMOVED,
 	NFC_EVENT_SE_CONNECTIVITY,
 	NFC_EVENT_SE_TRANSACTION,
+	NFC_CMD_GET_SE,
 /* private: internal use only */
 	__NFC_CMD_AFTER_LAST
 };
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index f16fd59d416..3b08ef90e04 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1191,6 +1191,91 @@ static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info)
 	return rc;
 }
 
+static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev,
+				u32 portid, u32 seq,
+				struct netlink_callback *cb,
+				int flags)
+{
+	void *hdr;
+	struct nfc_se *se, *n;
+
+	list_for_each_entry_safe(se, n, &dev->secure_elements, list) {
+		hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags,
+				  NFC_CMD_GET_SE);
+		if (!hdr)
+			goto nla_put_failure;
+
+		if (cb)
+			genl_dump_check_consistent(cb, hdr, &nfc_genl_family);
+
+		if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+		    nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) ||
+		    nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type))
+			goto nla_put_failure;
+
+		if (genlmsg_end(msg, hdr) < 0)
+			goto nla_put_failure;
+	}
+
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static int nfc_genl_dump_ses(struct sk_buff *skb,
+				 struct netlink_callback *cb)
+{
+	struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+	struct nfc_dev *dev = (struct nfc_dev *) cb->args[1];
+	bool first_call = false;
+
+	if (!iter) {
+		first_call = true;
+		iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL);
+		if (!iter)
+			return -ENOMEM;
+		cb->args[0] = (long) iter;
+	}
+
+	mutex_lock(&nfc_devlist_mutex);
+
+	cb->seq = nfc_devlist_generation;
+
+	if (first_call) {
+		nfc_device_iter_init(iter);
+		dev = nfc_device_iter_next(iter);
+	}
+
+	while (dev) {
+		int rc;
+
+		rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid,
+					  cb->nlh->nlmsg_seq, cb, NLM_F_MULTI);
+		if (rc < 0)
+			break;
+
+		dev = nfc_device_iter_next(iter);
+	}
+
+	mutex_unlock(&nfc_devlist_mutex);
+
+	cb->args[1] = (long) dev;
+
+	return skb->len;
+}
+
+static int nfc_genl_dump_ses_done(struct netlink_callback *cb)
+{
+	struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0];
+
+	nfc_device_iter_exit(iter);
+	kfree(iter);
+
+	return 0;
+}
+
 static struct genl_ops nfc_genl_ops[] = {
 	{
 		.cmd = NFC_CMD_GET_DEVICE,
@@ -1265,6 +1350,12 @@ static struct genl_ops nfc_genl_ops[] = {
 		.doit = nfc_genl_disable_se,
 		.policy = nfc_genl_policy,
 	},
+	{
+		.cmd = NFC_CMD_GET_SE,
+		.dumpit = nfc_genl_dump_ses,
+		.done = nfc_genl_dump_ses_done,
+		.policy = nfc_genl_policy,
+	},
 };
 
 
-- 
cgit v1.2.3-70-g09d2


From 352a5f5fb3ad8f829cfd4248fe6119895bda881f Mon Sep 17 00:00:00 2001
From: Eric Lapuyade <eric.lapuyade@linux.intel.com>
Date: Fri, 19 Jul 2013 14:57:55 +0200
Subject: NFC: netlink: Add result of firmware operation to completion event

Result is added as an NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS attribute
containing the standard errno positive value of the completion result.
This event will be sent when the firmare download operation is done and
will contain the operation result.

Signed-off-by: Eric Lapuyade <eric.lapuyade@intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/net/nfc/nfc.h    |  3 ++-
 include/uapi/linux/nfc.h |  2 ++
 net/nfc/core.c           | 12 ++++++++++--
 net/nfc/netlink.c        |  4 +++-
 net/nfc/nfc.h            |  3 ++-
 5 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 10059556058..f68ee68e4e3 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -224,7 +224,8 @@ int nfc_set_remote_general_bytes(struct nfc_dev *dev,
 				 u8 *gt, u8 gt_len);
 u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len);
 
-int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name);
+int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+			 u32 result);
 
 int nfc_targets_found(struct nfc_dev *dev,
 		      struct nfc_target *targets, int ntargets);
diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 029921b067f..29bed72a4ac 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -146,6 +146,7 @@ enum nfc_commands {
  * @NFC_ATTR_FIRMWARE_NAME: Free format firmware version
  * @NFC_ATTR_SE_INDEX: Secure element index
  * @NFC_ATTR_SE_TYPE: Secure element type (UICC or EMBEDDED)
+ * @NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS: Firmware download operation status
  */
 enum nfc_attrs {
 	NFC_ATTR_UNSPEC,
@@ -172,6 +173,7 @@ enum nfc_attrs {
 	NFC_ATTR_SE_INDEX,
 	NFC_ATTR_SE_TYPE,
 	NFC_ATTR_SE_AID,
+	NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS,
 /* private: internal use only */
 	__NFC_ATTR_AFTER_LAST
 };
diff --git a/net/nfc/core.c b/net/nfc/core.c
index aad7f8f5978..d252912b8de 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -77,11 +77,19 @@ error:
 	return rc;
 }
 
-int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name)
+/**
+ * nfc_fw_download_done - inform that a firmware download was completed
+ *
+ * @dev: The nfc device to which firmware was downloaded
+ * @firmware_name: The firmware filename
+ * @result: The positive value of a standard errno value
+ */
+int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+			 u32 result)
 {
 	dev->fw_download_in_progress = false;
 
-	return nfc_genl_fw_download_done(dev, firmware_name);
+	return nfc_genl_fw_download_done(dev, firmware_name, result);
 }
 EXPORT_SYMBOL(nfc_fw_download_done);
 
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 3b08ef90e04..68063b2025d 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1114,7 +1114,8 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info)
 	return rc;
 }
 
-int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name)
+int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+			      u32 result)
 {
 	struct sk_buff *msg;
 	void *hdr;
@@ -1129,6 +1130,7 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name)
 		goto free_msg;
 
 	if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) ||
+	    nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) ||
 	    nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx))
 		goto nla_put_failure;
 
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index 4e2e5a787c4..aaf606fc1fa 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -124,7 +124,8 @@ static inline void nfc_device_iter_exit(struct class_dev_iter *iter)
 }
 
 int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name);
-int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name);
+int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name,
+			      u32 result);
 
 int nfc_dev_up(struct nfc_dev *dev);
 
-- 
cgit v1.2.3-70-g09d2


From fc4eba58b4c1462ff3d6247b66fb47d6928db6d2 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 14 Aug 2013 01:03:46 +0200
Subject: ipv6: make unsolicited report intervals configurable for mld

Commit cab70040dfd95ee32144f02fade64f0cb94f31a0 ("net: igmp:
Reduce Unsolicited report interval to 1s when using IGMPv3") and
2690048c01f32bf45d1c1e1ab3079bc10ad2aea7 ("net: igmp: Allow user-space
configuration of igmp unsolicited report interval") by William Manley made
igmp unsolicited report intervals configurable per interface and corrected
the interval of unsolicited igmpv3 report messages resendings to 1s.

Same needs to be done for IPv6:

MLDv1 (RFC2710 7.10.): 10 seconds
MLDv2 (RFC3810 9.11.): 1 second

Both intervals are configurable via new procfs knobs
mldv1_unsolicited_report_interval and mldv2_unsolicited_report_interval.

(also added .force_mld_version to ipv6_devconf_dflt to bring structs in
line without semantic changes)

v2:
a) Joined documentation update for IPv4 and IPv6 MLD/IGMP
   unsolicited_report_interval procfs knobs.
b) incorporate stylistic feedback from William Manley

v3:
a) add new DEVCONF_* values to the end of the enum (thanks to David
   Miller)

Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: William Manley <william.manley@youview.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++++++
 include/linux/ipv6.h                   |  2 ++
 include/uapi/linux/ipv6.h              |  2 ++
 net/ipv6/addrconf.c                    | 25 +++++++++++++++++++++++++
 net/ipv6/mcast.c                       | 17 ++++++++++++++---
 5 files changed, 61 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 36be26b2ef7..debfe857d8f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1039,7 +1039,15 @@ disable_policy - BOOLEAN
 disable_xfrm - BOOLEAN
 	Disable IPSEC encryption on this interface, whatever the policy
 
+igmpv2_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	IGMPv1 or IGMPv2 report retransmit will take place.
+	Default: 10000 (10 seconds)
 
+igmpv3_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	IGMPv3 report retransmit will take place.
+	Default: 1000 (1 seconds)
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
@@ -1331,6 +1339,16 @@ ndisc_notify - BOOLEAN
 	1 - Generate unsolicited neighbour advertisements when device is brought
 	    up or hardware address changes.
 
+mldv1_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	MLDv1 report retransmit will take place.
+	Default: 10000 (10 seconds)
+
+mldv2_unsolicited_report_interval - INTEGER
+	The interval in milliseconds in which the next unsolicited
+	MLDv2 report retransmit will take place.
+	Default: 1000 (1 second)
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 850e95bc766..77a478462d8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -19,6 +19,8 @@ struct ipv6_devconf {
 	__s32		rtr_solicit_interval;
 	__s32		rtr_solicit_delay;
 	__s32		force_mld_version;
+	__s32		mldv1_unsolicited_report_interval;
+	__s32		mldv2_unsolicited_report_interval;
 #ifdef CONFIG_IPV6_PRIVACY
 	__s32		use_tempaddr;
 	__s32		temp_valid_lft;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 4bda4cf5b0f..d07ac6903e5 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -160,6 +160,8 @@ enum {
 	DEVCONF_ACCEPT_DAD,
 	DEVCONF_FORCE_TLLAO,
 	DEVCONF_NDISC_NOTIFY,
+	DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL,
+	DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7fd8572bac8..ad12f7c43f2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -177,6 +177,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_redirects	= 1,
 	.autoconf		= 1,
 	.force_mld_version	= 0,
+	.mldv1_unsolicited_report_interval = 10 * HZ,
+	.mldv2_unsolicited_report_interval = HZ,
 	.dad_transmits		= 1,
 	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
 	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
@@ -211,6 +213,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.accept_ra		= 1,
 	.accept_redirects	= 1,
 	.autoconf		= 1,
+	.force_mld_version	= 0,
+	.mldv1_unsolicited_report_interval = 10 * HZ,
+	.mldv2_unsolicited_report_interval = HZ,
 	.dad_transmits		= 1,
 	.rtr_solicits		= MAX_RTR_SOLICITATIONS,
 	.rtr_solicit_interval	= RTR_SOLICITATION_INTERVAL,
@@ -4179,6 +4184,10 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_RTR_SOLICIT_DELAY] =
 		jiffies_to_msecs(cnf->rtr_solicit_delay);
 	array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
+	array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] =
+		jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval);
+	array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] =
+		jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval);
 #ifdef CONFIG_IPV6_PRIVACY
 	array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr;
 	array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft;
@@ -4862,6 +4871,22 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "mldv1_unsolicited_report_interval",
+			.data		=
+				&ipv6_devconf.mldv1_unsolicited_report_interval,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_ms_jiffies,
+		},
+		{
+			.procname	= "mldv2_unsolicited_report_interval",
+			.data		=
+				&ipv6_devconf.mldv2_unsolicited_report_interval,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec_ms_jiffies,
+		},
 #ifdef CONFIG_IPV6_PRIVACY
 		{
 			.procname	= "use_tempaddr",
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index db25b8eb62b..6c76df9909b 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -108,7 +108,6 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 			    struct inet6_dev *idev);
 
 
-#define IGMP6_UNSOLICITED_IVAL	(10*HZ)
 #define MLD_QRV_DEFAULT		2
 
 #define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \
@@ -129,6 +128,18 @@ int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF;
 	     pmc != NULL;					\
 	     pmc = rcu_dereference(pmc->next))
 
+static int unsolicited_report_interval(struct inet6_dev *idev)
+{
+	int iv;
+
+	if (MLD_V1_SEEN(idev))
+		iv = idev->cnf.mldv1_unsolicited_report_interval;
+	else
+		iv = idev->cnf.mldv2_unsolicited_report_interval;
+
+	return iv > 0 ? iv : 1;
+}
+
 int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 {
 	struct net_device *dev = NULL;
@@ -2158,7 +2169,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma)
 
 	igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
 
-	delay = net_random() % IGMP6_UNSOLICITED_IVAL;
+	delay = net_random() % unsolicited_report_interval(ma->idev);
 
 	spin_lock_bh(&ma->mca_lock);
 	if (del_timer(&ma->mca_timer)) {
@@ -2325,7 +2336,7 @@ void ipv6_mc_init_dev(struct inet6_dev *idev)
 	setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire,
 		    (unsigned long)idev);
 	idev->mc_qrv = MLD_QRV_DEFAULT;
-	idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL;
+	idev->mc_maxdelay = unsolicited_report_interval(idev);
 	idev->mc_v1_seen = 0;
 	write_unlock_bh(&idev->lock);
 }
-- 
cgit v1.2.3-70-g09d2


From f0c03956ac40fdc4fbce7bed262ae74ac372e765 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 13 Aug 2013 15:05:38 +0200
Subject: netfilter: export xt_rpfilter.h to userland

This file contains the API for the match "rpfilter", hence it should be exported
to userland.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/xt_rpfilter.h      | 23 -----------------------
 include/uapi/linux/netfilter/Kbuild        |  1 +
 include/uapi/linux/netfilter/xt_rpfilter.h | 23 +++++++++++++++++++++++
 3 files changed, 24 insertions(+), 23 deletions(-)
 delete mode 100644 include/linux/netfilter/xt_rpfilter.h
 create mode 100644 include/uapi/linux/netfilter/xt_rpfilter.h

(limited to 'include/uapi')

diff --git a/include/linux/netfilter/xt_rpfilter.h b/include/linux/netfilter/xt_rpfilter.h
deleted file mode 100644
index 8358d4f7195..00000000000
--- a/include/linux/netfilter/xt_rpfilter.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _XT_RPATH_H
-#define _XT_RPATH_H
-
-#include <linux/types.h>
-
-enum {
-	XT_RPFILTER_LOOSE = 1 << 0,
-	XT_RPFILTER_VALID_MARK = 1 << 1,
-	XT_RPFILTER_ACCEPT_LOCAL = 1 << 2,
-	XT_RPFILTER_INVERT = 1 << 3,
-#ifdef __KERNEL__
-	XT_RPFILTER_OPTION_MASK = XT_RPFILTER_LOOSE |
-				  XT_RPFILTER_VALID_MARK |
-				  XT_RPFILTER_ACCEPT_LOCAL |
-				  XT_RPFILTER_INVERT,
-#endif
-};
-
-struct xt_rpfilter_info {
-	__u8 flags;
-};
-
-#endif
diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index 41115776d76..dc00927ffd6 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -68,6 +68,7 @@ header-y += xt_quota.h
 header-y += xt_rateest.h
 header-y += xt_realm.h
 header-y += xt_recent.h
+header-y += xt_rpfilter.h
 header-y += xt_sctp.h
 header-y += xt_set.h
 header-y += xt_socket.h
diff --git a/include/uapi/linux/netfilter/xt_rpfilter.h b/include/uapi/linux/netfilter/xt_rpfilter.h
new file mode 100644
index 00000000000..8358d4f7195
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_rpfilter.h
@@ -0,0 +1,23 @@
+#ifndef _XT_RPATH_H
+#define _XT_RPATH_H
+
+#include <linux/types.h>
+
+enum {
+	XT_RPFILTER_LOOSE = 1 << 0,
+	XT_RPFILTER_VALID_MARK = 1 << 1,
+	XT_RPFILTER_ACCEPT_LOCAL = 1 << 2,
+	XT_RPFILTER_INVERT = 1 << 3,
+#ifdef __KERNEL__
+	XT_RPFILTER_OPTION_MASK = XT_RPFILTER_LOOSE |
+				  XT_RPFILTER_VALID_MARK |
+				  XT_RPFILTER_ACCEPT_LOCAL |
+				  XT_RPFILTER_INVERT,
+#endif
+};
+
+struct xt_rpfilter_info {
+	__u8 flags;
+};
+
+#endif
-- 
cgit v1.2.3-70-g09d2


From 38c67328ac79cb9eaf61b5d4750fe3b9cff0dd15 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 13 Aug 2013 15:05:39 +0200
Subject: netfilter: export xt_HMARK.h to userland

This file contains the API for the target "HMARK", hence it should be exported
to userland.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/xt_HMARK.h      | 50 ---------------------------------
 include/uapi/linux/netfilter/Kbuild     |  1 +
 include/uapi/linux/netfilter/xt_HMARK.h | 50 +++++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 50 deletions(-)
 delete mode 100644 include/linux/netfilter/xt_HMARK.h
 create mode 100644 include/uapi/linux/netfilter/xt_HMARK.h

(limited to 'include/uapi')

diff --git a/include/linux/netfilter/xt_HMARK.h b/include/linux/netfilter/xt_HMARK.h
deleted file mode 100644
index 826fc580757..00000000000
--- a/include/linux/netfilter/xt_HMARK.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef XT_HMARK_H_
-#define XT_HMARK_H_
-
-#include <linux/types.h>
-
-enum {
-	XT_HMARK_SADDR_MASK,
-	XT_HMARK_DADDR_MASK,
-	XT_HMARK_SPI,
-	XT_HMARK_SPI_MASK,
-	XT_HMARK_SPORT,
-	XT_HMARK_DPORT,
-	XT_HMARK_SPORT_MASK,
-	XT_HMARK_DPORT_MASK,
-	XT_HMARK_PROTO_MASK,
-	XT_HMARK_RND,
-	XT_HMARK_MODULUS,
-	XT_HMARK_OFFSET,
-	XT_HMARK_CT,
-	XT_HMARK_METHOD_L3,
-	XT_HMARK_METHOD_L3_4,
-};
-#define XT_HMARK_FLAG(flag)	(1 << flag)
-
-union hmark_ports {
-	struct {
-		__u16	src;
-		__u16	dst;
-	} p16;
-	struct {
-		__be16	src;
-		__be16	dst;
-	} b16;
-	__u32	v32;
-	__be32	b32;
-};
-
-struct xt_hmark_info {
-	union nf_inet_addr	src_mask;
-	union nf_inet_addr	dst_mask;
-	union hmark_ports	port_mask;
-	union hmark_ports	port_set;
-	__u32			flags;
-	__u16			proto_mask;
-	__u32			hashrnd;
-	__u32			hmodulus;
-	__u32			hoffset;	/* Mark offset to start from */
-};
-
-#endif /* XT_HMARK_H_ */
diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index dc00927ffd6..174915420d3 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -22,6 +22,7 @@ header-y += xt_CONNMARK.h
 header-y += xt_CONNSECMARK.h
 header-y += xt_CT.h
 header-y += xt_DSCP.h
+header-y += xt_HMARK.h
 header-y += xt_IDLETIMER.h
 header-y += xt_LED.h
 header-y += xt_LOG.h
diff --git a/include/uapi/linux/netfilter/xt_HMARK.h b/include/uapi/linux/netfilter/xt_HMARK.h
new file mode 100644
index 00000000000..826fc580757
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_HMARK.h
@@ -0,0 +1,50 @@
+#ifndef XT_HMARK_H_
+#define XT_HMARK_H_
+
+#include <linux/types.h>
+
+enum {
+	XT_HMARK_SADDR_MASK,
+	XT_HMARK_DADDR_MASK,
+	XT_HMARK_SPI,
+	XT_HMARK_SPI_MASK,
+	XT_HMARK_SPORT,
+	XT_HMARK_DPORT,
+	XT_HMARK_SPORT_MASK,
+	XT_HMARK_DPORT_MASK,
+	XT_HMARK_PROTO_MASK,
+	XT_HMARK_RND,
+	XT_HMARK_MODULUS,
+	XT_HMARK_OFFSET,
+	XT_HMARK_CT,
+	XT_HMARK_METHOD_L3,
+	XT_HMARK_METHOD_L3_4,
+};
+#define XT_HMARK_FLAG(flag)	(1 << flag)
+
+union hmark_ports {
+	struct {
+		__u16	src;
+		__u16	dst;
+	} p16;
+	struct {
+		__be16	src;
+		__be16	dst;
+	} b16;
+	__u32	v32;
+	__be32	b32;
+};
+
+struct xt_hmark_info {
+	union nf_inet_addr	src_mask;
+	union nf_inet_addr	dst_mask;
+	union hmark_ports	port_mask;
+	union hmark_ports	port_set;
+	__u32			flags;
+	__u16			proto_mask;
+	__u32			hashrnd;
+	__u32			hmodulus;
+	__u32			hoffset;	/* Mark offset to start from */
+};
+
+#endif /* XT_HMARK_H_ */
-- 
cgit v1.2.3-70-g09d2


From 58264848a5a7b91195f43c4729072e8cc980288d Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Mon, 19 Aug 2013 11:23:34 -0700
Subject: openvswitch: Add vxlan tunneling support.

Following patch adds vxlan vport type for openvswitch using
vxlan api. So now there is vxlan dependency for openvswitch.

CC: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  11 +++
 net/openvswitch/Kconfig          |  13 +++
 net/openvswitch/Makefile         |   4 +
 net/openvswitch/vport-vxlan.c    | 204 +++++++++++++++++++++++++++++++++++++++
 net/openvswitch/vport.c          |   3 +
 net/openvswitch/vport.h          |   1 +
 6 files changed, 236 insertions(+)
 create mode 100644 net/openvswitch/vport-vxlan.c

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index c55efaaa9bb..52490b0e62b 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -165,6 +165,7 @@ enum ovs_vport_type {
 	OVS_VPORT_TYPE_NETDEV,   /* network device */
 	OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
 	OVS_VPORT_TYPE_GRE,      /* GRE tunnel. */
+	OVS_VPORT_TYPE_VXLAN,	 /* VXLAN tunnel. */
 	__OVS_VPORT_TYPE_MAX
 };
 
@@ -211,6 +212,16 @@ enum ovs_vport_attr {
 
 #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1)
 
+/* OVS_VPORT_ATTR_OPTIONS attributes for tunnels.
+ */
+enum {
+	OVS_TUNNEL_ATTR_UNSPEC,
+	OVS_TUNNEL_ATTR_DST_PORT, /* 16-bit UDP port, used by L4 tunnels. */
+	__OVS_TUNNEL_ATTR_MAX
+};
+
+#define OVS_TUNNEL_ATTR_MAX (__OVS_TUNNEL_ATTR_MAX - 1)
+
 /* Flows. */
 
 #define OVS_FLOW_FAMILY  "ovs_flow"
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 27ee56b688a..bed30e69baa 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -40,3 +40,16 @@ config OPENVSWITCH_GRE
 	  Say N to exclude this support and reduce the binary size.
 
 	  If unsure, say Y.
+
+config OPENVSWITCH_VXLAN
+	bool "Open vSwitch VXLAN tunneling support"
+	depends on INET
+	depends on OPENVSWITCH
+	depends on VXLAN && !(OPENVSWITCH=y && VXLAN=m)
+	default y
+	---help---
+	  If you say Y here, then the Open vSwitch will be able create vxlan vport.
+
+	  Say N to exclude this support and reduce the binary size.
+
+	  If unsure, say Y.
diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile
index 01bddb2991e..82e4ee54a44 100644
--- a/net/openvswitch/Makefile
+++ b/net/openvswitch/Makefile
@@ -13,3 +13,7 @@ openvswitch-y := \
 	vport-gre.o \
 	vport-internal_dev.o \
 	vport-netdev.o
+
+ifneq ($(CONFIG_OPENVSWITCH_VXLAN),)
+openvswitch-y += vport-vxlan.o
+endif
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
new file mode 100644
index 00000000000..36848bd54a7
--- /dev/null
+++ b/net/openvswitch/vport-vxlan.c
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ * Copyright (c) 2013 Cisco Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/rculist.h>
+#include <linux/udp.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/udp.h>
+#include <net/ip_tunnels.h>
+#include <net/udp.h>
+#include <net/rtnetlink.h>
+#include <net/route.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/vxlan.h>
+
+#include "datapath.h"
+#include "vport.h"
+
+/**
+ * struct vxlan_port - Keeps track of open UDP ports
+ * @vs: vxlan_sock created for the port.
+ * @name: vport name.
+ */
+struct vxlan_port {
+	struct vxlan_sock *vs;
+	char name[IFNAMSIZ];
+};
+
+static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
+{
+	return vport_priv(vport);
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni)
+{
+	struct ovs_key_ipv4_tunnel tun_key;
+	struct vport *vport = vs->data;
+	struct iphdr *iph;
+	__be64 key;
+
+	/* Save outer tunnel values */
+	iph = ip_hdr(skb);
+	key = cpu_to_be64(ntohl(vx_vni) >> 8);
+	ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY);
+
+	ovs_vport_receive(vport, skb, &tun_key);
+}
+
+static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
+{
+	struct vxlan_port *vxlan_port = vxlan_vport(vport);
+	__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+
+	if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static void vxlan_tnl_destroy(struct vport *vport)
+{
+	struct vxlan_port *vxlan_port = vxlan_vport(vport);
+
+	vxlan_sock_release(vxlan_port->vs);
+
+	ovs_vport_deferred_free(vport);
+}
+
+static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
+{
+	struct net *net = ovs_dp_get_net(parms->dp);
+	struct nlattr *options = parms->options;
+	struct vxlan_port *vxlan_port;
+	struct vxlan_sock *vs;
+	struct vport *vport;
+	struct nlattr *a;
+	u16 dst_port;
+	int err;
+
+	if (!options) {
+		err = -EINVAL;
+		goto error;
+	}
+	a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
+	if (a && nla_len(a) == sizeof(u16)) {
+		dst_port = nla_get_u16(a);
+	} else {
+		/* Require destination port from userspace. */
+		err = -EINVAL;
+		goto error;
+	}
+
+	vport = ovs_vport_alloc(sizeof(struct vxlan_port),
+				&ovs_vxlan_vport_ops, parms);
+	if (IS_ERR(vport))
+		return vport;
+
+	vxlan_port = vxlan_vport(vport);
+	strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
+
+	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true);
+	if (IS_ERR(vs)) {
+		ovs_vport_free(vport);
+		return (void *)vs;
+	}
+	vxlan_port->vs = vs;
+
+	return vport;
+
+error:
+	return ERR_PTR(err);
+}
+
+static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
+{
+	struct net *net = ovs_dp_get_net(vport->dp);
+	struct vxlan_port *vxlan_port = vxlan_vport(vport);
+	__be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport;
+	struct rtable *rt;
+	struct flowi4 fl;
+	__be16 src_port;
+	int port_min;
+	int port_max;
+	__be16 df;
+	int err;
+
+	if (unlikely(!OVS_CB(skb)->tun_key)) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	/* Route lookup */
+	memset(&fl, 0, sizeof(fl));
+	fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst;
+	fl.saddr = OVS_CB(skb)->tun_key->ipv4_src;
+	fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos);
+	fl.flowi4_mark = skb->mark;
+	fl.flowi4_proto = IPPROTO_UDP;
+
+	rt = ip_route_output_key(net, &fl);
+	if (IS_ERR(rt)) {
+		err = PTR_ERR(rt);
+		goto error;
+	}
+
+	df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
+		htons(IP_DF) : 0;
+
+	skb->local_df = 1;
+
+	inet_get_local_port_range(&port_min, &port_max);
+	src_port = vxlan_src_port(port_min, port_max, skb);
+
+	err = vxlan_xmit_skb(net, vxlan_port->vs, rt, skb,
+			     fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst,
+			     OVS_CB(skb)->tun_key->ipv4_tos,
+			     OVS_CB(skb)->tun_key->ipv4_ttl, df,
+			     src_port, dst_port,
+			     htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8));
+	if (err < 0)
+		ip_rt_put(rt);
+error:
+	return err;
+}
+
+static const char *vxlan_get_name(const struct vport *vport)
+{
+	struct vxlan_port *vxlan_port = vxlan_vport(vport);
+	return vxlan_port->name;
+}
+
+const struct vport_ops ovs_vxlan_vport_ops = {
+	.type		= OVS_VPORT_TYPE_VXLAN,
+	.create		= vxlan_tnl_create,
+	.destroy	= vxlan_tnl_destroy,
+	.get_name	= vxlan_get_name,
+	.get_options	= vxlan_get_options,
+	.send		= vxlan_tnl_send,
+};
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index d4c7fa04ce0..d69e0c06dfd 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -42,6 +42,9 @@ static const struct vport_ops *vport_ops_list[] = {
 #ifdef CONFIG_OPENVSWITCH_GRE
 	&ovs_gre_vport_ops,
 #endif
+#ifdef CONFIG_OPENVSWITCH_VXLAN
+	&ovs_vxlan_vport_ops,
+#endif
 };
 
 /* Protected by RCU read lock for reading, ovs_mutex for writing. */
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index 376045c42f8..1a9fbcec6e1 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -199,6 +199,7 @@ void ovs_vport_record_error(struct vport *, enum vport_err_type err_type);
 extern const struct vport_ops ovs_netdev_vport_ops;
 extern const struct vport_ops ovs_internal_vport_ops;
 extern const struct vport_ops ovs_gre_vport_ops;
+extern const struct vport_ops ovs_vxlan_vport_ops;
 
 static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
 				      const void *start, unsigned int len)
-- 
cgit v1.2.3-70-g09d2


From fb7589a162162223e6bb6422dde3fb1ce07d9a78 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 21 Aug 2013 14:31:38 +0400
Subject: tun: Add ability to create tun device with given index

Tun devices cannot be created with ifidex user wants, but it's
required by checkpoint-restore project.

Long time ago such ability was implemented for rtnl_ops-based
interface for creating links (9c7dafbf net: Allow to create links
with given ifindex), but the only API for creating and managing
tuntap devices is ioctl-based and is evolving with adding new ones
(cde8b15f tuntap: add ioctl to attach or detach a file form tuntap
device).

Following that trend, here's how a new ioctl that sets the ifindex
for device, that _will_ be created by TUNSETIFF ioctl looks like.
So those who want a tuntap device with the ifindex N, should open
the tun device, call ioctl(fd, TUNSETIFINDEX, &N), then call TUNSETIFF.
If the index N is busy, then the register_netdev will find this out
and the ioctl would be failed with -EBUSY.

If setifindex is not called, then it will be generated as before.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 21 ++++++++++++++++++++-
 include/uapi/linux/if_tun.h |  1 +
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7ed13cc0dcb..4b65fbcc490 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -138,7 +138,10 @@ struct tun_file {
 	struct fasync_struct *fasync;
 	/* only used for fasnyc */
 	unsigned int flags;
-	u16 queue_index;
+	union {
+		u16 queue_index;
+		unsigned int ifindex;
+	};
 	struct list_head next;
 	struct tun_struct *detached;
 };
@@ -1601,6 +1604,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 		dev_net_set(dev, net);
 		dev->rtnl_link_ops = &tun_link_ops;
+		dev->ifindex = tfile->ifindex;
 
 		tun = netdev_priv(dev);
 		tun->dev = dev;
@@ -1817,6 +1821,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	kgid_t group;
 	int sndbuf;
 	int vnet_hdr_sz;
+	unsigned int ifindex;
 	int ret;
 
 	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
@@ -1851,6 +1856,19 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			ret = -EFAULT;
 		goto unlock;
 	}
+	if (cmd == TUNSETIFINDEX) {
+		ret = -EPERM;
+		if (tun)
+			goto unlock;
+
+		ret = -EFAULT;
+		if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
+			goto unlock;
+
+		ret = 0;
+		tfile->ifindex = ifindex;
+		goto unlock;
+	}
 
 	ret = -EBADFD;
 	if (!tun)
@@ -2099,6 +2117,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	rcu_assign_pointer(tfile->tun, NULL);
 	tfile->net = get_net(current->nsproxy->net_ns);
 	tfile->flags = 0;
+	tfile->ifindex = 0;
 
 	rcu_assign_pointer(tfile->socket.wq, &tfile->wq);
 	init_waitqueue_head(&tfile->wq.wait);
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 1870ee29bb3..c58d023c482 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -56,6 +56,7 @@
 #define TUNGETVNETHDRSZ _IOR('T', 215, int)
 #define TUNSETVNETHDRSZ _IOW('T', 216, int)
 #define TUNSETQUEUE  _IOW('T', 217, int)
+#define TUNSETIFINDEX	_IOW('T', 218, unsigned int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
cgit v1.2.3-70-g09d2


From 849c9b6f93cc4cb5eb59301b6380a7a81b43f414 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 21 Aug 2013 14:32:21 +0400
Subject: tun: Allow to skip filter on attach

There's a small problem with sk-filters on tun devices. Consider
an application doing this sequence of steps:

fd = open("/dev/net/tun");
ioctl(fd, TUNSETIFF, { .ifr_name = "tun0" });
ioctl(fd, TUNATTACHFILTER, &my_filter);
ioctl(fd, TUNSETPERSIST, 1);
close(fd);

At that point the tun0 will remain in the system and will keep in
mind that there should be a socket filter at address '&my_filter'.

If after that we do

fd = open("/dev/net/tun");
ioctl(fd, TUNSETIFF, { .ifr_name = "tun0" });

we most likely receive the -EFAULT error, since tun_attach() would
try to connect the filter back. But (!) if we provide a filter at
address &my_filter, then tun0 will be created and the "new" filter
would be attached, but application may not know about that.

This may create certain problems to anyone using tun-s, but it's
critical problem for c/r -- if we meet a persistent tun device
with a filter in mind, we will not be able to attach to it to dump
its state (flags, owner, address, vnethdr size, etc.).

The proposal is to allow to attach to tun device (with TUNSETIFF)
w/o attaching the filter to the tun-file's socket. After this
attach app may e.g clean the device by dropping the filter, it
doesn't want to have one, or (in case of c/r) get information
about the device with tun ioctls.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 12 +++++++-----
 include/uapi/linux/if_tun.h |  1 +
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db43a240973..6acbdbccebc 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -501,7 +501,7 @@ static void tun_detach_all(struct net_device *dev)
 		module_put(THIS_MODULE);
 }
 
-static int tun_attach(struct tun_struct *tun, struct file *file)
+static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
 {
 	struct tun_file *tfile = file->private_data;
 	int err;
@@ -526,7 +526,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
 	err = 0;
 
 	/* Re-attach the filter to presist device */
-	if (tun->filter_attached == true) {
+	if (!skip_filter && (tun->filter_attached == true)) {
 		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
 		if (!err)
 			goto out;
@@ -1557,7 +1557,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		if (err < 0)
 			return err;
 
-		err = tun_attach(tun, file);
+		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER);
 		if (err < 0)
 			return err;
 
@@ -1631,7 +1631,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		dev->vlan_features = dev->features;
 
 		INIT_LIST_HEAD(&tun->disabled);
-		err = tun_attach(tun, file);
+		err = tun_attach(tun, file, false);
 		if (err < 0)
 			goto err_free_dev;
 
@@ -1795,7 +1795,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
 		ret = security_tun_dev_attach_queue(tun->security);
 		if (ret < 0)
 			goto unlock;
-		ret = tun_attach(tun, file);
+		ret = tun_attach(tun, file, false);
 	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
 		tun = rtnl_dereference(tfile->tun);
 		if (!tun || !(tun->flags & TUN_TAP_MQ) || tfile->detached)
@@ -1883,6 +1883,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 
 		if (tfile->detached)
 			ifr.ifr_flags |= IFF_DETACH_QUEUE;
+		if (!tfile->socket.sk->sk_filter)
+			ifr.ifr_flags |= IFF_NOFILTER;
 
 		if (copy_to_user(argp, &ifr, ifreq_len))
 			ret = -EFAULT;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index c58d023c482..cc127b2b4c3 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -71,6 +71,7 @@
 #define IFF_DETACH_QUEUE 0x0400
 /* read-only flag */
 #define IFF_PERSIST	0x0800
+#define IFF_NOFILTER	0x1000
 
 /* Socket options */
 #define TUN_TX_TIMESTAMP 1
-- 
cgit v1.2.3-70-g09d2


From 76975e9cb4a7c6fe39478a3dc4dd292a5c6c8c74 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 21 Aug 2013 14:32:39 +0400
Subject: tun: Get skfilter layout

The only thing we may have from tun device is the fprog, whic contains
the number of filter elements and a pointer to (user-space) memory
where the elements are. The program itself may not be available if the
device is persistent and detached.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c           | 10 ++++++++++
 include/uapi/linux/if_tun.h |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 6acbdbccebc..60a1e93e9d3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2042,6 +2042,16 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		tun_detach_filter(tun, tun->numqueues);
 		break;
 
+	case TUNGETFILTER:
+		ret = -EINVAL;
+		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
+			break;
+		ret = -EFAULT;
+		if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
+			break;
+		ret = 0;
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index cc127b2b4c3..e9502dd1ee2 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -57,6 +57,7 @@
 #define TUNSETVNETHDRSZ _IOW('T', 216, int)
 #define TUNSETQUEUE  _IOW('T', 217, int)
 #define TUNSETIFINDEX	_IOW('T', 218, unsigned int)
+#define TUNGETFILTER _IOR('T', 219, struct sock_fprog)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
cgit v1.2.3-70-g09d2


From 19504cf5f35fbe85db811fce9f4392a0cbdada2f Mon Sep 17 00:00:00 2001
From: Vladimir Kondratiev <qca_vkondrat@qca.qualcomm.com>
Date: Thu, 15 Aug 2013 14:51:28 +0300
Subject: cfg80211: add flags to cfg80211_rx_mgmt()

Add flags intended to report various auxiliary information
and introduce the NL80211_RXMGMT_FLAG_ANSWERED flag to report
that the frame was already answered by the device.

Signed-off-by: Vladimir Kondratiev <qca_vkondrat@qca.qualcomm.com>
[REPLIED->ANSWERED, reword commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath6kl/wmi.c         |  7 +++----
 drivers/net/wireless/ath/wil6210/wmi.c        |  2 +-
 drivers/net/wireless/brcm80211/brcmfmac/p2p.c |  4 ++--
 drivers/net/wireless/mwifiex/util.c           |  4 ++--
 include/net/cfg80211.h                        |  3 ++-
 include/uapi/linux/nl80211.h                  | 16 ++++++++++++++++
 net/mac80211/rx.c                             |  3 +--
 net/wireless/mlme.c                           |  4 ++--
 net/wireless/nl80211.c                        |  6 ++++--
 net/wireless/nl80211.h                        |  2 +-
 10 files changed, 34 insertions(+), 17 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c
index 87aefb4c4c2..546d5da0b89 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.c
+++ b/drivers/net/wireless/ath/ath6kl/wmi.c
@@ -568,8 +568,8 @@ static int ath6kl_wmi_rx_probe_req_event_rx(struct wmi *wmi, u8 *datap, int len,
 		   dlen, freq, vif->probe_req_report);
 
 	if (vif->probe_req_report || vif->nw_type == AP_NETWORK)
-		cfg80211_rx_mgmt(&vif->wdev, freq, 0,
-				 ev->data, dlen, GFP_ATOMIC);
+		cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0,
+				 GFP_ATOMIC);
 
 	return 0;
 }
@@ -608,8 +608,7 @@ static int ath6kl_wmi_rx_action_event_rx(struct wmi *wmi, u8 *datap, int len,
 		return -EINVAL;
 	}
 	ath6kl_dbg(ATH6KL_DBG_WMI, "rx_action: len=%u freq=%u\n", dlen, freq);
-	cfg80211_rx_mgmt(&vif->wdev, freq, 0,
-			 ev->data, dlen, GFP_ATOMIC);
+	cfg80211_rx_mgmt(&vif->wdev, freq, 0, ev->data, dlen, 0, GFP_ATOMIC);
 
 	return 0;
 }
diff --git a/drivers/net/wireless/ath/wil6210/wmi.c b/drivers/net/wireless/ath/wil6210/wmi.c
index dc8059ad4ba..21c791ee817 100644
--- a/drivers/net/wireless/ath/wil6210/wmi.c
+++ b/drivers/net/wireless/ath/wil6210/wmi.c
@@ -339,7 +339,7 @@ static void wmi_evt_rx_mgmt(struct wil6210_priv *wil, int id, void *d, int len)
 		}
 	} else {
 		cfg80211_rx_mgmt(wil->wdev, freq, signal,
-				 (void *)rx_mgmt_frame, d_len, GFP_KERNEL);
+				 (void *)rx_mgmt_frame, d_len, 0, GFP_KERNEL);
 	}
 }
 
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
index 79555f006d5..d7a97453290 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/p2p.c
@@ -1430,7 +1430,7 @@ int brcmf_p2p_notify_action_frame_rx(struct brcmf_if *ifp,
 					      IEEE80211_BAND_5GHZ);
 
 	wdev = &ifp->vif->wdev;
-	cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len,
+	cfg80211_rx_mgmt(wdev, freq, 0, (u8 *)mgmt_frame, mgmt_frame_len, 0,
 			 GFP_ATOMIC);
 
 	kfree(mgmt_frame);
@@ -1895,7 +1895,7 @@ s32 brcmf_p2p_notify_rx_mgmt_p2p_probereq(struct brcmf_if *ifp,
 					      IEEE80211_BAND_2GHZ :
 					      IEEE80211_BAND_5GHZ);
 
-	cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len,
+	cfg80211_rx_mgmt(&vif->wdev, freq, 0, mgmt_frame, mgmt_frame_len, 0,
 			 GFP_ATOMIC);
 
 	brcmf_dbg(INFO, "mgmt_frame_len (%d) , e->datalen (%d), chanspec (%04x), freq (%d)\n",
diff --git a/drivers/net/wireless/mwifiex/util.c b/drivers/net/wireless/mwifiex/util.c
index e57ac0dd3ab..5d9e150f411 100644
--- a/drivers/net/wireless/mwifiex/util.c
+++ b/drivers/net/wireless/mwifiex/util.c
@@ -171,8 +171,8 @@ mwifiex_process_mgmt_packet(struct mwifiex_private *priv,
 	rx_pd->rx_pkt_length = cpu_to_le16(pkt_len);
 
 	cfg80211_rx_mgmt(priv->wdev, priv->roc_cfg.chan.center_freq,
-			 CAL_RSSI(rx_pd->snr, rx_pd->nf),
-			 skb->data, pkt_len, GFP_ATOMIC);
+			 CAL_RSSI(rx_pd->snr, rx_pd->nf), skb->data, pkt_len,
+			 0, GFP_ATOMIC);
 
 	return 0;
 }
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9ab7a0690d9..d530c54a366 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4056,6 +4056,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
  * @sig_dbm: signal strength in mBm, or 0 if unknown
  * @buf: Management frame (header + body)
  * @len: length of the frame data
+ * @flags: flags, as defined in enum nl80211_rxmgmt_flags
  * @gfp: context flags
  *
  * This function is called whenever an Action frame is received for a station
@@ -4067,7 +4068,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
  * driver is responsible for rejecting the frame.
  */
 bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm,
-		      const u8 *buf, size_t len, gfp_t gfp);
+		      const u8 *buf, size_t len, u32 flags, gfp_t gfp);
 
 /**
  * cfg80211_mgmt_tx_status - notification of TX status for management frame
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 1f42bc3dcb9..fde2c021b26 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1493,6 +1493,9 @@ enum nl80211_commands {
  * @NL80211_ATTR_CSA_C_OFF_PRESP: Offset of the channel switch counter
  *	field in the probe response (%NL80211_ATTR_PROBE_RESP).
  *
+ * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32.
+ *	As specified in the &enum nl80211_rxmgmt_flags.
+ *
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
  */
@@ -1801,6 +1804,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_CSA_C_OFF_BEACON,
 	NL80211_ATTR_CSA_C_OFF_PRESP,
 
+	NL80211_ATTR_RXMGMT_FLAGS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -3901,4 +3906,15 @@ enum nl80211_crit_proto_id {
 /* maximum duration for critical protocol measures */
 #define NL80211_CRIT_PROTO_MAX_DURATION		5000 /* msec */
 
+/**
+ * enum nl80211_rxmgmt_flags - flags for received management frame.
+ *
+ * Used by cfg80211_rx_mgmt()
+ *
+ * @NL80211_RXMGMT_FLAG_ANSWERED: frame was answered by device/driver.
+ */
+enum nl80211_rxmgmt_flags {
+	NL80211_RXMGMT_FLAG_ANSWERED = 1 << 0,
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index ffad155316a..07901050812 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2678,8 +2678,7 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
 		sig = status->signal;
 
 	if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig,
-			     rx->skb->data, rx->skb->len,
-			     GFP_ATOMIC)) {
+			     rx->skb->data, rx->skb->len, 0, GFP_ATOMIC)) {
 		if (rx->sta)
 			rx->sta->rx_packets++;
 		dev_kfree_skb(rx->skb);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index bfac5e186f5..8d49c1ce3de 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -621,7 +621,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 }
 
 bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm,
-		      const u8 *buf, size_t len, gfp_t gfp)
+		      const u8 *buf, size_t len, u32 flags, gfp_t gfp)
 {
 	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy);
@@ -664,7 +664,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm,
 		/* Indicate the received Action frame to user space */
 		if (nl80211_send_mgmt(rdev, wdev, reg->nlportid,
 				      freq, sig_mbm,
-				      buf, len, gfp))
+				      buf, len, flags, gfp))
 			continue;
 
 		result = true;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 334697de5cc..a51269d2d48 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10446,7 +10446,7 @@ EXPORT_SYMBOL(cfg80211_rx_unexpected_4addr_frame);
 int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 		      struct wireless_dev *wdev, u32 nlportid,
 		      int freq, int sig_dbm,
-		      const u8 *buf, size_t len, gfp_t gfp)
+		      const u8 *buf, size_t len, u32 flags, gfp_t gfp)
 {
 	struct net_device *netdev = wdev->netdev;
 	struct sk_buff *msg;
@@ -10469,7 +10469,9 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	    nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) ||
 	    (sig_dbm &&
 	     nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
-	    nla_put(msg, NL80211_ATTR_FRAME, len, buf))
+	    nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
+	    (flags &&
+	     nla_put_u32(msg, NL80211_ATTR_RXMGMT_FLAGS, flags)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 44341bf53cf..2c0f2b3c07c 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -66,7 +66,7 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
 int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 		      struct wireless_dev *wdev, u32 nlpid,
 		      int freq, int sig_dbm,
-		      const u8 *buf, size_t len, gfp_t gfp);
+		      const u8 *buf, size_t len, u32 flags, gfp_t gfp);
 
 void
 nl80211_radar_notify(struct cfg80211_registered_device *rdev,
-- 
cgit v1.2.3-70-g09d2


From 03f0d916aa0317592dda11bd17c7357858719b6c Mon Sep 17 00:00:00 2001
From: Andy Zhou <azhou@nicira.com>
Date: Wed, 7 Aug 2013 20:01:00 -0700
Subject: openvswitch: Mega flow implementation

Add wildcarded flow support in kernel datapath.

Wildcarded flow can improve OVS flow set up performance by avoid sending
matching new flows to the user space program. The exact performance boost
will largely dependent on wildcarded flow hit rate.

In case all new flows hits wildcard flows, the flow set up rate is
within 5% of that of linux bridge module.

Pravin has made significant contributions to this patch. Including API
clean ups and bug fixes.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 Documentation/networking/openvswitch.txt |   40 +
 include/uapi/linux/openvswitch.h         |    9 +-
 net/openvswitch/actions.c                |    6 +-
 net/openvswitch/datapath.c               |  140 ++-
 net/openvswitch/datapath.h               |    6 +
 net/openvswitch/flow.c                   | 1387 ++++++++++++++++++++----------
 net/openvswitch/flow.h                   |   96 ++-
 7 files changed, 1171 insertions(+), 513 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/networking/openvswitch.txt b/Documentation/networking/openvswitch.txt
index 8fa2dd1e792..37c20ee2455 100644
--- a/Documentation/networking/openvswitch.txt
+++ b/Documentation/networking/openvswitch.txt
@@ -91,6 +91,46 @@ Often we ellipsize arguments not important to the discussion, e.g.:
     in_port(1), eth(...), eth_type(0x0800), ipv4(...), tcp(...)
 
 
+Wildcarded flow key format
+--------------------------
+
+A wildcarded flow is described with two sequences of Netlink attributes
+passed over the Netlink socket. A flow key, exactly as described above, and an
+optional corresponding flow mask.
+
+A wildcarded flow can represent a group of exact match flows. Each '1' bit
+in the mask specifies a exact match with the corresponding bit in the flow key.
+A '0' bit specifies a don't care bit, which will match either a '1' or '0' bit
+of a incoming packet. Using wildcarded flow can improve the flow set up rate
+by reduce the number of new flows need to be processed by the user space program.
+
+Support for the mask Netlink attribute is optional for both the kernel and user
+space program. The kernel can ignore the mask attribute, installing an exact
+match flow, or reduce the number of don't care bits in the kernel to less than
+what was specified by the user space program. In this case, variations in bits
+that the kernel does not implement will simply result in additional flow setups.
+The kernel module will also work with user space programs that neither support
+nor supply flow mask attributes.
+
+Since the kernel may ignore or modify wildcard bits, it can be difficult for
+the userspace program to know exactly what matches are installed. There are
+two possible approaches: reactively install flows as they miss the kernel
+flow table (and therefore not attempt to determine wildcard changes at all)
+or use the kernel's response messages to determine the installed wildcards.
+
+When interacting with userspace, the kernel should maintain the match portion
+of the key exactly as originally installed. This will provides a handle to
+identify the flow for all future operations. However, when reporting the
+mask of an installed flow, the mask should include any restrictions imposed
+by the kernel.
+
+The behavior when using overlapping wildcarded flows is undefined. It is the
+responsibility of the user space program to ensure that any incoming packet
+can match at most one flow, wildcarded or not. The current implementation
+performs best-effort detection of overlapping wildcarded flows and may reject
+some but not all of them. However, this behavior may change in future versions.
+
+
 Basic rule for evolving flow keys
 ---------------------------------
 
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 52490b0e62b..de1fa5d3780 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2007-2011 Nicira Networks.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -379,6 +379,12 @@ struct ovs_key_nd {
  * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the
  * last-used time, accumulated TCP flags, and statistics for this flow.
  * Otherwise ignored in requests.  Never present in notifications.
+ * @OVS_FLOW_ATTR_MASK: Nested %OVS_KEY_ATTR_* attributes specifying the
+ * mask bits for wildcarded flow match. Mask bit value '1' specifies exact
+ * match with corresponding flow key bit, while mask bit value '0' specifies
+ * a wildcarded match. Omitting attribute is treated as wildcarding all
+ * corresponding fields. Optional for all requests. If not present,
+ * all flow key bits are exact match bits.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_FLOW_* commands.
@@ -391,6 +397,7 @@ enum ovs_flow_attr {
 	OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */
 	OVS_FLOW_ATTR_USED,      /* u64 msecs last used in monotonic time. */
 	OVS_FLOW_ATTR_CLEAR,     /* Flag to clear stats, tcp_flags, used. */
+	OVS_FLOW_ATTR_MASK,      /* Sequence of OVS_KEY_ATTR_* attributes. */
 	__OVS_FLOW_ATTR_MAX
 };
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ab101f71544..1f680222f4f 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -376,8 +376,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 	const struct nlattr *a;
 	int rem;
 
+	BUG_ON(!OVS_CB(skb)->pkt_key);
+
 	upcall.cmd = OVS_PACKET_CMD_ACTION;
-	upcall.key = &OVS_CB(skb)->flow->key;
+	upcall.key = OVS_CB(skb)->pkt_key;
 	upcall.userdata = NULL;
 	upcall.portid = 0;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9d97ef3c983..d29cd9aa4a6 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -165,7 +165,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
 {
 	struct datapath *dp = container_of(rcu, struct datapath, rcu);
 
-	ovs_flow_tbl_destroy((__force struct flow_table *)dp->table);
+	ovs_flow_tbl_destroy((__force struct flow_table *)dp->table, false);
 	free_percpu(dp->stats_percpu);
 	release_net(ovs_dp_get_net(dp));
 	kfree(dp->ports);
@@ -226,19 +226,18 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	struct sw_flow_key key;
 	u64 *stats_counter;
 	int error;
-	int key_len;
 
 	stats = this_cpu_ptr(dp->stats_percpu);
 
 	/* Extract flow from 'skb' into 'key'. */
-	error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
+	error = ovs_flow_extract(skb, p->port_no, &key);
 	if (unlikely(error)) {
 		kfree_skb(skb);
 		return;
 	}
 
 	/* Look up flow. */
-	flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len);
+	flow = ovs_flow_lookup(rcu_dereference(dp->table), &key);
 	if (unlikely(!flow)) {
 		struct dp_upcall_info upcall;
 
@@ -253,6 +252,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
 	}
 
 	OVS_CB(skb)->flow = flow;
+	OVS_CB(skb)->pkt_key = &key;
 
 	stats_counter = &stats->n_hit;
 	ovs_flow_used(OVS_CB(skb)->flow, skb);
@@ -435,7 +435,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
 	upcall->dp_ifindex = dp_ifindex;
 
 	nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY);
-	ovs_flow_to_nlattrs(upcall_info->key, user_skb);
+	ovs_flow_to_nlattrs(upcall_info->key, upcall_info->key, user_skb);
 	nla_nest_end(user_skb, nla);
 
 	if (upcall_info->userdata)
@@ -468,7 +468,7 @@ static int flush_flows(struct datapath *dp)
 
 	rcu_assign_pointer(dp->table, new_table);
 
-	ovs_flow_tbl_deferred_destroy(old_table);
+	ovs_flow_tbl_destroy(old_table, true);
 	return 0;
 }
 
@@ -611,10 +611,12 @@ static int validate_tp_port(const struct sw_flow_key *flow_key)
 static int validate_and_copy_set_tun(const struct nlattr *attr,
 				     struct sw_flow_actions **sfa)
 {
-	struct ovs_key_ipv4_tunnel tun_key;
+	struct sw_flow_match match;
+	struct sw_flow_key key;
 	int err, start;
 
-	err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key);
+	ovs_match_init(&match, &key, NULL);
+	err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &match, false);
 	if (err)
 		return err;
 
@@ -622,7 +624,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	if (start < 0)
 		return start;
 
-	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key));
+	err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key,
+			sizeof(match.key->tun_key));
 	add_nested_action_end(*sfa, start);
 
 	return err;
@@ -857,7 +860,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	struct ethhdr *eth;
 	int len;
 	int err;
-	int key_len;
 
 	err = -EINVAL;
 	if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] ||
@@ -890,11 +892,11 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	if (IS_ERR(flow))
 		goto err_kfree_skb;
 
-	err = ovs_flow_extract(packet, -1, &flow->key, &key_len);
+	err = ovs_flow_extract(packet, -1, &flow->key);
 	if (err)
 		goto err_flow_free;
 
-	err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]);
+	err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]);
 	if (err)
 		goto err_flow_free;
 	acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
@@ -908,6 +910,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 		goto err_flow_free;
 
 	OVS_CB(packet)->flow = flow;
+	OVS_CB(packet)->pkt_key = &flow->key;
 	packet->priority = flow->key.phy.priority;
 	packet->mark = flow->key.phy.skb_mark;
 
@@ -922,13 +925,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	local_bh_enable();
 	rcu_read_unlock();
 
-	ovs_flow_free(flow);
+	ovs_flow_free(flow, false);
 	return err;
 
 err_unlock:
 	rcu_read_unlock();
 err_flow_free:
-	ovs_flow_free(flow);
+	ovs_flow_free(flow, false);
 err_kfree_skb:
 	kfree_skb(packet);
 err:
@@ -1045,7 +1048,8 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 		if (!start)
 			return -EMSGSIZE;
 
-		err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key));
+		err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key),
+					     nla_data(ovs_key));
 		if (err)
 			return err;
 		nla_nest_end(skb, start);
@@ -1093,6 +1097,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
 {
 	return NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */
+		+ nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_MASK */
 		+ nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */
 		+ nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */
 		+ nla_total_size(8) /* OVS_FLOW_ATTR_USED */
@@ -1119,12 +1124,25 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
 
 	ovs_header->dp_ifindex = get_dpifindex(dp);
 
+	/* Fill flow key. */
 	nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY);
 	if (!nla)
 		goto nla_put_failure;
-	err = ovs_flow_to_nlattrs(&flow->key, skb);
+
+	err = ovs_flow_to_nlattrs(&flow->unmasked_key,
+			&flow->unmasked_key, skb);
+	if (err)
+		goto error;
+	nla_nest_end(skb, nla);
+
+	nla = nla_nest_start(skb, OVS_FLOW_ATTR_MASK);
+	if (!nla)
+		goto nla_put_failure;
+
+	err = ovs_flow_to_nlattrs(&flow->key, &flow->mask->key, skb);
 	if (err)
 		goto error;
+
 	nla_nest_end(skb, nla);
 
 	spin_lock_bh(&flow->lock);
@@ -1214,20 +1232,24 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr **a = info->attrs;
 	struct ovs_header *ovs_header = info->userhdr;
-	struct sw_flow_key key;
-	struct sw_flow *flow;
+	struct sw_flow_key key, masked_key;
+	struct sw_flow *flow = NULL;
+	struct sw_flow_mask mask;
 	struct sk_buff *reply;
 	struct datapath *dp;
 	struct flow_table *table;
 	struct sw_flow_actions *acts = NULL;
+	struct sw_flow_match match;
 	int error;
-	int key_len;
 
 	/* Extract key. */
 	error = -EINVAL;
 	if (!a[OVS_FLOW_ATTR_KEY])
 		goto error;
-	error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+
+	ovs_match_init(&match, &key, &mask);
+	error = ovs_match_from_nlattrs(&match,
+			a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
 	if (error)
 		goto error;
 
@@ -1238,9 +1260,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		if (IS_ERR(acts))
 			goto error;
 
-		error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key,  0, &acts);
-		if (error)
+		ovs_flow_key_mask(&masked_key, &key, &mask);
+		error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS],
+						  &masked_key, 0, &acts);
+		if (error) {
+			OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
 			goto err_kfree;
+		}
 	} else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
 		error = -EINVAL;
 		goto error;
@@ -1253,8 +1279,11 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		goto err_unlock_ovs;
 
 	table = ovsl_dereference(dp->table);
-	flow = ovs_flow_tbl_lookup(table, &key, key_len);
+
+	/* Check if this is a duplicate flow */
+	flow = ovs_flow_lookup(table, &key);
 	if (!flow) {
+		struct sw_flow_mask *mask_p;
 		/* Bail out if we're not allowed to create a new flow. */
 		error = -ENOENT;
 		if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
@@ -1267,7 +1296,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 			new_table = ovs_flow_tbl_expand(table);
 			if (!IS_ERR(new_table)) {
 				rcu_assign_pointer(dp->table, new_table);
-				ovs_flow_tbl_deferred_destroy(table);
+				ovs_flow_tbl_destroy(table, true);
 				table = ovsl_dereference(dp->table);
 			}
 		}
@@ -1280,14 +1309,30 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		}
 		clear_stats(flow);
 
+		flow->key = masked_key;
+		flow->unmasked_key = key;
+
+		/* Make sure mask is unique in the system */
+		mask_p = ovs_sw_flow_mask_find(table, &mask);
+		if (!mask_p) {
+			/* Allocate a new mask if none exsits. */
+			mask_p = ovs_sw_flow_mask_alloc();
+			if (!mask_p)
+				goto err_flow_free;
+			mask_p->key = mask.key;
+			mask_p->range = mask.range;
+			ovs_sw_flow_mask_insert(table, mask_p);
+		}
+
+		ovs_sw_flow_mask_add_ref(mask_p);
+		flow->mask = mask_p;
 		rcu_assign_pointer(flow->sf_acts, acts);
 
 		/* Put flow in bucket. */
-		ovs_flow_tbl_insert(table, flow, &key, key_len);
+		ovs_flow_insert(table, flow);
 
 		reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
-						info->snd_seq,
-						OVS_FLOW_CMD_NEW);
+						info->snd_seq, OVS_FLOW_CMD_NEW);
 	} else {
 		/* We found a matching flow. */
 		struct sw_flow_actions *old_acts;
@@ -1303,6 +1348,13 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		    info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))
 			goto err_unlock_ovs;
 
+		/* The unmasked key has to be the same for flow updates. */
+		error = -EINVAL;
+		if (!ovs_flow_cmp_unmasked_key(flow, &key, match.range.end)) {
+			OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n");
+			goto err_unlock_ovs;
+		}
+
 		/* Update actions. */
 		old_acts = ovsl_dereference(flow->sf_acts);
 		rcu_assign_pointer(flow->sf_acts, acts);
@@ -1327,6 +1379,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 				ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
 	return 0;
 
+err_flow_free:
+	ovs_flow_free(flow, false);
 err_unlock_ovs:
 	ovs_unlock();
 err_kfree:
@@ -1344,12 +1398,16 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	struct sw_flow *flow;
 	struct datapath *dp;
 	struct flow_table *table;
+	struct sw_flow_match match;
 	int err;
-	int key_len;
 
-	if (!a[OVS_FLOW_ATTR_KEY])
+	if (!a[OVS_FLOW_ATTR_KEY]) {
+		OVS_NLERR("Flow get message rejected, Key attribute missing.\n");
 		return -EINVAL;
-	err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+	}
+
+	ovs_match_init(&match, &key, NULL);
+	err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
 	if (err)
 		return err;
 
@@ -1361,7 +1419,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	table = ovsl_dereference(dp->table);
-	flow = ovs_flow_tbl_lookup(table, &key, key_len);
+	flow = ovs_flow_lookup_unmasked_key(table, &match);
 	if (!flow) {
 		err = -ENOENT;
 		goto unlock;
@@ -1390,8 +1448,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	struct sw_flow *flow;
 	struct datapath *dp;
 	struct flow_table *table;
+	struct sw_flow_match match;
 	int err;
-	int key_len;
 
 	ovs_lock();
 	dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
@@ -1404,12 +1462,14 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		err = flush_flows(dp);
 		goto unlock;
 	}
-	err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]);
+
+	ovs_match_init(&match, &key, NULL);
+	err = ovs_match_from_nlattrs(&match, a[OVS_FLOW_ATTR_KEY], NULL);
 	if (err)
 		goto unlock;
 
 	table = ovsl_dereference(dp->table);
-	flow = ovs_flow_tbl_lookup(table, &key, key_len);
+	flow = ovs_flow_lookup_unmasked_key(table, &match);
 	if (!flow) {
 		err = -ENOENT;
 		goto unlock;
@@ -1421,13 +1481,13 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		goto unlock;
 	}
 
-	ovs_flow_tbl_remove(table, flow);
+	ovs_flow_remove(table, flow);
 
 	err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid,
 				     info->snd_seq, 0, OVS_FLOW_CMD_DEL);
 	BUG_ON(err < 0);
 
-	ovs_flow_deferred_free(flow);
+	ovs_flow_free(flow, true);
 	ovs_unlock();
 
 	ovs_notify(reply, info, &ovs_dp_flow_multicast_group);
@@ -1457,7 +1517,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 		bucket = cb->args[0];
 		obj = cb->args[1];
-		flow = ovs_flow_tbl_next(table, &bucket, &obj);
+		flow = ovs_flow_dump_next(table, &bucket, &obj);
 		if (!flow)
 			break;
 
@@ -1680,7 +1740,7 @@ err_destroy_ports_array:
 err_destroy_percpu:
 	free_percpu(dp->stats_percpu);
 err_destroy_table:
-	ovs_flow_tbl_destroy(ovsl_dereference(dp->table));
+	ovs_flow_tbl_destroy(ovsl_dereference(dp->table), false);
 err_free_dp:
 	release_net(ovs_dp_get_net(dp));
 	kfree(dp);
@@ -2287,7 +2347,7 @@ static void rehash_flow_table(struct work_struct *work)
 			new_table = ovs_flow_tbl_rehash(old_table);
 			if (!IS_ERR(new_table)) {
 				rcu_assign_pointer(dp->table, new_table);
-				ovs_flow_tbl_deferred_destroy(old_table);
+				ovs_flow_tbl_destroy(old_table, true);
 			}
 		}
 	}
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index a9148648491..4d109c176ef 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -88,11 +88,13 @@ struct datapath {
 /**
  * struct ovs_skb_cb - OVS data in skb CB
  * @flow: The flow associated with this packet.  May be %NULL if no flow.
+ * @pkt_key: The flow information extracted from the packet.  Must be nonnull.
  * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the
  * packet is not being tunneled.
  */
 struct ovs_skb_cb {
 	struct sw_flow		*flow;
+	struct sw_flow_key	*pkt_key;
 	struct ovs_key_ipv4_tunnel  *tun_key;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -183,4 +185,8 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq,
 
 int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb);
 void ovs_dp_notify_wq(struct work_struct *work);
+
+#define OVS_NLERR(fmt, ...) \
+	pr_info_once("netlink: " fmt, ##__VA_ARGS__)
+
 #endif /* datapath.h */
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index fca282520ce..1fceb965359 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2011 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -46,6 +46,184 @@
 
 static struct kmem_cache *flow_cache;
 
+static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
+		struct sw_flow_key_range *range, u8 val);
+
+static void update_range__(struct sw_flow_match *match,
+			  size_t offset, size_t size, bool is_mask)
+{
+	struct sw_flow_key_range *range = NULL;
+	size_t start = offset;
+	size_t end = offset + size;
+
+	if (!is_mask)
+		range = &match->range;
+	else if (match->mask)
+		range = &match->mask->range;
+
+	if (!range)
+		return;
+
+	if (range->start == range->end) {
+		range->start = start;
+		range->end = end;
+		return;
+	}
+
+	if (range->start > start)
+		range->start = start;
+
+	if (range->end < end)
+		range->end = end;
+}
+
+#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \
+	do { \
+		update_range__(match, offsetof(struct sw_flow_key, field),  \
+				     sizeof((match)->key->field), is_mask); \
+		if (is_mask) {						    \
+			if ((match)->mask)				    \
+				(match)->mask->key.field = value;	    \
+		} else {                                                    \
+			(match)->key->field = value;		            \
+		}                                                           \
+	} while (0)
+
+#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \
+	do { \
+		update_range__(match, offsetof(struct sw_flow_key, field),  \
+				len, is_mask);                              \
+		if (is_mask) {						    \
+			if ((match)->mask)				    \
+				memcpy(&(match)->mask->key.field, value_p, len);\
+		} else {                                                    \
+			memcpy(&(match)->key->field, value_p, len);         \
+		}                                                           \
+	} while (0)
+
+void ovs_match_init(struct sw_flow_match *match,
+		    struct sw_flow_key *key,
+		    struct sw_flow_mask *mask)
+{
+	memset(match, 0, sizeof(*match));
+	match->key = key;
+	match->mask = mask;
+
+	memset(key, 0, sizeof(*key));
+
+	if (mask) {
+		memset(&mask->key, 0, sizeof(mask->key));
+		mask->range.start = mask->range.end = 0;
+	}
+}
+
+static bool ovs_match_validate(const struct sw_flow_match *match,
+		u64 key_attrs, u64 mask_attrs)
+{
+	u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET;
+	u64 mask_allowed = key_attrs;  /* At most allow all key attributes */
+
+	/* The following mask attributes allowed only if they
+	 * pass the validation tests. */
+	mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4)
+			| (1 << OVS_KEY_ATTR_IPV6)
+			| (1 << OVS_KEY_ATTR_TCP)
+			| (1 << OVS_KEY_ATTR_UDP)
+			| (1 << OVS_KEY_ATTR_ICMP)
+			| (1 << OVS_KEY_ATTR_ICMPV6)
+			| (1 << OVS_KEY_ATTR_ARP)
+			| (1 << OVS_KEY_ATTR_ND));
+
+	/* Always allowed mask fields. */
+	mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
+		       | (1 << OVS_KEY_ATTR_IN_PORT)
+		       | (1 << OVS_KEY_ATTR_ETHERTYPE));
+
+	/* Check key attributes. */
+	if (match->key->eth.type == htons(ETH_P_ARP)
+			|| match->key->eth.type == htons(ETH_P_RARP)) {
+		key_expected |= 1 << OVS_KEY_ATTR_ARP;
+		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+			mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
+	}
+
+	if (match->key->eth.type == htons(ETH_P_IP)) {
+		key_expected |= 1 << OVS_KEY_ATTR_IPV4;
+		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+			mask_allowed |= 1 << OVS_KEY_ATTR_IPV4;
+
+		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+			if (match->key->ip.proto == IPPROTO_UDP) {
+				key_expected |= 1 << OVS_KEY_ATTR_UDP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+			}
+
+			if (match->key->ip.proto == IPPROTO_TCP) {
+				key_expected |= 1 << OVS_KEY_ATTR_TCP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+			}
+
+			if (match->key->ip.proto == IPPROTO_ICMP) {
+				key_expected |= 1 << OVS_KEY_ATTR_ICMP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_ICMP;
+			}
+		}
+	}
+
+	if (match->key->eth.type == htons(ETH_P_IPV6)) {
+		key_expected |= 1 << OVS_KEY_ATTR_IPV6;
+		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+			mask_allowed |= 1 << OVS_KEY_ATTR_IPV6;
+
+		if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) {
+			if (match->key->ip.proto == IPPROTO_UDP) {
+				key_expected |= 1 << OVS_KEY_ATTR_UDP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
+			}
+
+			if (match->key->ip.proto == IPPROTO_TCP) {
+				key_expected |= 1 << OVS_KEY_ATTR_TCP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_TCP;
+			}
+
+			if (match->key->ip.proto == IPPROTO_ICMPV6) {
+				key_expected |= 1 << OVS_KEY_ATTR_ICMPV6;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6;
+
+				if (match->key->ipv6.tp.src ==
+						htons(NDISC_NEIGHBOUR_SOLICITATION) ||
+				    match->key->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
+					key_expected |= 1 << OVS_KEY_ATTR_ND;
+					if (match->mask && (match->mask->key.ipv6.tp.src == htons(0xffff)))
+						mask_allowed |= 1 << OVS_KEY_ATTR_ND;
+				}
+			}
+		}
+	}
+
+	if ((key_attrs & key_expected) != key_expected) {
+		/* Key attributes check failed. */
+		OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n",
+				key_attrs, key_expected);
+		return false;
+	}
+
+	if ((mask_attrs & mask_allowed) != mask_attrs) {
+		/* Mask attributes check failed. */
+		OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n",
+				mask_attrs, mask_allowed);
+		return false;
+	}
+
+	return true;
+}
+
 static int check_header(struct sk_buff *skb, int len)
 {
 	if (unlikely(skb->len < len))
@@ -121,12 +299,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies)
 	return cur_ms - idle_ms;
 }
 
-#define SW_FLOW_KEY_OFFSET(field)		\
-	(offsetof(struct sw_flow_key, field) +	\
-	 FIELD_SIZEOF(struct sw_flow_key, field))
-
-static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
-			 int *key_lenp)
+static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key)
 {
 	unsigned int nh_ofs = skb_network_offset(skb);
 	unsigned int nh_len;
@@ -136,8 +309,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key,
 	__be16 frag_off;
 	int err;
 
-	*key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label);
-
 	err = check_header(skb, nh_ofs + sizeof(*nh));
 	if (unlikely(err))
 		return err;
@@ -176,6 +347,21 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
 				  sizeof(struct icmp6hdr));
 }
 
+void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
+		       const struct sw_flow_mask *mask)
+{
+	u8 *m = (u8 *)&mask->key + mask->range.start;
+	u8 *s = (u8 *)src + mask->range.start;
+	u8 *d = (u8 *)dst + mask->range.start;
+	int i;
+
+	memset(dst, 0, sizeof(*dst));
+	for (i = 0; i < ovs_sw_flow_mask_size_roundup(mask); i++) {
+		*d = *s & *m;
+		d++, s++, m++;
+	}
+}
+
 #define TCP_FLAGS_OFFSET 13
 #define TCP_FLAG_MASK 0x3f
 
@@ -224,6 +410,7 @@ struct sw_flow *ovs_flow_alloc(void)
 
 	spin_lock_init(&flow->lock);
 	flow->sf_acts = NULL;
+	flow->mask = NULL;
 
 	return flow;
 }
@@ -263,7 +450,7 @@ static void free_buckets(struct flex_array *buckets)
 	flex_array_free(buckets);
 }
 
-struct flow_table *ovs_flow_tbl_alloc(int new_size)
+static struct flow_table *__flow_tbl_alloc(int new_size)
 {
 	struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL);
 
@@ -281,17 +468,15 @@ struct flow_table *ovs_flow_tbl_alloc(int new_size)
 	table->node_ver = 0;
 	table->keep_flows = false;
 	get_random_bytes(&table->hash_seed, sizeof(u32));
+	table->mask_list = NULL;
 
 	return table;
 }
 
-void ovs_flow_tbl_destroy(struct flow_table *table)
+static void __flow_tbl_destroy(struct flow_table *table)
 {
 	int i;
 
-	if (!table)
-		return;
-
 	if (table->keep_flows)
 		goto skip_flows;
 
@@ -303,31 +488,55 @@ void ovs_flow_tbl_destroy(struct flow_table *table)
 
 		hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
 			hlist_del(&flow->hash_node[ver]);
-			ovs_flow_free(flow);
+			ovs_flow_free(flow, false);
 		}
 	}
 
+	BUG_ON(!list_empty(table->mask_list));
+	kfree(table->mask_list);
+
 skip_flows:
 	free_buckets(table->buckets);
 	kfree(table);
 }
 
+struct flow_table *ovs_flow_tbl_alloc(int new_size)
+{
+	struct flow_table *table = __flow_tbl_alloc(new_size);
+
+	if (!table)
+		return NULL;
+
+	table->mask_list = kmalloc(sizeof(struct list_head), GFP_KERNEL);
+	if (!table->mask_list) {
+		table->keep_flows = true;
+		__flow_tbl_destroy(table);
+		return NULL;
+	}
+	INIT_LIST_HEAD(table->mask_list);
+
+	return table;
+}
+
 static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
 {
 	struct flow_table *table = container_of(rcu, struct flow_table, rcu);
 
-	ovs_flow_tbl_destroy(table);
+	__flow_tbl_destroy(table);
 }
 
-void ovs_flow_tbl_deferred_destroy(struct flow_table *table)
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
 {
 	if (!table)
 		return;
 
-	call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
+	if (deferred)
+		call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb);
+	else
+		__flow_tbl_destroy(table);
 }
 
-struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last)
+struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *last)
 {
 	struct sw_flow *flow;
 	struct hlist_head *head;
@@ -353,11 +562,13 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la
 	return NULL;
 }
 
-static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
+static void __tbl_insert(struct flow_table *table, struct sw_flow *flow)
 {
 	struct hlist_head *head;
+
 	head = find_bucket(table, flow->hash);
 	hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
+
 	table->count++;
 }
 
@@ -377,8 +588,10 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new
 		head = flex_array_get(old->buckets, i);
 
 		hlist_for_each_entry(flow, head, hash_node[old_ver])
-			__flow_tbl_insert(new, flow);
+			__tbl_insert(new, flow);
 	}
+
+	new->mask_list = old->mask_list;
 	old->keep_flows = true;
 }
 
@@ -386,7 +599,7 @@ static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buck
 {
 	struct flow_table *new_table;
 
-	new_table = ovs_flow_tbl_alloc(n_buckets);
+	new_table = __flow_tbl_alloc(n_buckets);
 	if (!new_table)
 		return ERR_PTR(-ENOMEM);
 
@@ -405,28 +618,30 @@ struct flow_table *ovs_flow_tbl_expand(struct flow_table *table)
 	return __flow_tbl_rehash(table, table->n_buckets * 2);
 }
 
-void ovs_flow_free(struct sw_flow *flow)
+static void __flow_free(struct sw_flow *flow)
 {
-	if (unlikely(!flow))
-		return;
-
 	kfree((struct sf_flow_acts __force *)flow->sf_acts);
 	kmem_cache_free(flow_cache, flow);
 }
 
-/* RCU callback used by ovs_flow_deferred_free. */
 static void rcu_free_flow_callback(struct rcu_head *rcu)
 {
 	struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
 
-	ovs_flow_free(flow);
+	__flow_free(flow);
 }
 
-/* Schedules 'flow' to be freed after the next RCU grace period.
- * The caller must hold rcu_read_lock for this to be sensible. */
-void ovs_flow_deferred_free(struct sw_flow *flow)
+void ovs_flow_free(struct sw_flow *flow, bool deferred)
 {
-	call_rcu(&flow->rcu, rcu_free_flow_callback);
+	if (!flow)
+		return;
+
+	ovs_sw_flow_mask_del_ref(flow->mask, deferred);
+
+	if (deferred)
+		call_rcu(&flow->rcu, rcu_free_flow_callback);
+	else
+		__flow_free(flow);
 }
 
 /* Schedules 'sf_acts' to be freed after the next RCU grace period.
@@ -497,18 +712,15 @@ static __be16 parse_ethertype(struct sk_buff *skb)
 }
 
 static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
-			int *key_lenp, int nh_len)
+			int nh_len)
 {
 	struct icmp6hdr *icmp = icmp6_hdr(skb);
-	int error = 0;
-	int key_len;
 
 	/* The ICMPv6 type and code fields use the 16-bit transport port
 	 * fields, so we need to store them in 16-bit network byte order.
 	 */
 	key->ipv6.tp.src = htons(icmp->icmp6_type);
 	key->ipv6.tp.dst = htons(icmp->icmp6_code);
-	key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
 
 	if (icmp->icmp6_code == 0 &&
 	    (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
@@ -517,21 +729,17 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 		struct nd_msg *nd;
 		int offset;
 
-		key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
-
 		/* In order to process neighbor discovery options, we need the
 		 * entire packet.
 		 */
 		if (unlikely(icmp_len < sizeof(*nd)))
-			goto out;
-		if (unlikely(skb_linearize(skb))) {
-			error = -ENOMEM;
-			goto out;
-		}
+			return 0;
+
+		if (unlikely(skb_linearize(skb)))
+			return -ENOMEM;
 
 		nd = (struct nd_msg *)skb_transport_header(skb);
 		key->ipv6.nd.target = nd->target;
-		key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
 
 		icmp_len -= sizeof(*nd);
 		offset = 0;
@@ -541,7 +749,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 			int opt_len = nd_opt->nd_opt_len * 8;
 
 			if (unlikely(!opt_len || opt_len > icmp_len))
-				goto invalid;
+				return 0;
 
 			/* Store the link layer address if the appropriate
 			 * option is provided.  It is considered an error if
@@ -566,16 +774,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key,
 		}
 	}
 
-	goto out;
+	return 0;
 
 invalid:
 	memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target));
 	memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll));
 	memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll));
 
-out:
-	*key_lenp = key_len;
-	return error;
+	return 0;
 }
 
 /**
@@ -584,7 +790,6 @@ out:
  * Ethernet header
  * @in_port: port number on which @skb was received.
  * @key: output flow key
- * @key_lenp: length of output flow key
  *
  * The caller must ensure that skb->len >= ETH_HLEN.
  *
@@ -602,11 +807,9 @@ out:
  *      of a correct length, otherwise the same as skb->network_header.
  *      For other key->eth.type values it is left untouched.
  */
-int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
-		 int *key_lenp)
+int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 {
-	int error = 0;
-	int key_len = SW_FLOW_KEY_OFFSET(eth);
+	int error;
 	struct ethhdr *eth;
 
 	memset(key, 0, sizeof(*key));
@@ -649,15 +852,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 		struct iphdr *nh;
 		__be16 offset;
 
-		key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
-
 		error = check_iphdr(skb);
 		if (unlikely(error)) {
 			if (error == -EINVAL) {
 				skb->transport_header = skb->network_header;
 				error = 0;
 			}
-			goto out;
+			return error;
 		}
 
 		nh = ip_hdr(skb);
@@ -671,7 +872,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 		offset = nh->frag_off & htons(IP_OFFSET);
 		if (offset) {
 			key->ip.frag = OVS_FRAG_TYPE_LATER;
-			goto out;
+			return 0;
 		}
 		if (nh->frag_off & htons(IP_MF) ||
 			 skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
@@ -679,21 +880,18 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 
 		/* Transport layer. */
 		if (key->ip.proto == IPPROTO_TCP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
 			if (tcphdr_ok(skb)) {
 				struct tcphdr *tcp = tcp_hdr(skb);
 				key->ipv4.tp.src = tcp->source;
 				key->ipv4.tp.dst = tcp->dest;
 			}
 		} else if (key->ip.proto == IPPROTO_UDP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
 			if (udphdr_ok(skb)) {
 				struct udphdr *udp = udp_hdr(skb);
 				key->ipv4.tp.src = udp->source;
 				key->ipv4.tp.dst = udp->dest;
 			}
 		} else if (key->ip.proto == IPPROTO_ICMP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
 			if (icmphdr_ok(skb)) {
 				struct icmphdr *icmp = icmp_hdr(skb);
 				/* The ICMP type and code fields use the 16-bit
@@ -722,53 +920,49 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
 			memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst));
 			memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN);
 			memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
-			key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
 		}
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		int nh_len;             /* IPv6 Header + Extensions */
 
-		nh_len = parse_ipv6hdr(skb, key, &key_len);
+		nh_len = parse_ipv6hdr(skb, key);
 		if (unlikely(nh_len < 0)) {
-			if (nh_len == -EINVAL)
+			if (nh_len == -EINVAL) {
 				skb->transport_header = skb->network_header;
-			else
+				error = 0;
+			} else {
 				error = nh_len;
-			goto out;
+			}
+			return error;
 		}
 
 		if (key->ip.frag == OVS_FRAG_TYPE_LATER)
-			goto out;
+			return 0;
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
 
 		/* Transport layer. */
 		if (key->ip.proto == NEXTHDR_TCP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
 			if (tcphdr_ok(skb)) {
 				struct tcphdr *tcp = tcp_hdr(skb);
 				key->ipv6.tp.src = tcp->source;
 				key->ipv6.tp.dst = tcp->dest;
 			}
 		} else if (key->ip.proto == NEXTHDR_UDP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
 			if (udphdr_ok(skb)) {
 				struct udphdr *udp = udp_hdr(skb);
 				key->ipv6.tp.src = udp->source;
 				key->ipv6.tp.dst = udp->dest;
 			}
 		} else if (key->ip.proto == NEXTHDR_ICMP) {
-			key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
 			if (icmp6hdr_ok(skb)) {
-				error = parse_icmpv6(skb, key, &key_len, nh_len);
-				if (error < 0)
-					goto out;
+				error = parse_icmpv6(skb, key, nh_len);
+				if (error)
+					return error;
 			}
 		}
 	}
 
-out:
-	*key_lenp = key_len;
-	return error;
+	return 0;
 }
 
 static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len)
@@ -777,7 +971,7 @@ static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_l
 		      DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0);
 }
 
-static int flow_key_start(struct sw_flow_key *key)
+static int flow_key_start(const struct sw_flow_key *key)
 {
 	if (key->tun_key.ipv4_dst)
 		return 0;
@@ -785,39 +979,95 @@ static int flow_key_start(struct sw_flow_key *key)
 		return offsetof(struct sw_flow_key, phy);
 }
 
-struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
-				struct sw_flow_key *key, int key_len)
+static bool __cmp_key(const struct sw_flow_key *key1,
+		const struct sw_flow_key *key2,  int key_start, int key_len)
+{
+	return !memcmp((u8 *)key1 + key_start,
+			(u8 *)key2 + key_start, (key_len - key_start));
+}
+
+static bool __flow_cmp_key(const struct sw_flow *flow,
+		const struct sw_flow_key *key, int key_start, int key_len)
+{
+	return __cmp_key(&flow->key, key, key_start, key_len);
+}
+
+static bool __flow_cmp_unmasked_key(const struct sw_flow *flow,
+		  const struct sw_flow_key *key, int key_start, int key_len)
+{
+	return __cmp_key(&flow->unmasked_key, key, key_start, key_len);
+}
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+		const struct sw_flow_key *key, int key_len)
+{
+	int key_start;
+	key_start = flow_key_start(key);
+
+	return __flow_cmp_unmasked_key(flow, key, key_start, key_len);
+
+}
+
+struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
+				       struct sw_flow_match *match)
+{
+	struct sw_flow_key *unmasked = match->key;
+	int key_len = match->range.end;
+	struct sw_flow *flow;
+
+	flow = ovs_flow_lookup(table, unmasked);
+	if (flow && (!ovs_flow_cmp_unmasked_key(flow, unmasked, key_len)))
+		flow = NULL;
+
+	return flow;
+}
+
+static struct sw_flow *ovs_masked_flow_lookup(struct flow_table *table,
+				    const struct sw_flow_key *flow_key,
+				    struct sw_flow_mask *mask)
 {
 	struct sw_flow *flow;
 	struct hlist_head *head;
-	u8 *_key;
-	int key_start;
+	int key_start = mask->range.start;
+	int key_len = mask->range.end;
 	u32 hash;
+	struct sw_flow_key masked_key;
 
-	key_start = flow_key_start(key);
-	hash = ovs_flow_hash(key, key_start, key_len);
-
-	_key = (u8 *) key + key_start;
+	ovs_flow_key_mask(&masked_key, flow_key, mask);
+	hash = ovs_flow_hash(&masked_key, key_start, key_len);
 	head = find_bucket(table, hash);
 	hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) {
-
-		if (flow->hash == hash &&
-		    !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) {
+		if (flow->mask == mask &&
+		    __flow_cmp_key(flow, &masked_key, key_start, key_len))
 			return flow;
-		}
 	}
 	return NULL;
 }
 
-void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
-			 struct sw_flow_key *key, int key_len)
+struct sw_flow *ovs_flow_lookup(struct flow_table *tbl,
+				const struct sw_flow_key *key)
 {
-	flow->hash = ovs_flow_hash(key, flow_key_start(key), key_len);
-	memcpy(&flow->key, key, sizeof(flow->key));
-	__flow_tbl_insert(table, flow);
+	struct sw_flow *flow = NULL;
+	struct sw_flow_mask *mask;
+
+	list_for_each_entry_rcu(mask, tbl->mask_list, list) {
+		flow = ovs_masked_flow_lookup(tbl, key, mask);
+		if (flow)  /* Found */
+			break;
+	}
+
+	return flow;
 }
 
-void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
+
+void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow)
+{
+	flow->hash = ovs_flow_hash(&flow->key, flow->mask->range.start,
+			flow->mask->range.end);
+	__tbl_insert(table, flow);
+}
+
+void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow)
 {
 	BUG_ON(table->count == 0);
 	hlist_del_rcu(&flow->hash_node[table->node_ver]);
@@ -844,149 +1094,84 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_TUNNEL] = -1,
 };
 
-static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
-				  const struct nlattr *a[], u32 *attrs)
-{
-	const struct ovs_key_icmp *icmp_key;
-	const struct ovs_key_tcp *tcp_key;
-	const struct ovs_key_udp *udp_key;
-
-	switch (swkey->ip.proto) {
-	case IPPROTO_TCP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_TCP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
-		tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
-		swkey->ipv4.tp.src = tcp_key->tcp_src;
-		swkey->ipv4.tp.dst = tcp_key->tcp_dst;
-		break;
-
-	case IPPROTO_UDP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_UDP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
-		udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
-		swkey->ipv4.tp.src = udp_key->udp_src;
-		swkey->ipv4.tp.dst = udp_key->udp_dst;
-		break;
-
-	case IPPROTO_ICMP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv4.tp);
-		icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
-		swkey->ipv4.tp.src = htons(icmp_key->icmp_type);
-		swkey->ipv4.tp.dst = htons(icmp_key->icmp_code);
-		break;
-	}
-
-	return 0;
-}
-
-static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
-				  const struct nlattr *a[], u32 *attrs)
+static bool is_all_zero(const u8 *fp, size_t size)
 {
-	const struct ovs_key_icmpv6 *icmpv6_key;
-	const struct ovs_key_tcp *tcp_key;
-	const struct ovs_key_udp *udp_key;
-
-	switch (swkey->ip.proto) {
-	case IPPROTO_TCP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_TCP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_TCP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
-		tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
-		swkey->ipv6.tp.src = tcp_key->tcp_src;
-		swkey->ipv6.tp.dst = tcp_key->tcp_dst;
-		break;
-
-	case IPPROTO_UDP:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_UDP)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_UDP);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
-		udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
-		swkey->ipv6.tp.src = udp_key->udp_src;
-		swkey->ipv6.tp.dst = udp_key->udp_dst;
-		break;
-
-	case IPPROTO_ICMPV6:
-		if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6)))
-			return -EINVAL;
-		*attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
-
-		*key_len = SW_FLOW_KEY_OFFSET(ipv6.tp);
-		icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
-		swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type);
-		swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code);
+	int i;
 
-		if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) ||
-		    swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) {
-			const struct ovs_key_nd *nd_key;
+	if (!fp)
+		return false;
 
-			if (!(*attrs & (1 << OVS_KEY_ATTR_ND)))
-				return -EINVAL;
-			*attrs &= ~(1 << OVS_KEY_ATTR_ND);
-
-			*key_len = SW_FLOW_KEY_OFFSET(ipv6.nd);
-			nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
-			memcpy(&swkey->ipv6.nd.target, nd_key->nd_target,
-			       sizeof(swkey->ipv6.nd.target));
-			memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN);
-			memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN);
-		}
-		break;
-	}
+	for (i = 0; i < size; i++)
+		if (fp[i])
+			return false;
 
-	return 0;
+	return true;
 }
 
-static int parse_flow_nlattrs(const struct nlattr *attr,
-			      const struct nlattr *a[], u32 *attrsp)
+static int __parse_flow_nlattrs(const struct nlattr *attr,
+			      const struct nlattr *a[],
+			      u64 *attrsp, bool nz)
 {
 	const struct nlattr *nla;
 	u32 attrs;
 	int rem;
 
-	attrs = 0;
+	attrs = *attrsp;
 	nla_for_each_nested(nla, attr, rem) {
 		u16 type = nla_type(nla);
 		int expected_len;
 
-		if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type))
+		if (type > OVS_KEY_ATTR_MAX) {
+			OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n",
+				  type, OVS_KEY_ATTR_MAX);
+		}
+
+		if (attrs & (1 << type)) {
+			OVS_NLERR("Duplicate key attribute (type %d).\n", type);
 			return -EINVAL;
+		}
 
 		expected_len = ovs_key_lens[type];
-		if (nla_len(nla) != expected_len && expected_len != -1)
+		if (nla_len(nla) != expected_len && expected_len != -1) {
+			OVS_NLERR("Key attribute has unexpected length (type=%d"
+				  ", length=%d, expected=%d).\n", type,
+				  nla_len(nla), expected_len);
 			return -EINVAL;
+		}
 
-		attrs |= 1 << type;
-		a[type] = nla;
+		if (!nz || !is_all_zero(nla_data(nla), expected_len)) {
+			attrs |= 1 << type;
+			a[type] = nla;
+		}
 	}
-	if (rem)
+	if (rem) {
+		OVS_NLERR("Message has %d unknown bytes.\n", rem);
 		return -EINVAL;
+	}
 
 	*attrsp = attrs;
 	return 0;
 }
 
+static int parse_flow_mask_nlattrs(const struct nlattr *attr,
+			      const struct nlattr *a[], u64 *attrsp)
+{
+	return __parse_flow_nlattrs(attr, a, attrsp, true);
+}
+
+static int parse_flow_nlattrs(const struct nlattr *attr,
+			      const struct nlattr *a[], u64 *attrsp)
+{
+	return __parse_flow_nlattrs(attr, a, attrsp, false);
+}
+
 int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
-			     struct ovs_key_ipv4_tunnel *tun_key)
+			     struct sw_flow_match *match, bool is_mask)
 {
 	struct nlattr *a;
 	int rem;
 	bool ttl = false;
-
-	memset(tun_key, 0, sizeof(*tun_key));
+	__be16 tun_flags = 0;
 
 	nla_for_each_nested(a, attr, rem) {
 		int type = nla_type(a);
@@ -1000,53 +1185,78 @@ int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
 			[OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
 		};
 
-		if (type > OVS_TUNNEL_KEY_ATTR_MAX ||
-			ovs_tunnel_key_lens[type] != nla_len(a))
+		if (type > OVS_TUNNEL_KEY_ATTR_MAX) {
+			OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n",
+			type, OVS_TUNNEL_KEY_ATTR_MAX);
 			return -EINVAL;
+		}
+
+		if (ovs_tunnel_key_lens[type] != nla_len(a)) {
+			OVS_NLERR("IPv4 tunnel attribute type has unexpected "
+				  " length (type=%d, length=%d, expected=%d).\n",
+				  type, nla_len(a), ovs_tunnel_key_lens[type]);
+			return -EINVAL;
+		}
 
 		switch (type) {
 		case OVS_TUNNEL_KEY_ATTR_ID:
-			tun_key->tun_id = nla_get_be64(a);
-			tun_key->tun_flags |= TUNNEL_KEY;
+			SW_FLOW_KEY_PUT(match, tun_key.tun_id,
+					nla_get_be64(a), is_mask);
+			tun_flags |= TUNNEL_KEY;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
-			tun_key->ipv4_src = nla_get_be32(a);
+			SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+					nla_get_be32(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
-			tun_key->ipv4_dst = nla_get_be32(a);
+			SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+					nla_get_be32(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_TOS:
-			tun_key->ipv4_tos = nla_get_u8(a);
+			SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+					nla_get_u8(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_TTL:
-			tun_key->ipv4_ttl = nla_get_u8(a);
+			SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+					nla_get_u8(a), is_mask);
 			ttl = true;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
-			tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT;
+			tun_flags |= TUNNEL_DONT_FRAGMENT;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_CSUM:
-			tun_key->tun_flags |= TUNNEL_CSUM;
+			tun_flags |= TUNNEL_CSUM;
 			break;
 		default:
 			return -EINVAL;
-
 		}
 	}
-	if (rem > 0)
-		return -EINVAL;
 
-	if (!tun_key->ipv4_dst)
-		return -EINVAL;
+	SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
 
-	if (!ttl)
+	if (rem > 0) {
+		OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem);
 		return -EINVAL;
+	}
+
+	if (!is_mask) {
+		if (!match->key->tun_key.ipv4_dst) {
+			OVS_NLERR("IPv4 tunnel destination address is zero.\n");
+			return -EINVAL;
+		}
+
+		if (!ttl) {
+			OVS_NLERR("IPv4 tunnel TTL not specified.\n");
+			return -EINVAL;
+		}
+	}
 
 	return 0;
 }
 
 int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
-			   const struct ovs_key_ipv4_tunnel *tun_key)
+			   const struct ovs_key_ipv4_tunnel *tun_key,
+			   const struct ovs_key_ipv4_tunnel *output)
 {
 	struct nlattr *nla;
 
@@ -1054,23 +1264,24 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
 	if (!nla)
 		return -EMSGSIZE;
 
-	if (tun_key->tun_flags & TUNNEL_KEY &&
-	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id))
+	if (output->tun_flags & TUNNEL_KEY &&
+	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
 		return -EMSGSIZE;
-	if (tun_key->ipv4_src &&
-	    nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src))
+	if (output->ipv4_src &&
+		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
 		return -EMSGSIZE;
-	if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst))
+	if (output->ipv4_dst &&
+		nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
 		return -EMSGSIZE;
-	if (tun_key->ipv4_tos &&
-	    nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos))
+	if (output->ipv4_tos &&
+		nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
 		return -EMSGSIZE;
-	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl))
+	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
 		return -EMSGSIZE;
-	if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+	if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
 		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
 		return -EMSGSIZE;
-	if ((tun_key->tun_flags & TUNNEL_CSUM) &&
+	if ((output->tun_flags & TUNNEL_CSUM) &&
 		nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
 		return -EMSGSIZE;
 
@@ -1078,176 +1289,372 @@ int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
 	return 0;
 }
 
-/**
- * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key.
- * @swkey: receives the extracted flow key.
- * @key_lenp: number of bytes used in @swkey.
- * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
- * sequence.
- */
-int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
-		      const struct nlattr *attr)
+static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
+		const struct nlattr **a, bool is_mask)
 {
-	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
-	const struct ovs_key_ethernet *eth_key;
-	int key_len;
-	u32 attrs;
-	int err;
+	if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
+		SW_FLOW_KEY_PUT(match, phy.priority,
+			  nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask);
+		*attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+	}
 
-	memset(swkey, 0, sizeof(struct sw_flow_key));
-	key_len = SW_FLOW_KEY_OFFSET(eth);
+	if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
+		u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
 
-	err = parse_flow_nlattrs(attr, a, &attrs);
-	if (err)
-		return err;
+		if (is_mask)
+			in_port = 0xffffffff; /* Always exact match in_port. */
+		else if (in_port >= DP_MAX_PORTS)
+			return -EINVAL;
 
-	/* Metadata attributes. */
-	if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) {
-		swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]);
-		attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY);
+		SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask);
+		*attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
+	} else if (!is_mask) {
+		SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask);
 	}
-	if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) {
-		u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]);
-		if (in_port >= DP_MAX_PORTS)
-			return -EINVAL;
-		swkey->phy.in_port = in_port;
-		attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT);
-	} else {
-		swkey->phy.in_port = DP_MAX_PORTS;
+
+	if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
+		uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
+
+		SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask);
+		*attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
 	}
-	if (attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) {
-		swkey->phy.skb_mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]);
-		attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
+	if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
+		if (ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match,
+					is_mask))
+			return -EINVAL;
+		*attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
 	}
+	return 0;
+}
 
-	if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
-		err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key);
-		if (err)
-			return err;
+static int ovs_key_from_nlattrs(struct sw_flow_match *match,  u64 attrs,
+		const struct nlattr **a, bool is_mask)
+{
+	int err;
+	u64 orig_attrs = attrs;
 
-		attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
-	}
+	err = metadata_from_nlattrs(match, &attrs, a, is_mask);
+	if (err)
+		return err;
 
-	/* Data attributes. */
-	if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET)))
-		return -EINVAL;
-	attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+	if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) {
+		const struct ovs_key_ethernet *eth_key;
 
-	eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
-	memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN);
-	memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN);
+		eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]);
+		SW_FLOW_KEY_MEMCPY(match, eth.src,
+				eth_key->eth_src, ETH_ALEN, is_mask);
+		SW_FLOW_KEY_MEMCPY(match, eth.dst,
+				eth_key->eth_dst, ETH_ALEN, is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET);
+	}
 
-	if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) &&
-	    nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) {
-		const struct nlattr *encap;
+	if (attrs & (1 << OVS_KEY_ATTR_VLAN)) {
 		__be16 tci;
 
-		if (attrs != ((1 << OVS_KEY_ATTR_VLAN) |
-			      (1 << OVS_KEY_ATTR_ETHERTYPE) |
-			      (1 << OVS_KEY_ATTR_ENCAP)))
-			return -EINVAL;
-
-		encap = a[OVS_KEY_ATTR_ENCAP];
 		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
-		if (tci & htons(VLAN_TAG_PRESENT)) {
-			swkey->eth.tci = tci;
-
-			err = parse_flow_nlattrs(encap, a, &attrs);
-			if (err)
-				return err;
-		} else if (!tci) {
-			/* Corner case for truncated 802.1Q header. */
-			if (nla_len(encap))
-				return -EINVAL;
+		if (!(tci & htons(VLAN_TAG_PRESENT))) {
+			if (is_mask)
+				OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n");
+			else
+				OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n");
 
-			swkey->eth.type = htons(ETH_P_8021Q);
-			*key_lenp = key_len;
-			return 0;
-		} else {
 			return -EINVAL;
 		}
-	}
+
+		SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_VLAN);
+	} else if (!is_mask)
+		SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true);
 
 	if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
-		swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
-		if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN)
+		__be16 eth_type;
+
+		eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+		if (is_mask) {
+			/* Always exact match EtherType. */
+			eth_type = htons(0xffff);
+		} else if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+			OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n",
+					ntohs(eth_type), ETH_P_802_3_MIN);
 			return -EINVAL;
+		}
+
+		SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask);
 		attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
-	} else {
-		swkey->eth.type = htons(ETH_P_802_2);
+	} else if (!is_mask) {
+		SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
 	}
 
-	if (swkey->eth.type == htons(ETH_P_IP)) {
+	if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
 		const struct ovs_key_ipv4 *ipv4_key;
 
-		if (!(attrs & (1 << OVS_KEY_ATTR_IPV4)))
-			return -EINVAL;
-		attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
-
-		key_len = SW_FLOW_KEY_OFFSET(ipv4.addr);
 		ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
-		if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX)
+		if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) {
+			OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n",
+				ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX);
 			return -EINVAL;
-		swkey->ip.proto = ipv4_key->ipv4_proto;
-		swkey->ip.tos = ipv4_key->ipv4_tos;
-		swkey->ip.ttl = ipv4_key->ipv4_ttl;
-		swkey->ip.frag = ipv4_key->ipv4_frag;
-		swkey->ipv4.addr.src = ipv4_key->ipv4_src;
-		swkey->ipv4.addr.dst = ipv4_key->ipv4_dst;
-
-		if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
-			err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs);
-			if (err)
-				return err;
 		}
-	} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
-		const struct ovs_key_ipv6 *ipv6_key;
+		SW_FLOW_KEY_PUT(match, ip.proto,
+				ipv4_key->ipv4_proto, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.tos,
+				ipv4_key->ipv4_tos, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.ttl,
+				ipv4_key->ipv4_ttl, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.frag,
+				ipv4_key->ipv4_frag, is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+				ipv4_key->ipv4_src, is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+				ipv4_key->ipv4_dst, is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
+	}
 
-		if (!(attrs & (1 << OVS_KEY_ATTR_IPV6)))
-			return -EINVAL;
-		attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+	if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
+		const struct ovs_key_ipv6 *ipv6_key;
 
-		key_len = SW_FLOW_KEY_OFFSET(ipv6.label);
 		ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]);
-		if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX)
+		if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) {
+			OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n",
+				ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
 			return -EINVAL;
-		swkey->ipv6.label = ipv6_key->ipv6_label;
-		swkey->ip.proto = ipv6_key->ipv6_proto;
-		swkey->ip.tos = ipv6_key->ipv6_tclass;
-		swkey->ip.ttl = ipv6_key->ipv6_hlimit;
-		swkey->ip.frag = ipv6_key->ipv6_frag;
-		memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src,
-		       sizeof(swkey->ipv6.addr.src));
-		memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst,
-		       sizeof(swkey->ipv6.addr.dst));
-
-		if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) {
-			err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs);
-			if (err)
-				return err;
 		}
-	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
-		   swkey->eth.type == htons(ETH_P_RARP)) {
+		SW_FLOW_KEY_PUT(match, ipv6.label,
+				ipv6_key->ipv6_label, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.proto,
+				ipv6_key->ipv6_proto, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.tos,
+				ipv6_key->ipv6_tclass, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.ttl,
+				ipv6_key->ipv6_hlimit, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.frag,
+				ipv6_key->ipv6_frag, is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src,
+				ipv6_key->ipv6_src,
+				sizeof(match->key->ipv6.addr.src),
+				is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst,
+				ipv6_key->ipv6_dst,
+				sizeof(match->key->ipv6.addr.dst),
+				is_mask);
+
+		attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
 		const struct ovs_key_arp *arp_key;
 
-		if (!(attrs & (1 << OVS_KEY_ATTR_ARP)))
+		arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
+		if (!is_mask && (arp_key->arp_op & htons(0xff00))) {
+			OVS_NLERR("Unknown ARP opcode (opcode=%d).\n",
+				  arp_key->arp_op);
 			return -EINVAL;
+		}
+
+		SW_FLOW_KEY_PUT(match, ipv4.addr.src,
+				arp_key->arp_sip, is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
+			arp_key->arp_tip, is_mask);
+		SW_FLOW_KEY_PUT(match, ip.proto,
+				ntohs(arp_key->arp_op), is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha,
+				arp_key->arp_sha, ETH_ALEN, is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha,
+				arp_key->arp_tha, ETH_ALEN, is_mask);
+
 		attrs &= ~(1 << OVS_KEY_ATTR_ARP);
+	}
 
-		key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
-		arp_key = nla_data(a[OVS_KEY_ATTR_ARP]);
-		swkey->ipv4.addr.src = arp_key->arp_sip;
-		swkey->ipv4.addr.dst = arp_key->arp_tip;
-		if (arp_key->arp_op & htons(0xff00))
+	if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
+		const struct ovs_key_tcp *tcp_key;
+
+		tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]);
+		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+			SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+					tcp_key->tcp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+					tcp_key->tcp_dst, is_mask);
+		} else {
+			SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+					tcp_key->tcp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+					tcp_key->tcp_dst, is_mask);
+		}
+		attrs &= ~(1 << OVS_KEY_ATTR_TCP);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_UDP)) {
+		const struct ovs_key_udp *udp_key;
+
+		udp_key = nla_data(a[OVS_KEY_ATTR_UDP]);
+		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+			SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+					udp_key->udp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+					udp_key->udp_dst, is_mask);
+		} else {
+			SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+					udp_key->udp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+					udp_key->udp_dst, is_mask);
+		}
+		attrs &= ~(1 << OVS_KEY_ATTR_UDP);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
+		const struct ovs_key_icmp *icmp_key;
+
+		icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]);
+		SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+				htons(icmp_key->icmp_type), is_mask);
+		SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+				htons(icmp_key->icmp_code), is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_ICMP);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) {
+		const struct ovs_key_icmpv6 *icmpv6_key;
+
+		icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]);
+		SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+				htons(icmpv6_key->icmpv6_type), is_mask);
+		SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+				htons(icmpv6_key->icmpv6_code), is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6);
+	}
+
+	if (attrs & (1 << OVS_KEY_ATTR_ND)) {
+		const struct ovs_key_nd *nd_key;
+
+		nd_key = nla_data(a[OVS_KEY_ATTR_ND]);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target,
+			nd_key->nd_target,
+			sizeof(match->key->ipv6.nd.target),
+			is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll,
+			nd_key->nd_sll, ETH_ALEN, is_mask);
+		SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll,
+				nd_key->nd_tll, ETH_ALEN, is_mask);
+		attrs &= ~(1 << OVS_KEY_ATTR_ND);
+	}
+
+	if (attrs != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * ovs_match_from_nlattrs - parses Netlink attributes into a flow key and
+ * mask. In case the 'mask' is NULL, the flow is treated as exact match
+ * flow. Otherwise, it is treated as a wildcarded flow, except the mask
+ * does not include any don't care bit.
+ * @match: receives the extracted flow match information.
+ * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
+ * sequence. The fields should of the packet that triggered the creation
+ * of this flow.
+ * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink
+ * attribute specifies the mask field of the wildcarded flow.
+ */
+int ovs_match_from_nlattrs(struct sw_flow_match *match,
+			   const struct nlattr *key,
+			   const struct nlattr *mask)
+{
+	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+	const struct nlattr *encap;
+	u64 key_attrs = 0;
+	u64 mask_attrs = 0;
+	bool encap_valid = false;
+	int err;
+
+	err = parse_flow_nlattrs(key, a, &key_attrs);
+	if (err)
+		return err;
+
+	if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) &&
+	    (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) &&
+	    (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) {
+		__be16 tci;
+
+		if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) &&
+		      (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) {
+			OVS_NLERR("Invalid Vlan frame.\n");
 			return -EINVAL;
-		swkey->ip.proto = ntohs(arp_key->arp_op);
-		memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
-		memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
+		}
+
+		key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+		tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+		encap = a[OVS_KEY_ATTR_ENCAP];
+		key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+		encap_valid = true;
+
+		if (tci & htons(VLAN_TAG_PRESENT)) {
+			err = parse_flow_nlattrs(encap, a, &key_attrs);
+			if (err)
+				return err;
+		} else if (!tci) {
+			/* Corner case for truncated 802.1Q header. */
+			if (nla_len(encap)) {
+				OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n");
+				return -EINVAL;
+			}
+		} else {
+			OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n");
+			return  -EINVAL;
+		}
 	}
 
-	if (attrs)
+	err = ovs_key_from_nlattrs(match, key_attrs, a, false);
+	if (err)
+		return err;
+
+	if (mask) {
+		err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
+		if (err)
+			return err;
+
+		if (mask_attrs & 1ULL << OVS_KEY_ATTR_ENCAP)  {
+			__be16 eth_type = 0;
+			__be16 tci = 0;
+
+			if (!encap_valid) {
+				OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n");
+				return  -EINVAL;
+			}
+
+			mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP);
+			if (a[OVS_KEY_ATTR_ETHERTYPE])
+				eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
+
+			if (eth_type == htons(0xffff)) {
+				mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
+				encap = a[OVS_KEY_ATTR_ENCAP];
+				err = parse_flow_mask_nlattrs(encap, a, &mask_attrs);
+			} else {
+				OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n",
+						ntohs(eth_type));
+				return -EINVAL;
+			}
+
+			if (a[OVS_KEY_ATTR_VLAN])
+				tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]);
+
+			if (!(tci & htons(VLAN_TAG_PRESENT))) {
+				OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci));
+				return -EINVAL;
+			}
+		}
+
+		err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
+		if (err)
+			return err;
+	} else {
+		/* Populate exact match flow's key mask. */
+		if (match->mask)
+			ovs_sw_flow_mask_set(match->mask, &match->range, 0xff);
+	}
+
+	if (!ovs_match_validate(match, key_attrs, mask_attrs))
 		return -EINVAL;
-	*key_lenp = key_len;
 
 	return 0;
 }
@@ -1255,7 +1662,6 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
 /**
  * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
  * @flow: Receives extracted in_port, priority, tun_key and skb_mark.
- * @key_len: Length of key in @flow.  Used for calculating flow hash.
  * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
  * sequence.
  *
@@ -1264,102 +1670,100 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
  * get the metadata, that is, the parts of the flow key that cannot be
  * extracted from the packet itself.
  */
-int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len,
-				   const struct nlattr *attr)
+
+int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
+		const struct nlattr *attr)
 {
 	struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
-	const struct nlattr *nla;
-	int rem;
+	const struct nlattr *a[OVS_KEY_ATTR_MAX + 1];
+	u64 attrs = 0;
+	int err;
+	struct sw_flow_match match;
 
 	flow->key.phy.in_port = DP_MAX_PORTS;
 	flow->key.phy.priority = 0;
 	flow->key.phy.skb_mark = 0;
 	memset(tun_key, 0, sizeof(flow->key.tun_key));
 
-	nla_for_each_nested(nla, attr, rem) {
-		int type = nla_type(nla);
-
-		if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) {
-			int err;
-
-			if (nla_len(nla) != ovs_key_lens[type])
-				return -EINVAL;
-
-			switch (type) {
-			case OVS_KEY_ATTR_PRIORITY:
-				flow->key.phy.priority = nla_get_u32(nla);
-				break;
-
-			case OVS_KEY_ATTR_TUNNEL:
-				err = ovs_ipv4_tun_from_nlattr(nla, tun_key);
-				if (err)
-					return err;
-				break;
-
-			case OVS_KEY_ATTR_IN_PORT:
-				if (nla_get_u32(nla) >= DP_MAX_PORTS)
-					return -EINVAL;
-				flow->key.phy.in_port = nla_get_u32(nla);
-				break;
-
-			case OVS_KEY_ATTR_SKB_MARK:
-				flow->key.phy.skb_mark = nla_get_u32(nla);
-				break;
-			}
-		}
-	}
-	if (rem)
+	err = parse_flow_nlattrs(attr, a, &attrs);
+	if (err)
 		return -EINVAL;
 
-	flow->hash = ovs_flow_hash(&flow->key,
-				   flow_key_start(&flow->key), key_len);
+	memset(&match, 0, sizeof(match));
+	match.key = &flow->key;
+
+	err = metadata_from_nlattrs(&match, &attrs, a, false);
+	if (err)
+		return err;
 
 	return 0;
 }
 
-int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
+int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey,
+		const struct sw_flow_key *output, struct sk_buff *skb)
 {
 	struct ovs_key_ethernet *eth_key;
 	struct nlattr *nla, *encap;
+	bool is_mask = (swkey != output);
 
-	if (swkey->phy.priority &&
-	    nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority))
+	if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
 		goto nla_put_failure;
 
-	if (swkey->tun_key.ipv4_dst &&
-	    ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key))
+	if ((swkey->tun_key.ipv4_dst || is_mask) &&
+	    ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key))
 		goto nla_put_failure;
 
-	if (swkey->phy.in_port != DP_MAX_PORTS &&
-	    nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port))
-		goto nla_put_failure;
+	if (swkey->phy.in_port == DP_MAX_PORTS) {
+		if (is_mask && (output->phy.in_port == 0xffff))
+			if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff))
+				goto nla_put_failure;
+	} else {
+		u16 upper_u16;
+		upper_u16 = !is_mask ? 0 : 0xffff;
 
-	if (swkey->phy.skb_mark &&
-	    nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, swkey->phy.skb_mark))
+		if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT,
+				(upper_u16 << 16) | output->phy.in_port))
+			goto nla_put_failure;
+	}
+
+	if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark))
 		goto nla_put_failure;
 
 	nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key));
 	if (!nla)
 		goto nla_put_failure;
+
 	eth_key = nla_data(nla);
-	memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN);
-	memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN);
+	memcpy(eth_key->eth_src, output->eth.src, ETH_ALEN);
+	memcpy(eth_key->eth_dst, output->eth.dst, ETH_ALEN);
 
 	if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) {
-		if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q)) ||
-		    nla_put_be16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci))
+		__be16 eth_type;
+		eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff);
+		if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) ||
+		    nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci))
 			goto nla_put_failure;
 		encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP);
 		if (!swkey->eth.tci)
 			goto unencap;
-	} else {
+	} else
 		encap = NULL;
-	}
 
-	if (swkey->eth.type == htons(ETH_P_802_2))
+	if (swkey->eth.type == htons(ETH_P_802_2)) {
+		/*
+		 * Ethertype 802.2 is represented in the netlink with omitted
+		 * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and
+		 * 0xffff in the mask attribute.  Ethertype can also
+		 * be wildcarded.
+		 */
+		if (is_mask && output->eth.type)
+			if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE,
+						output->eth.type))
+				goto nla_put_failure;
 		goto unencap;
+	}
 
-	if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type))
+	if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type))
 		goto nla_put_failure;
 
 	if (swkey->eth.type == htons(ETH_P_IP)) {
@@ -1369,12 +1773,12 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 		if (!nla)
 			goto nla_put_failure;
 		ipv4_key = nla_data(nla);
-		ipv4_key->ipv4_src = swkey->ipv4.addr.src;
-		ipv4_key->ipv4_dst = swkey->ipv4.addr.dst;
-		ipv4_key->ipv4_proto = swkey->ip.proto;
-		ipv4_key->ipv4_tos = swkey->ip.tos;
-		ipv4_key->ipv4_ttl = swkey->ip.ttl;
-		ipv4_key->ipv4_frag = swkey->ip.frag;
+		ipv4_key->ipv4_src = output->ipv4.addr.src;
+		ipv4_key->ipv4_dst = output->ipv4.addr.dst;
+		ipv4_key->ipv4_proto = output->ip.proto;
+		ipv4_key->ipv4_tos = output->ip.tos;
+		ipv4_key->ipv4_ttl = output->ip.ttl;
+		ipv4_key->ipv4_frag = output->ip.frag;
 	} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
 		struct ovs_key_ipv6 *ipv6_key;
 
@@ -1382,15 +1786,15 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 		if (!nla)
 			goto nla_put_failure;
 		ipv6_key = nla_data(nla);
-		memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src,
+		memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src,
 				sizeof(ipv6_key->ipv6_src));
-		memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst,
+		memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst,
 				sizeof(ipv6_key->ipv6_dst));
-		ipv6_key->ipv6_label = swkey->ipv6.label;
-		ipv6_key->ipv6_proto = swkey->ip.proto;
-		ipv6_key->ipv6_tclass = swkey->ip.tos;
-		ipv6_key->ipv6_hlimit = swkey->ip.ttl;
-		ipv6_key->ipv6_frag = swkey->ip.frag;
+		ipv6_key->ipv6_label = output->ipv6.label;
+		ipv6_key->ipv6_proto = output->ip.proto;
+		ipv6_key->ipv6_tclass = output->ip.tos;
+		ipv6_key->ipv6_hlimit = output->ip.ttl;
+		ipv6_key->ipv6_frag = output->ip.frag;
 	} else if (swkey->eth.type == htons(ETH_P_ARP) ||
 		   swkey->eth.type == htons(ETH_P_RARP)) {
 		struct ovs_key_arp *arp_key;
@@ -1400,11 +1804,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 			goto nla_put_failure;
 		arp_key = nla_data(nla);
 		memset(arp_key, 0, sizeof(struct ovs_key_arp));
-		arp_key->arp_sip = swkey->ipv4.addr.src;
-		arp_key->arp_tip = swkey->ipv4.addr.dst;
-		arp_key->arp_op = htons(swkey->ip.proto);
-		memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN);
-		memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN);
+		arp_key->arp_sip = output->ipv4.addr.src;
+		arp_key->arp_tip = output->ipv4.addr.dst;
+		arp_key->arp_op = htons(output->ip.proto);
+		memcpy(arp_key->arp_sha, output->ipv4.arp.sha, ETH_ALEN);
+		memcpy(arp_key->arp_tha, output->ipv4.arp.tha, ETH_ALEN);
 	}
 
 	if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -1419,11 +1823,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 				goto nla_put_failure;
 			tcp_key = nla_data(nla);
 			if (swkey->eth.type == htons(ETH_P_IP)) {
-				tcp_key->tcp_src = swkey->ipv4.tp.src;
-				tcp_key->tcp_dst = swkey->ipv4.tp.dst;
+				tcp_key->tcp_src = output->ipv4.tp.src;
+				tcp_key->tcp_dst = output->ipv4.tp.dst;
 			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
-				tcp_key->tcp_src = swkey->ipv6.tp.src;
-				tcp_key->tcp_dst = swkey->ipv6.tp.dst;
+				tcp_key->tcp_src = output->ipv6.tp.src;
+				tcp_key->tcp_dst = output->ipv6.tp.dst;
 			}
 		} else if (swkey->ip.proto == IPPROTO_UDP) {
 			struct ovs_key_udp *udp_key;
@@ -1433,11 +1837,11 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 				goto nla_put_failure;
 			udp_key = nla_data(nla);
 			if (swkey->eth.type == htons(ETH_P_IP)) {
-				udp_key->udp_src = swkey->ipv4.tp.src;
-				udp_key->udp_dst = swkey->ipv4.tp.dst;
+				udp_key->udp_src = output->ipv4.tp.src;
+				udp_key->udp_dst = output->ipv4.tp.dst;
 			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
-				udp_key->udp_src = swkey->ipv6.tp.src;
-				udp_key->udp_dst = swkey->ipv6.tp.dst;
+				udp_key->udp_src = output->ipv6.tp.src;
+				udp_key->udp_dst = output->ipv6.tp.dst;
 			}
 		} else if (swkey->eth.type == htons(ETH_P_IP) &&
 			   swkey->ip.proto == IPPROTO_ICMP) {
@@ -1447,8 +1851,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 			if (!nla)
 				goto nla_put_failure;
 			icmp_key = nla_data(nla);
-			icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src);
-			icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst);
+			icmp_key->icmp_type = ntohs(output->ipv4.tp.src);
+			icmp_key->icmp_code = ntohs(output->ipv4.tp.dst);
 		} else if (swkey->eth.type == htons(ETH_P_IPV6) &&
 			   swkey->ip.proto == IPPROTO_ICMPV6) {
 			struct ovs_key_icmpv6 *icmpv6_key;
@@ -1458,8 +1862,8 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 			if (!nla)
 				goto nla_put_failure;
 			icmpv6_key = nla_data(nla);
-			icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src);
-			icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst);
+			icmpv6_key->icmpv6_type = ntohs(output->ipv6.tp.src);
+			icmpv6_key->icmpv6_code = ntohs(output->ipv6.tp.dst);
 
 			if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION ||
 			    icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) {
@@ -1469,10 +1873,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
 				if (!nla)
 					goto nla_put_failure;
 				nd_key = nla_data(nla);
-				memcpy(nd_key->nd_target, &swkey->ipv6.nd.target,
+				memcpy(nd_key->nd_target, &output->ipv6.nd.target,
 							sizeof(nd_key->nd_target));
-				memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN);
-				memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN);
+				memcpy(nd_key->nd_sll, output->ipv6.nd.sll, ETH_ALEN);
+				memcpy(nd_key->nd_tll, output->ipv6.nd.tll, ETH_ALEN);
 			}
 		}
 	}
@@ -1504,3 +1908,84 @@ void ovs_flow_exit(void)
 {
 	kmem_cache_destroy(flow_cache);
 }
+
+struct sw_flow_mask *ovs_sw_flow_mask_alloc(void)
+{
+	struct sw_flow_mask *mask;
+
+	mask = kmalloc(sizeof(*mask), GFP_KERNEL);
+	if (mask)
+		mask->ref_count = 0;
+
+	return mask;
+}
+
+void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *mask)
+{
+	mask->ref_count++;
+}
+
+void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred)
+{
+	if (!mask)
+		return;
+
+	BUG_ON(!mask->ref_count);
+	mask->ref_count--;
+
+	if (!mask->ref_count) {
+		list_del_rcu(&mask->list);
+		if (deferred)
+			kfree_rcu(mask, rcu);
+		else
+			kfree(mask);
+	}
+}
+
+static bool ovs_sw_flow_mask_equal(const struct sw_flow_mask *a,
+		const struct sw_flow_mask *b)
+{
+	u8 *a_ = (u8 *)&a->key + a->range.start;
+	u8 *b_ = (u8 *)&b->key + b->range.start;
+
+	return  (a->range.end == b->range.end)
+		&& (a->range.start == b->range.start)
+		&& (memcmp(a_, b_, ovs_sw_flow_mask_actual_size(a)) == 0);
+}
+
+struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl,
+                                           const struct sw_flow_mask *mask)
+{
+	struct list_head *ml;
+
+	list_for_each(ml, tbl->mask_list) {
+		struct sw_flow_mask *m;
+		m = container_of(ml, struct sw_flow_mask, list);
+		if (ovs_sw_flow_mask_equal(mask, m))
+			return m;
+	}
+
+	return NULL;
+}
+
+/**
+ * add a new mask into the mask list.
+ * The caller needs to make sure that 'mask' is not the same
+ * as any masks that are already on the list.
+ */
+void ovs_sw_flow_mask_insert(struct flow_table *tbl, struct sw_flow_mask *mask)
+{
+	list_add_rcu(&mask->list, tbl->mask_list);
+}
+
+/**
+ * Set 'range' fields in the mask to the value of 'val'.
+ */
+static void ovs_sw_flow_mask_set(struct sw_flow_mask *mask,
+		struct sw_flow_key_range *range, u8 val)
+{
+	u8 *m = (u8 *)&mask->key + range->start;
+
+	mask->range = *range;
+	memset(m, val, ovs_sw_flow_mask_size_roundup(mask));
+}
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 66ef7220293..9674e45f696 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2011 Nicira, Inc.
+ * Copyright (c) 2007-2013 Nicira, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -33,6 +33,8 @@
 #include <net/inet_ecn.h>
 
 struct sk_buff;
+struct sw_flow_mask;
+struct flow_table;
 
 struct sw_flow_actions {
 	struct rcu_head rcu;
@@ -131,6 +133,8 @@ struct sw_flow {
 	u32 hash;
 
 	struct sw_flow_key key;
+	struct sw_flow_key unmasked_key;
+	struct sw_flow_mask *mask;
 	struct sw_flow_actions __rcu *sf_acts;
 
 	spinlock_t lock;	/* Lock for values below. */
@@ -140,6 +144,25 @@ struct sw_flow {
 	u8 tcp_flags;		/* Union of seen TCP flags. */
 };
 
+struct sw_flow_key_range {
+	size_t start;
+	size_t end;
+};
+
+static inline u16 ovs_sw_flow_key_range_actual_size(const struct sw_flow_key_range *range)
+{
+	return range->end - range->start;
+}
+
+struct sw_flow_match {
+	struct sw_flow_key *key;
+	struct sw_flow_key_range range;
+	struct sw_flow_mask *mask;
+};
+
+void ovs_match_init(struct sw_flow_match *match,
+		struct sw_flow_key *key, struct sw_flow_mask *mask);
+
 struct arp_eth_header {
 	__be16      ar_hrd;	/* format of hardware address   */
 	__be16      ar_pro;	/* format of protocol address   */
@@ -159,21 +182,21 @@ void ovs_flow_exit(void);
 
 struct sw_flow *ovs_flow_alloc(void);
 void ovs_flow_deferred_free(struct sw_flow *);
-void ovs_flow_free(struct sw_flow *flow);
+void ovs_flow_free(struct sw_flow *, bool deferred);
 
 struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len);
 void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
 
-int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
-		     int *key_lenp);
+int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *);
 void ovs_flow_used(struct sw_flow *, struct sk_buff *);
 u64 ovs_flow_used_time(unsigned long flow_jiffies);
-
-int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *);
-int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
+int ovs_flow_to_nlattrs(const struct sw_flow_key *,
+		const struct sw_flow_key *, struct sk_buff *);
+int ovs_match_from_nlattrs(struct sw_flow_match *match,
+		      const struct nlattr *,
 		      const struct nlattr *);
-int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len,
-				  const struct nlattr *attr);
+int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
+		const struct nlattr *attr);
 
 #define MAX_ACTIONS_BUFSIZE    (32 * 1024)
 #define TBL_MIN_BUCKETS		1024
@@ -182,6 +205,7 @@ struct flow_table {
 	struct flex_array *buckets;
 	unsigned int count, n_buckets;
 	struct rcu_head rcu;
+	struct list_head *mask_list;
 	int node_ver;
 	u32 hash_seed;
 	bool keep_flows;
@@ -197,22 +221,56 @@ static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table)
 	return (table->count > table->n_buckets);
 }
 
-struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
-				    struct sw_flow_key *key, int len);
-void ovs_flow_tbl_destroy(struct flow_table *table);
-void ovs_flow_tbl_deferred_destroy(struct flow_table *table);
+struct sw_flow *ovs_flow_lookup(struct flow_table *,
+				const struct sw_flow_key *);
+struct sw_flow *ovs_flow_lookup_unmasked_key(struct flow_table *table,
+				    struct sw_flow_match *match);
+
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred);
 struct flow_table *ovs_flow_tbl_alloc(int new_size);
 struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
 struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
-void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
-			 struct sw_flow_key *key, int key_len);
-void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
 
-struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
+void ovs_flow_insert(struct flow_table *table, struct sw_flow *flow);
+void ovs_flow_remove(struct flow_table *table, struct sw_flow *flow);
+
+struct sw_flow *ovs_flow_dump_next(struct flow_table *table, u32 *bucket, u32 *idx);
 extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
 int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
-			 struct ovs_key_ipv4_tunnel *tun_key);
+			     struct sw_flow_match *match, bool is_mask);
 int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
-			const struct ovs_key_ipv4_tunnel *tun_key);
+			   const struct ovs_key_ipv4_tunnel *tun_key,
+			   const struct ovs_key_ipv4_tunnel *output);
+
+bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
+		const struct sw_flow_key *key, int key_len);
+
+struct sw_flow_mask {
+	int ref_count;
+	struct rcu_head rcu;
+	struct list_head list;
+	struct sw_flow_key_range range;
+	struct sw_flow_key key;
+};
+
+static inline u16
+ovs_sw_flow_mask_actual_size(const struct sw_flow_mask *mask)
+{
+	return ovs_sw_flow_key_range_actual_size(&mask->range);
+}
+
+static inline u16
+ovs_sw_flow_mask_size_roundup(const struct sw_flow_mask *mask)
+{
+	return roundup(ovs_sw_flow_mask_actual_size(mask), sizeof(u32));
+}
 
+struct sw_flow_mask *ovs_sw_flow_mask_alloc(void);
+void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *);
+void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred);
+void ovs_sw_flow_mask_insert(struct flow_table *, struct sw_flow_mask *);
+struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *,
+		const struct sw_flow_mask *);
+void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
+		       const struct sw_flow_mask *mask);
 #endif /* flow.h */
-- 
cgit v1.2.3-70-g09d2


From a175a723301a8a4a9fedf9ce5b8ca586e7a97b40 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@wand.net.nz>
Date: Thu, 22 Aug 2013 12:30:48 -0700
Subject: openvswitch: Add SCTP support

This patch adds support for rewriting SCTP src,dst ports similar to the
functionality already available for TCP/UDP.

Rewriting SCTP ports is expensive due to double-recalculation of the
SCTP checksums; this is performed to ensure that packets traversing OVS
with invalid checksums will continue to the destination with any
checksum corruption intact.

Reviewed-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/uapi/linux/openvswitch.h |  6 ++++
 net/openvswitch/Kconfig          |  1 +
 net/openvswitch/actions.c        | 39 ++++++++++++++++++++++++
 net/openvswitch/datapath.c       |  6 ++++
 net/openvswitch/flow.c           | 65 ++++++++++++++++++++++++++++++++++++++++
 net/openvswitch/flow.h           |  8 ++---
 6 files changed, 121 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index de1fa5d3780..a74d375b439 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -259,6 +259,7 @@ enum ovs_key_attr {
 	OVS_KEY_ATTR_ND,        /* struct ovs_key_nd */
 	OVS_KEY_ATTR_SKB_MARK,  /* u32 skb mark */
 	OVS_KEY_ATTR_TUNNEL,    /* Nested set of ovs_tunnel attributes */
+	OVS_KEY_ATTR_SCTP,      /* struct ovs_key_sctp */
 
 #ifdef __KERNEL__
 	OVS_KEY_ATTR_IPV4_TUNNEL,  /* struct ovs_key_ipv4_tunnel */
@@ -333,6 +334,11 @@ struct ovs_key_udp {
 	__be16 udp_dst;
 };
 
+struct ovs_key_sctp {
+	__be16 sctp_src;
+	__be16 sctp_dst;
+};
+
 struct ovs_key_icmp {
 	__u8 icmp_type;
 	__u8 icmp_code;
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index bed30e69baa..6ecf491ad50 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -4,6 +4,7 @@
 
 config OPENVSWITCH
 	tristate "Open vSwitch"
+	select LIBCRC32C
 	---help---
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized
 	  environments.  In addition to supporting a variety of features
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 1f680222f4f..65cfaa81607 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -22,6 +22,7 @@
 #include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/openvswitch.h>
+#include <linux/sctp.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/in6.h>
@@ -31,6 +32,7 @@
 #include <net/ipv6.h>
 #include <net/checksum.h>
 #include <net/dsfield.h>
+#include <net/sctp/checksum.h>
 
 #include "datapath.h"
 #include "vport.h"
@@ -352,6 +354,39 @@ static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key)
 	return 0;
 }
 
+static int set_sctp(struct sk_buff *skb,
+		     const struct ovs_key_sctp *sctp_port_key)
+{
+	struct sctphdr *sh;
+	int err;
+	unsigned int sctphoff = skb_transport_offset(skb);
+
+	err = make_writable(skb, sctphoff + sizeof(struct sctphdr));
+	if (unlikely(err))
+		return err;
+
+	sh = sctp_hdr(skb);
+	if (sctp_port_key->sctp_src != sh->source ||
+	    sctp_port_key->sctp_dst != sh->dest) {
+		__le32 old_correct_csum, new_csum, old_csum;
+
+		old_csum = sh->checksum;
+		old_correct_csum = sctp_compute_cksum(skb, sctphoff);
+
+		sh->source = sctp_port_key->sctp_src;
+		sh->dest = sctp_port_key->sctp_dst;
+
+		new_csum = sctp_compute_cksum(skb, sctphoff);
+
+		/* Carry any checksum errors through. */
+		sh->checksum = old_csum ^ old_correct_csum ^ new_csum;
+
+		skb->rxhash = 0;
+	}
+
+	return 0;
+}
+
 static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
 {
 	struct vport *vport;
@@ -461,6 +496,10 @@ static int execute_set_action(struct sk_buff *skb,
 	case OVS_KEY_ATTR_UDP:
 		err = set_udp(skb, nla_data(nested_attr));
 		break;
+
+	case OVS_KEY_ATTR_SCTP:
+		err = set_sctp(skb, nla_data(nested_attr));
+		break;
 	}
 
 	return err;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index d29cd9aa4a6..2aa13bd7f2b 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -712,6 +712,12 @@ static int validate_set(const struct nlattr *a,
 
 		return validate_tp_port(flow_key);
 
+	case OVS_KEY_ATTR_SCTP:
+		if (flow_key->ip.proto != IPPROTO_SCTP)
+			return -EINVAL;
+
+		return validate_tp_port(flow_key);
+
 	default:
 		return -EINVAL;
 	}
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 1fceb965359..2b4785590b5 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -34,6 +34,7 @@
 #include <linux/if_arp.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
+#include <linux/sctp.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/icmp.h>
@@ -129,6 +130,7 @@ static bool ovs_match_validate(const struct sw_flow_match *match,
 			| (1 << OVS_KEY_ATTR_IPV6)
 			| (1 << OVS_KEY_ATTR_TCP)
 			| (1 << OVS_KEY_ATTR_UDP)
+			| (1 << OVS_KEY_ATTR_SCTP)
 			| (1 << OVS_KEY_ATTR_ICMP)
 			| (1 << OVS_KEY_ATTR_ICMPV6)
 			| (1 << OVS_KEY_ATTR_ARP)
@@ -159,6 +161,12 @@ static bool ovs_match_validate(const struct sw_flow_match *match,
 					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
 			}
 
+			if (match->key->ip.proto == IPPROTO_SCTP) {
+				key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+			}
+
 			if (match->key->ip.proto == IPPROTO_TCP) {
 				key_expected |= 1 << OVS_KEY_ATTR_TCP;
 				if (match->mask && (match->mask->key.ip.proto == 0xff))
@@ -185,6 +193,12 @@ static bool ovs_match_validate(const struct sw_flow_match *match,
 					mask_allowed |= 1 << OVS_KEY_ATTR_UDP;
 			}
 
+			if (match->key->ip.proto == IPPROTO_SCTP) {
+				key_expected |= 1 << OVS_KEY_ATTR_SCTP;
+				if (match->mask && (match->mask->key.ip.proto == 0xff))
+					mask_allowed |= 1 << OVS_KEY_ATTR_SCTP;
+			}
+
 			if (match->key->ip.proto == IPPROTO_TCP) {
 				key_expected |= 1 << OVS_KEY_ATTR_TCP;
 				if (match->mask && (match->mask->key.ip.proto == 0xff))
@@ -280,6 +294,12 @@ static bool udphdr_ok(struct sk_buff *skb)
 				  sizeof(struct udphdr));
 }
 
+static bool sctphdr_ok(struct sk_buff *skb)
+{
+	return pskb_may_pull(skb, skb_transport_offset(skb) +
+				  sizeof(struct sctphdr));
+}
+
 static bool icmphdr_ok(struct sk_buff *skb)
 {
 	return pskb_may_pull(skb, skb_transport_offset(skb) +
@@ -891,6 +911,12 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 				key->ipv4.tp.src = udp->source;
 				key->ipv4.tp.dst = udp->dest;
 			}
+		} else if (key->ip.proto == IPPROTO_SCTP) {
+			if (sctphdr_ok(skb)) {
+				struct sctphdr *sctp = sctp_hdr(skb);
+				key->ipv4.tp.src = sctp->source;
+				key->ipv4.tp.dst = sctp->dest;
+			}
 		} else if (key->ip.proto == IPPROTO_ICMP) {
 			if (icmphdr_ok(skb)) {
 				struct icmphdr *icmp = icmp_hdr(skb);
@@ -953,6 +979,12 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key)
 				key->ipv6.tp.src = udp->source;
 				key->ipv6.tp.dst = udp->dest;
 			}
+		} else if (key->ip.proto == NEXTHDR_SCTP) {
+			if (sctphdr_ok(skb)) {
+				struct sctphdr *sctp = sctp_hdr(skb);
+				key->ipv6.tp.src = sctp->source;
+				key->ipv6.tp.dst = sctp->dest;
+			}
 		} else if (key->ip.proto == NEXTHDR_ICMP) {
 			if (icmp6hdr_ok(skb)) {
 				error = parse_icmpv6(skb, key, nh_len);
@@ -1087,6 +1119,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
 	[OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6),
 	[OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp),
 	[OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp),
+	[OVS_KEY_ATTR_SCTP] = sizeof(struct ovs_key_sctp),
 	[OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp),
 	[OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
 	[OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
@@ -1500,6 +1533,24 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  u64 attrs,
 		attrs &= ~(1 << OVS_KEY_ATTR_UDP);
 	}
 
+	if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
+		const struct ovs_key_sctp *sctp_key;
+
+		sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]);
+		if (orig_attrs & (1 << OVS_KEY_ATTR_IPV4)) {
+			SW_FLOW_KEY_PUT(match, ipv4.tp.src,
+					sctp_key->sctp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv4.tp.dst,
+					sctp_key->sctp_dst, is_mask);
+		} else {
+			SW_FLOW_KEY_PUT(match, ipv6.tp.src,
+					sctp_key->sctp_src, is_mask);
+			SW_FLOW_KEY_PUT(match, ipv6.tp.dst,
+					sctp_key->sctp_dst, is_mask);
+		}
+		attrs &= ~(1 << OVS_KEY_ATTR_SCTP);
+	}
+
 	if (attrs & (1 << OVS_KEY_ATTR_ICMP)) {
 		const struct ovs_key_icmp *icmp_key;
 
@@ -1843,6 +1894,20 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey,
 				udp_key->udp_src = output->ipv6.tp.src;
 				udp_key->udp_dst = output->ipv6.tp.dst;
 			}
+		} else if (swkey->ip.proto == IPPROTO_SCTP) {
+			struct ovs_key_sctp *sctp_key;
+
+			nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key));
+			if (!nla)
+				goto nla_put_failure;
+			sctp_key = nla_data(nla);
+			if (swkey->eth.type == htons(ETH_P_IP)) {
+				sctp_key->sctp_src = swkey->ipv4.tp.src;
+				sctp_key->sctp_dst = swkey->ipv4.tp.dst;
+			} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
+				sctp_key->sctp_src = swkey->ipv6.tp.src;
+				sctp_key->sctp_dst = swkey->ipv6.tp.dst;
+			}
 		} else if (swkey->eth.type == htons(ETH_P_IP) &&
 			   swkey->ip.proto == IPPROTO_ICMP) {
 			struct ovs_key_icmp *icmp_key;
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 9674e45f696..d08dcf78dbf 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -99,8 +99,8 @@ struct sw_flow_key {
 			} addr;
 			union {
 				struct {
-					__be16 src;		/* TCP/UDP source port. */
-					__be16 dst;		/* TCP/UDP destination port. */
+					__be16 src;		/* TCP/UDP/SCTP source port. */
+					__be16 dst;		/* TCP/UDP/SCTP destination port. */
 				} tp;
 				struct {
 					u8 sha[ETH_ALEN];	/* ARP source hardware address. */
@@ -115,8 +115,8 @@ struct sw_flow_key {
 			} addr;
 			__be32 label;			/* IPv6 flow label. */
 			struct {
-				__be16 src;		/* TCP/UDP source port. */
-				__be16 dst;		/* TCP/UDP destination port. */
+				__be16 src;		/* TCP/UDP/SCTP source port. */
+				__be16 dst;		/* TCP/UDP/SCTP destination port. */
 			} tp;
 			struct {
 				struct in6_addr target;	/* ND target address. */
-- 
cgit v1.2.3-70-g09d2


From 41d73ec053d2424599c4ed8452b889374d523ade Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 27 Aug 2013 08:50:12 +0200
Subject: netfilter: nf_conntrack: make sequence number adjustments usuable
 without NAT

Split out sequence number adjustments from NAT and move them to the conntrack
core to make them usable for SYN proxying. The sequence number adjustment
information is moved to a seperate extend. The extend is added to new
conntracks when a NAT mapping is set up for a connection using a helper.

As a side effect, this saves 24 bytes per connection with NAT in the common
case that a connection does not have a helper assigned.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Tested-by: Martin Topholm <mph@one.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                          |   9 +-
 include/net/netfilter/nf_conntrack_extend.h        |   2 +
 include/net/netfilter/nf_conntrack_seqadj.h        |  49 +++++
 include/net/netfilter/nf_nat.h                     |  10 -
 include/net/netfilter/nf_nat_helper.h              |  19 --
 include/uapi/linux/netfilter/nf_conntrack_common.h |   3 +-
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |  15 +-
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c     |   7 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c     |   7 +-
 net/netfilter/Makefile                             |   2 +-
 net/netfilter/nf_conntrack_core.c                  |  16 +-
 net/netfilter/nf_conntrack_netlink.c               | 115 +++++------
 net/netfilter/nf_conntrack_proto_tcp.c             |  18 +-
 net/netfilter/nf_conntrack_seqadj.c                | 218 ++++++++++++++++++++
 net/netfilter/nf_nat_core.c                        |  16 +-
 net/netfilter/nf_nat_helper.c                      | 228 +--------------------
 net/netfilter/nf_nat_sip.c                         |   3 +-
 net/netfilter/nfnetlink_queue_ct.c                 |   8 +-
 18 files changed, 369 insertions(+), 376 deletions(-)
 create mode 100644 include/net/netfilter/nf_conntrack_seqadj.h
 create mode 100644 net/netfilter/nf_conntrack_seqadj.c

(limited to 'include/uapi')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index e2cf786be22..708fe72ab91 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -319,6 +319,7 @@ extern void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
 extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu;
 
 struct nf_conn;
+enum ip_conntrack_info;
 struct nlattr;
 
 struct nfq_ct_hook {
@@ -327,14 +328,10 @@ struct nfq_ct_hook {
 	int (*parse)(const struct nlattr *attr, struct nf_conn *ct);
 	int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct,
 			     u32 portid, u32 report);
-};
-extern struct nfq_ct_hook __rcu *nfq_ct_hook;
-
-struct nfq_ct_nat_hook {
 	void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
-			   u32 ctinfo, s32 off);
+			   enum ip_conntrack_info ctinfo, s32 off);
 };
-extern struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook;
+extern struct nfq_ct_hook __rcu *nfq_ct_hook;
 #else
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 #endif
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 977bc8a4644..2a22bcbfe6e 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -9,6 +9,7 @@ enum nf_ct_ext_id {
 	NF_CT_EXT_HELPER,
 #if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE)
 	NF_CT_EXT_NAT,
+	NF_CT_EXT_SEQADJ,
 #endif
 	NF_CT_EXT_ACCT,
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -31,6 +32,7 @@ enum nf_ct_ext_id {
 
 #define NF_CT_EXT_HELPER_TYPE struct nf_conn_help
 #define NF_CT_EXT_NAT_TYPE struct nf_conn_nat
+#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj
 #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter
 #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
 #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone
diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h
new file mode 100644
index 00000000000..30bfbbed9f4
--- /dev/null
+++ b/include/net/netfilter/nf_conntrack_seqadj.h
@@ -0,0 +1,49 @@
+#ifndef _NF_CONNTRACK_SEQADJ_H
+#define _NF_CONNTRACK_SEQADJ_H
+
+#include <net/netfilter/nf_conntrack_extend.h>
+
+/**
+ * struct nf_ct_seqadj - sequence number adjustment information
+ *
+ * @correction_pos: position of the last TCP sequence number modification
+ * @offset_before: sequence number offset before last modification
+ * @offset_after: sequence number offset after last modification
+ */
+struct nf_ct_seqadj {
+	u32		correction_pos;
+	s32		offset_before;
+	s32		offset_after;
+};
+
+struct nf_conn_seqadj {
+	struct nf_ct_seqadj	seq[IP_CT_DIR_MAX];
+};
+
+static inline struct nf_conn_seqadj *nfct_seqadj(const struct nf_conn *ct)
+{
+	return nf_ct_ext_find(ct, NF_CT_EXT_SEQADJ);
+}
+
+static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct)
+{
+	return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC);
+}
+
+extern int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			    __be32 seq, s32 off);
+extern void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
+				 struct nf_conn *ct,
+				 enum ip_conntrack_info ctinfo,
+				 s32 off);
+
+extern int nf_ct_seq_adjust(struct sk_buff *skb,
+			    struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			    unsigned int protoff);
+extern s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir,
+			    u32 seq);
+
+extern int nf_conntrack_seqadj_init(void);
+extern void nf_conntrack_seqadj_fini(void);
+
+#endif /* _NF_CONNTRACK_SEQADJ_H */
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index e2441413675..59a19242005 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -13,15 +13,6 @@ enum nf_nat_manip_type {
 #define HOOK2MANIP(hooknum) ((hooknum) != NF_INET_POST_ROUTING && \
 			     (hooknum) != NF_INET_LOCAL_IN)
 
-/* NAT sequence number modifications */
-struct nf_nat_seq {
-	/* position of the last TCP sequence number modification (if any) */
-	u_int32_t correction_pos;
-
-	/* sequence number offset before and after last modification */
-	int32_t offset_before, offset_after;
-};
-
 #include <linux/list.h>
 #include <linux/netfilter/nf_conntrack_pptp.h>
 #include <net/netfilter/nf_conntrack_extend.h>
@@ -39,7 +30,6 @@ struct nf_conn;
 /* The structure embedded in the conntrack structure. */
 struct nf_conn_nat {
 	struct hlist_node bysource;
-	struct nf_nat_seq seq[IP_CT_DIR_MAX];
 	struct nf_conn *ct;
 	union nf_conntrack_nat_help help;
 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h
index 194c3479492..404324d1d0c 100644
--- a/include/net/netfilter/nf_nat_helper.h
+++ b/include/net/netfilter/nf_nat_helper.h
@@ -39,28 +39,9 @@ extern int nf_nat_mangle_udp_packet(struct sk_buff *skb,
 				    const char *rep_buffer,
 				    unsigned int rep_len);
 
-extern void nf_nat_set_seq_adjust(struct nf_conn *ct,
-				  enum ip_conntrack_info ctinfo,
-				  __be32 seq, s32 off);
-extern int nf_nat_seq_adjust(struct sk_buff *skb,
-			     struct nf_conn *ct,
-			     enum ip_conntrack_info ctinfo,
-			     unsigned int protoff);
-extern int (*nf_nat_seq_adjust_hook)(struct sk_buff *skb,
-				     struct nf_conn *ct,
-				     enum ip_conntrack_info ctinfo,
-				     unsigned int protoff);
-
 /* Setup NAT on this expected conntrack so it follows master, but goes
  * to port ct->master->saved_proto. */
 extern void nf_nat_follow_master(struct nf_conn *ct,
 				 struct nf_conntrack_expect *this);
 
-extern s32 nf_nat_get_offset(const struct nf_conn *ct,
-			     enum ip_conntrack_dir dir,
-			     u32 seq);
-
-extern void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
-				  u32 dir, s32 off);
-
 #endif
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index d69483fb382..8dd803818eb 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -99,7 +99,8 @@ enum ip_conntrack_events {
 	IPCT_PROTOINFO,		/* protocol information has changed */
 	IPCT_HELPER,		/* new helper has been set */
 	IPCT_MARK,		/* new mark has been set */
-	IPCT_NATSEQADJ,		/* NAT is doing sequence adjustment */
+	IPCT_SEQADJ,		/* sequence adjustment has changed */
+	IPCT_NATSEQADJ = IPCT_SEQADJ,
 	IPCT_SECMARK,		/* new security mark has been set */
 	IPCT_LABEL,		/* new connlabel has been set */
 };
diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
index 08fabc6c93f..acad6c52a65 100644
--- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h
@@ -42,8 +42,10 @@ enum ctattr_type {
 	CTA_ID,
 	CTA_NAT_DST,
 	CTA_TUPLE_MASTER,
-	CTA_NAT_SEQ_ADJ_ORIG,
-	CTA_NAT_SEQ_ADJ_REPLY,
+	CTA_SEQ_ADJ_ORIG,
+	CTA_NAT_SEQ_ADJ_ORIG	= CTA_SEQ_ADJ_ORIG,
+	CTA_SEQ_ADJ_REPLY,
+	CTA_NAT_SEQ_ADJ_REPLY	= CTA_SEQ_ADJ_REPLY,
 	CTA_SECMARK,		/* obsolete */
 	CTA_ZONE,
 	CTA_SECCTX,
@@ -165,6 +167,15 @@ enum ctattr_protonat {
 };
 #define CTA_PROTONAT_MAX (__CTA_PROTONAT_MAX - 1)
 
+enum ctattr_seqadj {
+	CTA_SEQADJ_UNSPEC,
+	CTA_SEQADJ_CORRECTION_POS,
+	CTA_SEQADJ_OFFSET_BEFORE,
+	CTA_SEQADJ_OFFSET_AFTER,
+	__CTA_SEQADJ_MAX
+};
+#define CTA_SEQADJ_MAX (__CTA_SEQADJ_MAX - 1)
+
 enum ctattr_natseq {
 	CTA_NAT_SEQ_UNSPEC,
 	CTA_NAT_SEQ_CORRECTION_POS,
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 0a2e0e3e95b..86f5b34a4ed 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -25,6 +25,7 @@
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
@@ -136,11 +137,7 @@ static unsigned int ipv4_confirm(unsigned int hooknum,
 	/* adjust seqs for loopback traffic only in outgoing direction */
 	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
 	    !nf_is_loopback_packet(skb)) {
-		typeof(nf_nat_seq_adjust_hook) seq_adjust;
-
-		seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
-		if (!seq_adjust ||
-		    !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
+		if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
 			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
 			return NF_DROP;
 		}
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index c9b6a6e6a1e..d6e4dd8b58d 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -28,6 +28,7 @@
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
@@ -158,11 +159,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
 	/* adjust seqs for loopback traffic only in outgoing direction */
 	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
 	    !nf_is_loopback_packet(skb)) {
-		typeof(nf_nat_seq_adjust_hook) seq_adjust;
-
-		seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
-		if (!seq_adjust ||
-		    !seq_adjust(skb, ct, ctinfo, protoff)) {
+		if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
 			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
 			return NF_DROP;
 		}
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index ebfa7dc747c..89a9c1658f5 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,6 +1,6 @@
 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
 
-nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o
+nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index da6f1787a10..00a7a94d413 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -39,6 +39,7 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_acct.h>
@@ -1326,6 +1327,7 @@ void nf_conntrack_cleanup_end(void)
 	nf_ct_extend_unregister(&nf_ct_zone_extend);
 #endif
 	nf_conntrack_proto_fini();
+	nf_conntrack_seqadj_fini();
 	nf_conntrack_labels_fini();
 	nf_conntrack_helper_fini();
 	nf_conntrack_timeout_fini();
@@ -1531,6 +1533,10 @@ int nf_conntrack_init_start(void)
 	if (ret < 0)
 		goto err_labels;
 
+	ret = nf_conntrack_seqadj_init();
+	if (ret < 0)
+		goto err_seqadj;
+
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	ret = nf_ct_extend_register(&nf_ct_zone_extend);
 	if (ret < 0)
@@ -1555,6 +1561,8 @@ err_proto:
 	nf_ct_extend_unregister(&nf_ct_zone_extend);
 err_extend:
 #endif
+	nf_conntrack_seqadj_fini();
+err_seqadj:
 	nf_conntrack_labels_fini();
 err_labels:
 	nf_conntrack_helper_fini();
@@ -1577,9 +1585,6 @@ void nf_conntrack_init_end(void)
 	/* For use by REJECT target */
 	RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
 	RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
-
-	/* Howto get NAT offsets */
-	RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
 }
 
 /*
@@ -1666,8 +1671,3 @@ err_slabname:
 err_stat:
 	return ret;
 }
-
-s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
-			enum ip_conntrack_dir dir,
-			u32 seq);
-EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index fa61fea6323..7c55745ecec 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -37,6 +37,7 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
@@ -381,9 +382,8 @@ nla_put_failure:
 	return -1;
 }
 
-#ifdef CONFIG_NF_NAT_NEEDED
 static int
-dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)
+dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type)
 {
 	struct nlattr *nest_parms;
 
@@ -391,12 +391,12 @@ dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type)
 	if (!nest_parms)
 		goto nla_put_failure;
 
-	if (nla_put_be32(skb, CTA_NAT_SEQ_CORRECTION_POS,
-			 htonl(natseq->correction_pos)) ||
-	    nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_BEFORE,
-			 htonl(natseq->offset_before)) ||
-	    nla_put_be32(skb, CTA_NAT_SEQ_OFFSET_AFTER,
-			 htonl(natseq->offset_after)))
+	if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS,
+			 htonl(seq->correction_pos)) ||
+	    nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE,
+			 htonl(seq->offset_before)) ||
+	    nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER,
+			 htonl(seq->offset_after)))
 		goto nla_put_failure;
 
 	nla_nest_end(skb, nest_parms);
@@ -408,27 +408,24 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct)
 {
-	struct nf_nat_seq *natseq;
-	struct nf_conn_nat *nat = nfct_nat(ct);
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+	struct nf_ct_seqadj *seq;
 
-	if (!(ct->status & IPS_SEQ_ADJUST) || !nat)
+	if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj)
 		return 0;
 
-	natseq = &nat->seq[IP_CT_DIR_ORIGINAL];
-	if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1)
+	seq = &seqadj->seq[IP_CT_DIR_ORIGINAL];
+	if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1)
 		return -1;
 
-	natseq = &nat->seq[IP_CT_DIR_REPLY];
-	if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1)
+	seq = &seqadj->seq[IP_CT_DIR_REPLY];
+	if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1)
 		return -1;
 
 	return 0;
 }
-#else
-#define ctnetlink_dump_nat_seq_adj(a, b) (0)
-#endif
 
 static inline int
 ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
@@ -502,7 +499,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	    ctnetlink_dump_id(skb, ct) < 0 ||
 	    ctnetlink_dump_use(skb, ct) < 0 ||
 	    ctnetlink_dump_master(skb, ct) < 0 ||
-	    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+	    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -707,8 +704,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 		    ctnetlink_dump_master(skb, ct) < 0)
 			goto nla_put_failure;
 
-		if (events & (1 << IPCT_NATSEQADJ) &&
-		    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+		if (events & (1 << IPCT_SEQADJ) &&
+		    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
 			goto nla_put_failure;
 	}
 
@@ -1439,66 +1436,65 @@ ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]
 	return err;
 }
 
-#ifdef CONFIG_NF_NAT_NEEDED
-static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = {
-	[CTA_NAT_SEQ_CORRECTION_POS]	= { .type = NLA_U32 },
-	[CTA_NAT_SEQ_OFFSET_BEFORE]	= { .type = NLA_U32 },
-	[CTA_NAT_SEQ_OFFSET_AFTER]	= { .type = NLA_U32 },
+static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = {
+	[CTA_SEQADJ_CORRECTION_POS]	= { .type = NLA_U32 },
+	[CTA_SEQADJ_OFFSET_BEFORE]	= { .type = NLA_U32 },
+	[CTA_SEQADJ_OFFSET_AFTER]	= { .type = NLA_U32 },
 };
 
 static inline int
-change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr)
+change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr)
 {
 	int err;
-	struct nlattr *cda[CTA_NAT_SEQ_MAX+1];
+	struct nlattr *cda[CTA_SEQADJ_MAX+1];
 
-	err = nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy);
+	err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy);
 	if (err < 0)
 		return err;
 
-	if (!cda[CTA_NAT_SEQ_CORRECTION_POS])
+	if (!cda[CTA_SEQADJ_CORRECTION_POS])
 		return -EINVAL;
 
-	natseq->correction_pos =
-		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS]));
+	seq->correction_pos =
+		ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS]));
 
-	if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE])
+	if (!cda[CTA_SEQADJ_OFFSET_BEFORE])
 		return -EINVAL;
 
-	natseq->offset_before =
-		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE]));
+	seq->offset_before =
+		ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE]));
 
-	if (!cda[CTA_NAT_SEQ_OFFSET_AFTER])
+	if (!cda[CTA_SEQADJ_OFFSET_AFTER])
 		return -EINVAL;
 
-	natseq->offset_after =
-		ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER]));
+	seq->offset_after =
+		ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER]));
 
 	return 0;
 }
 
 static int
-ctnetlink_change_nat_seq_adj(struct nf_conn *ct,
-			     const struct nlattr * const cda[])
+ctnetlink_change_seq_adj(struct nf_conn *ct,
+			 const struct nlattr * const cda[])
 {
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
 	int ret = 0;
-	struct nf_conn_nat *nat = nfct_nat(ct);
 
-	if (!nat)
+	if (!seqadj)
 		return 0;
 
-	if (cda[CTA_NAT_SEQ_ADJ_ORIG]) {
-		ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL],
-					 cda[CTA_NAT_SEQ_ADJ_ORIG]);
+	if (cda[CTA_SEQ_ADJ_ORIG]) {
+		ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL],
+				     cda[CTA_SEQ_ADJ_ORIG]);
 		if (ret < 0)
 			return ret;
 
 		ct->status |= IPS_SEQ_ADJUST;
 	}
 
-	if (cda[CTA_NAT_SEQ_ADJ_REPLY]) {
-		ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY],
-					 cda[CTA_NAT_SEQ_ADJ_REPLY]);
+	if (cda[CTA_SEQ_ADJ_REPLY]) {
+		ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY],
+				     cda[CTA_SEQ_ADJ_REPLY]);
 		if (ret < 0)
 			return ret;
 
@@ -1507,7 +1503,6 @@ ctnetlink_change_nat_seq_adj(struct nf_conn *ct,
 
 	return 0;
 }
-#endif
 
 static int
 ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[])
@@ -1573,13 +1568,12 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
 		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
 #endif
 
-#ifdef CONFIG_NF_NAT_NEEDED
-	if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
-		err = ctnetlink_change_nat_seq_adj(ct, cda);
+	if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+		err = ctnetlink_change_seq_adj(ct, cda);
 		if (err < 0)
 			return err;
 	}
-#endif
+
 	if (cda[CTA_LABELS]) {
 		err = ctnetlink_attach_labels(ct, cda);
 		if (err < 0)
@@ -1684,13 +1678,11 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
 			goto err2;
 	}
 
-#ifdef CONFIG_NF_NAT_NEEDED
-	if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) {
-		err = ctnetlink_change_nat_seq_adj(ct, cda);
+	if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) {
+		err = ctnetlink_change_seq_adj(ct, cda);
 		if (err < 0)
 			goto err2;
 	}
-#endif
 
 	memset(&ct->proto, 0, sizeof(ct->proto));
 	if (cda[CTA_PROTOINFO]) {
@@ -1804,7 +1796,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      (1 << IPCT_ASSURED) |
 						      (1 << IPCT_HELPER) |
 						      (1 << IPCT_PROTOINFO) |
-						      (1 << IPCT_NATSEQADJ) |
+						      (1 << IPCT_SEQADJ) |
 						      (1 << IPCT_MARK) | events,
 						      ct, NETLINK_CB(skb).portid,
 						      nlmsg_report(nlh));
@@ -1827,7 +1819,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 						      (1 << IPCT_HELPER) |
 						      (1 << IPCT_LABEL) |
 						      (1 << IPCT_PROTOINFO) |
-						      (1 << IPCT_NATSEQADJ) |
+						      (1 << IPCT_SEQADJ) |
 						      (1 << IPCT_MARK),
 						      ct, NETLINK_CB(skb).portid,
 						      nlmsg_report(nlh));
@@ -2082,7 +2074,7 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct)
 		goto nla_put_failure;
 
 	if ((ct->status & IPS_SEQ_ADJUST) &&
-	    ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
+	    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
 		goto nla_put_failure;
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
@@ -2211,6 +2203,7 @@ static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
 	.build		= ctnetlink_nfqueue_build,
 	.parse		= ctnetlink_nfqueue_parse,
 	.attach_expect	= ctnetlink_nfqueue_attach_expect,
+	.seq_adjust	= nf_ct_tcp_seqadj_set,
 };
 #endif /* CONFIG_NETFILTER_NETLINK_QUEUE_CT */
 
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index d224e001f14..984a8d1a335 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -27,6 +27,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -495,21 +496,6 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
 	}
 }
 
-#ifdef CONFIG_NF_NAT_NEEDED
-static inline s32 nat_offset(const struct nf_conn *ct,
-			     enum ip_conntrack_dir dir,
-			     u32 seq)
-{
-	typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset);
-
-	return get_offset != NULL ? get_offset(ct, dir, seq) : 0;
-}
-#define NAT_OFFSET(ct, dir, seq) \
-	(nat_offset(ct, dir, seq))
-#else
-#define NAT_OFFSET(ct, dir, seq)	0
-#endif
-
 static bool tcp_in_window(const struct nf_conn *ct,
 			  struct ip_ct_tcp *state,
 			  enum ip_conntrack_dir dir,
@@ -540,7 +526,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
 		tcp_sack(skb, dataoff, tcph, &sack);
 
 	/* Take into account NAT sequence number mangling */
-	receiver_offset = NAT_OFFSET(ct, !dir, ack - 1);
+	receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
 	ack -= receiver_offset;
 	sack -= receiver_offset;
 
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
new file mode 100644
index 00000000000..483eb9ce321
--- /dev/null
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -0,0 +1,218 @@
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+
+int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+		     __be32 seq, s32 off)
+{
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	struct nf_ct_seqadj *this_way;
+
+	if (off == 0)
+		return 0;
+
+	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+	spin_lock_bh(&ct->lock);
+	this_way = &seqadj->seq[dir];
+	if (this_way->offset_before == this_way->offset_after ||
+	    before(this_way->correction_pos, seq)) {
+		this_way->correction_pos = seq;
+		this_way->offset_before	 = this_way->offset_after;
+		this_way->offset_after	+= off;
+	}
+	spin_unlock_bh(&ct->lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_set);
+
+void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
+			  struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			  s32 off)
+{
+	const struct tcphdr *th;
+
+	if (nf_ct_protonum(ct) != IPPROTO_TCP)
+		return;
+
+	th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb));
+	nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
+}
+EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set);
+
+/* Adjust one found SACK option including checksum correction */
+static void nf_ct_sack_block_adjust(struct sk_buff *skb,
+				    struct tcphdr *tcph,
+				    unsigned int sackoff,
+				    unsigned int sackend,
+				    struct nf_ct_seqadj *seq)
+{
+	while (sackoff < sackend) {
+		struct tcp_sack_block_wire *sack;
+		__be32 new_start_seq, new_end_seq;
+
+		sack = (void *)skb->data + sackoff;
+		if (after(ntohl(sack->start_seq) - seq->offset_before,
+			  seq->correction_pos))
+			new_start_seq = htonl(ntohl(sack->start_seq) -
+					seq->offset_after);
+		else
+			new_start_seq = htonl(ntohl(sack->start_seq) -
+					seq->offset_before);
+
+		if (after(ntohl(sack->end_seq) - seq->offset_before,
+			  seq->correction_pos))
+			new_end_seq = htonl(ntohl(sack->end_seq) -
+				      seq->offset_after);
+		else
+			new_end_seq = htonl(ntohl(sack->end_seq) -
+				      seq->offset_before);
+
+		pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
+			 ntohl(sack->start_seq), new_start_seq,
+			 ntohl(sack->end_seq), new_end_seq);
+
+		inet_proto_csum_replace4(&tcph->check, skb,
+					 sack->start_seq, new_start_seq, 0);
+		inet_proto_csum_replace4(&tcph->check, skb,
+					 sack->end_seq, new_end_seq, 0);
+		sack->start_seq = new_start_seq;
+		sack->end_seq = new_end_seq;
+		sackoff += sizeof(*sack);
+	}
+}
+
+/* TCP SACK sequence number adjustment */
+static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
+				      unsigned int protoff,
+				      struct tcphdr *tcph,
+				      struct nf_conn *ct,
+				      enum ip_conntrack_info ctinfo)
+{
+	unsigned int dir, optoff, optend;
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+
+	optoff = protoff + sizeof(struct tcphdr);
+	optend = protoff + tcph->doff * 4;
+
+	if (!skb_make_writable(skb, optend))
+		return 0;
+
+	dir = CTINFO2DIR(ctinfo);
+
+	while (optoff < optend) {
+		/* Usually: option, length. */
+		unsigned char *op = skb->data + optoff;
+
+		switch (op[0]) {
+		case TCPOPT_EOL:
+			return 1;
+		case TCPOPT_NOP:
+			optoff++;
+			continue;
+		default:
+			/* no partial options */
+			if (optoff + 1 == optend ||
+			    optoff + op[1] > optend ||
+			    op[1] < 2)
+				return 0;
+			if (op[0] == TCPOPT_SACK &&
+			    op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
+			    ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
+				nf_ct_sack_block_adjust(skb, tcph, optoff + 2,
+							optoff+op[1],
+							&seqadj->seq[!dir]);
+			optoff += op[1];
+		}
+	}
+	return 1;
+}
+
+/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
+int nf_ct_seq_adjust(struct sk_buff *skb,
+		     struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+		     unsigned int protoff)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	struct tcphdr *tcph;
+	__be32 newseq, newack;
+	s32 seqoff, ackoff;
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+	struct nf_ct_seqadj *this_way, *other_way;
+	int res;
+
+	this_way  = &seqadj->seq[dir];
+	other_way = &seqadj->seq[!dir];
+
+	if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
+		return 0;
+
+	tcph = (void *)skb->data + protoff;
+	spin_lock_bh(&ct->lock);
+	if (after(ntohl(tcph->seq), this_way->correction_pos))
+		seqoff = this_way->offset_after;
+	else
+		seqoff = this_way->offset_before;
+
+	if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
+		  other_way->correction_pos))
+		ackoff = other_way->offset_after;
+	else
+		ackoff = other_way->offset_before;
+
+	newseq = htonl(ntohl(tcph->seq) + seqoff);
+	newack = htonl(ntohl(tcph->ack_seq) - ackoff);
+
+	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
+	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
+
+	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+		 ntohl(newack));
+
+	tcph->seq = newseq;
+	tcph->ack_seq = newack;
+
+	res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo);
+	spin_unlock_bh(&ct->lock);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_adjust);
+
+s32 nf_ct_seq_offset(const struct nf_conn *ct,
+		     enum ip_conntrack_dir dir,
+		     u32 seq)
+{
+	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
+	struct nf_ct_seqadj *this_way;
+
+	if (!seqadj)
+		return 0;
+
+	this_way = &seqadj->seq[dir];
+	return after(seq, this_way->correction_pos) ?
+		 this_way->offset_after : this_way->offset_before;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seq_offset);
+
+static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = {
+	.len	= sizeof(struct nf_conn_seqadj),
+	.align	= __alignof__(struct nf_conn_seqadj),
+	.id	= NF_CT_EXT_SEQADJ,
+};
+
+int nf_conntrack_seqadj_init(void)
+{
+	return nf_ct_extend_register(&nf_ct_seqadj_extend);
+}
+
+void nf_conntrack_seqadj_fini(void)
+{
+	nf_ct_extend_unregister(&nf_ct_seqadj_extend);
+}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 6ff808375b5..6f0f4f7f68a 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -25,6 +25,7 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
@@ -402,6 +403,9 @@ nf_nat_setup_info(struct nf_conn *ct,
 			ct->status |= IPS_SRC_NAT;
 		else
 			ct->status |= IPS_DST_NAT;
+
+		if (nfct_help(ct))
+			nfct_seqadj_ext_add(ct);
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
@@ -764,10 +768,6 @@ static struct nf_ct_helper_expectfn follow_master_nat = {
 	.expectfn	= nf_nat_follow_master,
 };
 
-static struct nfq_ct_nat_hook nfq_ct_nat = {
-	.seq_adjust	= nf_nat_tcp_seq_adjust,
-};
-
 static int __init nf_nat_init(void)
 {
 	int ret;
@@ -787,14 +787,9 @@ static int __init nf_nat_init(void)
 	/* Initialize fake conntrack so that NAT will skip it */
 	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
 
-	BUG_ON(nf_nat_seq_adjust_hook != NULL);
-	RCU_INIT_POINTER(nf_nat_seq_adjust_hook, nf_nat_seq_adjust);
 	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
 	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
 			   nfnetlink_parse_nat_setup);
-	BUG_ON(nf_ct_nat_offset != NULL);
-	RCU_INIT_POINTER(nf_ct_nat_offset, nf_nat_get_offset);
-	RCU_INIT_POINTER(nfq_ct_nat_hook, &nfq_ct_nat);
 #ifdef CONFIG_XFRM
 	BUG_ON(nf_nat_decode_session_hook != NULL);
 	RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session);
@@ -813,10 +808,7 @@ static void __exit nf_nat_cleanup(void)
 	unregister_pernet_subsys(&nf_nat_net_ops);
 	nf_ct_extend_unregister(&nat_extend);
 	nf_ct_helper_expectfn_unregister(&follow_master_nat);
-	RCU_INIT_POINTER(nf_nat_seq_adjust_hook, NULL);
 	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL);
-	RCU_INIT_POINTER(nf_ct_nat_offset, NULL);
-	RCU_INIT_POINTER(nfq_ct_nat_hook, NULL);
 #ifdef CONFIG_XFRM
 	RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL);
 #endif
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 46b9baa845a..2840abb5bb9 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -20,67 +20,13 @@
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/netfilter/nf_nat_l4proto.h>
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
 
-#define DUMP_OFFSET(x) \
-	pr_debug("offset_before=%d, offset_after=%d, correction_pos=%u\n", \
-		 x->offset_before, x->offset_after, x->correction_pos);
-
-/* Setup TCP sequence correction given this change at this sequence */
-static inline void
-adjust_tcp_sequence(u32 seq,
-		    int sizediff,
-		    struct nf_conn *ct,
-		    enum ip_conntrack_info ctinfo)
-{
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	struct nf_conn_nat *nat = nfct_nat(ct);
-	struct nf_nat_seq *this_way = &nat->seq[dir];
-
-	pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
-		 seq, sizediff);
-
-	pr_debug("adjust_tcp_sequence: Seq_offset before: ");
-	DUMP_OFFSET(this_way);
-
-	spin_lock_bh(&ct->lock);
-
-	/* SYN adjust. If it's uninitialized, or this is after last
-	 * correction, record it: we don't handle more than one
-	 * adjustment in the window, but do deal with common case of a
-	 * retransmit */
-	if (this_way->offset_before == this_way->offset_after ||
-	    before(this_way->correction_pos, seq)) {
-		this_way->correction_pos = seq;
-		this_way->offset_before = this_way->offset_after;
-		this_way->offset_after += sizediff;
-	}
-	spin_unlock_bh(&ct->lock);
-
-	pr_debug("adjust_tcp_sequence: Seq_offset after: ");
-	DUMP_OFFSET(this_way);
-}
-
-/* Get the offset value, for conntrack. Caller must have the conntrack locked */
-s32 nf_nat_get_offset(const struct nf_conn *ct,
-		      enum ip_conntrack_dir dir,
-		      u32 seq)
-{
-	struct nf_conn_nat *nat = nfct_nat(ct);
-	struct nf_nat_seq *this_way;
-
-	if (!nat)
-		return 0;
-
-	this_way = &nat->seq[dir];
-	return after(seq, this_way->correction_pos)
-		 ? this_way->offset_after : this_way->offset_before;
-}
-
 /* Frobs data inside this packet, which is linear. */
 static void mangle_contents(struct sk_buff *skb,
 			    unsigned int dataoff,
@@ -135,30 +81,6 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
 	return 1;
 }
 
-void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
-			   __be32 seq, s32 off)
-{
-	if (!off)
-		return;
-	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
-	adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
-	nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
-}
-EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
-
-void nf_nat_tcp_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
-			   u32 ctinfo, int off)
-{
-	const struct tcphdr *th;
-
-	if (nf_ct_protonum(ct) != IPPROTO_TCP)
-		return;
-
-	th = (struct tcphdr *)(skb_network_header(skb)+ ip_hdrlen(skb));
-	nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
-}
-EXPORT_SYMBOL_GPL(nf_nat_tcp_seq_adjust);
-
 /* Generic function for mangling variable-length address changes inside
  * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
  * command in FTP).
@@ -203,8 +125,8 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 			     datalen, oldlen);
 
 	if (adjust && rep_len != match_len)
-		nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
-				      (int)rep_len - (int)match_len);
+		nf_ct_seqadj_set(ct, ctinfo, tcph->seq,
+				 (int)rep_len - (int)match_len);
 
 	return 1;
 }
@@ -264,150 +186,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(nf_nat_mangle_udp_packet);
 
-/* Adjust one found SACK option including checksum correction */
-static void
-sack_adjust(struct sk_buff *skb,
-	    struct tcphdr *tcph,
-	    unsigned int sackoff,
-	    unsigned int sackend,
-	    struct nf_nat_seq *natseq)
-{
-	while (sackoff < sackend) {
-		struct tcp_sack_block_wire *sack;
-		__be32 new_start_seq, new_end_seq;
-
-		sack = (void *)skb->data + sackoff;
-		if (after(ntohl(sack->start_seq) - natseq->offset_before,
-			  natseq->correction_pos))
-			new_start_seq = htonl(ntohl(sack->start_seq)
-					- natseq->offset_after);
-		else
-			new_start_seq = htonl(ntohl(sack->start_seq)
-					- natseq->offset_before);
-
-		if (after(ntohl(sack->end_seq) - natseq->offset_before,
-			  natseq->correction_pos))
-			new_end_seq = htonl(ntohl(sack->end_seq)
-				      - natseq->offset_after);
-		else
-			new_end_seq = htonl(ntohl(sack->end_seq)
-				      - natseq->offset_before);
-
-		pr_debug("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
-			 ntohl(sack->start_seq), new_start_seq,
-			 ntohl(sack->end_seq), new_end_seq);
-
-		inet_proto_csum_replace4(&tcph->check, skb,
-					 sack->start_seq, new_start_seq, 0);
-		inet_proto_csum_replace4(&tcph->check, skb,
-					 sack->end_seq, new_end_seq, 0);
-		sack->start_seq = new_start_seq;
-		sack->end_seq = new_end_seq;
-		sackoff += sizeof(*sack);
-	}
-}
-
-/* TCP SACK sequence number adjustment */
-static inline unsigned int
-nf_nat_sack_adjust(struct sk_buff *skb,
-		   unsigned int protoff,
-		   struct tcphdr *tcph,
-		   struct nf_conn *ct,
-		   enum ip_conntrack_info ctinfo)
-{
-	unsigned int dir, optoff, optend;
-	struct nf_conn_nat *nat = nfct_nat(ct);
-
-	optoff = protoff + sizeof(struct tcphdr);
-	optend = protoff + tcph->doff * 4;
-
-	if (!skb_make_writable(skb, optend))
-		return 0;
-
-	dir = CTINFO2DIR(ctinfo);
-
-	while (optoff < optend) {
-		/* Usually: option, length. */
-		unsigned char *op = skb->data + optoff;
-
-		switch (op[0]) {
-		case TCPOPT_EOL:
-			return 1;
-		case TCPOPT_NOP:
-			optoff++;
-			continue;
-		default:
-			/* no partial options */
-			if (optoff + 1 == optend ||
-			    optoff + op[1] > optend ||
-			    op[1] < 2)
-				return 0;
-			if (op[0] == TCPOPT_SACK &&
-			    op[1] >= 2+TCPOLEN_SACK_PERBLOCK &&
-			    ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
-				sack_adjust(skb, tcph, optoff+2,
-					    optoff+op[1], &nat->seq[!dir]);
-			optoff += op[1];
-		}
-	}
-	return 1;
-}
-
-/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
-int
-nf_nat_seq_adjust(struct sk_buff *skb,
-		  struct nf_conn *ct,
-		  enum ip_conntrack_info ctinfo,
-		  unsigned int protoff)
-{
-	struct tcphdr *tcph;
-	int dir;
-	__be32 newseq, newack;
-	s32 seqoff, ackoff;
-	struct nf_conn_nat *nat = nfct_nat(ct);
-	struct nf_nat_seq *this_way, *other_way;
-	int res;
-
-	dir = CTINFO2DIR(ctinfo);
-
-	this_way = &nat->seq[dir];
-	other_way = &nat->seq[!dir];
-
-	if (!skb_make_writable(skb, protoff + sizeof(*tcph)))
-		return 0;
-
-	tcph = (void *)skb->data + protoff;
-	spin_lock_bh(&ct->lock);
-	if (after(ntohl(tcph->seq), this_way->correction_pos))
-		seqoff = this_way->offset_after;
-	else
-		seqoff = this_way->offset_before;
-
-	if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
-		  other_way->correction_pos))
-		ackoff = other_way->offset_after;
-	else
-		ackoff = other_way->offset_before;
-
-	newseq = htonl(ntohl(tcph->seq) + seqoff);
-	newack = htonl(ntohl(tcph->ack_seq) - ackoff);
-
-	inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0);
-	inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0);
-
-	pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n",
-		 ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
-		 ntohl(newack));
-
-	tcph->seq = newseq;
-	tcph->ack_seq = newack;
-
-	res = nf_nat_sack_adjust(skb, protoff, tcph, ct, ctinfo);
-	spin_unlock_bh(&ct->lock);
-
-	return res;
-}
-
 /* Setup NAT on this expected conntrack so it follows master. */
 /* If we fail to get a free NAT slot, we'll get dropped on confirm */
 void nf_nat_follow_master(struct nf_conn *ct,
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index dac11f73868..f9790405b7f 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -20,6 +20,7 @@
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <linux/netfilter/nf_conntrack_sip.h>
 
 MODULE_LICENSE("GPL");
@@ -308,7 +309,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff,
 		return;
 
 	th = (struct tcphdr *)(skb->data + protoff);
-	nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
+	nf_ct_seqadj_set(ct, ctinfo, th->seq, off);
 }
 
 /* Handles expected signalling connections and media streams */
diff --git a/net/netfilter/nfnetlink_queue_ct.c b/net/netfilter/nfnetlink_queue_ct.c
index be893039966..96cac50e0d1 100644
--- a/net/netfilter/nfnetlink_queue_ct.c
+++ b/net/netfilter/nfnetlink_queue_ct.c
@@ -87,14 +87,14 @@ nla_put_failure:
 void nfqnl_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct,
 			 enum ip_conntrack_info ctinfo, int diff)
 {
-	struct nfq_ct_nat_hook *nfq_nat_ct;
+	struct nfq_ct_hook *nfq_ct;
 
-	nfq_nat_ct = rcu_dereference(nfq_ct_nat_hook);
-	if (nfq_nat_ct == NULL)
+	nfq_ct = rcu_dereference(nfq_ct_hook);
+	if (nfq_ct == NULL)
 		return;
 
 	if ((ct->status & IPS_NAT_MASK) && diff)
-		nfq_nat_ct->seq_adjust(skb, ct, ctinfo, diff);
+		nfq_ct->seq_adjust(skb, ct, ctinfo, diff);
 }
 
 int nfqnl_attach_expect(struct nf_conn *ct, const struct nlattr *attr,
-- 
cgit v1.2.3-70-g09d2


From 48b1de4c110a7afa4b85862f6c75af817db26fad Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 27 Aug 2013 08:50:14 +0200
Subject: netfilter: add SYNPROXY core/target

Add a SYNPROXY for netfilter. The code is split into two parts, the synproxy
core with common functions and an address family specific target.

The SYNPROXY receives the connection request from the client, responds with
a SYN/ACK containing a SYN cookie and announcing a zero window and checks
whether the final ACK from the client contains a valid cookie.

It then establishes a connection to the original destination and, if
successful, sends a window update to the client with the window size
announced by the server.

Support for timestamps, SACK, window scaling and MSS options can be
statically configured as target parameters if the features of the server
are known. If timestamps are used, the timestamp value sent back to
the client in the SYN/ACK will be different from the real timestamp of
the server. In order to now break PAWS, the timestamps are translated in
the direction server->client.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Tested-by: Martin Topholm <mph@one.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_extend.h   |   6 +-
 include/net/netfilter/nf_conntrack_seqadj.h   |   2 +
 include/net/netfilter/nf_conntrack_synproxy.h |  77 +++++
 include/uapi/linux/netfilter/xt_SYNPROXY.h    |  16 +
 net/ipv4/netfilter/Kconfig                    |  13 +
 net/ipv4/netfilter/Makefile                   |   1 +
 net/ipv4/netfilter/ipt_SYNPROXY.c             | 472 ++++++++++++++++++++++++++
 net/netfilter/Kconfig                         |   3 +
 net/netfilter/Makefile                        |   3 +
 net/netfilter/nf_conntrack_core.c             |   6 +
 net/netfilter/nf_conntrack_proto_tcp.c        |  16 +
 net/netfilter/nf_conntrack_seqadj.c           |  20 ++
 net/netfilter/nf_synproxy_core.c              | 432 +++++++++++++++++++++++
 13 files changed, 1066 insertions(+), 1 deletion(-)
 create mode 100644 include/net/netfilter/nf_conntrack_synproxy.h
 create mode 100644 include/uapi/linux/netfilter/xt_SYNPROXY.h
 create mode 100644 net/ipv4/netfilter/ipt_SYNPROXY.c
 create mode 100644 net/netfilter/nf_synproxy_core.c

(limited to 'include/uapi')

diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 2a22bcbfe6e..ff95434e50c 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -9,8 +9,8 @@ enum nf_ct_ext_id {
 	NF_CT_EXT_HELPER,
 #if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE)
 	NF_CT_EXT_NAT,
-	NF_CT_EXT_SEQADJ,
 #endif
+	NF_CT_EXT_SEQADJ,
 	NF_CT_EXT_ACCT,
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
 	NF_CT_EXT_ECACHE,
@@ -26,6 +26,9 @@ enum nf_ct_ext_id {
 #endif
 #ifdef CONFIG_NF_CONNTRACK_LABELS
 	NF_CT_EXT_LABELS,
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+	NF_CT_EXT_SYNPROXY,
 #endif
 	NF_CT_EXT_NUM,
 };
@@ -39,6 +42,7 @@ enum nf_ct_ext_id {
 #define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp
 #define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout
 #define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels
+#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy
 
 /* Extensions: optional stuff which isn't permanently in struct. */
 struct nf_ct_ext {
diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h
index 30bfbbed9f4..f6177a5fe0c 100644
--- a/include/net/netfilter/nf_conntrack_seqadj.h
+++ b/include/net/netfilter/nf_conntrack_seqadj.h
@@ -30,6 +30,8 @@ static inline struct nf_conn_seqadj *nfct_seqadj_ext_add(struct nf_conn *ct)
 	return nf_ct_ext_add(ct, NF_CT_EXT_SEQADJ, GFP_ATOMIC);
 }
 
+extern int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+			     s32 off);
 extern int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 			    __be32 seq, s32 off);
 extern void nf_ct_tcp_seqadj_set(struct sk_buff *skb,
diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
new file mode 100644
index 00000000000..806f54a290d
--- /dev/null
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -0,0 +1,77 @@
+#ifndef _NF_CONNTRACK_SYNPROXY_H
+#define _NF_CONNTRACK_SYNPROXY_H
+
+#include <net/netns/generic.h>
+
+struct nf_conn_synproxy {
+	u32	isn;
+	u32	its;
+	u32	tsoff;
+};
+
+static inline struct nf_conn_synproxy *nfct_synproxy(const struct nf_conn *ct)
+{
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+	return nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY);
+#else
+	return NULL;
+#endif
+}
+
+static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct)
+{
+#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY)
+	return nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY, GFP_ATOMIC);
+#else
+	return NULL;
+#endif
+}
+
+struct synproxy_stats {
+	unsigned int			syn_received;
+	unsigned int			cookie_invalid;
+	unsigned int			cookie_valid;
+	unsigned int			cookie_retrans;
+	unsigned int			conn_reopened;
+};
+
+struct synproxy_net {
+	struct nf_conn			*tmpl;
+	struct synproxy_stats __percpu	*stats;
+};
+
+extern int synproxy_net_id;
+static inline struct synproxy_net *synproxy_pernet(struct net *net)
+{
+	return net_generic(net, synproxy_net_id);
+}
+
+struct synproxy_options {
+	u8				options;
+	u8				wscale;
+	u16				mss;
+	u32				tsval;
+	u32				tsecr;
+};
+
+struct tcphdr;
+struct xt_synproxy_info;
+extern void synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
+				   const struct tcphdr *th,
+				   struct synproxy_options *opts);
+extern unsigned int synproxy_options_size(const struct synproxy_options *opts);
+extern void synproxy_build_options(struct tcphdr *th,
+				   const struct synproxy_options *opts);
+
+extern void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+					   struct synproxy_options *opts);
+extern void synproxy_check_timestamp_cookie(struct synproxy_options *opts);
+
+extern unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
+					   unsigned int protoff,
+					   struct tcphdr *th,
+					   struct nf_conn *ct,
+					   enum ip_conntrack_info ctinfo,
+					   const struct nf_conn_synproxy *synproxy);
+
+#endif /* _NF_CONNTRACK_SYNPROXY_H */
diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h
new file mode 100644
index 00000000000..2d59fbaa93c
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h
@@ -0,0 +1,16 @@
+#ifndef _XT_SYNPROXY_H
+#define _XT_SYNPROXY_H
+
+#define XT_SYNPROXY_OPT_MSS		0x01
+#define XT_SYNPROXY_OPT_WSCALE		0x02
+#define XT_SYNPROXY_OPT_SACK_PERM	0x04
+#define XT_SYNPROXY_OPT_TIMESTAMP	0x08
+#define XT_SYNPROXY_OPT_ECN		0x10
+
+struct xt_synproxy_info {
+	__u8	options;
+	__u8	wscale;
+	__u16	mss;
+};
+
+#endif /* _XT_SYNPROXY_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 4e902801742..1657e39b291 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -110,6 +110,19 @@ config IP_NF_TARGET_REJECT
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_TARGET_SYNPROXY
+	tristate "SYNPROXY target support"
+	depends on NF_CONNTRACK && NETFILTER_ADVANCED
+	select NETFILTER_SYNPROXY
+	select SYN_COOKIES
+	help
+	  The SYNPROXY target allows you to intercept TCP connections and
+	  establish them using syncookies before they are passed on to the
+	  server. This allows to avoid conntrack and server resource usage
+	  during SYN-flood attacks.
+
+	  To compile it as a module, choose M here. If unsure, say N.
+
 config IP_NF_TARGET_ULOG
 	tristate "ULOG target support (obsolete)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 007b128eecc..3622b248b6d 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
 obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 
 # generic ARP tables
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
new file mode 100644
index 00000000000..94371db6aec
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+static struct iphdr *
+synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+	struct iphdr *iph;
+
+	skb_reset_network_header(skb);
+	iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+	iph->version	= 4;
+	iph->ihl	= sizeof(*iph) / 4;
+	iph->tos	= 0;
+	iph->id		= 0;
+	iph->frag_off	= htons(IP_DF);
+	iph->ttl	= sysctl_ip_default_ttl;
+	iph->protocol	= IPPROTO_TCP;
+	iph->check	= 0;
+	iph->saddr	= saddr;
+	iph->daddr	= daddr;
+
+	return iph;
+}
+
+static void
+synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
+		  struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
+		  struct iphdr *niph, struct tcphdr *nth,
+		  unsigned int tcp_hdr_size)
+{
+	nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
+	nskb->ip_summed   = CHECKSUM_PARTIAL;
+	nskb->csum_start  = (unsigned char *)nth - nskb->head;
+	nskb->csum_offset = offsetof(struct tcphdr, check);
+
+	skb_dst_set_noref(nskb, skb_dst(skb));
+	nskb->protocol = htons(ETH_P_IP);
+	if (ip_route_me_harder(nskb, RTN_UNSPEC))
+		goto free_nskb;
+
+	if (nfct) {
+		nskb->nfct = nfct;
+		nskb->nfctinfo = ctinfo;
+		nf_conntrack_get(nfct);
+	}
+
+	ip_local_out(nskb);
+	return;
+
+free_nskb:
+	kfree_skb(nskb);
+}
+
+static void
+synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
+			    const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+	u16 mss = opts->mss;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(__cookie_v4_init_sequence(iph, th, &mss));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
+	if (opts->options & XT_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= 0;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+			  niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_syn(const struct synproxy_net *snet,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts, u32 recv_seq)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(recv_seq - 1);
+	/* ack_seq is used to relay our ISN to the synproxy hook to initialize
+	 * sequence number translation once a connection tracking entry exists.
+	 */
+	nth->ack_seq	= htonl(ntohl(th->ack_seq) - 1);
+	tcp_flag_word(nth) = TCP_FLAG_SYN;
+	if (opts->options & XT_SYNPROXY_OPT_ECN)
+		tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= th->window;
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+			  niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_server_ack(const struct synproxy_net *snet,
+			 const struct ip_ct_tcp *state,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->dest;
+	nth->dest	= th->source;
+	nth->seq	= htonl(ntohl(th->ack_seq));
+	nth->ack_seq	= htonl(ntohl(th->seq) + 1);
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static void
+synproxy_send_client_ack(const struct synproxy_net *snet,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 const struct synproxy_options *opts)
+{
+	struct sk_buff *nskb;
+	struct iphdr *iph, *niph;
+	struct tcphdr *nth;
+	unsigned int tcp_hdr_size;
+
+	iph = ip_hdr(skb);
+
+	tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
+	nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
+			 GFP_ATOMIC);
+	if (nskb == NULL)
+		return;
+	skb_reserve(nskb, MAX_TCP_HEADER);
+
+	niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+
+	skb_reset_transport_header(nskb);
+	nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+	nth->source	= th->source;
+	nth->dest	= th->dest;
+	nth->seq	= htonl(ntohl(th->seq) + 1);
+	nth->ack_seq	= th->ack_seq;
+	tcp_flag_word(nth) = TCP_FLAG_ACK;
+	nth->doff	= tcp_hdr_size / 4;
+	nth->window	= ntohs(htons(th->window) >> opts->wscale);
+	nth->check	= 0;
+	nth->urg_ptr	= 0;
+
+	synproxy_build_options(nth, opts);
+
+	synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+}
+
+static bool
+synproxy_recv_client_ack(const struct synproxy_net *snet,
+			 const struct sk_buff *skb, const struct tcphdr *th,
+			 struct synproxy_options *opts, u32 recv_seq)
+{
+	int mss;
+
+	mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
+	if (mss == 0) {
+		this_cpu_inc(snet->stats->cookie_invalid);
+		return false;
+	}
+
+	this_cpu_inc(snet->stats->cookie_valid);
+	opts->mss = mss;
+
+	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+		synproxy_check_timestamp_cookie(opts);
+
+	synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+	return true;
+}
+
+static unsigned int
+synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_synproxy_info *info = par->targinfo;
+	struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
+	struct synproxy_options opts = {};
+	struct tcphdr *th, _th;
+
+	if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+		return NF_DROP;
+
+	th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
+	if (th == NULL)
+		return NF_DROP;
+
+	synproxy_parse_options(skb, par->thoff, th, &opts);
+
+	if (th->syn && !th->ack) {
+		/* Initial SYN from client */
+		this_cpu_inc(snet->stats->syn_received);
+
+		if (th->ece && th->cwr)
+			opts.options |= XT_SYNPROXY_OPT_ECN;
+
+		opts.options &= info->options;
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy_init_timestamp_cookie(info, &opts);
+		else
+			opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
+					  XT_SYNPROXY_OPT_SACK_PERM |
+					  XT_SYNPROXY_OPT_ECN);
+
+		synproxy_send_client_synack(skb, th, &opts);
+	} else if (th->ack && !(th->fin || th->rst))
+		/* ACK from client */
+		synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+
+	return NF_DROP;
+}
+
+static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out));
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+	struct nf_conn_synproxy *synproxy;
+	struct synproxy_options opts = {};
+	const struct ip_ct_tcp *state;
+	struct tcphdr *th, _th;
+	unsigned int thoff;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct == NULL)
+		return NF_ACCEPT;
+
+	synproxy = nfct_synproxy(ct);
+	if (synproxy == NULL)
+		return NF_ACCEPT;
+
+	if (nf_is_loopback_packet(skb))
+		return NF_ACCEPT;
+
+	thoff = ip_hdrlen(skb);
+	th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
+	if (th == NULL)
+		return NF_DROP;
+
+	state = &ct->proto.tcp;
+	switch (state->state) {
+	case TCP_CONNTRACK_CLOSE:
+		if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+			nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
+						      ntohl(th->seq) + 1);
+			break;
+		}
+
+		if (!th->syn || th->ack ||
+		    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+			break;
+
+		/* Reopened connection - reset the sequence number and timestamp
+		 * adjustments, they will get initialized once the connection is
+		 * reestablished.
+		 */
+		nf_ct_seqadj_init(ct, ctinfo, 0);
+		synproxy->tsoff = 0;
+		this_cpu_inc(snet->stats->conn_reopened);
+
+		/* fall through */
+	case TCP_CONNTRACK_SYN_SENT:
+		synproxy_parse_options(skb, thoff, th, &opts);
+
+		if (!th->syn && th->ack &&
+		    CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+			/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
+			 * therefore we need to add 1 to make the SYN sequence
+			 * number match the one of first SYN.
+			 */
+			if (synproxy_recv_client_ack(snet, skb, th, &opts,
+						     ntohl(th->seq) + 1))
+				this_cpu_inc(snet->stats->cookie_retrans);
+
+			return NF_DROP;
+		}
+
+		synproxy->isn = ntohl(th->ack_seq);
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy->its = opts.tsecr;
+		break;
+	case TCP_CONNTRACK_SYN_RECV:
+		if (!th->syn || !th->ack)
+			break;
+
+		synproxy_parse_options(skb, thoff, th, &opts);
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+			synproxy->tsoff = opts.tsval - synproxy->its;
+
+		opts.options &= ~(XT_SYNPROXY_OPT_MSS |
+				  XT_SYNPROXY_OPT_WSCALE |
+				  XT_SYNPROXY_OPT_SACK_PERM);
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_server_ack(snet, state, skb, th, &opts);
+
+		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+
+		swap(opts.tsval, opts.tsecr);
+		synproxy_send_client_ack(snet, skb, th, &opts);
+
+		consume_skb(skb);
+		return NF_STOLEN;
+	default:
+		break;
+	}
+
+	synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
+	return NF_ACCEPT;
+}
+
+static int synproxy_tg4_check(const struct xt_tgchk_param *par)
+{
+	const struct ipt_entry *e = par->entryinfo;
+
+	if (e->ip.proto != IPPROTO_TCP ||
+	    e->ip.invflags & XT_INV_PROTO)
+		return -EINVAL;
+
+	return nf_ct_l3proto_try_module_get(par->family);
+}
+
+static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_ct_l3proto_module_put(par->family);
+}
+
+static struct xt_target synproxy_tg4_reg __read_mostly = {
+	.name		= "SYNPROXY",
+	.family		= NFPROTO_IPV4,
+	.target		= synproxy_tg4,
+	.targetsize	= sizeof(struct xt_synproxy_info),
+	.checkentry	= synproxy_tg4_check,
+	.destroy	= synproxy_tg4_destroy,
+	.me		= THIS_MODULE,
+};
+
+static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
+	{
+		.hook		= ipv4_synproxy_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+	{
+		.hook		= ipv4_synproxy_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM - 1,
+	},
+};
+
+static int __init synproxy_tg4_init(void)
+{
+	int err;
+
+	err = nf_register_hooks(ipv4_synproxy_ops,
+				ARRAY_SIZE(ipv4_synproxy_ops));
+	if (err < 0)
+		goto err1;
+
+	err = xt_register_target(&synproxy_tg4_reg);
+	if (err < 0)
+		goto err2;
+
+	return 0;
+
+err2:
+	nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+err1:
+	return err;
+}
+
+static void __exit synproxy_tg4_exit(void)
+{
+	xt_unregister_target(&synproxy_tg4_reg);
+	nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
+}
+
+module_init(synproxy_tg4_init);
+module_exit(synproxy_tg4_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index c45fc1a60e0..62a171ab204 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -408,6 +408,9 @@ config NF_NAT_TFTP
 	depends on NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_TFTP
 
+config NETFILTER_SYNPROXY
+	tristate
+
 endif # NF_CONNTRACK
 
 config NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 89a9c1658f5..c3a0a12907f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -61,6 +61,9 @@ obj-$(CONFIG_NF_NAT_IRC) += nf_nat_irc.o
 obj-$(CONFIG_NF_NAT_SIP) += nf_nat_sip.o
 obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
 
+# SYNPROXY
+obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
+
 # generic X tables 
 obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 00a7a94d413..5d892febd64 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -48,6 +48,7 @@
 #include <net/netfilter/nf_conntrack_timestamp.h>
 #include <net/netfilter/nf_conntrack_timeout.h>
 #include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
@@ -799,6 +800,11 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	if (IS_ERR(ct))
 		return (struct nf_conntrack_tuple_hash *)ct;
 
+	if (tmpl && nfct_synproxy(tmpl)) {
+		nfct_seqadj_ext_add(ct);
+		nfct_synproxy_ext_add(ct);
+	}
+
 	timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
 	if (timeout_ext)
 		timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 984a8d1a335..44d1ea32570 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -28,6 +28,7 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -946,6 +947,21 @@ static int tcp_packet(struct nf_conn *ct,
 				  "state %s ", tcp_conntrack_names[old_state]);
 		return NF_ACCEPT;
 	case TCP_CONNTRACK_MAX:
+		/* Special case for SYN proxy: when the SYN to the server or
+		 * the SYN/ACK from the server is lost, the client may transmit
+		 * a keep-alive packet while in SYN_SENT state. This needs to
+		 * be associated with the original conntrack entry in order to
+		 * generate a new SYN with the correct sequence number.
+		 */
+		if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
+		    index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
+		    ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
+		    ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
+			pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
+			spin_unlock_bh(&ct->lock);
+			return NF_ACCEPT;
+		}
+
 		/* Invalid packet */
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index 483eb9ce321..5f9bfd060de 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -6,6 +6,26 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
 
+int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+		      s32 off)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	struct nf_conn_seqadj *seqadj;
+	struct nf_ct_seqadj *this_way;
+
+	if (off == 0)
+		return 0;
+
+	set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+
+	seqadj = nfct_seqadj(ct);
+	this_way = &seqadj->seq[dir];
+	this_way->offset_before	 = off;
+	this_way->offset_after	 = off;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_seqadj_init);
+
 int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 		     __be32 seq, s32 off)
 {
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
new file mode 100644
index 00000000000..d23dc791aca
--- /dev/null
+++ b/net/netfilter/nf_synproxy_core.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <asm/unaligned.h>
+#include <net/tcp.h>
+#include <net/netns/generic.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_tcpudp.h>
+#include <linux/netfilter/xt_SYNPROXY.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
+
+int synproxy_net_id;
+EXPORT_SYMBOL_GPL(synproxy_net_id);
+
+void
+synproxy_parse_options(const struct sk_buff *skb, unsigned int doff,
+		       const struct tcphdr *th, struct synproxy_options *opts)
+{
+	int length = (th->doff * 4) - sizeof(*th);
+	u8 buf[40], *ptr;
+
+	ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf);
+	BUG_ON(ptr == NULL);
+
+	opts->options = 0;
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return;
+		case TCPOPT_NOP:
+			length--;
+			continue;
+		default:
+			opsize = *ptr++;
+			if (opsize < 2)
+				return;
+			if (opsize > length)
+				return;
+
+			switch (opcode) {
+			case TCPOPT_MSS:
+				if (opsize == TCPOLEN_MSS) {
+					opts->mss = get_unaligned_be16(ptr);
+					opts->options |= XT_SYNPROXY_OPT_MSS;
+				}
+				break;
+			case TCPOPT_WINDOW:
+				if (opsize == TCPOLEN_WINDOW) {
+					opts->wscale = *ptr;
+					if (opts->wscale > 14)
+						opts->wscale = 14;
+					opts->options |= XT_SYNPROXY_OPT_WSCALE;
+				}
+				break;
+			case TCPOPT_TIMESTAMP:
+				if (opsize == TCPOLEN_TIMESTAMP) {
+					opts->tsval = get_unaligned_be32(ptr);
+					opts->tsecr = get_unaligned_be32(ptr + 4);
+					opts->options |= XT_SYNPROXY_OPT_TIMESTAMP;
+				}
+				break;
+			case TCPOPT_SACK_PERM:
+				if (opsize == TCPOLEN_SACK_PERM)
+					opts->options |= XT_SYNPROXY_OPT_SACK_PERM;
+				break;
+			}
+
+			ptr += opsize - 2;
+			length -= opsize;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(synproxy_parse_options);
+
+unsigned int synproxy_options_size(const struct synproxy_options *opts)
+{
+	unsigned int size = 0;
+
+	if (opts->options & XT_SYNPROXY_OPT_MSS)
+		size += TCPOLEN_MSS_ALIGNED;
+	if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
+		size += TCPOLEN_TSTAMP_ALIGNED;
+	else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+		size += TCPOLEN_SACKPERM_ALIGNED;
+	if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+		size += TCPOLEN_WSCALE_ALIGNED;
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(synproxy_options_size);
+
+void
+synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts)
+{
+	__be32 *ptr = (__be32 *)(th + 1);
+	u8 options = opts->options;
+
+	if (options & XT_SYNPROXY_OPT_MSS)
+		*ptr++ = htonl((TCPOPT_MSS << 24) |
+			       (TCPOLEN_MSS << 16) |
+			       opts->mss);
+
+	if (options & XT_SYNPROXY_OPT_TIMESTAMP) {
+		if (options & XT_SYNPROXY_OPT_SACK_PERM)
+			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+				       (TCPOLEN_SACK_PERM << 16) |
+				       (TCPOPT_TIMESTAMP << 8) |
+				       TCPOLEN_TIMESTAMP);
+		else
+			*ptr++ = htonl((TCPOPT_NOP << 24) |
+				       (TCPOPT_NOP << 16) |
+				       (TCPOPT_TIMESTAMP << 8) |
+				       TCPOLEN_TIMESTAMP);
+
+		*ptr++ = htonl(opts->tsval);
+		*ptr++ = htonl(opts->tsecr);
+	} else if (options & XT_SYNPROXY_OPT_SACK_PERM)
+		*ptr++ = htonl((TCPOPT_NOP << 24) |
+			       (TCPOPT_NOP << 16) |
+			       (TCPOPT_SACK_PERM << 8) |
+			       TCPOLEN_SACK_PERM);
+
+	if (options & XT_SYNPROXY_OPT_WSCALE)
+		*ptr++ = htonl((TCPOPT_NOP << 24) |
+			       (TCPOPT_WINDOW << 16) |
+			       (TCPOLEN_WINDOW << 8) |
+			       opts->wscale);
+}
+EXPORT_SYMBOL_GPL(synproxy_build_options);
+
+void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info,
+				    struct synproxy_options *opts)
+{
+	opts->tsecr = opts->tsval;
+	opts->tsval = tcp_time_stamp & ~0x3f;
+
+	if (opts->options & XT_SYNPROXY_OPT_WSCALE)
+		opts->tsval |= info->wscale;
+	else
+		opts->tsval |= 0xf;
+
+	if (opts->options & XT_SYNPROXY_OPT_SACK_PERM)
+		opts->tsval |= 1 << 4;
+
+	if (opts->options & XT_SYNPROXY_OPT_ECN)
+		opts->tsval |= 1 << 5;
+}
+EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie);
+
+void synproxy_check_timestamp_cookie(struct synproxy_options *opts)
+{
+	opts->wscale = opts->tsecr & 0xf;
+	if (opts->wscale != 0xf)
+		opts->options |= XT_SYNPROXY_OPT_WSCALE;
+
+	opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0;
+
+	opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0;
+}
+EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie);
+
+unsigned int synproxy_tstamp_adjust(struct sk_buff *skb,
+				    unsigned int protoff,
+				    struct tcphdr *th,
+				    struct nf_conn *ct,
+				    enum ip_conntrack_info ctinfo,
+				    const struct nf_conn_synproxy *synproxy)
+{
+	unsigned int optoff, optend;
+	u32 *ptr, old;
+
+	if (synproxy->tsoff == 0)
+		return 1;
+
+	optoff = protoff + sizeof(struct tcphdr);
+	optend = protoff + th->doff * 4;
+
+	if (!skb_make_writable(skb, optend))
+		return 0;
+
+	while (optoff < optend) {
+		unsigned char *op = skb->data + optoff;
+
+		switch (op[0]) {
+		case TCPOPT_EOL:
+			return 1;
+		case TCPOPT_NOP:
+			optoff++;
+			continue;
+		default:
+			if (optoff + 1 == optend ||
+			    optoff + op[1] > optend ||
+			    op[1] < 2)
+				return 0;
+			if (op[0] == TCPOPT_TIMESTAMP &&
+			    op[1] == TCPOLEN_TIMESTAMP) {
+				if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+					ptr = (u32 *)&op[2];
+					old = *ptr;
+					*ptr = htonl(ntohl(*ptr) -
+						     synproxy->tsoff);
+				} else {
+					ptr = (u32 *)&op[6];
+					old = *ptr;
+					*ptr = htonl(ntohl(*ptr) +
+						     synproxy->tsoff);
+				}
+				inet_proto_csum_replace4(&th->check, skb,
+							 old, *ptr, 0);
+				return 1;
+			}
+			optoff += op[1];
+		}
+	}
+	return 1;
+}
+EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust);
+
+static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = {
+	.len		= sizeof(struct nf_conn_synproxy),
+	.align		= __alignof__(struct nf_conn_synproxy),
+	.id		= NF_CT_EXT_SYNPROXY,
+};
+
+#ifdef CONFIG_PROC_FS
+static void *synproxy_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos - 1; cpu < nr_cpu_ids; cpu++) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu + 1;
+		return per_cpu_ptr(snet->stats, cpu);
+	}
+
+	return NULL;
+}
+
+static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct synproxy_net *snet = synproxy_pernet(seq_file_net(seq));
+	int cpu;
+
+	for (cpu = *pos; cpu < nr_cpu_ids; cpu++) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu + 1;
+		return per_cpu_ptr(snet->stats, cpu);
+	}
+
+	return NULL;
+}
+
+static void synproxy_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+	return;
+}
+
+static int synproxy_cpu_seq_show(struct seq_file *seq, void *v)
+{
+	struct synproxy_stats *stats = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries\t\tsyn_received\t"
+				"cookie_invalid\tcookie_valid\t"
+				"cookie_retrans\tconn_reopened\n");
+		return 0;
+	}
+
+	seq_printf(seq, "%08x\t%08x\t%08x\t%08x\t%08x\t%08x\n", 0,
+		   stats->syn_received,
+		   stats->cookie_invalid,
+		   stats->cookie_valid,
+		   stats->cookie_retrans,
+		   stats->conn_reopened);
+
+	return 0;
+}
+
+static const struct seq_operations synproxy_cpu_seq_ops = {
+	.start		= synproxy_cpu_seq_start,
+	.next		= synproxy_cpu_seq_next,
+	.stop		= synproxy_cpu_seq_stop,
+	.show		= synproxy_cpu_seq_show,
+};
+
+static int synproxy_cpu_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open_net(inode, file, &synproxy_cpu_seq_ops,
+			    sizeof(struct seq_net_private));
+}
+
+static const struct file_operations synproxy_cpu_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= synproxy_cpu_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_net,
+};
+
+static int __net_init synproxy_proc_init(struct net *net)
+{
+	if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat,
+			 &synproxy_cpu_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+	remove_proc_entry("synproxy", net->proc_net_stat);
+}
+#else
+static int __net_init synproxy_proc_init(struct net *net)
+{
+	return 0;
+}
+
+static void __net_exit synproxy_proc_exit(struct net *net)
+{
+	return;
+}
+#endif /* CONFIG_PROC_FS */
+
+static int __net_init synproxy_net_init(struct net *net)
+{
+	struct synproxy_net *snet = synproxy_pernet(net);
+	struct nf_conntrack_tuple t;
+	struct nf_conn *ct;
+	int err = -ENOMEM;
+
+	memset(&t, 0, sizeof(t));
+	ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL);
+	if (IS_ERR(ct)) {
+		err = PTR_ERR(ct);
+		goto err1;
+	}
+
+	__set_bit(IPS_TEMPLATE_BIT, &ct->status);
+	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	if (!nfct_seqadj_ext_add(ct))
+		goto err2;
+	if (!nfct_synproxy_ext_add(ct))
+		goto err2;
+
+	snet->tmpl = ct;
+
+	snet->stats = alloc_percpu(struct synproxy_stats);
+	if (snet->stats == NULL)
+		goto err2;
+
+	err = synproxy_proc_init(net);
+	if (err < 0)
+		goto err3;
+
+	return 0;
+
+err3:
+	free_percpu(snet->stats);
+err2:
+	nf_conntrack_free(ct);
+err1:
+	return err;
+}
+
+static void __net_exit synproxy_net_exit(struct net *net)
+{
+	struct synproxy_net *snet = synproxy_pernet(net);
+
+	nf_conntrack_free(snet->tmpl);
+	synproxy_proc_exit(net);
+	free_percpu(snet->stats);
+}
+
+static struct pernet_operations synproxy_net_ops = {
+	.init		= synproxy_net_init,
+	.exit		= synproxy_net_exit,
+	.id		= &synproxy_net_id,
+	.size		= sizeof(struct synproxy_net),
+};
+
+static int __init synproxy_core_init(void)
+{
+	int err;
+
+	err = nf_ct_extend_register(&nf_ct_synproxy_extend);
+	if (err < 0)
+		goto err1;
+
+	err = register_pernet_subsys(&synproxy_net_ops);
+	if (err < 0)
+		goto err2;
+
+	return 0;
+
+err2:
+	nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+err1:
+	return err;
+}
+
+static void __exit synproxy_core_exit(void)
+{
+	unregister_pernet_subsys(&synproxy_net_ops);
+	nf_ct_extend_unregister(&nf_ct_synproxy_extend);
+}
+
+module_init(synproxy_core_init);
+module_exit(synproxy_core_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-- 
cgit v1.2.3-70-g09d2


From b800c3b966bcf004bd8592293a49ed5cb7ea67a9 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Tue, 27 Aug 2013 01:36:51 +0200
Subject: ipv6: drop fragmented ndisc packets by default (RFC 6980)

This patch implements RFC6980: Drop fragmented ndisc packets by
default. If a fragmented ndisc packet is received the user is informed
that it is possible to disable the check.

Cc: Fernando Gont <fernando@gont.com.ar>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 10 ++++++++++
 net/ipv6/ndisc.c                       | 17 +++++++++++++++++
 5 files changed, 35 insertions(+)

(limited to 'include/uapi')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index debfe857d8f..a2be556032c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1349,6 +1349,12 @@ mldv2_unsolicited_report_interval - INTEGER
 	MLDv2 report retransmit will take place.
 	Default: 1000 (1 second)
 
+suppress_frag_ndisc - INTEGER
+	Control RFC 6980 (Security Implications of IPv6 Fragmentation
+	with IPv6 Neighbor Discovery) behavior:
+	1 - (default) discard fragmented neighbor discovery packets
+	0 - allow fragmented neighbor discovery packets
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 9ac5047062c..28ea3843931 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -50,6 +50,7 @@ struct ipv6_devconf {
 	__s32		accept_dad;
 	__s32		force_tllao;
 	__s32           ndisc_notify;
+	__s32		suppress_frag_ndisc;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index d07ac6903e5..593b0e32d95 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -162,6 +162,7 @@ enum {
 	DEVCONF_NDISC_NOTIFY,
 	DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL,
 	DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
+	DEVCONF_SUPPRESS_FRAG_NDISC,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2d6d1793bbf..a7183fc9bbc 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -204,6 +204,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_source_route	= 0,	/* we do not accept RH0 by default. */
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
+	.suppress_frag_ndisc	= 1,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -241,6 +242,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.accept_source_route	= 0,	/* we do not accept RH0 by default. */
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
+	.suppress_frag_ndisc	= 1,
 };
 
 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
@@ -4188,6 +4190,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
 	array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao;
 	array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify;
+	array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5001,6 +5004,13 @@ static struct addrconf_sysctl_table
 			.mode           = 0644,
 			.proc_handler   = proc_dointvec
 		},
+		{
+			.procname	= "suppress_frag_ndisc",
+			.data		= &ipv6_devconf.suppress_frag_ndisc,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 04d31c2fbef..41720feeaa6 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1519,10 +1519,27 @@ static void pndisc_redo(struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
+{
+	struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+	if (!idev)
+		return true;
+	if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED &&
+	    idev->cnf.suppress_frag_ndisc) {
+		net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n");
+		return true;
+	}
+	return false;
+}
+
 int ndisc_rcv(struct sk_buff *skb)
 {
 	struct nd_msg *msg;
 
+	if (ndisc_suppress_frag_ndisc(skb))
+		return 0;
+
 	if (skb_linearize(skb))
 		return 0;
 
-- 
cgit v1.2.3-70-g09d2


From 5df0ddfbc9209ffafc82236509ba0e975120e3c3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 28 Aug 2013 22:13:09 +0200
Subject: net: packet: add randomized fanout scheduler

We currently allow for different fanout scheduling policies in pf_packet
such as scheduling by skb's rxhash, round-robin, by cpu, and rollover.
Also allow for a random, equidistributed selection of the socket from the
fanout process group.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h |  1 +
 net/packet/af_packet.c         | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index b950c02030c..dbf06667394 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -56,6 +56,7 @@ struct sockaddr_ll {
 #define PACKET_FANOUT_LB		1
 #define PACKET_FANOUT_CPU		2
 #define PACKET_FANOUT_ROLLOVER		3
+#define PACKET_FANOUT_RND		4
 #define PACKET_FANOUT_FLAG_ROLLOVER	0x1000
 #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1fdf9ab91c3..bee9bfdc8d0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -88,7 +88,7 @@
 #include <linux/virtio_net.h>
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
-
+#include <linux/reciprocal_div.h>
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
@@ -1158,6 +1158,13 @@ static unsigned int fanout_demux_cpu(struct packet_fanout *f,
 	return smp_processor_id() % num;
 }
 
+static unsigned int fanout_demux_rnd(struct packet_fanout *f,
+				     struct sk_buff *skb,
+				     unsigned int num)
+{
+	return reciprocal_divide(prandom_u32(), num);
+}
+
 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
 					  struct sk_buff *skb,
 					  unsigned int idx, unsigned int skip,
@@ -1215,6 +1222,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
 	case PACKET_FANOUT_CPU:
 		idx = fanout_demux_cpu(f, skb, num);
 		break;
+	case PACKET_FANOUT_RND:
+		idx = fanout_demux_rnd(f, skb, num);
+		break;
 	case PACKET_FANOUT_ROLLOVER:
 		idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
 		break;
@@ -1284,6 +1294,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 	case PACKET_FANOUT_HASH:
 	case PACKET_FANOUT_LB:
 	case PACKET_FANOUT_CPU:
+	case PACKET_FANOUT_RND:
 		break;
 	default:
 		return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From 391ac1282dd7ff1cb8245cccc5262e8e4173edc4 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Mon, 26 Aug 2013 15:05:36 +0200
Subject: can: gw: add a per rule limitation of frame hops

Usually the received CAN frames can be processed/routed as much as 'max_hops'
times (which is given at module load time of the can-gw module).
Introduce a new configuration option to reduce the number of possible hops
for a specific gateway rule to a value smaller then max_hops.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/uapi/linux/can/gw.h |  9 ++++++++-
 net/can/gw.c                | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h
index ae07bec74f4..4e27c82b564 100644
--- a/include/uapi/linux/can/gw.h
+++ b/include/uapi/linux/can/gw.h
@@ -45,6 +45,7 @@ enum {
 	CGW_DST_IF,	/* ifindex of destination network interface */
 	CGW_FILTER,	/* specify struct can_filter on source CAN device */
 	CGW_DELETED,	/* number of deleted CAN frames (see max_hops param) */
+	CGW_LIM_HOPS,	/* limit the number of hops of this specific rule */
 	__CGW_MAX
 };
 
@@ -116,13 +117,19 @@ enum {
  * Sets a CAN receive filter for the gateway job specified by the
  * struct can_filter described in include/linux/can.h
  *
- * CGW_MOD_XXX (length 17 bytes):
+ * CGW_MOD_(AND|OR|XOR|SET) (length 17 bytes):
  * Specifies a modification that's done to a received CAN frame before it is
  * send out to the destination interface.
  *
  * <struct can_frame> data used as operator
  * <u8> affected CAN frame elements
  *
+ * CGW_LIM_HOPS (length 1 byte):
+ * Limit the number of hops of this specific rule. Usually the received CAN
+ * frame can be processed as much as 'max_hops' times (which is given at module
+ * load time of the can-gw module). This value is used to reduce the number of
+ * possible hops for this gateway rule to a value smaller then max_hops.
+ *
  * CGW_CS_XOR (length 4 bytes):
  * Set a simple XOR checksum starting with an initial value into
  * data[result-idx] using data[start-idx] .. data[end-idx]
diff --git a/net/can/gw.c b/net/can/gw.c
index 2f291f961a1..3f9b0f3a281 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -146,6 +146,7 @@ struct cgw_job {
 		/* tbc */
 	};
 	u8 gwtype;
+	u8 limit_hops;
 	u16 flags;
 };
 
@@ -402,6 +403,11 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 
 	/* put the incremented hop counter in the cloned skb */
 	cgw_hops(nskb) = cgw_hops(skb) + 1;
+
+	/* first processing of this CAN frame -> adjust to private hop limit */
+	if (gwj->limit_hops && cgw_hops(nskb) == 1)
+		cgw_hops(nskb) = max_hops - gwj->limit_hops + 1;
+
 	nskb->dev = gwj->dst.dev;
 
 	/* pointer to modifiable CAN frame */
@@ -509,6 +515,11 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
 
 	/* check non default settings of attributes */
 
+	if (gwj->limit_hops) {
+		if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0)
+			goto cancel;
+	}
+
 	if (gwj->mod.modtype.and) {
 		memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
 		mb.modtype = gwj->mod.modtype.and;
@@ -606,11 +617,12 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = {
 	[CGW_SRC_IF]	= { .type = NLA_U32 },
 	[CGW_DST_IF]	= { .type = NLA_U32 },
 	[CGW_FILTER]	= { .len = sizeof(struct can_filter) },
+	[CGW_LIM_HOPS]	= { .type = NLA_U8 },
 };
 
 /* check for common and gwtype specific attributes */
 static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
-			  u8 gwtype, void *gwtypeattr)
+			  u8 gwtype, void *gwtypeattr, u8 *limhops)
 {
 	struct nlattr *tb[CGW_MAX+1];
 	struct cgw_frame_mod mb;
@@ -625,6 +637,13 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
 	if (err < 0)
 		return err;
 
+	if (tb[CGW_LIM_HOPS]) {
+		*limhops = nla_get_u8(tb[CGW_LIM_HOPS]);
+
+		if (*limhops < 1 || *limhops > max_hops)
+			return -EINVAL;
+	}
+
 	/* check for AND/OR/XOR/SET modifications */
 
 	if (tb[CGW_MOD_AND]) {
@@ -782,6 +801,7 @@ static int cgw_create_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 {
 	struct rtcanmsg *r;
 	struct cgw_job *gwj;
+	u8 limhops = 0;
 	int err = 0;
 
 	if (!capable(CAP_NET_ADMIN))
@@ -808,7 +828,8 @@ static int cgw_create_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 	gwj->flags = r->flags;
 	gwj->gwtype = r->gwtype;
 
-	err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw);
+	err = cgw_parse_attr(nlh, &gwj->mod, CGW_TYPE_CAN_CAN, &gwj->ccgw,
+			     &limhops);
 	if (err < 0)
 		goto out;
 
@@ -836,6 +857,8 @@ static int cgw_create_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 	if (gwj->dst.dev->type != ARPHRD_CAN || gwj->dst.dev->header_ops)
 		goto put_src_dst_out;
 
+	gwj->limit_hops = limhops;
+
 	ASSERT_RTNL();
 
 	err = cgw_register_filter(gwj);
@@ -867,13 +890,14 @@ static void cgw_remove_all_jobs(void)
 	}
 }
 
-static int cgw_remove_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
+static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct cgw_job *gwj = NULL;
 	struct hlist_node *nx;
 	struct rtcanmsg *r;
 	struct cf_mod mod;
 	struct can_can_gw ccgw;
+	u8 limhops = 0;
 	int err = 0;
 
 	if (!capable(CAP_NET_ADMIN))
@@ -890,7 +914,7 @@ static int cgw_remove_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 	if (r->gwtype != CGW_TYPE_CAN_CAN)
 		return -EINVAL;
 
-	err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw);
+	err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
 	if (err < 0)
 		return err;
 
@@ -910,6 +934,9 @@ static int cgw_remove_job(struct sk_buff *skb,  struct nlmsghdr *nlh)
 		if (gwj->flags != r->flags)
 			continue;
 
+		if (gwj->limit_hops != limhops)
+			continue;
+
 		if (memcmp(&gwj->mod, &mod, sizeof(mod)))
 			continue;
 
-- 
cgit v1.2.3-70-g09d2


From afe4fd062416b158a8a8538b23adc1930a9b88dc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 29 Aug 2013 15:49:55 -0700
Subject: pkt_sched: fq: Fair Queue packet scheduler

- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel)
- Uses the new_flow/old_flow separation from FQ_codel
- New flows get an initial credit allowing IW10 without added delay.
- Special FIFO queue for high prio packets (no need for PRIO + FQ)
- Uses a hash table of RB trees to locate the flows at enqueue() time
- Smart on demand gc (at enqueue() time, RB tree lookup evicts old
  unused flows)
- Dynamic memory allocations.
- Designed to allow millions of concurrent flows per Qdisc.
- Small memory footprint : ~8K per Qdisc, and 104 bytes per flow.
- Single high resolution timer for throttled flows (if any).
- One RB tree to link throttled flows.
- Ability to have a max rate per flow. We might add a socket option
  to add per socket limitation.

Attempts have been made to add TCP pacing in TCP stack, but this
seems to add complex code to an already complex stack.

TCP pacing is welcomed for flows having idle times, as the cwnd
permits TCP stack to queue a possibly large number of packets.

This removes the 'slow start after idle' choice, hitting badly
large BDP flows, and applications delivering chunks of data
as video streams.

Nicely spaced packets :
Here interface is 10Gbit, but flow bottleneck is ~20Mbit

cwin is big, yet FQ avoids the typical bursts generated by TCP
(as in netperf TCP_RR -- -r 100000,100000)

15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115>
15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115>
15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115>
15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115>
15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115>
15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115>
15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115>
15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>

TCP timestamps show that most packets from B were queued in the same ms
timeframe (TSval 1159799{3,4}), but FQ managed to send them right
in time to avoid a big burst.

In slow start or steady state, very few packets are throttled [1]

FQ gets a bunch of tunables as :

  limit : max number of packets on whole Qdisc (default 10000)

  flow_limit : max number of packets per flow (default 100)

  quantum : the credit per RR round (default is 2 MTU)

  initial_quantum : initial credit for new flows (default is 10 MTU)

  maxrate : max per flow rate (default : unlimited)

  buckets : number of RB trees (default : 1024) in hash table.
               (consumes 8 bytes per bucket)

  [no]pacing : disable/enable pacing (default is enable)

All of them can be changed on a live qdisc.

$ tc qd add dev eth0 root fq help
Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ]
              [ quantum BYTES ] [ initial_quantum BYTES ]
              [ maxrate RATE  ] [ buckets NUMBER ]
              [ [no]pacing ]

$ tc -s -d qd
qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140
 Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14)
 backlog 0b 0p requeues 14
  511 flows, 511 inactive, 0 throttled
  110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit

[1] Except if initial srtt is overestimated, as if using
cached srtt in tcp metrics. We'll provide a fix for this issue.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  41 +++
 net/sched/Kconfig              |  14 +
 net/sched/Makefile             |   1 +
 net/sched/sch_fq.c             | 792 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 848 insertions(+)
 create mode 100644 net/sched/sch_fq.c

(limited to 'include/uapi')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 09d62b9228f..9b829134d42 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -744,4 +744,45 @@ struct tc_fq_codel_xstats {
 	};
 };
 
+/* FQ */
+
+enum {
+	TCA_FQ_UNSPEC,
+
+	TCA_FQ_PLIMIT,		/* limit of total number of packets in queue */
+
+	TCA_FQ_FLOW_PLIMIT,	/* limit of packets per flow */
+
+	TCA_FQ_QUANTUM,		/* RR quantum */
+
+	TCA_FQ_INITIAL_QUANTUM,		/* RR quantum for new flow */
+
+	TCA_FQ_RATE_ENABLE,	/* enable/disable rate limiting */
+
+	TCA_FQ_FLOW_DEFAULT_RATE,/* for sockets with unspecified sk_rate,
+				  * use the following rate
+				  */
+
+	TCA_FQ_FLOW_MAX_RATE,	/* per flow max rate */
+
+	TCA_FQ_BUCKETS_LOG,	/* log2(number of buckets) */
+	__TCA_FQ_MAX
+};
+
+#define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
+
+struct tc_fq_qd_stats {
+	__u64	gc_flows;
+	__u64	highprio_packets;
+	__u64	tcp_retrans;
+	__u64	throttled;
+	__u64	flows_plimit;
+	__u64	pkts_too_long;
+	__u64	allocation_errors;
+	__s64	time_next_delayed_flow;
+	__u32	flows;
+	__u32	inactive_flows;
+	__u32	throttled_flows;
+	__u32	pad;
+};
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 235e01acac5..c03a32a0418 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -272,6 +272,20 @@ config NET_SCH_FQ_CODEL
 
 	  If unsure, say N.
 
+config NET_SCH_FQ
+	tristate "Fair Queue"
+	help
+	  Say Y here if you want to use the FQ packet scheduling algorithm.
+
+	  FQ does flow separation, and is able to respect pacing requirements
+	  set by TCP stack into sk->sk_pacing_rate (for localy generated
+	  traffic)
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_fq.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 978cbf004e8..e5f9abe9a5d 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
 obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
 obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
+obj-$(CONFIG_NET_SCH_FQ)	+= sch_fq.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
new file mode 100644
index 00000000000..91ceca7bcb5
--- /dev/null
+++ b/net/sched/sch_fq.c
@@ -0,0 +1,792 @@
+/*
+ * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
+ *
+ *  Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *  Meant to be mostly used for localy generated traffic :
+ *  Fast classification depends on skb->sk being set before reaching us.
+ *  If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
+ *  All packets belonging to a socket are considered as a 'flow'.
+ *
+ *  Flows are dynamically allocated and stored in a hash table of RB trees
+ *  They are also part of one Round Robin 'queues' (new or old flows)
+ *
+ *  Burst avoidance (aka pacing) capability :
+ *
+ *  Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
+ *  bunch of packets, and this packet scheduler adds delay between
+ *  packets to respect rate limitation.
+ *
+ *  enqueue() :
+ *   - lookup one RB tree (out of 1024 or more) to find the flow.
+ *     If non existent flow, create it, add it to the tree.
+ *     Add skb to the per flow list of skb (fifo).
+ *   - Use a special fifo for high prio packets
+ *
+ *  dequeue() : serves flows in Round Robin
+ *  Note : When a flow becomes empty, we do not immediately remove it from
+ *  rb trees, for performance reasons (its expected to send additional packets,
+ *  or SLAB cache will reuse socket for another flow)
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/hash.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+
+/*
+ * Per flow structure, dynamically allocated
+ */
+struct fq_flow {
+	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
+	union {
+		struct sk_buff *tail;	/* last skb in the list */
+		unsigned long  age;	/* jiffies when flow was emptied, for gc */
+	};
+	struct rb_node	fq_node; 	/* anchor in fq_root[] trees */
+	struct sock	*sk;
+	int		qlen;		/* number of packets in flow queue */
+	int		credit;
+	u32		socket_hash;	/* sk_hash */
+	struct fq_flow *next;		/* next pointer in RR lists, or &detached */
+
+	struct rb_node  rate_node;	/* anchor in q->delayed tree */
+	u64		time_next_packet;
+};
+
+struct fq_flow_head {
+	struct fq_flow *first;
+	struct fq_flow *last;
+};
+
+struct fq_sched_data {
+	struct fq_flow_head new_flows;
+
+	struct fq_flow_head old_flows;
+
+	struct rb_root	delayed;	/* for rate limited flows */
+	u64		time_next_delayed_flow;
+
+	struct fq_flow	internal;	/* for non classified or high prio packets */
+	u32		quantum;
+	u32		initial_quantum;
+	u32		flow_default_rate;/* rate per flow : bytes per second */
+	u32		flow_max_rate;	/* optional max rate per flow */
+	u32		flow_plimit;	/* max packets per flow */
+	struct rb_root	*fq_root;
+	u8		rate_enable;
+	u8		fq_trees_log;
+
+	u32		flows;
+	u32		inactive_flows;
+	u32		throttled_flows;
+
+	u64		stat_gc_flows;
+	u64		stat_internal_packets;
+	u64		stat_tcp_retrans;
+	u64		stat_throttled;
+	u64		stat_flows_plimit;
+	u64		stat_pkts_too_long;
+	u64		stat_allocation_errors;
+	struct qdisc_watchdog watchdog;
+};
+
+/* special value to mark a detached flow (not on old/new list) */
+static struct fq_flow detached, throttled;
+
+static void fq_flow_set_detached(struct fq_flow *f)
+{
+	f->next = &detached;
+}
+
+static bool fq_flow_is_detached(const struct fq_flow *f)
+{
+	return f->next == &detached;
+}
+
+static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
+{
+	struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
+
+	while (*p) {
+		struct fq_flow *aux;
+
+		parent = *p;
+		aux = container_of(parent, struct fq_flow, rate_node);
+		if (f->time_next_packet >= aux->time_next_packet)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&f->rate_node, parent, p);
+	rb_insert_color(&f->rate_node, &q->delayed);
+	q->throttled_flows++;
+	q->stat_throttled++;
+
+	f->next = &throttled;
+	if (q->time_next_delayed_flow > f->time_next_packet)
+		q->time_next_delayed_flow = f->time_next_packet;
+}
+
+
+static struct kmem_cache *fq_flow_cachep __read_mostly;
+
+static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+{
+	if (head->first)
+		head->last->next = flow;
+	else
+		head->first = flow;
+	head->last = flow;
+	flow->next = NULL;
+}
+
+/* limit number of collected flows per round */
+#define FQ_GC_MAX 8
+#define FQ_GC_AGE (3*HZ)
+
+static bool fq_gc_candidate(const struct fq_flow *f)
+{
+	return fq_flow_is_detached(f) &&
+	       time_after(jiffies, f->age + FQ_GC_AGE);
+}
+
+static void fq_gc(struct fq_sched_data *q,
+		  struct rb_root *root,
+		  struct sock *sk)
+{
+	struct fq_flow *f, *tofree[FQ_GC_MAX];
+	struct rb_node **p, *parent;
+	int fcnt = 0;
+
+	p = &root->rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		f = container_of(parent, struct fq_flow, fq_node);
+		if (f->sk == sk)
+			break;
+
+		if (fq_gc_candidate(f)) {
+			tofree[fcnt++] = f;
+			if (fcnt == FQ_GC_MAX)
+				break;
+		}
+
+		if (f->sk > sk)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+
+	q->flows -= fcnt;
+	q->inactive_flows -= fcnt;
+	q->stat_gc_flows += fcnt;
+	while (fcnt) {
+		struct fq_flow *f = tofree[--fcnt];
+
+		rb_erase(&f->fq_node, root);
+		kmem_cache_free(fq_flow_cachep, f);
+	}
+}
+
+static const u8 prio2band[TC_PRIO_MAX + 1] = {
+	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+};
+
+static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+{
+	struct rb_node **p, *parent;
+	struct sock *sk = skb->sk;
+	struct rb_root *root;
+	struct fq_flow *f;
+	int band;
+
+	/* warning: no starvation prevention... */
+	band = prio2band[skb->priority & TC_PRIO_MAX];
+	if (unlikely(band == 0))
+		return &q->internal;
+
+	if (unlikely(!sk)) {
+		/* By forcing low order bit to 1, we make sure to not
+		 * collide with a local flow (socket pointers are word aligned)
+		 */
+		sk = (struct sock *)(skb_get_rxhash(skb) | 1L);
+	}
+
+	root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
+
+	if (q->flows >= (2U << q->fq_trees_log) &&
+	    q->inactive_flows > q->flows/2)
+		fq_gc(q, root, sk);
+
+	p = &root->rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		f = container_of(parent, struct fq_flow, fq_node);
+		if (f->sk == sk) {
+			/* socket might have been reallocated, so check
+			 * if its sk_hash is the same.
+			 * It not, we need to refill credit with
+			 * initial quantum
+			 */
+			if (unlikely(skb->sk &&
+				     f->socket_hash != sk->sk_hash)) {
+				f->credit = q->initial_quantum;
+				f->socket_hash = sk->sk_hash;
+			}
+			return f;
+		}
+		if (f->sk > sk)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+
+	f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!f)) {
+		q->stat_allocation_errors++;
+		return &q->internal;
+	}
+	fq_flow_set_detached(f);
+	f->sk = sk;
+	if (skb->sk)
+		f->socket_hash = sk->sk_hash;
+	f->credit = q->initial_quantum;
+
+	rb_link_node(&f->fq_node, parent, p);
+	rb_insert_color(&f->fq_node, root);
+
+	q->flows++;
+	q->inactive_flows++;
+	return f;
+}
+
+
+/* remove one skb from head of flow queue */
+static struct sk_buff *fq_dequeue_head(struct fq_flow *flow)
+{
+	struct sk_buff *skb = flow->head;
+
+	if (skb) {
+		flow->head = skb->next;
+		skb->next = NULL;
+		flow->qlen--;
+	}
+	return skb;
+}
+
+/* We might add in the future detection of retransmits
+ * For the time being, just return false
+ */
+static bool skb_is_retransmit(struct sk_buff *skb)
+{
+	return false;
+}
+
+/* add skb to flow queue
+ * flow queue is a linked list, kind of FIFO, except for TCP retransmits
+ * We special case tcp retransmits to be transmitted before other packets.
+ * We rely on fact that TCP retransmits are unlikely, so we do not waste
+ * a separate queue or a pointer.
+ * head->  [retrans pkt 1]
+ *         [retrans pkt 2]
+ *         [ normal pkt 1]
+ *         [ normal pkt 2]
+ *         [ normal pkt 3]
+ * tail->  [ normal pkt 4]
+ */
+static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
+{
+	struct sk_buff *prev, *head = flow->head;
+
+	skb->next = NULL;
+	if (!head) {
+		flow->head = skb;
+		flow->tail = skb;
+		return;
+	}
+	if (likely(!skb_is_retransmit(skb))) {
+		flow->tail->next = skb;
+		flow->tail = skb;
+		return;
+	}
+
+	/* This skb is a tcp retransmit,
+	 * find the last retrans packet in the queue
+	 */
+	prev = NULL;
+	while (skb_is_retransmit(head)) {
+		prev = head;
+		head = head->next;
+		if (!head)
+			break;
+	}
+	if (!prev) { /* no rtx packet in queue, become the new head */
+		skb->next = flow->head;
+		flow->head = skb;
+	} else {
+		if (prev == flow->tail)
+			flow->tail = skb;
+		else
+			skb->next = prev->next;
+		prev->next = skb;
+	}
+}
+
+static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct fq_flow *f;
+
+	if (unlikely(sch->q.qlen >= sch->limit))
+		return qdisc_drop(skb, sch);
+
+	f = fq_classify(skb, q);
+	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
+		q->stat_flows_plimit++;
+		return qdisc_drop(skb, sch);
+	}
+
+	f->qlen++;
+	flow_queue_add(f, skb);
+	if (skb_is_retransmit(skb))
+		q->stat_tcp_retrans++;
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+	if (fq_flow_is_detached(f)) {
+		fq_flow_add_tail(&q->new_flows, f);
+		if (q->quantum > f->credit)
+			f->credit = q->quantum;
+		q->inactive_flows--;
+		qdisc_unthrottled(sch);
+	}
+	if (unlikely(f == &q->internal)) {
+		q->stat_internal_packets++;
+		qdisc_unthrottled(sch);
+	}
+	sch->q.qlen++;
+
+	return NET_XMIT_SUCCESS;
+}
+
+static void fq_check_throttled(struct fq_sched_data *q, u64 now)
+{
+	struct rb_node *p;
+
+	if (q->time_next_delayed_flow > now)
+		return;
+
+	q->time_next_delayed_flow = ~0ULL;
+	while ((p = rb_first(&q->delayed)) != NULL) {
+		struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
+
+		if (f->time_next_packet > now) {
+			q->time_next_delayed_flow = f->time_next_packet;
+			break;
+		}
+		rb_erase(p, &q->delayed);
+		q->throttled_flows--;
+		fq_flow_add_tail(&q->old_flows, f);
+	}
+}
+
+static struct sk_buff *fq_dequeue(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 now = ktime_to_ns(ktime_get());
+	struct fq_flow_head *head;
+	struct sk_buff *skb;
+	struct fq_flow *f;
+
+	skb = fq_dequeue_head(&q->internal);
+	if (skb)
+		goto out;
+	fq_check_throttled(q, now);
+begin:
+	head = &q->new_flows;
+	if (!head->first) {
+		head = &q->old_flows;
+		if (!head->first) {
+			if (q->time_next_delayed_flow != ~0ULL)
+				qdisc_watchdog_schedule_ns(&q->watchdog,
+							   q->time_next_delayed_flow);
+			return NULL;
+		}
+	}
+	f = head->first;
+
+	if (f->credit <= 0) {
+		f->credit += q->quantum;
+		head->first = f->next;
+		fq_flow_add_tail(&q->old_flows, f);
+		goto begin;
+	}
+
+	if (unlikely(f->head && now < f->time_next_packet)) {
+		head->first = f->next;
+		fq_flow_set_throttled(q, f);
+		goto begin;
+	}
+
+	skb = fq_dequeue_head(f);
+	if (!skb) {
+		head->first = f->next;
+		/* force a pass through old_flows to prevent starvation */
+		if ((head == &q->new_flows) && q->old_flows.first) {
+			fq_flow_add_tail(&q->old_flows, f);
+		} else {
+			fq_flow_set_detached(f);
+			f->age = jiffies;
+			q->inactive_flows++;
+		}
+		goto begin;
+	}
+	f->time_next_packet = now;
+	f->credit -= qdisc_pkt_len(skb);
+
+	if (f->credit <= 0 &&
+	    q->rate_enable &&
+	    skb->sk && skb->sk->sk_state != TCP_TIME_WAIT) {
+		u32 rate = skb->sk->sk_pacing_rate ?: q->flow_default_rate;
+
+		rate = min(rate, q->flow_max_rate);
+		if (rate) {
+			u64 len = (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC;
+
+			do_div(len, rate);
+			/* Since socket rate can change later,
+			 * clamp the delay to 125 ms.
+			 * TODO: maybe segment the too big skb, as in commit
+			 * e43ac79a4bc ("sch_tbf: segment too big GSO packets")
+			 */
+			if (unlikely(len > 125 * NSEC_PER_MSEC)) {
+				len = 125 * NSEC_PER_MSEC;
+				q->stat_pkts_too_long++;
+			}
+
+			f->time_next_packet = now + len;
+		}
+	}
+out:
+	prefetch(&skb->end);
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
+	sch->q.qlen--;
+	qdisc_unthrottled(sch);
+	return skb;
+}
+
+static void fq_reset(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = fq_dequeue(sch)) != NULL)
+		kfree_skb(skb);
+}
+
+static void fq_rehash(struct fq_sched_data *q,
+		      struct rb_root *old_array, u32 old_log,
+		      struct rb_root *new_array, u32 new_log)
+{
+	struct rb_node *op, **np, *parent;
+	struct rb_root *oroot, *nroot;
+	struct fq_flow *of, *nf;
+	int fcnt = 0;
+	u32 idx;
+
+	for (idx = 0; idx < (1U << old_log); idx++) {
+		oroot = &old_array[idx];
+		while ((op = rb_first(oroot)) != NULL) {
+			rb_erase(op, oroot);
+			of = container_of(op, struct fq_flow, fq_node);
+			if (fq_gc_candidate(of)) {
+				fcnt++;
+				kmem_cache_free(fq_flow_cachep, of);
+				continue;
+			}
+			nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
+
+			np = &nroot->rb_node;
+			parent = NULL;
+			while (*np) {
+				parent = *np;
+
+				nf = container_of(parent, struct fq_flow, fq_node);
+				BUG_ON(nf->sk == of->sk);
+
+				if (nf->sk > of->sk)
+					np = &parent->rb_right;
+				else
+					np = &parent->rb_left;
+			}
+
+			rb_link_node(&of->fq_node, parent, np);
+			rb_insert_color(&of->fq_node, nroot);
+		}
+	}
+	q->flows -= fcnt;
+	q->inactive_flows -= fcnt;
+	q->stat_gc_flows += fcnt;
+}
+
+static int fq_resize(struct fq_sched_data *q, u32 log)
+{
+	struct rb_root *array;
+	u32 idx;
+
+	if (q->fq_root && log == q->fq_trees_log)
+		return 0;
+
+	array = kmalloc(sizeof(struct rb_root) << log, GFP_KERNEL);
+	if (!array)
+		return -ENOMEM;
+
+	for (idx = 0; idx < (1U << log); idx++)
+		array[idx] = RB_ROOT;
+
+	if (q->fq_root) {
+		fq_rehash(q, q->fq_root, q->fq_trees_log, array, log);
+		kfree(q->fq_root);
+	}
+	q->fq_root = array;
+	q->fq_trees_log = log;
+
+	return 0;
+}
+
+static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+	[TCA_FQ_PLIMIT]			= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_PLIMIT]		= { .type = NLA_U32 },
+	[TCA_FQ_QUANTUM]		= { .type = NLA_U32 },
+	[TCA_FQ_INITIAL_QUANTUM]	= { .type = NLA_U32 },
+	[TCA_FQ_RATE_ENABLE]		= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_DEFAULT_RATE]	= { .type = NLA_U32 },
+	[TCA_FQ_FLOW_MAX_RATE]		= { .type = NLA_U32 },
+	[TCA_FQ_BUCKETS_LOG]		= { .type = NLA_U32 },
+};
+
+static int fq_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_FQ_MAX + 1];
+	int err, drop_count = 0;
+	u32 fq_log;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
+	if (err < 0)
+		return err;
+
+	sch_tree_lock(sch);
+
+	fq_log = q->fq_trees_log;
+
+	if (tb[TCA_FQ_BUCKETS_LOG]) {
+		u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
+
+		if (nval >= 1 && nval <= ilog2(256*1024))
+			fq_log = nval;
+		else
+			err = -EINVAL;
+	}
+	if (tb[TCA_FQ_PLIMIT])
+		sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
+
+	if (tb[TCA_FQ_FLOW_PLIMIT])
+		q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
+
+	if (tb[TCA_FQ_QUANTUM])
+		q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
+
+	if (tb[TCA_FQ_INITIAL_QUANTUM])
+		q->quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
+
+	if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
+		q->flow_default_rate = nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]);
+
+	if (tb[TCA_FQ_FLOW_MAX_RATE])
+		q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
+
+	if (tb[TCA_FQ_RATE_ENABLE]) {
+		u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
+
+		if (enable <= 1)
+			q->rate_enable = enable;
+		else
+			err = -EINVAL;
+	}
+
+	if (!err)
+		err = fq_resize(q, fq_log);
+
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = fq_dequeue(sch);
+
+		kfree_skb(skb);
+		drop_count++;
+	}
+	qdisc_tree_decrease_qlen(sch, drop_count);
+
+	sch_tree_unlock(sch);
+	return err;
+}
+
+static void fq_destroy(struct Qdisc *sch)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct rb_root *root;
+	struct rb_node *p;
+	unsigned int idx;
+
+	if (q->fq_root) {
+		for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
+			root = &q->fq_root[idx];
+			while ((p = rb_first(root)) != NULL) {
+				rb_erase(p, root);
+				kmem_cache_free(fq_flow_cachep,
+						container_of(p, struct fq_flow, fq_node));
+			}
+		}
+		kfree(q->fq_root);
+	}
+	qdisc_watchdog_cancel(&q->watchdog);
+}
+
+static int fq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	int err;
+
+	sch->limit		= 10000;
+	q->flow_plimit		= 100;
+	q->quantum		= 2 * psched_mtu(qdisc_dev(sch));
+	q->initial_quantum	= 10 * psched_mtu(qdisc_dev(sch));
+	q->flow_default_rate	= 0;
+	q->flow_max_rate	= ~0U;
+	q->rate_enable		= 1;
+	q->new_flows.first	= NULL;
+	q->old_flows.first	= NULL;
+	q->delayed		= RB_ROOT;
+	q->fq_root		= NULL;
+	q->fq_trees_log		= ilog2(1024);
+	qdisc_watchdog_init(&q->watchdog, sch);
+
+	if (opt)
+		err = fq_change(sch, opt);
+	else
+		err = fq_resize(q, q->fq_trees_log);
+
+	return err;
+}
+
+static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
+	    nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
+	    nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_DEFAULT_RATE, q->flow_default_rate) ||
+	    nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
+	    nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, opts);
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+}
+
+static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
+	u64 now = ktime_to_ns(ktime_get());
+	struct tc_fq_qd_stats st = {
+		.gc_flows		= q->stat_gc_flows,
+		.highprio_packets	= q->stat_internal_packets,
+		.tcp_retrans		= q->stat_tcp_retrans,
+		.throttled		= q->stat_throttled,
+		.flows_plimit		= q->stat_flows_plimit,
+		.pkts_too_long		= q->stat_pkts_too_long,
+		.allocation_errors	= q->stat_allocation_errors,
+		.flows			= q->flows,
+		.inactive_flows		= q->inactive_flows,
+		.throttled_flows	= q->throttled_flows,
+		.time_next_delayed_flow	= q->time_next_delayed_flow - now,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
+	.id		=	"fq",
+	.priv_size	=	sizeof(struct fq_sched_data),
+
+	.enqueue	=	fq_enqueue,
+	.dequeue	=	fq_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.init		=	fq_init,
+	.reset		=	fq_reset,
+	.destroy	=	fq_destroy,
+	.change		=	fq_change,
+	.dump		=	fq_dump,
+	.dump_stats	=	fq_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init fq_module_init(void)
+{
+	int ret;
+
+	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
+					   sizeof(struct fq_flow),
+					   0, 0, NULL);
+	if (!fq_flow_cachep)
+		return -ENOMEM;
+
+	ret = register_qdisc(&fq_qdisc_ops);
+	if (ret)
+		kmem_cache_destroy(fq_flow_cachep);
+	return ret;
+}
+
+static void __exit fq_module_exit(void)
+{
+	unregister_qdisc(&fq_qdisc_ops);
+	kmem_cache_destroy(fq_flow_cachep);
+}
+
+module_init(fq_module_init)
+module_exit(fq_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From e4c7ed415387cf718ffbec305396c30cee092987 Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 31 Aug 2013 13:44:33 +0800
Subject: vxlan: add ipv6 support

This patch adds IPv6 support to vxlan device, as the new version
RFC already mentions it:

   http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-03

Cc: David Stevens <dlstevens@us.ibm.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c           | 764 ++++++++++++++++++++++++++++++++++--------
 include/net/vxlan.h           |   2 +-
 include/uapi/linux/if_link.h  |   2 +
 net/openvswitch/vport-vxlan.c |   2 +-
 4 files changed, 622 insertions(+), 148 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3b21aca0c0c..faf131e02a7 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -6,9 +6,6 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
- *
- * TODO
- *  - IPv6 (not in RFC)
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -43,6 +40,11 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/vxlan.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#endif
 
 #define VXLAN_VERSION	"0.1"
 
@@ -59,6 +61,8 @@
 #define VXLAN_VID_MASK	(VXLAN_N_VID - 1)
 /* IP header + UDP + VXLAN + Ethernet header */
 #define VXLAN_HEADROOM (20 + 8 + 8 + 14)
+/* IPv6 header + UDP + VXLAN + Ethernet header */
+#define VXLAN6_HEADROOM (40 + 8 + 8 + 14)
 #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
 
 #define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
@@ -92,8 +96,14 @@ struct vxlan_net {
 	spinlock_t	  sock_lock;
 };
 
+union vxlan_addr {
+	struct sockaddr_in sin;
+	struct sockaddr_in6 sin6;
+	struct sockaddr sa;
+};
+
 struct vxlan_rdst {
-	__be32			 remote_ip;
+	union vxlan_addr	 remote_ip;
 	__be16			 remote_port;
 	u32			 remote_vni;
 	u32			 remote_ifindex;
@@ -120,7 +130,7 @@ struct vxlan_dev {
 	struct vxlan_sock *vn_sock;	/* listening socket */
 	struct net_device *dev;
 	struct vxlan_rdst default_dst;	/* default destination */
-	__be32		  saddr;	/* source address */
+	union vxlan_addr  saddr;	/* source address */
 	__be16		  dst_port;
 	__u16		  port_min;	/* source port range */
 	__u16		  port_max;
@@ -146,6 +156,7 @@ struct vxlan_dev {
 #define VXLAN_F_RSC	0x04
 #define VXLAN_F_L2MISS	0x08
 #define VXLAN_F_L3MISS	0x10
+#define VXLAN_F_IPV6	0x20 /* internal flag */
 
 /* salt for hash table */
 static u32 vxlan_salt __read_mostly;
@@ -153,6 +164,96 @@ static struct workqueue_struct *vxlan_wq;
 
 static void vxlan_sock_work(struct work_struct *work);
 
+#if IS_ENABLED(CONFIG_IPV6)
+static inline
+bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
+{
+       if (a->sa.sa_family != b->sa.sa_family)
+               return false;
+       if (a->sa.sa_family == AF_INET6)
+               return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
+       else
+               return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
+}
+
+static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
+{
+       if (ipa->sa.sa_family == AF_INET6)
+               return ipv6_addr_any(&ipa->sin6.sin6_addr);
+       else
+               return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+}
+
+static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
+{
+       if (ipa->sa.sa_family == AF_INET6)
+               return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
+       else
+               return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+}
+
+static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
+{
+       if (nla_len(nla) >= sizeof(struct in6_addr)) {
+               nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr));
+               ip->sa.sa_family = AF_INET6;
+               return 0;
+       } else if (nla_len(nla) >= sizeof(__be32)) {
+               ip->sin.sin_addr.s_addr = nla_get_be32(nla);
+               ip->sa.sa_family = AF_INET;
+               return 0;
+       } else {
+               return -EAFNOSUPPORT;
+       }
+}
+
+static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
+                             const union vxlan_addr *ip)
+{
+       if (ip->sa.sa_family == AF_INET6)
+               return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr);
+       else
+               return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
+}
+
+#else /* !CONFIG_IPV6 */
+
+static inline
+bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
+{
+       return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
+}
+
+static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
+{
+       return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+}
+
+static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
+{
+       return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+}
+
+static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
+{
+       if (nla_len(nla) >= sizeof(struct in6_addr)) {
+               return -EAFNOSUPPORT;
+       } else if (nla_len(nla) >= sizeof(__be32)) {
+               ip->sin.sin_addr.s_addr = nla_get_be32(nla);
+               ip->sa.sa_family = AF_INET;
+               return 0;
+       } else {
+               return -EAFNOSUPPORT;
+       }
+}
+
+static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
+                             const union vxlan_addr *ip)
+{
+       return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
+}
+#endif
+
 /* Virtual Network hash table head */
 static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
 {
@@ -239,7 +340,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
 	if (type == RTM_GETNEIGH) {
 		ndm->ndm_family	= AF_INET;
-		send_ip = rdst->remote_ip != htonl(INADDR_ANY);
+		send_ip = !vxlan_addr_any(&rdst->remote_ip);
 		send_eth = !is_zero_ether_addr(fdb->eth_addr);
 	} else
 		ndm->ndm_family	= AF_BRIDGE;
@@ -251,7 +352,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
 		goto nla_put_failure;
 
-	if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip))
+	if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
 		goto nla_put_failure;
 
 	if (rdst->remote_port && rdst->remote_port != vxlan->dst_port &&
@@ -283,7 +384,7 @@ static inline size_t vxlan_nlmsg_size(void)
 {
 	return NLMSG_ALIGN(sizeof(struct ndmsg))
 		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
-		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
 		+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
 		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
 		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
@@ -317,14 +418,14 @@ errout:
 		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 }
 
-static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
+static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb f = {
 		.state = NUD_STALE,
 	};
 	struct vxlan_rdst remote = {
-		.remote_ip = ipa, /* goes to NDA_DST */
+		.remote_ip = *ipa, /* goes to NDA_DST */
 		.remote_vni = VXLAN_N_VID,
 	};
 
@@ -397,13 +498,13 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
 
 /* caller should hold vxlan->hash_lock */
 static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
-					      __be32 ip, __be16 port,
+					      union vxlan_addr *ip, __be16 port,
 					      __u32 vni, __u32 ifindex)
 {
 	struct vxlan_rdst *rd;
 
 	list_for_each_entry(rd, &f->remotes, list) {
-		if (rd->remote_ip == ip &&
+		if (vxlan_addr_equal(&rd->remote_ip, ip) &&
 		    rd->remote_port == port &&
 		    rd->remote_vni == vni &&
 		    rd->remote_ifindex == ifindex)
@@ -415,7 +516,7 @@ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
 
 /* Replace destination of unicast mac */
 static int vxlan_fdb_replace(struct vxlan_fdb *f,
-			    __be32 ip, __be16 port, __u32 vni, __u32 ifindex)
+			     union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex)
 {
 	struct vxlan_rdst *rd;
 
@@ -426,7 +527,7 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f,
 	rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
 	if (!rd)
 		return 0;
-	rd->remote_ip = ip;
+	rd->remote_ip = *ip;
 	rd->remote_port = port;
 	rd->remote_vni = vni;
 	rd->remote_ifindex = ifindex;
@@ -435,7 +536,7 @@ static int vxlan_fdb_replace(struct vxlan_fdb *f,
 
 /* Add/update destinations for multicast */
 static int vxlan_fdb_append(struct vxlan_fdb *f,
-			    __be32 ip, __be16 port, __u32 vni, __u32 ifindex)
+			    union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex)
 {
 	struct vxlan_rdst *rd;
 
@@ -446,7 +547,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
 	if (rd == NULL)
 		return -ENOBUFS;
-	rd->remote_ip = ip;
+	rd->remote_ip = *ip;
 	rd->remote_port = port;
 	rd->remote_vni = vni;
 	rd->remote_ifindex = ifindex;
@@ -458,7 +559,7 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 
 /* Add new entry to forwarding table -- assumes lock held */
 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
-			    const u8 *mac, __be32 ip,
+			    const u8 *mac, union vxlan_addr *ip,
 			    __u16 state, __u16 flags,
 			    __be16 port, __u32 vni, __u32 ifindex,
 			    __u8 ndm_flags)
@@ -517,7 +618,7 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 		    (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
 			return -EOPNOTSUPP;
 
-		netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
+		netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
 		f = kmalloc(sizeof(*f), GFP_ATOMIC);
 		if (!f)
 			return -ENOMEM;
@@ -565,17 +666,26 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
 }
 
 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
-			   __be32 *ip, __be16 *port, u32 *vni, u32 *ifindex)
+			   union vxlan_addr *ip, __be16 *port, u32 *vni, u32 *ifindex)
 {
 	struct net *net = dev_net(vxlan->dev);
+	int err;
 
 	if (tb[NDA_DST]) {
-		if (nla_len(tb[NDA_DST]) != sizeof(__be32))
-			return -EAFNOSUPPORT;
-
-		*ip = nla_get_be32(tb[NDA_DST]);
+		err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
+		if (err)
+			return err;
 	} else {
-		*ip = htonl(INADDR_ANY);
+		union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
+		if (remote->sa.sa_family == AF_INET) {
+			ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
+			ip->sa.sa_family = AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else {
+			ip->sin6.sin6_addr = in6addr_any;
+			ip->sa.sa_family = AF_INET6;
+#endif
+		}
 	}
 
 	if (tb[NDA_PORT]) {
@@ -618,7 +728,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	/* struct net *net = dev_net(vxlan->dev); */
-	__be32 ip;
+	union vxlan_addr ip;
 	__be16 port;
 	u32 vni, ifindex;
 	int err;
@@ -637,7 +747,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 		return err;
 
 	spin_lock_bh(&vxlan->hash_lock);
-	err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags,
+	err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags,
 			       port, vni, ifindex, ndm->ndm_flags);
 	spin_unlock_bh(&vxlan->hash_lock);
 
@@ -652,7 +762,7 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
 	struct vxlan_rdst *rd = NULL;
-	__be32 ip;
+	union vxlan_addr ip;
 	__be16 port;
 	u32 vni, ifindex;
 	int err;
@@ -668,8 +778,8 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 	if (!f)
 		goto out;
 
-	if (ip != htonl(INADDR_ANY)) {
-		rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
+	if (!vxlan_addr_any(&ip)) {
+		rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
 		if (!rd)
 			goto out;
 	}
@@ -732,7 +842,7 @@ out:
  * Return true if packet is bogus and should be droppped.
  */
 static bool vxlan_snoop(struct net_device *dev,
-			__be32 src_ip, const u8 *src_mac)
+			union vxlan_addr *src_ip, const u8 *src_mac)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_fdb *f;
@@ -741,7 +851,7 @@ static bool vxlan_snoop(struct net_device *dev,
 	if (likely(f)) {
 		struct vxlan_rdst *rdst = first_remote_rcu(f);
 
-		if (likely(rdst->remote_ip == src_ip))
+		if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip)))
 			return false;
 
 		/* Don't migrate static entries, drop packets */
@@ -750,10 +860,10 @@ static bool vxlan_snoop(struct net_device *dev,
 
 		if (net_ratelimit())
 			netdev_info(dev,
-				    "%pM migrated from %pI4 to %pI4\n",
+				    "%pM migrated from %pIS to %pIS\n",
 				    src_mac, &rdst->remote_ip, &src_ip);
 
-		rdst->remote_ip = src_ip;
+		rdst->remote_ip = *src_ip;
 		f->updated = jiffies;
 		vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
 	} else {
@@ -775,7 +885,7 @@ static bool vxlan_snoop(struct net_device *dev,
 }
 
 /* See if multicast group is already in use by other ID */
-static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip)
+static bool vxlan_group_used(struct vxlan_net *vn, union vxlan_addr *remote_ip)
 {
 	struct vxlan_dev *vxlan;
 
@@ -783,7 +893,8 @@ static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip)
 		if (!netif_running(vxlan->dev))
 			continue;
 
-		if (vxlan->default_dst.remote_ip == remote_ip)
+		if (vxlan_addr_equal(&vxlan->default_dst.remote_ip,
+				     remote_ip))
 			return true;
 	}
 
@@ -819,13 +930,23 @@ static void vxlan_igmp_join(struct work_struct *work)
 	struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join);
 	struct vxlan_sock *vs = vxlan->vn_sock;
 	struct sock *sk = vs->sock->sk;
-	struct ip_mreqn mreq = {
-		.imr_multiaddr.s_addr	= vxlan->default_dst.remote_ip,
-		.imr_ifindex		= vxlan->default_dst.remote_ifindex,
-	};
+	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
+	int ifindex = vxlan->default_dst.remote_ifindex;
 
 	lock_sock(sk);
-	ip_mc_join_group(sk, &mreq);
+	if (ip->sa.sa_family == AF_INET) {
+		struct ip_mreqn mreq = {
+			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
+			.imr_ifindex		= ifindex,
+		};
+
+		ip_mc_join_group(sk, &mreq);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
+					     &ip->sin6.sin6_addr);
+#endif
+	}
 	release_sock(sk);
 
 	vxlan_sock_release(vs);
@@ -838,13 +959,24 @@ static void vxlan_igmp_leave(struct work_struct *work)
 	struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave);
 	struct vxlan_sock *vs = vxlan->vn_sock;
 	struct sock *sk = vs->sock->sk;
-	struct ip_mreqn mreq = {
-		.imr_multiaddr.s_addr	= vxlan->default_dst.remote_ip,
-		.imr_ifindex		= vxlan->default_dst.remote_ifindex,
-	};
+	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
+	int ifindex = vxlan->default_dst.remote_ifindex;
 
 	lock_sock(sk);
-	ip_mc_leave_group(sk, &mreq);
+	if (ip->sa.sa_family == AF_INET) {
+		struct ip_mreqn mreq = {
+			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
+			.imr_ifindex		= ifindex,
+		};
+
+		ip_mc_leave_group(sk, &mreq);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
+					     &ip->sin6.sin6_addr);
+#endif
+	}
+
 	release_sock(sk);
 
 	vxlan_sock_release(vs);
@@ -896,11 +1028,14 @@ error:
 static void vxlan_rcv(struct vxlan_sock *vs,
 		      struct sk_buff *skb, __be32 vx_vni)
 {
-	struct iphdr *oip;
+	struct iphdr *oip = NULL;
+	struct ipv6hdr *oip6 = NULL;
 	struct vxlan_dev *vxlan;
 	struct pcpu_tstats *stats;
+	union vxlan_addr saddr;
 	__u32 vni;
-	int err;
+	int err = 0;
+	union vxlan_addr *remote_ip;
 
 	vni = ntohl(vx_vni) >> 8;
 	/* Is this VNI defined? */
@@ -908,6 +1043,7 @@ static void vxlan_rcv(struct vxlan_sock *vs,
 	if (!vxlan)
 		goto drop;
 
+	remote_ip = &vxlan->default_dst.remote_ip;
 	skb_reset_mac_header(skb);
 	skb->protocol = eth_type_trans(skb, vxlan->dev);
 
@@ -917,9 +1053,20 @@ static void vxlan_rcv(struct vxlan_sock *vs,
 		goto drop;
 
 	/* Re-examine inner Ethernet packet */
-	oip = ip_hdr(skb);
+	if (remote_ip->sa.sa_family == AF_INET) {
+		oip = ip_hdr(skb);
+		saddr.sin.sin_addr.s_addr = oip->saddr;
+		saddr.sa.sa_family = AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		oip6 = ipv6_hdr(skb);
+		saddr.sin6.sin6_addr = oip6->saddr;
+		saddr.sa.sa_family = AF_INET6;
+#endif
+	}
+
 	if ((vxlan->flags & VXLAN_F_LEARN) &&
-	    vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source))
+	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
 		goto drop;
 
 	skb_reset_network_header(skb);
@@ -935,11 +1082,20 @@ static void vxlan_rcv(struct vxlan_sock *vs,
 
 	skb->encapsulation = 0;
 
-	err = IP_ECN_decapsulate(oip, skb);
+	if (oip6)
+		err = IP6_ECN_decapsulate(oip6, skb);
+	if (oip)
+		err = IP_ECN_decapsulate(oip, skb);
+
 	if (unlikely(err)) {
-		if (log_ecn_error)
-			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
-					     &oip->saddr, oip->tos);
+		if (log_ecn_error) {
+			if (oip6)
+				net_info_ratelimited("non-ECT from %pI6\n",
+						     &oip6->saddr);
+			if (oip)
+				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
+						     &oip->saddr, oip->tos);
+		}
 		if (err > 1) {
 			++vxlan->dev->stats.rx_frame_errors;
 			++vxlan->dev->stats.rx_errors;
@@ -1009,7 +1165,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 		}
 
 		f = vxlan_find_mac(vxlan, n->ha);
-		if (f && first_remote_rcu(f)->remote_ip == htonl(INADDR_ANY)) {
+		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
 			/* bridge-local neighbor */
 			neigh_release(n);
 			goto out;
@@ -1027,8 +1183,14 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
 
 		if (netif_rx_ni(reply) == NET_RX_DROP)
 			dev->stats.rx_dropped++;
-	} else if (vxlan->flags & VXLAN_F_L3MISS)
-		vxlan_ip_miss(dev, tip);
+	} else if (vxlan->flags & VXLAN_F_L3MISS) {
+		union vxlan_addr ipa = {
+			.sin.sin_addr.s_addr = tip,
+			.sa.sa_family = AF_INET,
+		};
+
+		vxlan_ip_miss(dev, &ipa);
+	}
 out:
 	consume_skb(skb);
 	return NETDEV_TX_OK;
@@ -1050,6 +1212,16 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 			return false;
 		pip = ip_hdr(skb);
 		n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
+		if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
+			union vxlan_addr ipa = {
+				.sin.sin_addr.s_addr = pip->daddr,
+				.sa.sa_family = AF_INET,
+			};
+
+			vxlan_ip_miss(dev, &ipa);
+			return false;
+		}
+
 		break;
 	default:
 		return false;
@@ -1066,8 +1238,8 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 		}
 		neigh_release(n);
 		return diff;
-	} else if (vxlan->flags & VXLAN_F_L3MISS)
-		vxlan_ip_miss(dev, pip->daddr);
+	}
+
 	return false;
 }
 
@@ -1118,6 +1290,102 @@ static int handle_offloads(struct sk_buff *skb)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+static int vxlan6_xmit_skb(struct net *net, struct vxlan_sock *vs,
+			   struct dst_entry *dst, struct sk_buff *skb,
+			   struct net_device *dev, struct in6_addr *saddr,
+			   struct in6_addr *daddr, __u8 prio, __u8 ttl,
+			   __be16 src_port, __be16 dst_port, __be32 vni)
+{
+	struct ipv6hdr *ip6h;
+	struct vxlanhdr *vxh;
+	struct udphdr *uh;
+	int min_headroom;
+	int err;
+
+	if (!skb->encapsulation) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+
+	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
+			+ VXLAN_HLEN + sizeof(struct ipv6hdr)
+			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+
+	/* Need space for new headers (invalidates iph ptr) */
+	err = skb_cow_head(skb, min_headroom);
+	if (unlikely(err))
+		return err;
+
+	if (vlan_tx_tag_present(skb)) {
+		if (WARN_ON(!__vlan_put_tag(skb,
+					    skb->vlan_proto,
+					    vlan_tx_tag_get(skb))))
+			return -ENOMEM;
+
+		skb->vlan_tci = 0;
+	}
+
+	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = vni;
+
+	__skb_push(skb, sizeof(*uh));
+	skb_reset_transport_header(skb);
+	uh = udp_hdr(skb);
+
+	uh->dest = dst_port;
+	uh->source = src_port;
+
+	uh->len = htons(skb->len);
+	uh->check = 0;
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
+			      IPSKB_REROUTED);
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	if (!skb_is_gso(skb) && !(dst->dev->features & NETIF_F_IPV6_CSUM)) {
+		__wsum csum = skb_checksum(skb, 0, skb->len, 0);
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		uh->check = csum_ipv6_magic(saddr, daddr, skb->len,
+					    IPPROTO_UDP, csum);
+		if (uh->check == 0)
+			uh->check = CSUM_MANGLED_0;
+	} else {
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		uh->check = ~csum_ipv6_magic(saddr, daddr,
+					     skb->len, IPPROTO_UDP, 0);
+	}
+
+	__skb_push(skb, sizeof(*ip6h));
+	skb_reset_network_header(skb);
+	ip6h		  = ipv6_hdr(skb);
+	ip6h->version	  = 6;
+	ip6h->priority	  = prio;
+	ip6h->flow_lbl[0] = 0;
+	ip6h->flow_lbl[1] = 0;
+	ip6h->flow_lbl[2] = 0;
+	ip6h->payload_len = htons(skb->len);
+	ip6h->nexthdr     = IPPROTO_UDP;
+	ip6h->hop_limit   = ttl;
+	ip6h->daddr	  = *daddr;
+	ip6h->saddr	  = *saddr;
+
+	vxlan_set_owner(vs->sock->sk, skb);
+
+	err = handle_offloads(skb);
+	if (err)
+		return err;
+
+	ip6tunnel_xmit(skb, dev);
+	return 0;
+}
+#endif
+
 int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs,
 		   struct rtable *rt, struct sk_buff *skb,
 		   __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
@@ -1182,15 +1450,26 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 {
 	struct pcpu_tstats *tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
 	struct pcpu_tstats *rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
+	union vxlan_addr loopback;
+	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
 
 	skb->pkt_type = PACKET_HOST;
 	skb->encapsulation = 0;
 	skb->dev = dst_vxlan->dev;
 	__skb_pull(skb, skb_network_offset(skb));
 
+	if (remote_ip->sa.sa_family == AF_INET) {
+		loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		loopback.sa.sa_family =  AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		loopback.sin6.sin6_addr = in6addr_loopback;
+		loopback.sa.sa_family =  AF_INET6;
+#endif
+	}
+
 	if (dst_vxlan->flags & VXLAN_F_LEARN)
-		vxlan_snoop(skb->dev, htonl(INADDR_LOOPBACK),
-			    eth_hdr(skb)->h_source);
+		vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source);
 
 	u64_stats_update_begin(&tx_stats->syncp);
 	tx_stats->tx_packets++;
@@ -1211,11 +1490,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
-	struct rtable *rt;
+	struct rtable *rt = NULL;
 	const struct iphdr *old_iph;
 	struct flowi4 fl4;
-	__be32 dst;
-	__be16 src_port, dst_port;
+	union vxlan_addr *dst;
+	__be16 src_port = 0, dst_port;
 	u32 vni;
 	__be16 df = 0;
 	__u8 tos, ttl;
@@ -1223,9 +1502,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 	dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
 	vni = rdst->remote_vni;
-	dst = rdst->remote_ip;
+	dst = &rdst->remote_ip;
 
-	if (!dst) {
+	if (vxlan_addr_any(dst)) {
 		if (did_rsc) {
 			/* short-circuited back to local bridge */
 			vxlan_encap_bypass(skb, vxlan, vxlan);
@@ -1237,7 +1516,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	old_iph = ip_hdr(skb);
 
 	ttl = vxlan->ttl;
-	if (!ttl && IN_MULTICAST(ntohl(dst)))
+	if (!ttl && vxlan_addr_multicast(dst))
 		ttl = 1;
 
 	tos = vxlan->tos;
@@ -1246,48 +1525,101 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 	src_port = vxlan_src_port(vxlan->port_min, vxlan->port_max, skb);
 
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.flowi4_oif = rdst->remote_ifindex;
-	fl4.flowi4_tos = RT_TOS(tos);
-	fl4.daddr = dst;
-	fl4.saddr = vxlan->saddr;
-
-	rt = ip_route_output_key(dev_net(dev), &fl4);
-	if (IS_ERR(rt)) {
-		netdev_dbg(dev, "no route to %pI4\n", &dst);
-		dev->stats.tx_carrier_errors++;
-		goto tx_error;
-	}
+	if (dst->sa.sa_family == AF_INET) {
+		memset(&fl4, 0, sizeof(fl4));
+		fl4.flowi4_oif = rdst->remote_ifindex;
+		fl4.flowi4_tos = RT_TOS(tos);
+		fl4.daddr = dst->sin.sin_addr.s_addr;
+		fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr;
+
+		rt = ip_route_output_key(dev_net(dev), &fl4);
+		if (IS_ERR(rt)) {
+			netdev_dbg(dev, "no route to %pI4\n",
+				   &dst->sin.sin_addr.s_addr);
+			dev->stats.tx_carrier_errors++;
+			goto tx_error;
+		}
 
-	if (rt->dst.dev == dev) {
-		netdev_dbg(dev, "circular route to %pI4\n", &dst);
-		dev->stats.collisions++;
-		goto rt_tx_error;
-	}
+		if (rt->dst.dev == dev) {
+			netdev_dbg(dev, "circular route to %pI4\n",
+				   &dst->sin.sin_addr.s_addr);
+			dev->stats.collisions++;
+			goto tx_error;
+		}
+
+		/* Bypass encapsulation if the destination is local */
+		if (rt->rt_flags & RTCF_LOCAL &&
+		    !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
+			struct vxlan_dev *dst_vxlan;
+
+			ip_rt_put(rt);
+			dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port);
+			if (!dst_vxlan)
+				goto tx_error;
+			vxlan_encap_bypass(skb, vxlan, dst_vxlan);
+			return;
+		}
 
-	/* Bypass encapsulation if the destination is local */
-	if (rt->rt_flags & RTCF_LOCAL &&
-	    !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
-		struct vxlan_dev *dst_vxlan;
+		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
+		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 
-		ip_rt_put(rt);
-		dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port);
-		if (!dst_vxlan)
+		err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb,
+				     fl4.saddr, dst->sin.sin_addr.s_addr,
+				     tos, ttl, df, src_port, dst_port,
+				     htonl(vni << 8));
+
+		if (err < 0)
+			goto rt_tx_error;
+		iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		struct sock *sk = vxlan->vn_sock->sock->sk;
+		struct dst_entry *ndst;
+		struct flowi6 fl6;
+		u32 flags;
+
+		memset(&fl6, 0, sizeof(fl6));
+		fl6.flowi6_oif = rdst->remote_ifindex;
+		fl6.daddr = dst->sin6.sin6_addr;
+		fl6.saddr = vxlan->saddr.sin6.sin6_addr;
+		fl6.flowi6_proto = skb->protocol;
+
+		if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) {
+			netdev_dbg(dev, "no route to %pI6\n",
+				   &dst->sin6.sin6_addr);
+			dev->stats.tx_carrier_errors++;
 			goto tx_error;
-		vxlan_encap_bypass(skb, vxlan, dst_vxlan);
-		return;
-	}
+		}
 
-	tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
-	ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
+		if (ndst->dev == dev) {
+			netdev_dbg(dev, "circular route to %pI6\n",
+				   &dst->sin6.sin6_addr);
+			dst_release(ndst);
+			dev->stats.collisions++;
+			goto tx_error;
+		}
 
-	err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb,
-			     fl4.saddr, dst, tos, ttl, df,
-			     src_port, dst_port, htonl(vni << 8));
+		/* Bypass encapsulation if the destination is local */
+		flags = ((struct rt6_info *)ndst)->rt6i_flags;
+		if (flags & RTF_LOCAL &&
+		    !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
+			struct vxlan_dev *dst_vxlan;
+
+			dst_release(ndst);
+			dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port);
+			if (!dst_vxlan)
+				goto tx_error;
+			vxlan_encap_bypass(skb, vxlan, dst_vxlan);
+			return;
+		}
 
-	if (err < 0)
-		goto rt_tx_error;
-	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+		ttl = ttl ? : ip6_dst_hoplimit(ndst);
+
+		err = vxlan6_xmit_skb(dev_net(dev), vxlan->vn_sock, ndst, skb,
+				      dev, &fl6.saddr, &fl6.daddr, 0, ttl,
+				      src_port, dst_port, htonl(vni << 8));
+#endif
+	}
 
 	return;
 
@@ -1464,8 +1796,8 @@ static int vxlan_open(struct net_device *dev)
 	if (!vs)
 		return -ENOTCONN;
 
-	if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) &&
-	    vxlan_group_used(vn, vxlan->default_dst.remote_ip)) {
+	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
+	    vxlan_group_used(vn, &vxlan->default_dst.remote_ip)) {
 		vxlan_sock_hold(vs);
 		dev_hold(dev);
 		queue_work(vxlan_wq, &vxlan->igmp_join);
@@ -1503,8 +1835,8 @@ static int vxlan_stop(struct net_device *dev)
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_sock *vs = vxlan->vn_sock;
 
-	if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) &&
-	    ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) {
+	if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
+	    ! vxlan_group_used(vn, &vxlan->default_dst.remote_ip)) {
 		vxlan_sock_hold(vs);
 		dev_hold(dev);
 		queue_work(vxlan_wq, &vxlan->igmp_leave);
@@ -1552,7 +1884,10 @@ static void vxlan_setup(struct net_device *dev)
 
 	eth_hw_addr_random(dev);
 	ether_setup(dev);
-	dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
+	if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6)
+		dev->hard_header_len = ETH_HLEN + VXLAN6_HEADROOM;
+	else
+		dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
 
 	dev->netdev_ops = &vxlan_netdev_ops;
 	dev->destructor = free_netdev;
@@ -1597,8 +1932,10 @@ static void vxlan_setup(struct net_device *dev)
 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
 	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+	[IFLA_VXLAN_GROUP6]	= { .len = sizeof(struct in6_addr) },
 	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
 	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
 	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
@@ -1669,58 +2006,132 @@ static void vxlan_del_work(struct work_struct *work)
 	kfree_rcu(vs, rcu);
 }
 
-static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
-					      vxlan_rcv_t *rcv, void *data)
+#if IS_ENABLED(CONFIG_IPV6)
+/* Create UDP socket for encapsulation receive. AF_INET6 socket
+ * could be used for both IPv4 and IPv6 communications, but
+ * users may set bindv6only=1.
+ */
+static int create_v6_sock(struct net *net, __be16 port, struct socket **psock)
+{
+	struct sock *sk;
+	struct socket *sock;
+	struct sockaddr_in6 vxlan_addr = {
+		.sin6_family = AF_INET6,
+		.sin6_port = port,
+	};
+	int rc, val = 1;
+
+	rc = sock_create_kern(AF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	if (rc < 0) {
+		pr_debug("UDPv6 socket create failed\n");
+		return rc;
+	}
+
+	/* Put in proper namespace */
+	sk = sock->sk;
+	sk_change_net(sk, net);
+
+	kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+			  (char *)&val, sizeof(val));
+	rc = kernel_bind(sock, (struct sockaddr *)&vxlan_addr,
+			 sizeof(struct sockaddr_in6));
+	if (rc < 0) {
+		pr_debug("bind for UDPv6 socket %pI6:%u (%d)\n",
+			 &vxlan_addr.sin6_addr, ntohs(vxlan_addr.sin6_port), rc);
+		sk_release_kernel(sk);
+		return rc;
+	}
+	/* At this point, IPv6 module should have been loaded in
+	 * sock_create_kern().
+	 */
+	BUG_ON(!ipv6_stub);
+
+	*psock = sock;
+	/* Disable multicast loopback */
+	inet_sk(sk)->mc_loop = 0;
+	return 0;
+}
+
+#else
+
+static int create_v6_sock(struct net *net, __be16 port, struct socket **psock)
+{
+		return -EPFNOSUPPORT;
+}
+#endif
+
+static int create_v4_sock(struct net *net, __be16 port, struct socket **psock)
 {
-	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
-	struct vxlan_sock *vs;
 	struct sock *sk;
+	struct socket *sock;
 	struct sockaddr_in vxlan_addr = {
 		.sin_family = AF_INET,
 		.sin_addr.s_addr = htonl(INADDR_ANY),
 		.sin_port = port,
 	};
 	int rc;
-	unsigned int h;
-
-	vs = kmalloc(sizeof(*vs), GFP_KERNEL);
-	if (!vs) {
-		pr_debug("memory alocation failure\n");
-		return ERR_PTR(-ENOMEM);
-	}
-
-	for (h = 0; h < VNI_HASH_SIZE; ++h)
-		INIT_HLIST_HEAD(&vs->vni_list[h]);
-
-	INIT_WORK(&vs->del_work, vxlan_del_work);
 
 	/* Create UDP socket for encapsulation receive. */
-	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vs->sock);
+	rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (rc < 0) {
 		pr_debug("UDP socket create failed\n");
-		kfree(vs);
-		return ERR_PTR(rc);
+		return rc;
 	}
 
 	/* Put in proper namespace */
-	sk = vs->sock->sk;
+	sk = sock->sk;
 	sk_change_net(sk, net);
 
-	rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr,
+	rc = kernel_bind(sock, (struct sockaddr *) &vxlan_addr,
 			 sizeof(vxlan_addr));
 	if (rc < 0) {
 		pr_debug("bind for UDP socket %pI4:%u (%d)\n",
 			 &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
 		sk_release_kernel(sk);
+		return rc;
+	}
+
+	*psock = sock;
+	/* Disable multicast loopback */
+	inet_sk(sk)->mc_loop = 0;
+	return 0;
+}
+
+/* Create new listen socket if needed */
+static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
+					      vxlan_rcv_t *rcv, void *data, bool ipv6)
+{
+	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct vxlan_sock *vs;
+	struct socket *sock;
+	struct sock *sk;
+	int rc = 0;
+	unsigned int h;
+
+	vs = kmalloc(sizeof(*vs), GFP_KERNEL);
+	if (!vs)
+		return ERR_PTR(-ENOMEM);
+
+	for (h = 0; h < VNI_HASH_SIZE; ++h)
+		INIT_HLIST_HEAD(&vs->vni_list[h]);
+
+	INIT_WORK(&vs->del_work, vxlan_del_work);
+
+	if (ipv6)
+		rc = create_v6_sock(net, port, &sock);
+	else
+		rc = create_v4_sock(net, port, &sock);
+	if (rc < 0) {
 		kfree(vs);
 		return ERR_PTR(rc);
 	}
+
+	vs->sock = sock;
+	sk = sock->sk;
 	atomic_set(&vs->refcnt, 1);
 	vs->rcv = rcv;
 	vs->data = data;
 
-	/* Disable multicast loopback */
-	inet_sk(sk)->mc_loop = 0;
 	spin_lock(&vn->sock_lock);
 	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
 	spin_unlock(&vn->sock_lock);
@@ -1728,18 +2139,24 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
 	/* Mark socket as an encapsulation socket. */
 	udp_sk(sk)->encap_type = 1;
 	udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
-	udp_encap_enable();
+#if IS_ENABLED(CONFIG_IPV6)
+	if (ipv6)
+		ipv6_stub->udpv6_encap_enable();
+	else
+#endif
+		udp_encap_enable();
+
 	return vs;
 }
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,
-				  bool no_share)
+				  bool no_share, bool ipv6)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 	struct vxlan_sock *vs;
 
-	vs = vxlan_socket_create(net, port, rcv, data);
+	vs = vxlan_socket_create(net, port, rcv, data, ipv6);
 	if (!IS_ERR(vs))
 		return vs;
 
@@ -1772,7 +2189,7 @@ static void vxlan_sock_work(struct work_struct *work)
 	__be16 port = vxlan->dst_port;
 	struct vxlan_sock *nvs;
 
-	nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false);
+	nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags & VXLAN_F_IPV6);
 	spin_lock(&vn->sock_lock);
 	if (!IS_ERR(nvs))
 		vxlan_vs_add_dev(nvs, vxlan);
@@ -1789,6 +2206,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 	struct vxlan_rdst *dst = &vxlan->default_dst;
 	__u32 vni;
 	int err;
+	bool use_ipv6 = false;
 
 	if (!data[IFLA_VXLAN_ID])
 		return -EINVAL;
@@ -1796,11 +2214,32 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 	vni = nla_get_u32(data[IFLA_VXLAN_ID]);
 	dst->remote_vni = vni;
 
-	if (data[IFLA_VXLAN_GROUP])
-		dst->remote_ip = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+	if (data[IFLA_VXLAN_GROUP]) {
+		dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+		dst->remote_ip.sa.sa_family = AF_INET;
+	} else if (data[IFLA_VXLAN_GROUP6]) {
+		if (!IS_ENABLED(CONFIG_IPV6))
+			return -EPFNOSUPPORT;
+
+		nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6],
+			   sizeof(struct in6_addr));
+		dst->remote_ip.sa.sa_family = AF_INET6;
+		use_ipv6 = true;
+	}
 
-	if (data[IFLA_VXLAN_LOCAL])
-		vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+	if (data[IFLA_VXLAN_LOCAL]) {
+		vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+		vxlan->saddr.sa.sa_family = AF_INET;
+	} else if (data[IFLA_VXLAN_LOCAL6]) {
+		if (!IS_ENABLED(CONFIG_IPV6))
+			return -EPFNOSUPPORT;
+
+		/* TODO: respect scope id */
+		nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6],
+			   sizeof(struct in6_addr));
+		vxlan->saddr.sa.sa_family = AF_INET6;
+		use_ipv6 = true;
+	}
 
 	if (data[IFLA_VXLAN_LINK] &&
 	    (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
@@ -1812,12 +2251,23 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 			return -ENODEV;
 		}
 
+#if IS_ENABLED(CONFIG_IPV6)
+		if (use_ipv6) {
+			struct inet6_dev *idev = __in6_dev_get(lowerdev);
+			if (idev && idev->cnf.disable_ipv6) {
+				pr_info("IPv6 is disabled via sysctl\n");
+				return -EPERM;
+			}
+			vxlan->flags |= VXLAN_F_IPV6;
+		}
+#endif
+
 		if (!tb[IFLA_MTU])
-			dev->mtu = lowerdev->mtu - VXLAN_HEADROOM;
+			dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
 
 		/* update header length based on lower device */
 		dev->hard_header_len = lowerdev->hard_header_len +
-				       VXLAN_HEADROOM;
+				       (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
 	}
 
 	if (data[IFLA_VXLAN_TOS])
@@ -1868,7 +2318,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 
 	/* create an fdb entry for default destination */
 	err = vxlan_fdb_create(vxlan, all_zeros_mac,
-			       vxlan->default_dst.remote_ip,
+			       &vxlan->default_dst.remote_ip,
 			       NUD_REACHABLE|NUD_PERMANENT,
 			       NLM_F_EXCL|NLM_F_CREATE,
 			       vxlan->dst_port, vxlan->default_dst.remote_vni,
@@ -1905,9 +2355,9 @@ static size_t vxlan_get_size(const struct net_device *dev)
 {
 
 	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
-		nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
+		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
-		nla_total_size(sizeof(__be32))+	/* IFLA_VXLAN_LOCAL */
+		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
@@ -1934,14 +2384,36 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni))
 		goto nla_put_failure;
 
-	if (dst->remote_ip && nla_put_be32(skb, IFLA_VXLAN_GROUP, dst->remote_ip))
-		goto nla_put_failure;
+	if (!vxlan_addr_any(&dst->remote_ip)) {
+		if (dst->remote_ip.sa.sa_family == AF_INET) {
+			if (nla_put_be32(skb, IFLA_VXLAN_GROUP,
+					 dst->remote_ip.sin.sin_addr.s_addr))
+				goto nla_put_failure;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else {
+			if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr),
+				    &dst->remote_ip.sin6.sin6_addr))
+				goto nla_put_failure;
+#endif
+		}
+	}
 
 	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
 		goto nla_put_failure;
 
-	if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
-		goto nla_put_failure;
+	if (!vxlan_addr_any(&vxlan->saddr)) {
+		if (vxlan->saddr.sa.sa_family == AF_INET) {
+			if (nla_put_be32(skb, IFLA_VXLAN_LOCAL,
+					 vxlan->saddr.sin.sin_addr.s_addr))
+				goto nla_put_failure;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else {
+			if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr),
+				    &vxlan->saddr.sin6.sin6_addr))
+				goto nla_put_failure;
+#endif
+		}
+	}
 
 	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index ad342e3688a..d2b88cafa7a 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -25,7 +25,7 @@ struct vxlan_sock {
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,
-				  bool no_share);
+				  bool no_share, bool ipv6);
 
 void vxlan_sock_release(struct vxlan_sock *vs);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 04c0e7a5d48..80394e8dc3a 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -314,6 +314,8 @@ enum {
 	IFLA_VXLAN_L2MISS,
 	IFLA_VXLAN_L3MISS,
 	IFLA_VXLAN_PORT,	/* destination port */
+	IFLA_VXLAN_GROUP6,
+	IFLA_VXLAN_LOCAL6,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 36848bd54a7..a0060245b4e 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -123,7 +123,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
 	vxlan_port = vxlan_vport(vport);
 	strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
 
-	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true);
+	vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, false);
 	if (IS_ERR(vs)) {
 		ovs_vport_free(vport);
 		return (void *)vs;
-- 
cgit v1.2.3-70-g09d2


From 61e76b178dbe7145e8d6afa84bb4ccea71918994 Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Fri, 30 Aug 2013 11:18:45 +0200
Subject: ICMPv6: treat dest unreachable codes 5 and 6 as EACCES, not EPROTO

RFC 4443 has defined two additional codes for ICMPv6 type 1 (destination
unreachable) messages:
        5 - Source address failed ingress/egress policy
	6 - Reject route to destination

Now they are treated as protocol error and icmpv6_err_convert() converts them
to EPROTO.

RFC 4443 says:
	"Codes 5 and 6 are more informative subsets of code 1."

Treat codes 5 and 6 as code 1 (EACCES)

Btw, connect() returning -EPROTO confuses firefox, so that fallback to
other/IPv4 addresses does not work:
https://bugzilla.mozilla.org/show_bug.cgi?id=910773

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/icmpv6.h |  2 ++
 net/ipv6/icmp.c             | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h
index e0133c73c30..590beda78ea 100644
--- a/include/uapi/linux/icmpv6.h
+++ b/include/uapi/linux/icmpv6.h
@@ -115,6 +115,8 @@ struct icmp6hdr {
 #define ICMPV6_NOT_NEIGHBOUR		2
 #define ICMPV6_ADDR_UNREACH		3
 #define ICMPV6_PORT_UNREACH		4
+#define ICMPV6_POLICY_FAIL		5
+#define ICMPV6_REJECT_ROUTE		6
 
 /*
  *	Codes for Time Exceeded
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7cfc8d28487..67ae4e0d40b 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -940,6 +940,14 @@ static const struct icmp6_err {
 		.err	= ECONNREFUSED,
 		.fatal	= 1,
 	},
+	{	/* POLICY_FAIL */
+		.err	= EACCES,
+		.fatal	= 1,
+	},
+	{	/* REJECT_ROUTE	*/
+		.err	= EACCES,
+		.fatal	= 1,
+	},
 };
 
 int icmpv6_err_convert(u8 type, u8 code, int *err)
@@ -951,7 +959,7 @@ int icmpv6_err_convert(u8 type, u8 code, int *err)
 	switch (type) {
 	case ICMPV6_DEST_UNREACH:
 		fatal = 1;
-		if (code <= ICMPV6_PORT_UNREACH) {
+		if (code < ARRAY_SIZE(tab_unreach)) {
 			*err  = tab_unreach[code].err;
 			fatal = tab_unreach[code].fatal;
 		}
-- 
cgit v1.2.3-70-g09d2


From cfd280c91253cc28e4919e349fa7a813b63e71e8 Mon Sep 17 00:00:00 2001
From: Carlos O'Donell <carlos@redhat.com>
Date: Thu, 15 Aug 2013 17:28:10 +0800
Subject: net: sync some IP headers with glibc

Solution:
=========

- Synchronize linux's `include/uapi/linux/in6.h'
  with glibc's `inet/netinet/in.h'.
- Synchronize glibc's `inet/netinet/in.h with linux's
  `include/uapi/linux/in6.h'.
- Allow including the headers in either other.
- First header included defines the structures and macros.

Details:
========

The kernel promises not to break the UAPI ABI so I don't
see why we can't just have the two userspace headers
coordinate?

If you include the kernel headers first you get those,
and if you include the glibc headers first you get those,
and the following patch arranges a coordination and
synchronization between the two.

Let's handle `include/uapi/linux/in6.h' from linux,
and `inet/netinet/in.h' from glibc and ensure they compile
in any order and preserve the required ABI.

These two patches pass the following compile tests:

cat >> test1.c <<EOF
int main (void) {
  return 0;
}
EOF
gcc -c test1.c

cat >> test2.c <<EOF
int main (void) {
  return 0;
}
EOF
gcc -c test2.c

One wrinkle is that the kernel has a different name for one of
the members in ipv6_mreq. In the kernel patch we create a macro
to cover the uses of the old name, and while that's not entirely
clean it's one of the best solutions (aside from an anonymous
union which has other issues).

I've reviewed the code and it looks to me like the ABI is
assured and everything matches on both sides.

Notes:
- You want netinet/in.h to include bits/in.h as early as possible,
  but it needs in_addr so define in_addr early.
- You want bits/in.h included as early as possible so you can use
  the linux specific code to define __USE_KERNEL_DEFS based on
  the _UAPI_* macro definition and use those to cull in.h.
- glibc was missing IPPROTO_MH, added here.

Compile tested and inspected.

Reported-by: Thomas Backlund <tmb@mageia.org>
Cc: Thomas Backlund <tmb@mageia.org>
Cc: libc-alpha@sourceware.org
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: David S. Miller <davem@davemloft.net>
Tested-by: Cong Wang <amwang@redhat.com>
Signed-off-by: Carlos O'Donell <carlos@redhat.com>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/Kbuild        |   1 +
 include/uapi/linux/in.h          |  49 ++++++++++++++-----
 include/uapi/linux/in6.h         |  36 +++++++++++---
 include/uapi/linux/libc-compat.h | 103 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 169 insertions(+), 20 deletions(-)
 create mode 100644 include/uapi/linux/libc-compat.h

(limited to 'include/uapi')

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 997f9f2f096..e7c94eeb947 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -227,6 +227,7 @@ header-y += kvm_para.h
 endif
 
 header-y += l2tp.h
+header-y += libc-compat.h
 header-y += limits.h
 header-y += llc.h
 header-y += loop.h
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index 9edb441df82..f9e8e496ae5 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -24,30 +24,53 @@
 /* Standard well-defined IP protocols.  */
 enum {
   IPPROTO_IP = 0,		/* Dummy protocol for TCP		*/
+#define IPPROTO_IP		IPPROTO_IP
   IPPROTO_ICMP = 1,		/* Internet Control Message Protocol	*/
+#define IPPROTO_ICMP		IPPROTO_ICMP
   IPPROTO_IGMP = 2,		/* Internet Group Management Protocol	*/
+#define IPPROTO_IGMP		IPPROTO_IGMP
   IPPROTO_IPIP = 4,		/* IPIP tunnels (older KA9Q tunnels use 94) */
+#define IPPROTO_IPIP		IPPROTO_IPIP
   IPPROTO_TCP = 6,		/* Transmission Control Protocol	*/
+#define IPPROTO_TCP		IPPROTO_TCP
   IPPROTO_EGP = 8,		/* Exterior Gateway Protocol		*/
+#define IPPROTO_EGP		IPPROTO_EGP
   IPPROTO_PUP = 12,		/* PUP protocol				*/
+#define IPPROTO_PUP		IPPROTO_PUP
   IPPROTO_UDP = 17,		/* User Datagram Protocol		*/
+#define IPPROTO_UDP		IPPROTO_UDP
   IPPROTO_IDP = 22,		/* XNS IDP protocol			*/
+#define IPPROTO_IDP		IPPROTO_IDP
+  IPPROTO_TP = 29,		/* SO Transport Protocol Class 4	*/
+#define IPPROTO_TP		IPPROTO_TP
   IPPROTO_DCCP = 33,		/* Datagram Congestion Control Protocol */
-  IPPROTO_RSVP = 46,		/* RSVP protocol			*/
+#define IPPROTO_DCCP		IPPROTO_DCCP
+  IPPROTO_IPV6 = 41,		/* IPv6-in-IPv4 tunnelling		*/
+#define IPPROTO_IPV6		IPPROTO_IPV6
+  IPPROTO_RSVP = 46,		/* RSVP Protocol			*/
+#define IPPROTO_RSVP		IPPROTO_RSVP
   IPPROTO_GRE = 47,		/* Cisco GRE tunnels (rfc 1701,1702)	*/
-
-  IPPROTO_IPV6	 = 41,		/* IPv6-in-IPv4 tunnelling		*/
-
-  IPPROTO_ESP = 50,            /* Encapsulation Security Payload protocol */
-  IPPROTO_AH = 51,             /* Authentication Header protocol       */
-  IPPROTO_BEETPH = 94,	       /* IP option pseudo header for BEET */
-  IPPROTO_PIM    = 103,		/* Protocol Independent Multicast	*/
-
-  IPPROTO_COMP   = 108,                /* Compression Header protocol */
-  IPPROTO_SCTP   = 132,		/* Stream Control Transport Protocol	*/
+#define IPPROTO_GRE		IPPROTO_GRE
+  IPPROTO_ESP = 50,		/* Encapsulation Security Payload protocol */
+#define IPPROTO_ESP		IPPROTO_ESP
+  IPPROTO_AH = 51,		/* Authentication Header protocol	*/
+#define IPPROTO_AH		IPPROTO_AH
+  IPPROTO_MTP = 92,		/* Multicast Transport Protocol		*/
+#define IPPROTO_MTP		IPPROTO_MTP
+  IPPROTO_BEETPH = 94,		/* IP option pseudo header for BEET	*/
+#define IPPROTO_BEETPH		IPPROTO_BEETPH
+  IPPROTO_ENCAP = 98,		/* Encapsulation Header			*/
+#define IPPROTO_ENCAP		IPPROTO_ENCAP
+  IPPROTO_PIM = 103,		/* Protocol Independent Multicast	*/
+#define IPPROTO_PIM		IPPROTO_PIM
+  IPPROTO_COMP = 108,		/* Compression Header Protocol		*/
+#define IPPROTO_COMP		IPPROTO_COMP
+  IPPROTO_SCTP = 132,		/* Stream Control Transport Protocol	*/
+#define IPPROTO_SCTP		IPPROTO_SCTP
   IPPROTO_UDPLITE = 136,	/* UDP-Lite (RFC 3828)			*/
-
-  IPPROTO_RAW	 = 255,		/* Raw IP packets			*/
+#define IPPROTO_UDPLITE		IPPROTO_UDPLITE
+  IPPROTO_RAW = 255,		/* Raw IP packets			*/
+#define IPPROTO_RAW		IPPROTO_RAW
   IPPROTO_MAX
 };
 
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index 53b1d56a6e7..440d5c47914 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -22,22 +22,30 @@
 #define _UAPI_LINUX_IN6_H
 
 #include <linux/types.h>
+#include <linux/libc-compat.h>
 
 /*
  *	IPv6 address structure
  */
 
+#if __UAPI_DEF_IN6_ADDR
 struct in6_addr {
 	union {
 		__u8		u6_addr8[16];
+#if __UAPI_DEF_IN6_ADDR_ALT
 		__be16		u6_addr16[8];
 		__be32		u6_addr32[4];
+#endif
 	} in6_u;
 #define s6_addr			in6_u.u6_addr8
+#if __UAPI_DEF_IN6_ADDR_ALT
 #define s6_addr16		in6_u.u6_addr16
 #define s6_addr32		in6_u.u6_addr32
+#endif
 };
+#endif /* __UAPI_DEF_IN6_ADDR */
 
+#if __UAPI_DEF_SOCKADDR_IN6
 struct sockaddr_in6 {
 	unsigned short int	sin6_family;    /* AF_INET6 */
 	__be16			sin6_port;      /* Transport layer port # */
@@ -45,7 +53,9 @@ struct sockaddr_in6 {
 	struct in6_addr		sin6_addr;      /* IPv6 address */
 	__u32			sin6_scope_id;  /* scope id (new in RFC2553) */
 };
+#endif /* __UAPI_DEF_SOCKADDR_IN6 */
 
+#if __UAPI_DEF_IPV6_MREQ
 struct ipv6_mreq {
 	/* IPv6 multicast address of group */
 	struct in6_addr ipv6mr_multiaddr;
@@ -53,6 +63,7 @@ struct ipv6_mreq {
 	/* local IPv6 address of interface */
 	int		ipv6mr_ifindex;
 };
+#endif /* __UAPI_DEF_IVP6_MREQ */
 
 #define ipv6mr_acaddr	ipv6mr_multiaddr
 
@@ -114,13 +125,24 @@ struct in6_flowlabel_req {
 /*
  *	IPV6 extension headers
  */
-#define IPPROTO_HOPOPTS		0	/* IPv6 hop-by-hop options	*/
-#define IPPROTO_ROUTING		43	/* IPv6 routing header		*/
-#define IPPROTO_FRAGMENT	44	/* IPv6 fragmentation header	*/
-#define IPPROTO_ICMPV6		58	/* ICMPv6			*/
-#define IPPROTO_NONE		59	/* IPv6 no next header		*/
-#define IPPROTO_DSTOPTS		60	/* IPv6 destination options	*/
-#define IPPROTO_MH		135	/* IPv6 mobility header		*/
+#if __UAPI_DEF_IPPROTO_V6
+enum {
+  IPPROTO_HOPOPTS = 0,		/* IPv6 hop-by-hop options      */
+#define IPPROTO_HOPOPTS		IPPROTO_HOPOPTS
+  IPPROTO_ROUTING = 43,		/* IPv6 routing header          */
+#define IPPROTO_ROUTING		IPPROTO_ROUTING
+  IPPROTO_FRAGMENT = 44,	/* IPv6 fragmentation header    */
+#define IPPROTO_FRAGMENT	IPPROTO_FRAGMENT
+  IPPROTO_ICMPV6 = 58,		/* ICMPv6                       */
+#define IPPROTO_ICMPV6		IPPROTO_ICMPV6
+  IPPROTO_NONE = 59,		/* IPv6 no next header          */
+#define IPPROTO_NONE		IPPROTO_NONE
+  IPPROTO_DSTOPTS = 60,		/* IPv6 destination options     */
+#define IPPROTO_DSTOPTS		IPPROTO_DSTOPTS
+  IPPROTO_MH = 135,		/* IPv6 mobility header         */
+#define IPPROTO_MH		IPPROTO_MH
+};
+#endif /* __UAPI_DEF_IPPROTO_V6 */
 
 /*
  *	IPv6 TLV options.
diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h
new file mode 100644
index 00000000000..335e8a7cad3
--- /dev/null
+++ b/include/uapi/linux/libc-compat.h
@@ -0,0 +1,103 @@
+/*
+ * Compatibility interface for userspace libc header coordination:
+ *
+ * Define compatibility macros that are used to control the inclusion or
+ * exclusion of UAPI structures and definitions in coordination with another
+ * userspace C library.
+ *
+ * This header is intended to solve the problem of UAPI definitions that
+ * conflict with userspace definitions. If a UAPI header has such conflicting
+ * definitions then the solution is as follows:
+ *
+ * * Synchronize the UAPI header and the libc headers so either one can be
+ *   used and such that the ABI is preserved. If this is not possible then
+ *   no simple compatibility interface exists (you need to write translating
+ *   wrappers and rename things) and you can't use this interface.
+ *
+ * Then follow this process:
+ *
+ * (a) Include libc-compat.h in the UAPI header.
+ *      e.g. #include <linux/libc-compat.h>
+ *     This include must be as early as possible.
+ *
+ * (b) In libc-compat.h add enough code to detect that the comflicting
+ *     userspace libc header has been included first.
+ *
+ * (c) If the userspace libc header has been included first define a set of
+ *     guard macros of the form __UAPI_DEF_FOO and set their values to 1, else
+ *     set their values to 0.
+ *
+ * (d) Back in the UAPI header with the conflicting definitions, guard the
+ *     definitions with:
+ *     #if __UAPI_DEF_FOO
+ *       ...
+ *     #endif
+ *
+ * This fixes the situation where the linux headers are included *after* the
+ * libc headers. To fix the problem with the inclusion in the other order the
+ * userspace libc headers must be fixed like this:
+ *
+ * * For all definitions that conflict with kernel definitions wrap those
+ *   defines in the following:
+ *   #if !__UAPI_DEF_FOO
+ *     ...
+ *   #endif
+ *
+ * This prevents the redefinition of a construct already defined by the kernel.
+ */
+#ifndef _UAPI_LIBC_COMPAT_H
+#define _UAPI_LIBC_COMPAT_H
+
+/* We have included glibc headers... */
+#if defined(__GLIBC__)
+
+/* Coordinate with glibc netinet/in.h header. */
+#if defined(_NETINET_IN_H)
+
+/* GLIBC headers included first so don't define anything
+ * that would already be defined. */
+#define __UAPI_DEF_IN6_ADDR		0
+/* The exception is the in6_addr macros which must be defined
+ * if the glibc code didn't define them. This guard matches
+ * the guard in glibc/inet/netinet/in.h which defines the
+ * additional in6_addr macros e.g. s6_addr16, and s6_addr32. */
+#if defined(__USE_MISC) || defined (__USE_GNU)
+#define __UAPI_DEF_IN6_ADDR_ALT		0
+#else
+#define __UAPI_DEF_IN6_ADDR_ALT		1
+#endif
+#define __UAPI_DEF_SOCKADDR_IN6		0
+#define __UAPI_DEF_IPV6_MREQ		0
+#define __UAPI_DEF_IPPROTO_V6		0
+
+#else
+
+/* Linux headers included first, and we must define everything
+ * we need. The expectation is that glibc will check the
+ * __UAPI_DEF_* defines and adjust appropriately. */
+#define __UAPI_DEF_IN6_ADDR		1
+/* We unconditionally define the in6_addr macros and glibc must
+ * coordinate. */
+#define __UAPI_DEF_IN6_ADDR_ALT		1
+#define __UAPI_DEF_SOCKADDR_IN6		1
+#define __UAPI_DEF_IPV6_MREQ		1
+#define __UAPI_DEF_IPPROTO_V6		1
+
+#endif /* _NETINET_IN_H */
+
+
+/* If we did not see any headers from any supported C libraries,
+ * or we are being included in the kernel, then define everything
+ * that we need. */
+#else /* !defined(__GLIBC__) */
+
+/* Definitions for in6.h */
+#define __UAPI_DEF_IN6_ADDR		1
+#define __UAPI_DEF_IN6_ADDR_ALT		1
+#define __UAPI_DEF_SOCKADDR_IN6		1
+#define __UAPI_DEF_IPV6_MREQ		1
+#define __UAPI_DEF_IPPROTO_V6		1
+
+#endif /* __GLIBC__ */
+
+#endif /* _UAPI_LIBC_COMPAT_H */
-- 
cgit v1.2.3-70-g09d2