From 7085130bab2f9c5b8d61bff73b01dc8195d0f974 Mon Sep 17 00:00:00 2001
From: Daniele Di Proietto <daniele.di.proietto@gmail.com>
Date: Thu, 23 Jan 2014 10:56:49 -0800
Subject: openvswitch: use const in some local vars and casts

In few functions, const formal parameters are assigned or cast to
non-const.
These changes suppress warnings if compiled with -Wcast-qual.

Signed-off-by: Daniele Di Proietto <daniele.di.proietto@gmail.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/flow_table.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'net/openvswitch/flow_table.c')

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 3c268b3d71c..1ba1e0b8ade 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -57,8 +57,10 @@ static u16 range_n_bytes(const struct sw_flow_key_range *range)
 void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
 		       const struct sw_flow_mask *mask)
 {
-	const long *m = (long *)((u8 *)&mask->key + mask->range.start);
-	const long *s = (long *)((u8 *)src + mask->range.start);
+	const long *m = (const long *)((const u8 *)&mask->key +
+				mask->range.start);
+	const long *s = (const long *)((const u8 *)src +
+				mask->range.start);
 	long *d = (long *)((u8 *)dst + mask->range.start);
 	int i;
 
@@ -375,7 +377,7 @@ int ovs_flow_tbl_flush(struct flow_table *flow_table)
 static u32 flow_hash(const struct sw_flow_key *key, int key_start,
 		     int key_end)
 {
-	u32 *hash_key = (u32 *)((u8 *)key + key_start);
+	const u32 *hash_key = (const u32 *)((const u8 *)key + key_start);
 	int hash_u32s = (key_end - key_start) >> 2;
 
 	/* Make sure number of hash bytes are multiple of u32. */
@@ -397,8 +399,8 @@ static bool cmp_key(const struct sw_flow_key *key1,
 		    const struct sw_flow_key *key2,
 		    int key_start, int key_end)
 {
-	const long *cp1 = (long *)((u8 *)key1 + key_start);
-	const long *cp2 = (long *)((u8 *)key2 + key_start);
+	const long *cp1 = (const long *)((const u8 *)key1 + key_start);
+	const long *cp2 = (const long *)((const u8 *)key2 + key_start);
 	long diffs = 0;
 	int i;
 
@@ -513,8 +515,8 @@ static struct sw_flow_mask *mask_alloc(void)
 static bool mask_equal(const struct sw_flow_mask *a,
 		       const struct sw_flow_mask *b)
 {
-	u8 *a_ = (u8 *)&a->key + a->range.start;
-	u8 *b_ = (u8 *)&b->key + b->range.start;
+	const u8 *a_ = (const u8 *)&a->key + a->range.start;
+	const u8 *b_ = (const u8 *)&b->key + b->range.start;
 
 	return  (a->range.end == b->range.end)
 		&& (a->range.start == b->range.start)
-- 
cgit v1.2.3-70-g09d2


From 23dabf88abb48a866fdb19ee08ebcf1ddd9b1840 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Thu, 27 Mar 2014 12:35:23 -0700
Subject: openvswitch: Remove 5-tuple optimization.

The 5-tuple optimization becomes unnecessary with a later per-NUMA
node stats patch.  Remove it first to make the changes easier to
grasp.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/datapath.c     | 11 ++++----
 net/openvswitch/flow.c         | 32 +++++++++--------------
 net/openvswitch/flow.h         | 10 +-------
 net/openvswitch/flow_netlink.c | 58 +++---------------------------------------
 net/openvswitch/flow_netlink.h |  1 -
 net/openvswitch/flow_table.c   | 31 +++++++---------------
 net/openvswitch/flow_table.h   |  2 +-
 7 files changed, 32 insertions(+), 113 deletions(-)

(limited to 'net/openvswitch/flow_table.c')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index a3276e3c4fe..8867d7e2d65 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -524,7 +524,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 		packet->protocol = htons(ETH_P_802_2);
 
 	/* Build an sw_flow for sending this packet. */
-	flow = ovs_flow_alloc(false);
+	flow = ovs_flow_alloc();
 	err = PTR_ERR(flow);
 	if (IS_ERR(flow))
 		goto err_kfree_skb;
@@ -782,7 +782,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	struct sw_flow_actions *acts = NULL;
 	struct sw_flow_match match;
-	bool exact_5tuple;
 	int error;
 
 	/* Extract key. */
@@ -791,7 +790,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 		goto error;
 
 	ovs_match_init(&match, &key, &mask);
-	error = ovs_nla_get_match(&match, &exact_5tuple,
+	error = ovs_nla_get_match(&match,
 				  a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]);
 	if (error)
 		goto error;
@@ -830,7 +829,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
 			goto err_unlock_ovs;
 
 		/* Allocate flow. */
-		flow = ovs_flow_alloc(!exact_5tuple);
+		flow = ovs_flow_alloc();
 		if (IS_ERR(flow)) {
 			error = PTR_ERR(flow);
 			goto err_unlock_ovs;
@@ -914,7 +913,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	ovs_match_init(&match, &key, NULL);
-	err = ovs_nla_get_match(&match, NULL, a[OVS_FLOW_ATTR_KEY], NULL);
+	err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
 	if (err)
 		return err;
 
@@ -968,7 +967,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	ovs_match_init(&match, &key, NULL);
-	err = ovs_nla_get_match(&match, NULL, a[OVS_FLOW_ATTR_KEY], NULL);
+	err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL);
 	if (err)
 		goto unlock;
 
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 332aa01492f..aad7a8da70b 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -66,10 +66,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
 	struct flow_stats *stats;
 	__be16 tcp_flags = 0;
 
-	if (!flow->stats.is_percpu)
-		stats = flow->stats.stat;
-	else
-		stats = this_cpu_ptr(flow->stats.cpu_stats);
+	stats = this_cpu_ptr(flow->stats);
 
 	if ((flow->key.eth.type == htons(ETH_P_IP) ||
 	     flow->key.eth.type == htons(ETH_P_IPV6)) &&
@@ -110,16 +107,14 @@ void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
 	memset(ovs_stats, 0, sizeof(*ovs_stats));
 
 	local_bh_disable();
-	if (!flow->stats.is_percpu) {
-		stats_read(flow->stats.stat, ovs_stats, used, tcp_flags);
-	} else {
-		for_each_possible_cpu(cpu) {
-			struct flow_stats *stats;
-
-			stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-			stats_read(stats, ovs_stats, used, tcp_flags);
-		}
+
+	for_each_possible_cpu(cpu) {
+		struct flow_stats *stats;
+
+		stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
+		stats_read(stats, ovs_stats, used, tcp_flags);
 	}
+
 	local_bh_enable();
 }
 
@@ -138,13 +133,10 @@ void ovs_flow_stats_clear(struct sw_flow *flow)
 	int cpu;
 
 	local_bh_disable();
-	if (!flow->stats.is_percpu) {
-		stats_reset(flow->stats.stat);
-	} else {
-		for_each_possible_cpu(cpu) {
-			stats_reset(per_cpu_ptr(flow->stats.cpu_stats, cpu));
-		}
-	}
+
+	for_each_possible_cpu(cpu)
+		stats_reset(per_cpu_ptr(flow->stats, cpu));
+
 	local_bh_enable();
 }
 
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 2d770e28a3a..9c0dd8aa311 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -155,14 +155,6 @@ struct flow_stats {
 	__be16 tcp_flags;		/* Union of seen TCP flags. */
 };
 
-struct sw_flow_stats {
-	bool is_percpu;
-	union {
-		struct flow_stats *stat;
-		struct flow_stats __percpu *cpu_stats;
-	};
-};
-
 struct sw_flow {
 	struct rcu_head rcu;
 	struct hlist_node hash_node[2];
@@ -172,7 +164,7 @@ struct sw_flow {
 	struct sw_flow_key unmasked_key;
 	struct sw_flow_mask *mask;
 	struct sw_flow_actions __rcu *sf_acts;
-	struct sw_flow_stats stats;
+	struct flow_stats __percpu *stats;
 };
 
 struct arp_eth_header {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 5511ad18be5..84caa99b376 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -268,20 +268,6 @@ static bool is_all_zero(const u8 *fp, size_t size)
 	return true;
 }
 
-static bool is_all_set(const u8 *fp, size_t size)
-{
-	int i;
-
-	if (!fp)
-		return false;
-
-	for (i = 0; i < size; i++)
-		if (fp[i] != 0xff)
-			return false;
-
-	return true;
-}
-
 static int __parse_flow_nlattrs(const struct nlattr *attr,
 				const struct nlattr *a[],
 				u64 *attrsp, bool nz)
@@ -503,9 +489,8 @@ static int metadata_from_nlattrs(struct sw_flow_match *match,  u64 *attrs,
 	return 0;
 }
 
-static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple,
-				u64 attrs, const struct nlattr **a,
-				bool is_mask)
+static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
+				const struct nlattr **a, bool is_mask)
 {
 	int err;
 	u64 orig_attrs = attrs;
@@ -562,11 +547,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 		SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
 	}
 
-	if (is_mask && exact_5tuple) {
-		if (match->mask->key.eth.type != htons(0xffff))
-			*exact_5tuple = false;
-	}
-
 	if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
 		const struct ovs_key_ipv4 *ipv4_key;
 
@@ -589,13 +569,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 		SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
 				ipv4_key->ipv4_dst, is_mask);
 		attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
-
-		if (is_mask && exact_5tuple && *exact_5tuple) {
-			if (ipv4_key->ipv4_proto != 0xff ||
-			    ipv4_key->ipv4_src != htonl(0xffffffff) ||
-			    ipv4_key->ipv4_dst != htonl(0xffffffff))
-				*exact_5tuple = false;
-		}
 	}
 
 	if (attrs & (1 << OVS_KEY_ATTR_IPV6)) {
@@ -627,15 +600,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 				is_mask);
 
 		attrs &= ~(1 << OVS_KEY_ATTR_IPV6);
-
-		if (is_mask && exact_5tuple && *exact_5tuple) {
-			if (ipv6_key->ipv6_proto != 0xff ||
-			    !is_all_set((const u8 *)ipv6_key->ipv6_src,
-					sizeof(match->key->ipv6.addr.src)) ||
-			    !is_all_set((const u8 *)ipv6_key->ipv6_dst,
-					sizeof(match->key->ipv6.addr.dst)))
-				*exact_5tuple = false;
-		}
 	}
 
 	if (attrs & (1 << OVS_KEY_ATTR_ARP)) {
@@ -678,11 +642,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 					tcp_key->tcp_dst, is_mask);
 		}
 		attrs &= ~(1 << OVS_KEY_ATTR_TCP);
-
-		if (is_mask && exact_5tuple && *exact_5tuple &&
-		    (tcp_key->tcp_src != htons(0xffff) ||
-		     tcp_key->tcp_dst != htons(0xffff)))
-			*exact_5tuple = false;
 	}
 
 	if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) {
@@ -714,11 +673,6 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match,  bool *exact_5tuple
 					udp_key->udp_dst, is_mask);
 		}
 		attrs &= ~(1 << OVS_KEY_ATTR_UDP);
-
-		if (is_mask && exact_5tuple && *exact_5tuple &&
-		    (udp_key->udp_src != htons(0xffff) ||
-		     udp_key->udp_dst != htons(0xffff)))
-			*exact_5tuple = false;
 	}
 
 	if (attrs & (1 << OVS_KEY_ATTR_SCTP)) {
@@ -804,7 +758,6 @@ static void sw_flow_mask_set(struct sw_flow_mask *mask,
  * attribute specifies the mask field of the wildcarded flow.
  */
 int ovs_nla_get_match(struct sw_flow_match *match,
-		      bool *exact_5tuple,
 		      const struct nlattr *key,
 		      const struct nlattr *mask)
 {
@@ -852,13 +805,10 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 		}
 	}
 
-	err = ovs_key_from_nlattrs(match, NULL, key_attrs, a, false);
+	err = ovs_key_from_nlattrs(match, key_attrs, a, false);
 	if (err)
 		return err;
 
-	if (exact_5tuple)
-		*exact_5tuple = true;
-
 	if (mask) {
 		err = parse_flow_mask_nlattrs(mask, a, &mask_attrs);
 		if (err)
@@ -896,7 +846,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
 			}
 		}
 
-		err = ovs_key_from_nlattrs(match, exact_5tuple, mask_attrs, a, true);
+		err = ovs_key_from_nlattrs(match, mask_attrs, a, true);
 		if (err)
 			return err;
 	} else {
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index b31fbe28bc7..440151045d3 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -45,7 +45,6 @@ int ovs_nla_put_flow(const struct sw_flow_key *,
 int ovs_nla_get_flow_metadata(struct sw_flow *flow,
 			      const struct nlattr *attr);
 int ovs_nla_get_match(struct sw_flow_match *match,
-		      bool *exact_5tuple,
 		      const struct nlattr *,
 		      const struct nlattr *);
 
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 1ba1e0b8ade..aa92da23053 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -72,7 +72,7 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
 		*d++ = *s++ & *m++;
 }
 
-struct sw_flow *ovs_flow_alloc(bool percpu_stats)
+struct sw_flow *ovs_flow_alloc(void)
 {
 	struct sw_flow *flow;
 	int cpu;
@@ -84,25 +84,15 @@ struct sw_flow *ovs_flow_alloc(bool percpu_stats)
 	flow->sf_acts = NULL;
 	flow->mask = NULL;
 
-	flow->stats.is_percpu = percpu_stats;
+	flow->stats = alloc_percpu(struct flow_stats);
+	if (!flow->stats)
+		goto err;
 
-	if (!percpu_stats) {
-		flow->stats.stat = kzalloc(sizeof(*flow->stats.stat), GFP_KERNEL);
-		if (!flow->stats.stat)
-			goto err;
+	for_each_possible_cpu(cpu) {
+		struct flow_stats *cpu_stats;
 
-		spin_lock_init(&flow->stats.stat->lock);
-	} else {
-		flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
-		if (!flow->stats.cpu_stats)
-			goto err;
-
-		for_each_possible_cpu(cpu) {
-			struct flow_stats *cpu_stats;
-
-			cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-			spin_lock_init(&cpu_stats->lock);
-		}
+		cpu_stats = per_cpu_ptr(flow->stats, cpu);
+		spin_lock_init(&cpu_stats->lock);
 	}
 	return flow;
 err:
@@ -141,10 +131,7 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
 static void flow_free(struct sw_flow *flow)
 {
 	kfree((struct sf_flow_acts __force *)flow->sf_acts);
-	if (flow->stats.is_percpu)
-		free_percpu(flow->stats.cpu_stats);
-	else
-		kfree(flow->stats.stat);
+	free_percpu(flow->stats);
 	kmem_cache_free(flow_cache, flow);
 }
 
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index baaeb101924..c26c59a7ab5 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -55,7 +55,7 @@ struct flow_table {
 int ovs_flow_init(void);
 void ovs_flow_exit(void);
 
-struct sw_flow *ovs_flow_alloc(bool percpu_stats);
+struct sw_flow *ovs_flow_alloc(void);
 void ovs_flow_free(struct sw_flow *, bool deferred);
 
 int ovs_flow_tbl_init(struct flow_table *);
-- 
cgit v1.2.3-70-g09d2


From 63e7959c4b9bd6f791061c460a22d9ee32ae2240 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Thu, 27 Mar 2014 12:42:54 -0700
Subject: openvswitch: Per NUMA node flow stats.

Keep kernel flow stats for each NUMA node rather than each (logical)
CPU.  This avoids using the per-CPU allocator and removes most of the
kernel-side OVS locking overhead otherwise on the top of perf reports
and allows OVS to scale better with higher number of threads.

With 9 handlers and 4 revalidators netperf TCP_CRR test flow setup
rate doubles on a server with two hyper-threaded physical CPUs (16
logical cores each) compared to the current OVS master.  Tested with
non-trivial flow table with a TCP port match rule forcing all new
connections with unique port numbers to OVS userspace.  The IP
addresses are still wildcarded, so the kernel flows are not considered
as exact match 5-tuple flows.  This type of flows can be expected to
appear in large numbers as the result of more effective wildcarding
made possible by improvements in OVS userspace flow classifier.

Perf results for this test (master):

Events: 305K cycles
+   8.43%     ovs-vswitchd  [kernel.kallsyms]   [k] mutex_spin_on_owner
+   5.64%     ovs-vswitchd  [kernel.kallsyms]   [k] __ticket_spin_lock
+   4.75%     ovs-vswitchd  ovs-vswitchd        [.] find_match_wc
+   3.32%     ovs-vswitchd  libpthread-2.15.so  [.] pthread_mutex_lock
+   2.61%     ovs-vswitchd  [kernel.kallsyms]   [k] pcpu_alloc_area
+   2.19%     ovs-vswitchd  ovs-vswitchd        [.] flow_hash_in_minimask_range
+   2.03%          swapper  [kernel.kallsyms]   [k] intel_idle
+   1.84%     ovs-vswitchd  libpthread-2.15.so  [.] pthread_mutex_unlock
+   1.64%     ovs-vswitchd  ovs-vswitchd        [.] classifier_lookup
+   1.58%     ovs-vswitchd  libc-2.15.so        [.] 0x7f4e6
+   1.07%     ovs-vswitchd  [kernel.kallsyms]   [k] memset
+   1.03%          netperf  [kernel.kallsyms]   [k] __ticket_spin_lock
+   0.92%          swapper  [kernel.kallsyms]   [k] __ticket_spin_lock
...

And after this patch:

Events: 356K cycles
+   6.85%     ovs-vswitchd  ovs-vswitchd        [.] find_match_wc
+   4.63%     ovs-vswitchd  libpthread-2.15.so  [.] pthread_mutex_lock
+   3.06%     ovs-vswitchd  [kernel.kallsyms]   [k] __ticket_spin_lock
+   2.81%     ovs-vswitchd  ovs-vswitchd        [.] flow_hash_in_minimask_range
+   2.51%     ovs-vswitchd  libpthread-2.15.so  [.] pthread_mutex_unlock
+   2.27%     ovs-vswitchd  ovs-vswitchd        [.] classifier_lookup
+   1.84%     ovs-vswitchd  libc-2.15.so        [.] 0x15d30f
+   1.74%     ovs-vswitchd  [kernel.kallsyms]   [k] mutex_spin_on_owner
+   1.47%          swapper  [kernel.kallsyms]   [k] intel_idle
+   1.34%     ovs-vswitchd  ovs-vswitchd        [.] flow_hash_in_minimask
+   1.33%     ovs-vswitchd  ovs-vswitchd        [.] rule_actions_unref
+   1.16%     ovs-vswitchd  ovs-vswitchd        [.] hindex_node_with_hash
+   1.16%     ovs-vswitchd  ovs-vswitchd        [.] do_xlate_actions
+   1.09%     ovs-vswitchd  ovs-vswitchd        [.] ofproto_rule_ref
+   1.01%          netperf  [kernel.kallsyms]   [k] __ticket_spin_lock
...

There is a small increase in kernel spinlock overhead due to the same
spinlock being shared between multiple cores of the same physical CPU,
but that is barely visible in the netperf TCP_CRR test performance
(maybe ~1% performance drop, hard to tell exactly due to variance in
the test results), when testing for kernel module throughput (with no
userspace activity, handful of kernel flows).

On flow setup, a single stats instance is allocated (for the NUMA node
0).  As CPUs from multiple NUMA nodes start updating stats, new
NUMA-node specific stats instances are allocated.  This allocation on
the packet processing code path is made to never block or look for
emergency memory pools, minimizing the allocation latency.  If the
allocation fails, the existing preallocated stats instance is used.
Also, if only CPUs from one NUMA-node are updating the preallocated
stats instance, no additional stats instances are allocated.  This
eliminates the need to pre-allocate stats instances that will not be
used, also relieving the stats reader from the burden of reading stats
that are never used.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/flow.c       | 119 ++++++++++++++++++++++++++++---------------
 net/openvswitch/flow.h       |  10 +++-
 net/openvswitch/flow_table.c |  46 +++++++++++++----
 net/openvswitch/flow_table.h |   2 +
 4 files changed, 122 insertions(+), 55 deletions(-)

(limited to 'net/openvswitch/flow_table.c')

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index aad7a8da70b..432f04d5c89 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -65,8 +65,9 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
 {
 	struct flow_stats *stats;
 	__be16 tcp_flags = 0;
+	int node = numa_node_id();
 
-	stats = this_cpu_ptr(flow->stats);
+	stats = rcu_dereference(flow->stats[node]);
 
 	if ((flow->key.eth.type == htons(ETH_P_IP) ||
 	     flow->key.eth.type == htons(ETH_P_IPV6)) &&
@@ -76,68 +77,102 @@ void ovs_flow_stats_update(struct sw_flow *flow, struct sk_buff *skb)
 		tcp_flags = TCP_FLAGS_BE16(tcp_hdr(skb));
 	}
 
-	spin_lock(&stats->lock);
+	/* Check if already have node-specific stats. */
+	if (likely(stats)) {
+		spin_lock(&stats->lock);
+		/* Mark if we write on the pre-allocated stats. */
+		if (node == 0 && unlikely(flow->stats_last_writer != node))
+			flow->stats_last_writer = node;
+	} else {
+		stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
+		spin_lock(&stats->lock);
+
+		/* If the current NUMA-node is the only writer on the
+		 * pre-allocated stats keep using them.
+		 */
+		if (unlikely(flow->stats_last_writer != node)) {
+			/* A previous locker may have already allocated the
+			 * stats, so we need to check again.  If node-specific
+			 * stats were already allocated, we update the pre-
+			 * allocated stats as we have already locked them.
+			 */
+			if (likely(flow->stats_last_writer != NUMA_NO_NODE)
+			    && likely(!rcu_dereference(flow->stats[node]))) {
+				/* Try to allocate node-specific stats. */
+				struct flow_stats *new_stats;
+
+				new_stats =
+					kmem_cache_alloc_node(flow_stats_cache,
+							      GFP_THISNODE |
+							      __GFP_NOMEMALLOC,
+							      node);
+				if (likely(new_stats)) {
+					new_stats->used = jiffies;
+					new_stats->packet_count = 1;
+					new_stats->byte_count = skb->len;
+					new_stats->tcp_flags = tcp_flags;
+					spin_lock_init(&new_stats->lock);
+
+					rcu_assign_pointer(flow->stats[node],
+							   new_stats);
+					goto unlock;
+				}
+			}
+			flow->stats_last_writer = node;
+		}
+	}
+
 	stats->used = jiffies;
 	stats->packet_count++;
 	stats->byte_count += skb->len;
 	stats->tcp_flags |= tcp_flags;
-	spin_unlock(&stats->lock);
-}
-
-static void stats_read(struct flow_stats *stats,
-		       struct ovs_flow_stats *ovs_stats,
-		       unsigned long *used, __be16 *tcp_flags)
-{
-	spin_lock(&stats->lock);
-	if (!*used || time_after(stats->used, *used))
-		*used = stats->used;
-	*tcp_flags |= stats->tcp_flags;
-	ovs_stats->n_packets += stats->packet_count;
-	ovs_stats->n_bytes += stats->byte_count;
+unlock:
 	spin_unlock(&stats->lock);
 }
 
 void ovs_flow_stats_get(struct sw_flow *flow, struct ovs_flow_stats *ovs_stats,
 			unsigned long *used, __be16 *tcp_flags)
 {
-	int cpu;
+	int node;
 
 	*used = 0;
 	*tcp_flags = 0;
 	memset(ovs_stats, 0, sizeof(*ovs_stats));
 
-	local_bh_disable();
-
-	for_each_possible_cpu(cpu) {
-		struct flow_stats *stats;
+	for_each_node(node) {
+		struct flow_stats *stats = rcu_dereference(flow->stats[node]);
 
-		stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
-		stats_read(stats, ovs_stats, used, tcp_flags);
+		if (stats) {
+			/* Local CPU may write on non-local stats, so we must
+			 * block bottom-halves here.
+			 */
+			spin_lock_bh(&stats->lock);
+			if (!*used || time_after(stats->used, *used))
+				*used = stats->used;
+			*tcp_flags |= stats->tcp_flags;
+			ovs_stats->n_packets += stats->packet_count;
+			ovs_stats->n_bytes += stats->byte_count;
+			spin_unlock_bh(&stats->lock);
+		}
 	}
-
-	local_bh_enable();
-}
-
-static void stats_reset(struct flow_stats *stats)
-{
-	spin_lock(&stats->lock);
-	stats->used = 0;
-	stats->packet_count = 0;
-	stats->byte_count = 0;
-	stats->tcp_flags = 0;
-	spin_unlock(&stats->lock);
 }
 
 void ovs_flow_stats_clear(struct sw_flow *flow)
 {
-	int cpu;
-
-	local_bh_disable();
-
-	for_each_possible_cpu(cpu)
-		stats_reset(per_cpu_ptr(flow->stats, cpu));
-
-	local_bh_enable();
+	int node;
+
+	for_each_node(node) {
+		struct flow_stats *stats = rcu_dereference(flow->stats[node]);
+
+		if (stats) {
+			spin_lock_bh(&stats->lock);
+			stats->used = 0;
+			stats->packet_count = 0;
+			stats->byte_count = 0;
+			stats->tcp_flags = 0;
+			spin_unlock_bh(&stats->lock);
+		}
+	}
 }
 
 static int check_header(struct sk_buff *skb, int len)
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 9c0dd8aa311..ddcebc53224 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -159,12 +159,18 @@ struct sw_flow {
 	struct rcu_head rcu;
 	struct hlist_node hash_node[2];
 	u32 hash;
-
+	int stats_last_writer;		/* NUMA-node id of the last writer on
+					 * 'stats[0]'.
+					 */
 	struct sw_flow_key key;
 	struct sw_flow_key unmasked_key;
 	struct sw_flow_mask *mask;
 	struct sw_flow_actions __rcu *sf_acts;
-	struct flow_stats __percpu *stats;
+	struct flow_stats __rcu *stats[]; /* One for each NUMA node.  First one
+					   * is allocated at flow creation time,
+					   * the rest are allocated on demand
+					   * while holding the 'stats[0].lock'.
+					   */
 };
 
 struct arp_eth_header {
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index aa92da23053..d8ef37b937b 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -48,6 +48,7 @@
 #define REHASH_INTERVAL		(10 * 60 * HZ)
 
 static struct kmem_cache *flow_cache;
+struct kmem_cache *flow_stats_cache __read_mostly;
 
 static u16 range_n_bytes(const struct sw_flow_key_range *range)
 {
@@ -75,7 +76,8 @@ void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
 struct sw_flow *ovs_flow_alloc(void)
 {
 	struct sw_flow *flow;
-	int cpu;
+	struct flow_stats *stats;
+	int node;
 
 	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
 	if (!flow)
@@ -83,17 +85,22 @@ struct sw_flow *ovs_flow_alloc(void)
 
 	flow->sf_acts = NULL;
 	flow->mask = NULL;
+	flow->stats_last_writer = NUMA_NO_NODE;
 
-	flow->stats = alloc_percpu(struct flow_stats);
-	if (!flow->stats)
+	/* Initialize the default stat node. */
+	stats = kmem_cache_alloc_node(flow_stats_cache,
+				      GFP_KERNEL | __GFP_ZERO, 0);
+	if (!stats)
 		goto err;
 
-	for_each_possible_cpu(cpu) {
-		struct flow_stats *cpu_stats;
+	spin_lock_init(&stats->lock);
+
+	RCU_INIT_POINTER(flow->stats[0], stats);
+
+	for_each_node(node)
+		if (node != 0)
+			RCU_INIT_POINTER(flow->stats[node], NULL);
 
-		cpu_stats = per_cpu_ptr(flow->stats, cpu);
-		spin_lock_init(&cpu_stats->lock);
-	}
 	return flow;
 err:
 	kmem_cache_free(flow_cache, flow);
@@ -130,8 +137,13 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets)
 
 static void flow_free(struct sw_flow *flow)
 {
+	int node;
+
 	kfree((struct sf_flow_acts __force *)flow->sf_acts);
-	free_percpu(flow->stats);
+	for_each_node(node)
+		if (flow->stats[node])
+			kmem_cache_free(flow_stats_cache,
+					(struct flow_stats __force *)flow->stats[node]);
 	kmem_cache_free(flow_cache, flow);
 }
 
@@ -586,16 +598,28 @@ int ovs_flow_init(void)
 	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
 	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
 
-	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
-					0, NULL);
+	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
+				       + (num_possible_nodes()
+					  * sizeof(struct flow_stats *)),
+				       0, 0, NULL);
 	if (flow_cache == NULL)
 		return -ENOMEM;
 
+	flow_stats_cache
+		= kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats),
+				    0, SLAB_HWCACHE_ALIGN, NULL);
+	if (flow_stats_cache == NULL) {
+		kmem_cache_destroy(flow_cache);
+		flow_cache = NULL;
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
 /* Uninitializes the flow module. */
 void ovs_flow_exit(void)
 {
+	kmem_cache_destroy(flow_stats_cache);
 	kmem_cache_destroy(flow_cache);
 }
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index c26c59a7ab5..ca8a5820f61 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -52,6 +52,8 @@ struct flow_table {
 	unsigned int count;
 };
 
+extern struct kmem_cache *flow_stats_cache;
+
 int ovs_flow_init(void);
 void ovs_flow_exit(void);
 
-- 
cgit v1.2.3-70-g09d2


From 56c19868e115fcf8d62d843e1b9616bb9837d0db Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Mon, 5 May 2014 13:24:53 -0700
Subject: openvswitch: Make flow mask removal symmetric.

Masks are inserted when flows are inserted to the table, so it is
logical to correspondingly remove masks when flows are removed from
the table, in ovs_flow_table_remove().

This allows ovs_flow_free() to be called without locking, which will
be used by later patches.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/flow_table.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

(limited to 'net/openvswitch/flow_table.c')

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d8ef37b937b..c80df6f42fe 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -159,25 +159,6 @@ void ovs_flow_free(struct sw_flow *flow, bool deferred)
 	if (!flow)
 		return;
 
-	if (flow->mask) {
-		struct sw_flow_mask *mask = flow->mask;
-
-		/* ovs-lock is required to protect mask-refcount and
-		 * mask list.
-		 */
-		ASSERT_OVSL();
-		BUG_ON(!mask->ref_count);
-		mask->ref_count--;
-
-		if (!mask->ref_count) {
-			list_del_rcu(&mask->list);
-			if (deferred)
-				kfree_rcu(mask, rcu);
-			else
-				kfree(mask);
-		}
-	}
-
 	if (deferred)
 		call_rcu(&flow->rcu, rcu_free_flow_callback);
 	else
@@ -491,6 +472,25 @@ static struct table_instance *table_instance_expand(struct table_instance *ti)
 	return table_instance_rehash(ti, ti->n_buckets * 2);
 }
 
+/* Remove 'mask' from the mask list, if it is not needed any more. */
+static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
+{
+	if (mask) {
+		/* ovs-lock is required to protect mask-refcount and
+		 * mask list.
+		 */
+		ASSERT_OVSL();
+		BUG_ON(!mask->ref_count);
+		mask->ref_count--;
+
+		if (!mask->ref_count) {
+			list_del_rcu(&mask->list);
+			kfree_rcu(mask, rcu);
+		}
+	}
+}
+
+/* Must be called with OVS mutex held. */
 void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
 {
 	struct table_instance *ti = ovsl_dereference(table->ti);
@@ -498,6 +498,11 @@ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
 	BUG_ON(table->count == 0);
 	hlist_del_rcu(&flow->hash_node[ti->node_ver]);
 	table->count--;
+
+	/* RCU delete the mask. 'flow->mask' is not NULLed, as it should be
+	 * accessible as long as the RCU read lock is held.
+	 */
+	flow_mask_remove(table, flow->mask);
 }
 
 static struct sw_flow_mask *mask_alloc(void)
@@ -560,6 +565,7 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
 	return 0;
 }
 
+/* Must be called with OVS mutex held. */
 int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
 			struct sw_flow_mask *mask)
 {
-- 
cgit v1.2.3-70-g09d2


From eb07265904d6ee95497aba0f3cbd2ae6d9c39a97 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Mon, 5 May 2014 14:15:18 -0700
Subject: openvswitch: Fix typo.

Incorrect struct name was confusing, even though otherwise
inconsequental.

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/flow_table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/openvswitch/flow_table.c')

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index c80df6f42fe..574c3abc9b3 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -139,7 +139,7 @@ static void flow_free(struct sw_flow *flow)
 {
 	int node;
 
-	kfree((struct sf_flow_acts __force *)flow->sf_acts);
+	kfree((struct sw_flow_actions __force *)flow->sf_acts);
 	for_each_node(node)
 		if (flow->stats[node])
 			kmem_cache_free(flow_stats_cache,
-- 
cgit v1.2.3-70-g09d2


From 4a46b24e147dfa9b858026da02cad0bdd4e149d2 Mon Sep 17 00:00:00 2001
From: Alex Wang <alexw@nicira.com>
Date: Mon, 30 Jun 2014 20:30:29 -0700
Subject: openvswitch: Use exact lookup for flow_get and flow_del.

Due to the race condition in userspace, there is chance that two
overlapping megaflows could be installed in datapath.  And this
causes userspace unable to delete the less inclusive megaflow flow
even after it timeout, since the flow_del logic will stop at the
first match of masked flow.

This commit fixes the bug by making the kernel flow_del and flow_get
logic check all masks in that case.

Introduced by 03f0d916a (openvswitch: Mega flow implementation).

Signed-off-by: Alex Wang <alexw@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/datapath.c   | 23 +++++++++++------------
 net/openvswitch/flow_table.c | 16 ++++++++++++++++
 net/openvswitch/flow_table.h |  3 ++-
 3 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'net/openvswitch/flow_table.c')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index a863678c50a..9db4bf6740d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -889,8 +889,11 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
 		}
 		/* The unmasked key has to be the same for flow updates. */
 		if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) {
-			error = -EEXIST;
-			goto err_unlock_ovs;
+			flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+			if (!flow) {
+				error = -ENOENT;
+				goto err_unlock_ovs;
+			}
 		}
 		/* Update actions. */
 		old_acts = ovsl_dereference(flow->sf_acts);
@@ -981,16 +984,12 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 		goto err_unlock_ovs;
 	}
 	/* Check that the flow exists. */
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
+	flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
 	if (unlikely(!flow)) {
 		error = -ENOENT;
 		goto err_unlock_ovs;
 	}
-	/* The unmasked key has to be the same for flow updates. */
-	if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) {
-		error = -EEXIST;
-		goto err_unlock_ovs;
-	}
+
 	/* Update actions, if present. */
 	if (likely(acts)) {
 		old_acts = ovsl_dereference(flow->sf_acts);
@@ -1063,8 +1062,8 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info)
 		goto unlock;
 	}
 
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
-	if (!flow || !ovs_flow_cmp_unmasked_key(flow, &match)) {
+	flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+	if (!flow) {
 		err = -ENOENT;
 		goto unlock;
 	}
@@ -1113,8 +1112,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		goto unlock;
 	}
 
-	flow = ovs_flow_tbl_lookup(&dp->table, &key);
-	if (unlikely(!flow || !ovs_flow_cmp_unmasked_key(flow, &match))) {
+	flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
+	if (unlikely(!flow)) {
 		err = -ENOENT;
 		goto unlock;
 	}
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 574c3abc9b3..cf2d853646f 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -456,6 +456,22 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
 	return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
 }
 
+struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
+					  struct sw_flow_match *match)
+{
+	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
+	struct sw_flow_mask *mask;
+	struct sw_flow *flow;
+
+	/* Always called under ovs-mutex. */
+	list_for_each_entry(mask, &tbl->mask_list, list) {
+		flow = masked_flow_lookup(ti, match->key, mask);
+		if (flow && ovs_flow_cmp_unmasked_key(flow, match))  /* Found */
+			return flow;
+	}
+	return NULL;
+}
+
 int ovs_flow_tbl_num_masks(const struct flow_table *table)
 {
 	struct sw_flow_mask *mask;
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index ca8a5820f61..5918bff7f3f 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -76,7 +76,8 @@ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *,
 				    u32 *n_mask_hit);
 struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *,
 				    const struct sw_flow_key *);
-
+struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
+					  struct sw_flow_match *match);
 bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
 			       struct sw_flow_match *match);
 
-- 
cgit v1.2.3-70-g09d2