From 8c6b00c816191ded80d1ccd5164b53168255ec15 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Sun, 17 Aug 2014 16:29:43 +0300 Subject: net/openvswitch/flow.c: Replace rcu_dereference() with rcu_access_pointer() The "rcu_dereference()" call is used directly in a condition. Since its return value is never dereferenced it is recommended to use "rcu_access_pointer()" instead of "rcu_dereference()". Therefore, this patch makes the replacement. The following Coccinelle semantic patch was used: @@ @@ ( if( (<+... - rcu_dereference + rcu_access_pointer (...) ...+>)) {...} | while( (<+... - rcu_dereference + rcu_access_pointer (...) ...+>)) {...} ) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index d07ab538fc9..7064da92f42 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -89,7 +89,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, * allocated stats as we have already locked them. */ if (likely(flow->stats_last_writer != NUMA_NO_NODE) - && likely(!rcu_dereference(flow->stats[node]))) { + && likely(!rcu_access_pointer(flow->stats[node]))) { /* Try to allocate node-specific stats. */ struct flow_stats *new_stats; -- cgit v1.2.3-70-g09d2 From 83c8df26a3b654871c0503fcf6eac61777e12ea1 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 15 Sep 2014 19:20:31 -0700 Subject: openvswitch: refactor ovs flow extract API. OVS flow extract is called on packet receive or packet execute code path. Following patch defines separate API for extracting flow-key in packet execute code path. Signed-off-by: Pravin B Shelar Acked-by: Andy Zhou --- net/openvswitch/datapath.c | 21 ++++++++++++------ net/openvswitch/datapath.h | 5 ++++- net/openvswitch/flow.c | 49 ++++++++++++++++++++++++++++++------------ net/openvswitch/flow.h | 6 +++++- net/openvswitch/flow_netlink.c | 22 ++++++------------- net/openvswitch/flow_netlink.h | 4 ++-- net/openvswitch/vport.c | 5 +++-- 7 files changed, 71 insertions(+), 41 deletions(-) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 458096da138..0cce8e60d5e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -237,8 +237,9 @@ void ovs_dp_detach_port(struct vport *p) } /* Must be called with rcu_read_lock. */ -void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) +void ovs_dp_process_received_packet(struct sk_buff *skb) { + const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct dp_stats_percpu *stats; @@ -250,7 +251,7 @@ void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) stats = this_cpu_ptr(dp->stats_percpu); /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_extract(skb, p->port_no, &key); + error = ovs_flow_key_extract(skb, &key); if (unlikely(error)) { kfree_skb(skb); return; @@ -514,6 +515,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct sw_flow *flow; struct datapath *dp; struct ethhdr *eth; + struct vport *input_vport; int len; int err; @@ -548,13 +550,11 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_extract(packet, -1, &flow->key); + err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, + &flow->key); if (err) goto err_flow_free; - err = ovs_nla_get_flow_metadata(flow, a[OVS_PACKET_ATTR_KEY]); - if (err) - goto err_flow_free; acts = ovs_nla_alloc_flow_actions(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); err = PTR_ERR(acts); if (IS_ERR(acts)) @@ -576,6 +576,15 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (!dp) goto err_unlock; + input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); + if (!input_vport) + input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); + + if (!input_vport) + goto err_unlock; + + OVS_CB(packet)->input_vport = input_vport; + local_bh_disable(); err = ovs_execute_actions(dp, packet, &flow->key); local_bh_enable(); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index b576483aab7..2b982fae6a1 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -97,10 +97,13 @@ struct datapath { * @flow: The flow associated with this packet. May be %NULL if no flow. * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the * packet is not being tunneled. + * @input_vport: The original vport packet came in on. This value is cached + * when a packet is received by OVS. */ struct ovs_skb_cb { struct sw_flow *flow; struct ovs_key_ipv4_tunnel *tun_key; + struct vport *input_vport; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -181,7 +184,7 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; -void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); +void ovs_dp_process_received_packet(struct sk_buff *); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 7064da92f42..d186eb65a39 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -16,8 +16,6 @@ * 02110-1301, USA */ -#include "flow.h" -#include "datapath.h" #include #include #include @@ -46,6 +44,10 @@ #include #include +#include "datapath.h" +#include "flow.h" +#include "flow_netlink.h" + u64 ovs_flow_used_time(unsigned long flow_jiffies) { struct timespec cur_ts; @@ -420,10 +422,9 @@ invalid: } /** - * ovs_flow_extract - extracts a flow key from an Ethernet frame. + * key_extract - extracts a flow key from an Ethernet frame. * @skb: sk_buff that contains the frame, with skb->data pointing to the * Ethernet header - * @in_port: port number on which @skb was received. * @key: output flow key * * The caller must ensure that skb->len >= ETH_HLEN. @@ -442,19 +443,11 @@ invalid: * of a correct length, otherwise the same as skb->network_header. * For other key->eth.type values it is left untouched. */ -int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) +static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) { int error; struct ethhdr *eth; - memset(key, 0, sizeof(*key)); - - key->phy.priority = skb->priority; - if (OVS_CB(skb)->tun_key) - memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); - key->phy.in_port = in_port; - key->phy.skb_mark = skb->mark; - skb_reset_mac_header(skb); /* Link layer. We are guaranteed to have at least the 14 byte Ethernet @@ -610,6 +603,34 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key) } } } - return 0; } + +int ovs_flow_key_extract(struct sk_buff *skb, struct sw_flow_key *key) +{ + /* Extract metadata from packet. */ + memset(key, 0, sizeof(*key)); + if (OVS_CB(skb)->tun_key) + memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); + + key->phy.priority = skb->priority; + key->phy.in_port = OVS_CB(skb)->input_vport->port_no; + key->phy.skb_mark = skb->mark; + + return key_extract(skb, key); +} + +int ovs_flow_key_extract_userspace(const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key) +{ + int err; + + memset(key, 0, sizeof(*key)); + /* Extract metadata from netlink attributes. */ + err = ovs_nla_get_flow_metadata(attr, key); + if (err) + return err; + + return key_extract(skb, key); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 5e5aaed3a85..251789b6ec4 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -187,6 +187,10 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); -int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *); +int ovs_flow_key_extract(struct sk_buff *skb, struct sw_flow_key *key); +/* Extract key from packet coming from userspace. */ +int ovs_flow_key_extract_userspace(const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key); #endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index d757848da89..630b320fbf3 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -836,7 +836,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, /** * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. - * @flow: Receives extracted in_port, priority, tun_key and skb_mark. + * @key: Receives extracted in_port, priority, tun_key and skb_mark. * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. * @@ -846,32 +846,24 @@ int ovs_nla_get_match(struct sw_flow_match *match, * extracted from the packet itself. */ -int ovs_nla_get_flow_metadata(struct sw_flow *flow, - const struct nlattr *attr) +int ovs_nla_get_flow_metadata(const struct nlattr *attr, + struct sw_flow_key *key) { - struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + struct sw_flow_match match; u64 attrs = 0; int err; - struct sw_flow_match match; - - flow->key.phy.in_port = DP_MAX_PORTS; - flow->key.phy.priority = 0; - flow->key.phy.skb_mark = 0; - memset(tun_key, 0, sizeof(flow->key.tun_key)); err = parse_flow_nlattrs(attr, a, &attrs); if (err) return -EINVAL; memset(&match, 0, sizeof(match)); - match.key = &flow->key; + match.key = key; - err = metadata_from_nlattrs(&match, &attrs, a, false); - if (err) - return err; + key->phy.in_port = DP_MAX_PORTS; - return 0; + return metadata_from_nlattrs(&match, &attrs, a, false); } int ovs_nla_put_flow(const struct sw_flow_key *swkey, diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 440151045d3..206e45add88 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -42,8 +42,8 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_flow(const struct sw_flow_key *, const struct sw_flow_key *, struct sk_buff *); -int ovs_nla_get_flow_metadata(struct sw_flow *flow, - const struct nlattr *attr); +int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *); + int ovs_nla_get_match(struct sw_flow_match *match, const struct nlattr *, const struct nlattr *); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index f7e63f9df7b..acf31aa89e0 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -443,7 +443,8 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, u64_stats_update_end(&stats->syncp); OVS_CB(skb)->tun_key = tun_key; - ovs_dp_process_received_packet(vport, skb); + OVS_CB(skb)->input_vport = vport; + ovs_dp_process_received_packet(skb); } /** -- cgit v1.2.3-70-g09d2 From 8c8b1b83fcdd0f05e1f66ed6f8a2e831d5d374a2 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 15 Sep 2014 19:28:44 -0700 Subject: openvswitch: Use tun_key only for egress tunnel path. Currently tun_key is used for passing tunnel information on ingress and egress path, this cause confusion. Following patch removes its use on ingress path make it egress only parameter. Signed-off-by: Pravin B Shelar Acked-by: Andy Zhou --- net/openvswitch/actions.c | 3 +-- net/openvswitch/datapath.c | 20 ++++++-------------- net/openvswitch/datapath.h | 8 ++++---- net/openvswitch/flow.c | 7 ++++--- net/openvswitch/flow.h | 3 ++- net/openvswitch/vport-gre.c | 23 ++++++++++++----------- net/openvswitch/vport-vxlan.c | 21 +++++++++++---------- net/openvswitch/vport.c | 12 ++++++++++-- 8 files changed, 50 insertions(+), 47 deletions(-) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8a1eb562cde..1cdb539d05b 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -510,7 +510,7 @@ static int execute_set_action(struct sk_buff *skb, break; case OVS_KEY_ATTR_IPV4_TUNNEL: - OVS_CB(skb)->tun_key = nla_data(nested_attr); + OVS_CB(skb)->egress_tun_key = nla_data(nested_attr); break; case OVS_KEY_ATTR_ETHERNET: @@ -613,7 +613,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, { struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); - OVS_CB(skb)->tun_key = NULL; return do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0cce8e60d5e..7e0819919b8 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -237,33 +237,25 @@ void ovs_dp_detach_port(struct vport *p) } /* Must be called with rcu_read_lock. */ -void ovs_dp_process_received_packet(struct sk_buff *skb) +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct dp_stats_percpu *stats; - struct sw_flow_key key; u64 *stats_counter; u32 n_mask_hit; - int error; stats = this_cpu_ptr(dp->stats_percpu); - /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_key_extract(skb, &key); - if (unlikely(error)) { - kfree_skb(skb); - return; - } - /* Look up flow. */ - flow = ovs_flow_tbl_lookup_stats(&dp->table, &key, &n_mask_hit); + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; + int error; upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.key = &key; + upcall.key = key; upcall.userdata = NULL; upcall.portid = ovs_vport_find_upcall_portid(p, skb); error = ovs_dp_upcall(dp, skb, &upcall); @@ -277,8 +269,8 @@ void ovs_dp_process_received_packet(struct sk_buff *skb) OVS_CB(skb)->flow = flow; - ovs_flow_stats_update(OVS_CB(skb)->flow, key.tp.flags, skb); - ovs_execute_actions(dp, skb, &key); + ovs_flow_stats_update(OVS_CB(skb)->flow, key->tp.flags, skb); + ovs_execute_actions(dp, skb, key); stats_counter = &stats->n_hit; out: diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 2b982fae6a1..25b0e888cb2 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -95,15 +95,15 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB * @flow: The flow associated with this packet. May be %NULL if no flow. - * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the - * packet is not being tunneled. + * @egress_tun_key: Tunnel information about this packet on egress path. + * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. */ struct ovs_skb_cb { struct sw_flow *flow; - struct ovs_key_ipv4_tunnel *tun_key; struct vport *input_vport; + struct ovs_key_ipv4_tunnel *egress_tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -184,7 +184,7 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; -void ovs_dp_process_received_packet(struct sk_buff *); +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index d186eb65a39..bf8442071d7 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -606,12 +606,13 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return 0; } -int ovs_flow_key_extract(struct sk_buff *skb, struct sw_flow_key *key) +int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, + struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ memset(key, 0, sizeof(*key)); - if (OVS_CB(skb)->tun_key) - memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); + if (tun_key) + memcpy(&key->tun_key, tun_key, sizeof(key->tun_key)); key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 251789b6ec4..3869a540365 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -187,7 +187,8 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); -int ovs_flow_key_extract(struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, + struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index f49148a07da..309cca6e816 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -63,7 +63,7 @@ static __be16 filter_tnl_flags(__be16 flags) static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { - const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key; + const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->egress_tun_key; struct tnl_ptk_info tpi; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); @@ -129,6 +129,7 @@ static int gre_err(struct sk_buff *skb, u32 info, static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); + struct ovs_key_ipv4_tunnel *tun_key; struct flowi4 fl; struct rtable *rt; int min_headroom; @@ -136,16 +137,17 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_key)) { err = -EINVAL; goto error; } + tun_key = OVS_CB(skb)->egress_tun_key; /* Route lookup */ memset(&fl, 0, sizeof(fl)); - fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; - fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_GRE; @@ -153,7 +155,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) if (IS_ERR(rt)) return PTR_ERR(rt); - tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags); + tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr) @@ -185,15 +187,14 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) goto err_free_rt; } - df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; skb->ignore_df = 1; return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, - OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, - OVS_CB(skb)->tun_key->ipv4_tos, - OVS_CB(skb)->tun_key->ipv4_ttl, df, false); + tun_key->ipv4_dst, IPPROTO_GRE, + tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false); err_free_rt: ip_rt_put(rt); error: diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index d8b7e247beb..f19539bb8ad 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 Nicira, Inc. + * Copyright (c) 2014 Nicira, Inc. * Copyright (c) 2013 Cisco Systems, Inc. * * This program is free software; you can redistribute it and/or @@ -140,22 +140,24 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) struct net *net = ovs_dp_get_net(vport->dp); struct vxlan_port *vxlan_port = vxlan_vport(vport); __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + struct ovs_key_ipv4_tunnel *tun_key; struct rtable *rt; struct flowi4 fl; __be16 src_port; __be16 df; int err; - if (unlikely(!OVS_CB(skb)->tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_key)) { err = -EINVAL; goto error; } + tun_key = OVS_CB(skb)->egress_tun_key; /* Route lookup */ memset(&fl, 0, sizeof(fl)); - fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; - fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_UDP; @@ -165,7 +167,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) goto error; } - df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; skb->ignore_df = 1; @@ -173,11 +175,10 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) src_port = udp_flow_src_port(net, skb, 0, 0, true); err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, - fl.saddr, OVS_CB(skb)->tun_key->ipv4_dst, - OVS_CB(skb)->tun_key->ipv4_tos, - OVS_CB(skb)->tun_key->ipv4_ttl, df, + fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, tun_key->ipv4_ttl, df, src_port, dst_port, - htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8), + htonl(be64_to_cpu(tun_key->tun_id) << 8), false); if (err < 0) ip_rt_put(rt); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index acf31aa89e0..5df8377fcfb 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -435,6 +435,8 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, struct ovs_key_ipv4_tunnel *tun_key) { struct pcpu_sw_netstats *stats; + struct sw_flow_key key; + int error; stats = this_cpu_ptr(vport->percpu_stats); u64_stats_update_begin(&stats->syncp); @@ -442,9 +444,15 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, stats->rx_bytes += skb->len; u64_stats_update_end(&stats->syncp); - OVS_CB(skb)->tun_key = tun_key; OVS_CB(skb)->input_vport = vport; - ovs_dp_process_received_packet(skb); + OVS_CB(skb)->egress_tun_key = NULL; + /* Extract flow from 'skb' into 'key'. */ + error = ovs_flow_key_extract(tun_key, skb, &key); + if (unlikely(error)) { + kfree_skb(skb); + return; + } + ovs_dp_process_packet(skb, &key); } /** -- cgit v1.2.3-70-g09d2 From 971427f353f3c42c8dcef62e7124440df68eb809 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Mon, 15 Sep 2014 19:37:25 -0700 Subject: openvswitch: Add recirc and hash action. Recirc action allows a packet to reenter openvswitch processing. currently openvswitch lookup flow for packet received and execute set of actions on that packet, with help of recirc action we can process/modify the packet and recirculate it back in openvswitch for another pass. OVS hash action calculates 5-tupple hash and set hash in flow-key hash. This can be used along with recirculation for distributing packets among different ports for bond devices. For example: OVS bonding can use following actions: Match on: bond flow; Action: hash, recirc(id) Match on: recirc-id == id and hash lower bits == a; Action: output port_bond_a Signed-off-by: Andy Zhou Acked-by: Jesse Gross Signed-off-by: Pravin B Shelar --- include/uapi/linux/openvswitch.h | 26 +++++ net/openvswitch/actions.c | 203 +++++++++++++++++++++++++++++++++++++-- net/openvswitch/datapath.c | 11 ++- net/openvswitch/datapath.h | 7 +- net/openvswitch/flow.c | 7 +- net/openvswitch/flow.h | 3 + net/openvswitch/flow_netlink.c | 43 ++++++++- 7 files changed, 288 insertions(+), 12 deletions(-) (limited to 'net/openvswitch/flow.c') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a794d1dd7b4..f7fc507d82a 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -289,6 +289,9 @@ enum ovs_key_attr { OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ OVS_KEY_ATTR_TCP_FLAGS, /* be16 TCP flags. */ + OVS_KEY_ATTR_DP_HASH, /* u32 hash value. Value 0 indicates the hash + is not computed by the datapath. */ + OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ @@ -493,6 +496,27 @@ struct ovs_action_push_vlan { __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */ }; +/* Data path hash algorithm for computing Datapath hash. + * + * The algorithm type only specifies the fields in a flow + * will be used as part of the hash. Each datapath is free + * to use its own hash algorithm. The hash value will be + * opaque to the user space daemon. + */ +enum ovs_hash_alg { + OVS_HASH_ALG_L4, +}; + +/* + * struct ovs_action_hash - %OVS_ACTION_ATTR_HASH action argument. + * @hash_alg: Algorithm used to compute hash prior to recirculation. + * @hash_basis: basis used for computing hash. + */ +struct ovs_action_hash { + uint32_t hash_alg; /* One of ovs_hash_alg. */ + uint32_t hash_basis; +}; + /** * enum ovs_action_attr - Action types. * @@ -521,6 +545,8 @@ enum ovs_action_attr { OVS_ACTION_ATTR_PUSH_VLAN, /* struct ovs_action_push_vlan. */ OVS_ACTION_ATTR_POP_VLAN, /* No argument. */ OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ + OVS_ACTION_ATTR_RECIRC, /* u32 recirc_id. */ + OVS_ACTION_ATTR_HASH, /* struct ovs_action_hash. */ __OVS_ACTION_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c31bb80c984..6932a42e41a 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -35,12 +35,78 @@ #include #include "datapath.h" +#include "flow.h" #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len); +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +static struct action_fifo __percpu *action_fifos; +static DEFINE_PER_CPU(int, exec_actions_level); + +static void action_fifo_init(struct action_fifo *fifo) +{ + fifo->head = 0; + fifo->tail = 0; +} + +static bool action_fifo_is_empty(struct action_fifo *fifo) +{ + return (fifo->head == fifo->tail); +} + +static struct deferred_action *action_fifo_get(struct action_fifo *fifo) +{ + if (action_fifo_is_empty(fifo)) + return NULL; + + return &fifo->fifo[fifo->tail++]; +} + +static struct deferred_action *action_fifo_put(struct action_fifo *fifo) +{ + if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1) + return NULL; + + return &fifo->fifo[fifo->head++]; +} + +/* Return true if fifo is not full */ +static struct deferred_action *add_deferred_actions(struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct action_fifo *fifo; + struct deferred_action *da; + + fifo = this_cpu_ptr(action_fifos); + da = action_fifo_put(fifo); + if (da) { + da->skb = skb; + da->actions = attr; + da->pkt_key = *key; + } + + return da; +} + static int make_writable(struct sk_buff *skb, int write_len) { if (!pskb_may_pull(skb, write_len)) @@ -485,8 +551,29 @@ static int sample(struct datapath *dp, struct sk_buff *skb, /* Skip the sample action when out of memory. */ return 0; - /* do_execute_actions() will consume the cloned skb. */ - return do_execute_actions(dp, skb, key, a, rem); + if (!add_deferred_actions(skb, key, a)) { + if (net_ratelimit()) + pr_warn("%s: deferred actions limit reached, dropping sample action\n", + ovs_dp_name(dp)); + + kfree_skb(skb); + } + return 0; +} + +static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct ovs_action_hash *hash_act = nla_data(attr); + u32 hash = 0; + + /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ + hash = skb_get_hash(skb); + hash = jhash_1word(hash, hash_act->hash_basis); + if (!hash) + hash = 0x1; + + key->ovs_flow_hash = hash; } static int execute_set_action(struct sk_buff *skb, @@ -535,6 +622,44 @@ static int execute_set_action(struct sk_buff *skb, return err; } +static int execute_recirc(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *a, int rem) +{ + struct deferred_action *da; + int err; + + err = ovs_flow_key_update(skb, key); + if (err) + return err; + + if (!last_action(a, rem)) { + /* Recirc action is the not the last action + * of the action list, need to clone the skb. + */ + skb = skb_clone(skb, GFP_ATOMIC); + + /* Skip the recirc action when out of memory, but + * continue on with the rest of the action list. + */ + if (!skb) + return 0; + } + + da = add_deferred_actions(skb, key, NULL); + if (da) { + da->pkt_key.recirc_id = nla_get_u32(a); + } else { + kfree_skb(skb); + + if (net_ratelimit()) + pr_warn("%s: deferred action limit reached, drop recirc action\n", + ovs_dp_name(dp)); + } + + return 0; +} + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, @@ -566,6 +691,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, output_userspace(dp, skb, key, a); break; + case OVS_ACTION_ATTR_HASH: + execute_hash(skb, key, a); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, nla_data(a)); if (unlikely(err)) /* skb already freed. */ @@ -576,6 +705,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, err = pop_vlan(skb); break; + case OVS_ACTION_ATTR_RECIRC: + err = execute_recirc(dp, skb, key, a, rem); + if (last_action(a, rem)) { + /* If this is the last action, the skb has + * been consumed or freed. + * Return immediately. + */ + return err; + } + break; + case OVS_ACTION_ATTR_SET: err = execute_set_action(skb, nla_data(a)); break; @@ -601,12 +741,63 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return 0; } +static void process_deferred_actions(struct datapath *dp) +{ + struct action_fifo *fifo = this_cpu_ptr(action_fifos); + + /* Do not touch the FIFO in case there is no deferred actions. */ + if (action_fifo_is_empty(fifo)) + return; + + /* Finishing executing all deferred actions. */ + do { + struct deferred_action *da = action_fifo_get(fifo); + struct sk_buff *skb = da->skb; + struct sw_flow_key *key = &da->pkt_key; + const struct nlattr *actions = da->actions; + + if (actions) + do_execute_actions(dp, skb, key, actions, + nla_len(actions)); + else + ovs_dp_process_packet(skb, key); + } while (!action_fifo_is_empty(fifo)); + + /* Reset FIFO for the next packet. */ + action_fifo_init(fifo); +} + /* Execute a list of actions against 'skb'. */ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key) { - struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + int level = this_cpu_read(exec_actions_level); + struct sw_flow_actions *acts; + int err; + + acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + + this_cpu_inc(exec_actions_level); + err = do_execute_actions(dp, skb, key, + acts->actions, acts->actions_len); + + if (!level) + process_deferred_actions(dp); + + this_cpu_dec(exec_actions_level); + return err; +} + +int action_fifos_init(void) +{ + action_fifos = alloc_percpu(struct action_fifo); + if (!action_fifos) + return -ENOMEM; - return do_execute_actions(dp, skb, key, - acts->actions, acts->actions_len); + return 0; +} + +void action_fifos_exit(void) +{ + free_percpu(action_fifos); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 7e0819919b8..16cad14fa81 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -156,7 +156,7 @@ static struct datapath *get_dp(struct net *net, int dp_ifindex) } /* Must be called with rcu_read_lock or ovs_mutex. */ -static const char *ovs_dp_name(const struct datapath *dp) +const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return vport->ops->get_name(vport); @@ -2065,10 +2065,14 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = ovs_internal_dev_rtnl_link_register(); + err = action_fifos_init(); if (err) goto error; + err = ovs_internal_dev_rtnl_link_register(); + if (err) + goto error_action_fifos_exit; + err = ovs_flow_init(); if (err) goto error_unreg_rtnl_link; @@ -2101,6 +2105,8 @@ error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); +error_action_fifos_exit: + action_fifos_exit(); error: return err; } @@ -2114,6 +2120,7 @@ static void dp_cleanup(void) ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); + action_fifos_exit(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 25b0e888cb2..ac3f3df9696 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -189,13 +189,18 @@ void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); +const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *); + void ovs_dp_notify_wq(struct work_struct *work); +int action_fifos_init(void); +void action_fifos_exit(void); + #define OVS_NLERR(fmt, ...) \ do { \ if (net_ratelimit()) \ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index bf8442071d7..4010423f283 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -606,6 +606,11 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return 0; } +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) +{ + return key_extract(skb, key); +} + int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key) { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 3869a540365..0f5db4ec565 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -72,6 +72,8 @@ struct sw_flow_key { u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u32 ovs_flow_hash; /* Datapath computed hash value. */ + u32 recirc_id; /* Recirculation ID. */ struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ @@ -187,6 +189,7 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 630b320fbf3..f4c8daa7396 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -251,6 +251,8 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), + [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), + [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), [OVS_KEY_ATTR_TUNNEL] = -1, }; @@ -454,6 +456,20 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { + if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { + u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); + + SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH); + } + + if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) { + u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]); + + SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID); + } + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { SW_FLOW_KEY_PUT(match, phy.priority, nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); @@ -873,6 +889,12 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, struct nlattr *nla, *encap; bool is_mask = (swkey != output); + if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) + goto nla_put_failure; + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; @@ -1401,11 +1423,13 @@ int ovs_nla_copy_actions(const struct nlattr *attr, /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -1432,6 +1456,18 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *act_hash = nla_data(a); + + switch (act_hash->hash_alg) { + case OVS_HASH_ALG_L4: + break; + default: + return -EINVAL; + } + + break; + } case OVS_ACTION_ATTR_POP_VLAN: break; @@ -1444,6 +1480,9 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_RECIRC: + break; + case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, &skip_copy); if (err) -- cgit v1.2.3-70-g09d2 From 0714812134d7dcadeb7ecfbfeb18788aa7e1eaac Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:29 -0700 Subject: openvswitch: Eliminate memset() from flow_extract. As new protocols are added, the size of the flow key tends to increase although few protocols care about all of the fields. In order to optimize this for hashing and matching, OVS uses a variable length portion of the key. However, when fields are extracted from the packet we must still zero out the entire key. This is no longer necessary now that OVS implements masking. Any fields (or holes in the structure) which are not part of a given protocol will be by definition not part of the mask and zeroed out during lookup. Furthermore, since masking already uses variable length keys this zeroing operation automatically benefits as well. In principle, the only thing that needs to be done at this point is remove the memset() at the beginning of flow. However, some fields assume that they are initialized to zero, which now must be done explicitly. In addition, in the event of an error we must also zero out corresponding fields to signal that there is no valid data present. These increase the total amount of code but very little of it is executed in non-error situations. Removing the memset() reduces the profile of ovs_flow_extract() from 0.64% to 0.56% when tested with large packets on a 10G link. Suggested-by: Pravin Shelar Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 54 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 9 deletions(-) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 4010423f283..913bdc1a83c 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -462,6 +462,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * update skb->csum here. */ + key->eth.tci = 0; if (vlan_tx_tag_present(skb)) key->eth.tci = htons(skb->vlan_tci); else if (eth->h_proto == htons(ETH_P_8021Q)) @@ -482,6 +483,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) error = check_iphdr(skb); if (unlikely(error)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); if (error == -EINVAL) { skb->transport_header = skb->network_header; error = 0; @@ -503,8 +506,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return 0; } if (nh->frag_off & htons(IP_MF) || - skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; + else + key->ip.frag = OVS_FRAG_TYPE_NONE; /* Transport layer. */ if (key->ip.proto == IPPROTO_TCP) { @@ -513,18 +518,25 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) key->tp.src = tcp->source; key->tp.dst = tcp->dest; key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } + } else if (key->ip.proto == IPPROTO_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->tp.src = udp->source; key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == IPPROTO_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); key->tp.src = sctp->source; key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == IPPROTO_ICMP) { if (icmphdr_ok(skb)) { @@ -534,33 +546,44 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * them in 16-bit network byte order. */ key->tp.src = htons(icmp->type); key->tp.dst = htons(icmp->code); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } - } else if ((key->eth.type == htons(ETH_P_ARP) || - key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) { + } else if (key->eth.type == htons(ETH_P_ARP) || + key->eth.type == htons(ETH_P_RARP)) { struct arp_eth_header *arp; arp = (struct arp_eth_header *)skb_network_header(skb); - if (arp->ar_hrd == htons(ARPHRD_ETHER) - && arp->ar_pro == htons(ETH_P_IP) - && arp->ar_hln == ETH_ALEN - && arp->ar_pln == 4) { + if (arphdr_ok(skb) && + arp->ar_hrd == htons(ARPHRD_ETHER) && + arp->ar_pro == htons(ETH_P_IP) && + arp->ar_hln == ETH_ALEN && + arp->ar_pln == 4) { /* We only match on the lower 8 bits of the opcode. */ if (ntohs(arp->ar_op) <= 0xff) key->ip.proto = ntohs(arp->ar_op); + else + key->ip.proto = 0; + memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); + } else { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); if (nh_len == -EINVAL) { skb->transport_header = skb->network_header; error = 0; @@ -582,24 +605,32 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) key->tp.src = tcp->source; key->tp.dst = tcp->dest; key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_UDP) { if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); key->tp.src = udp->source; key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_SCTP) { if (sctphdr_ok(skb)) { struct sctphdr *sctp = sctp_hdr(skb); key->tp.src = sctp->source; key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_ICMP) { if (icmp6hdr_ok(skb)) { error = parse_icmpv6(skb, key, nh_len); if (error) return error; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } } @@ -615,13 +646,19 @@ int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - memset(key, 0, sizeof(*key)); if (tun_key) memcpy(&key->tun_key, tun_key, sizeof(key->tun_key)); + else + memset(&key->tun_key, 0, sizeof(key->tun_key)); key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; key->phy.skb_mark = skb->mark; + key->ovs_flow_hash = 0; + key->recirc_id = 0; + + /* Flags are always used as part of stats */ + key->tp.flags = 0; return key_extract(skb, key); } @@ -632,7 +669,6 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr, { int err; - memset(key, 0, sizeof(*key)); /* Extract metadata from netlink attributes. */ err = ovs_nla_get_flow_metadata(attr, key); if (err) -- cgit v1.2.3-70-g09d2 From f0b128c1e2cc33ad104daf0f51a51e34f7763c5f Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:31 -0700 Subject: openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure. Currently, the flow information that is matched for tunnels and the tunnel data passed around with packets is the same. However, as additional information is added this is not necessarily desirable, as in the case of pointers. This adds a new structure for tunnel metadata which currently contains only the existing struct. This change is purely internal to the kernel since the current OVS_KEY_ATTR_IPV4_TUNNEL is simply a compressed version of OVS_KEY_ATTR_TUNNEL that is translated at flow setup. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 +- net/openvswitch/actions.c | 5 +++-- net/openvswitch/datapath.h | 2 +- net/openvswitch/flow.c | 6 +++--- net/openvswitch/flow.h | 30 +++++++++++++++++------------- net/openvswitch/flow_netlink.c | 38 +++++++++++++++++++++++++++++++------- net/openvswitch/vport-gre.c | 16 +++++++++------- net/openvswitch/vport-vxlan.c | 10 +++++----- net/openvswitch/vport.c | 6 +++--- net/openvswitch/vport.h | 2 +- 10 files changed, 74 insertions(+), 43 deletions(-) (limited to 'net/openvswitch/flow.c') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7c06106f5af..6753032832e 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -294,7 +294,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ - OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ + OVS_KEY_ATTR_TUNNEL_INFO, /* struct ovs_tunnel_info */ #endif __OVS_KEY_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 6932a42e41a..006886dbee3 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -590,8 +590,8 @@ static int execute_set_action(struct sk_buff *skb, skb->mark = nla_get_u32(nested_attr); break; - case OVS_KEY_ATTR_IPV4_TUNNEL: - OVS_CB(skb)->egress_tun_key = nla_data(nested_attr); + case OVS_KEY_ATTR_TUNNEL_INFO: + OVS_CB(skb)->egress_tun_info = nla_data(nested_attr); break; case OVS_KEY_ATTR_ETHERNET: @@ -778,6 +778,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); this_cpu_inc(exec_actions_level); + OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index ac3f3df9696..974135439c5 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -102,8 +102,8 @@ struct datapath { */ struct ovs_skb_cb { struct sw_flow *flow; + struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; - struct ovs_key_ipv4_tunnel *egress_tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 913bdc1a83c..2924cb34086 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -642,12 +642,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - if (tun_key) - memcpy(&key->tun_key, tun_key, sizeof(key->tun_key)); + if (tun_info) + memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); else memset(&key->tun_key, 0, sizeof(key->tun_key)); diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 0f5db4ec565..fe5a71b81c1 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -49,20 +49,24 @@ struct ovs_key_ipv4_tunnel { u8 ipv4_ttl; } __packed __aligned(4); /* Minimize padding. */ -static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, - const struct iphdr *iph, __be64 tun_id, - __be16 tun_flags) +struct ovs_tunnel_info { + struct ovs_key_ipv4_tunnel tunnel; +}; + +static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + const struct iphdr *iph, + __be64 tun_id, __be16 tun_flags) { - tun_key->tun_id = tun_id; - tun_key->ipv4_src = iph->saddr; - tun_key->ipv4_dst = iph->daddr; - tun_key->ipv4_tos = iph->tos; - tun_key->ipv4_ttl = iph->ttl; - tun_key->tun_flags = tun_flags; + tun_info->tunnel.tun_id = tun_id; + tun_info->tunnel.ipv4_src = iph->saddr; + tun_info->tunnel.ipv4_dst = iph->daddr; + tun_info->tunnel.ipv4_tos = iph->tos; + tun_info->tunnel.ipv4_ttl = iph->ttl; + tun_info->tunnel.tun_flags = tun_flags; /* clear struct padding. */ - memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, - sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); + memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, + sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); } struct sw_flow_key { @@ -190,8 +194,8 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, - struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, + struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 22c855fa0bc..5d6194d9dad 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1148,13 +1148,14 @@ out: return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); } -static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) +static struct nlattr *__add_action(struct sw_flow_actions **sfa, + int attrtype, void *data, int len) { struct nlattr *a; a = reserve_sfa_size(sfa, nla_attr_size(len)); if (IS_ERR(a)) - return PTR_ERR(a); + return a; a->nla_type = attrtype; a->nla_len = nla_attr_size(len); @@ -1163,6 +1164,18 @@ static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, in memcpy(nla_data(a), data, len); memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + return a; +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len) +{ + struct nlattr *a; + + a = __add_action(sfa, attrtype, data, len); + if (IS_ERR(a)) + return PTR_ERR(a); + return 0; } @@ -1268,6 +1281,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; + struct ovs_tunnel_info *tun_info; + struct nlattr *a; int err, start; ovs_match_init(&match, &key, NULL); @@ -1279,8 +1294,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, - sizeof(match.key->tun_key)); + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, + sizeof(*tun_info)); + if (IS_ERR(a)) + return PTR_ERR(a); + + tun_info = nla_data(a); + tun_info->tunnel = key.tun_key; + add_nested_action_end(*sfa, start); return err; @@ -1563,17 +1584,20 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) int err; switch (key_type) { - case OVS_KEY_ATTR_IPV4_TUNNEL: + case OVS_KEY_ATTR_TUNNEL_INFO: { + struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key), - nla_data(ovs_key)); + err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + nla_data(ovs_key)); if (err) return err; nla_nest_end(skb, start); break; + } default: if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) return -EMSGSIZE; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 309cca6e816..fe768bd600e 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -63,8 +63,10 @@ static __be16 filter_tnl_flags(__be16 flags) static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { - const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->egress_tun_key; struct tnl_ptk_info tpi; + const struct ovs_key_ipv4_tunnel *tun_key; + + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); if (IS_ERR(skb)) @@ -92,7 +94,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq) static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct ovs_net *ovs_net; struct vport *vport; __be64 key; @@ -103,10 +105,10 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, + filter_tnl_flags(tpi->flags)); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; } @@ -137,12 +139,12 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->egress_tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } - tun_key = OVS_CB(skb)->egress_tun_key; + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index f19539bb8ad..5fbff2c1ee4 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -58,7 +58,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport) /* Called with rcu_read_lock and BH disabled. */ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct vport *vport = vs->data; struct iphdr *iph; __be64 key; @@ -66,9 +66,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); } static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) @@ -147,12 +147,12 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->egress_tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } - tun_key = OVS_CB(skb)->egress_tun_key; + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 5df8377fcfb..3e50ee8a218 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -432,7 +432,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb) * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - struct ovs_key_ipv4_tunnel *tun_key) + struct ovs_tunnel_info *tun_info) { struct pcpu_sw_netstats *stats; struct sw_flow_key key; @@ -445,9 +445,9 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, u64_stats_update_end(&stats->syncp); OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->egress_tun_key = NULL; + OVS_CB(skb)->egress_tun_info = NULL; /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_key_extract(tun_key, skb, &key); + error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); return; diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 0efd62fef1e..e28964aba02 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -207,7 +207,7 @@ static inline struct vport *vport_from_priv(void *priv) } void ovs_vport_receive(struct vport *, struct sk_buff *, - struct ovs_key_ipv4_tunnel *); + struct ovs_tunnel_info *); /* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the top of vport.c. */ -- cgit v1.2.3-70-g09d2 From f5796684069e0c71c65bce6a6d4766114aec1396 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:33 -0700 Subject: openvswitch: Add support for Geneve tunneling. The Openvswitch implementation is completely agnostic to the options that are in use and can handle newly defined options without further work. It does this by simply matching on a byte array of options and allowing userspace to setup flows on this array. Signed-off-by: Jesse Gross Singed-off-by: Ansis Atteka Signed-off-by: Andy Zhou Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 21 ++-- include/uapi/linux/openvswitch.h | 2 + net/openvswitch/Kconfig | 11 ++ net/openvswitch/Makefile | 4 + net/openvswitch/datapath.c | 5 +- net/openvswitch/flow.c | 20 +++- net/openvswitch/flow.h | 20 +++- net/openvswitch/flow_netlink.c | 176 ++++++++++++++++++++++++----- net/openvswitch/vport-geneve.c | 236 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport-gre.c | 2 +- net/openvswitch/vport-vxlan.c | 2 +- net/openvswitch/vport.c | 3 + net/openvswitch/vport.h | 1 + 13 files changed, 461 insertions(+), 42 deletions(-) create mode 100644 net/openvswitch/vport-geneve.c (limited to 'net/openvswitch/flow.c') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a9ce1556d77..5bc6edeb714 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -86,17 +86,18 @@ struct ip_tunnel { struct gro_cells gro_cells; }; -#define TUNNEL_CSUM __cpu_to_be16(0x01) -#define TUNNEL_ROUTING __cpu_to_be16(0x02) -#define TUNNEL_KEY __cpu_to_be16(0x04) -#define TUNNEL_SEQ __cpu_to_be16(0x08) -#define TUNNEL_STRICT __cpu_to_be16(0x10) -#define TUNNEL_REC __cpu_to_be16(0x20) -#define TUNNEL_VERSION __cpu_to_be16(0x40) -#define TUNNEL_NO_KEY __cpu_to_be16(0x80) +#define TUNNEL_CSUM __cpu_to_be16(0x01) +#define TUNNEL_ROUTING __cpu_to_be16(0x02) +#define TUNNEL_KEY __cpu_to_be16(0x04) +#define TUNNEL_SEQ __cpu_to_be16(0x08) +#define TUNNEL_STRICT __cpu_to_be16(0x10) +#define TUNNEL_REC __cpu_to_be16(0x20) +#define TUNNEL_VERSION __cpu_to_be16(0x40) +#define TUNNEL_NO_KEY __cpu_to_be16(0x80) #define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) -#define TUNNEL_OAM __cpu_to_be16(0x0200) -#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_OAM __cpu_to_be16(0x0200) +#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_OPTIONS_PRESENT __cpu_to_be16(0x0800) struct tnl_ptk_info { __be16 flags; diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 6753032832e..435eabc5ffa 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -192,6 +192,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ OVS_VPORT_TYPE_GRE, /* GRE tunnel. */ OVS_VPORT_TYPE_VXLAN, /* VXLAN tunnel. */ + OVS_VPORT_TYPE_GENEVE, /* Geneve tunnel. */ __OVS_VPORT_TYPE_MAX }; @@ -310,6 +311,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ OVS_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ + OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 6ecf491ad50..ba3bb8203b9 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -54,3 +54,14 @@ config OPENVSWITCH_VXLAN Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config OPENVSWITCH_GENEVE + bool "Open vSwitch Geneve tunneling support" + depends on INET + depends on OPENVSWITCH + depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m) + default y + ---help--- + If you say Y here, then the Open vSwitch will be able create geneve vport. + + Say N to exclude this support and reduce the binary size. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 3591cb5dae9..9a33a273c37 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,6 +15,10 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +ifneq ($(CONFIG_OPENVSWITCH_GENEVE),) +openvswitch-y += vport-geneve.o +endif + ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) openvswitch-y += vport-vxlan.o endif diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 010125c4824..2e31d9e7f4d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -370,6 +370,7 @@ static size_t key_attr_size(void) + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ @@ -556,10 +557,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); - rcu_assign_pointer(flow->sf_acts, acts); if (err) goto err_flow_free; + rcu_assign_pointer(flow->sf_acts, acts); + + OVS_CB(packet)->egress_tun_info = NULL; OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 2924cb34086..62db02ba36b 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -448,6 +448,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) int error; struct ethhdr *eth; + /* Flags are always used as part of stats */ + key->tp.flags = 0; + skb_reset_mac_header(skb); /* Link layer. We are guaranteed to have at least the 14 byte Ethernet @@ -646,10 +649,23 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - if (tun_info) + if (tun_info) { memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); - else + + if (tun_info->options) { + BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * + 8)) - 1 + > sizeof(key->tun_opts)); + memcpy(GENEVE_OPTS(key, tun_info->options_len), + tun_info->options, tun_info->options_len); + key->tun_opts_len = tun_info->options_len; + } else { + key->tun_opts_len = 0; + } + } else { + key->tun_opts_len = 0; memset(&key->tun_key, 0, sizeof(key->tun_key)); + } key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index fe5a71b81c1..71813318c8c 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -51,11 +51,24 @@ struct ovs_key_ipv4_tunnel { struct ovs_tunnel_info { struct ovs_key_ipv4_tunnel tunnel; + struct geneve_opt *options; + u8 options_len; }; +/* Store options at the end of the array if they are less than the + * maximum size. This allows us to get the benefits of variable length + * matching for small options. + */ +#define GENEVE_OPTS(flow_key, opt_len) \ + ((struct geneve_opt *)((flow_key)->tun_opts + \ + FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \ + opt_len)) + static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, const struct iphdr *iph, - __be64 tun_id, __be16 tun_flags) + __be64 tun_id, __be16 tun_flags, + struct geneve_opt *opts, + u8 opts_len) { tun_info->tunnel.tun_id = tun_id; tun_info->tunnel.ipv4_src = iph->saddr; @@ -67,9 +80,14 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, /* clear struct padding. */ memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); + + tun_info->options = opts; + tun_info->options_len = opts_len; } struct sw_flow_key { + u8 tun_opts[255]; + u8 tun_opts_len; struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 5d6194d9dad..368f2330791 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -88,18 +89,20 @@ static void update_range__(struct sw_flow_match *match, } \ } while (0) -#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - len, is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memcpy(&(match)->mask->key.field, value_p, len);\ - } else { \ - memcpy(&(match)->key->field, value_p, len); \ - } \ +#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ + do { \ + update_range__(match, offset, len, is_mask); \ + if (is_mask) \ + memcpy((u8 *)&(match)->mask->key + offset, value_p, \ + len); \ + else \ + memcpy((u8 *)(match)->key + offset, value_p, len); \ } while (0) +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ + value_p, len, is_mask) + static u16 range_n_bytes(const struct sw_flow_key_range *range) { return range->end - range->start; @@ -335,6 +338,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, int rem; bool ttl = false; __be16 tun_flags = 0; + unsigned long opt_key_offset; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -347,6 +351,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, [OVS_TUNNEL_KEY_ATTR_OAM] = 0, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { @@ -355,7 +360,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type] != nla_len(a)) { + if (ovs_tunnel_key_lens[type] != nla_len(a) && + ovs_tunnel_key_lens[type] != -1) { OVS_NLERR("IPv4 tunnel attribute type has unexpected " " length (type=%d, length=%d, expected=%d).\n", type, nla_len(a), ovs_tunnel_key_lens[type]); @@ -394,7 +400,60 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_OAM: tun_flags |= TUNNEL_OAM; break; + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + tun_flags |= TUNNEL_OPTIONS_PRESENT; + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n", + nla_len(a), + sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).", + match->key->tun_opts_len, + nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, + true); + } + + opt_key_offset = (unsigned long)GENEVE_OPTS( + (struct sw_flow_key *)0, + nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, + nla_data(a), nla_len(a), + is_mask); + break; default: + OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n", + type); return -EINVAL; } } @@ -421,16 +480,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return 0; } -static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key, - const struct ovs_key_ipv4_tunnel *output) +static int __ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) { - struct nlattr *nla; - - nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); - if (!nla) - return -EMSGSIZE; - if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; @@ -454,12 +508,35 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; + if (tun_opts && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; - nla_nest_end(skb, nla); return 0; } +static int ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + if (err) + return err; + + nla_nest_end(skb, nla); + return 0; +} + static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { @@ -905,9 +982,16 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask) && - ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) - goto nla_put_failure; + if ((swkey->tun_key.ipv4_dst || is_mask)) { + const struct geneve_opt *opts = NULL; + + if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) + opts = GENEVE_OPTS(output, swkey->tun_opts_len); + + if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len)) + goto nla_put_failure; + } if (swkey->phy.in_port == DP_MAX_PORTS) { if (is_mask && (output->phy.in_port == 0xffff)) @@ -1290,17 +1374,55 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (err) return err; + if (key.tun_opts_len) { + struct geneve_opt *option = GENEVE_OPTS(&key, + key.tun_opts_len); + int opts_len = key.tun_opts_len; + bool crit_opt = false; + + while (opts_len > 0) { + int len; + + if (opts_len < sizeof(*option)) + return -EINVAL; + + len = sizeof(*option) + option->length * 4; + if (len > opts_len) + return -EINVAL; + + crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); + + option = (struct geneve_opt *)((u8 *)option + len); + opts_len -= len; + }; + + key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + }; + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); if (start < 0) return start; a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info)); + sizeof(*tun_info) + key.tun_opts_len); if (IS_ERR(a)) return PTR_ERR(a); tun_info = nla_data(a); tun_info->tunnel = key.tun_key; + tun_info->options_len = key.tun_opts_len; + + if (tun_info->options_len) { + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); + tun_info->options = (struct geneve_opt *)(tun_info + 1); + } else { + tun_info->options = NULL; + } add_nested_action_end(*sfa, start); @@ -1592,7 +1714,9 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) return -EMSGSIZE; err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, - nla_data(ovs_key)); + tun_info->options_len ? + tun_info->options : NULL, + tun_info->options_len); if (err) return err; nla_nest_end(skb, start); diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c new file mode 100644 index 00000000000..5572d482f28 --- /dev/null +++ b/net/openvswitch/vport-geneve.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +/** + * struct geneve_port - Keeps track of open UDP ports + * @sock: The socket created for this port number. + * @name: vport name. + */ +struct geneve_port { + struct geneve_sock *gs; + char name[IFNAMSIZ]; +}; + +static LIST_HEAD(geneve_ports); + +static inline struct geneve_port *geneve_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + +/* Convert 64 bit tunnel ID to 24 bit VNI. */ +static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) +{ +#ifdef __BIG_ENDIAN + vni[0] = (__force __u8)(tun_id >> 16); + vni[1] = (__force __u8)(tun_id >> 8); + vni[2] = (__force __u8)tun_id; +#else + vni[0] = (__force __u8)((__force u64)tun_id >> 40); + vni[1] = (__force __u8)((__force u64)tun_id >> 48); + vni[2] = (__force __u8)((__force u64)tun_id >> 56); +#endif +} + +/* Convert 24 bit VNI to 64 bit tunnel ID. */ +static __be64 vni_to_tunnel_id(__u8 *vni) +{ +#ifdef __BIG_ENDIAN + return (vni[0] << 16) | (vni[1] << 8) | vni[2]; +#else + return (__force __be64)(((__force u64)vni[0] << 40) | + ((__force u64)vni[1] << 48) | + ((__force u64)vni[2] << 56)); +#endif +} + +static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) +{ + struct vport *vport = gs->rcv_data; + struct genevehdr *geneveh = geneve_hdr(skb); + int opts_len; + struct ovs_tunnel_info tun_info; + __be64 key; + __be16 flags; + + opts_len = geneveh->opt_len * 4; + + flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT | + (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | + (geneveh->oam ? TUNNEL_OAM : 0) | + (geneveh->critical ? TUNNEL_CRIT_OPT : 0); + + key = vni_to_tunnel_id(geneveh->vni); + + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags, + geneveh->options, opts_len); + + ovs_vport_receive(vport, skb, &tun_info); +} + +static int geneve_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 sport; + + sport = ntohs(inet_sk(geneve_port->gs->sock->sk)->inet_sport); + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, sport)) + return -EMSGSIZE; + return 0; +} + +static void geneve_tnl_destroy(struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + geneve_sock_release(geneve_port->gs); + + ovs_vport_deferred_free(vport); +} + +static struct vport *geneve_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct geneve_port *geneve_port; + struct geneve_sock *gs; + struct vport *vport; + struct nlattr *a; + int err; + u16 dst_port; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct geneve_port), + &ovs_geneve_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + geneve_port = geneve_vport(vport); + strncpy(geneve_port->name, parms->name, IFNAMSIZ); + + gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); + if (IS_ERR(gs)) { + ovs_vport_free(vport); + return (void *)gs; + } + geneve_port->gs = gs; + + return vport; +error: + return ERR_PTR(err); +} + +static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct ovs_key_ipv4_tunnel *tun_key; + struct ovs_tunnel_info *tun_info; + struct net *net = ovs_dp_get_net(vport->dp); + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; + __be16 sport; + struct rtable *rt; + struct flowi4 fl; + u8 vni[3]; + __be16 df; + int err; + + tun_info = OVS_CB(skb)->egress_tun_info; + if (unlikely(!tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &tun_info->tunnel; + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + tunnel_id_to_vni(tun_key->tun_id, vni); + skb->ignore_df = 1; + + err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, + tun_key->ipv4_dst, tun_key->ipv4_tos, + tun_key->ipv4_ttl, df, sport, dport, + tun_key->tun_flags, vni, + tun_info->options_len, (u8 *)tun_info->options, + false); + if (err < 0) + ip_rt_put(rt); +error: + return err; +} + +static const char *geneve_get_name(const struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + return geneve_port->name; +} + +const struct vport_ops ovs_geneve_vport_ops = { + .type = OVS_VPORT_TYPE_GENEVE, + .create = geneve_tnl_create, + .destroy = geneve_tnl_destroy, + .get_name = geneve_get_name, + .get_options = geneve_get_options, + .send = geneve_tnl_send, +}; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index fe768bd600e..108b82da2fd 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -106,7 +106,7 @@ static int gre_rcv(struct sk_buff *skb, key = key_to_tunnel_id(tpi->key, tpi->seq); ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); + filter_tnl_flags(tpi->flags), NULL, 0); ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 5fbff2c1ee4..2735e01dca7 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -66,7 +66,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY); + ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0); ovs_vport_receive(vport, skb, &tun_info); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 3e50ee8a218..53001b020ca 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -48,6 +48,9 @@ static const struct vport_ops *vport_ops_list[] = { #ifdef CONFIG_OPENVSWITCH_VXLAN &ovs_vxlan_vport_ops, #endif +#ifdef CONFIG_OPENVSWITCH_GENEVE + &ovs_geneve_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index e28964aba02..8942125de3a 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -215,6 +215,7 @@ extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; extern const struct vport_ops ovs_gre_vport_ops; extern const struct vport_ops ovs_vxlan_vport_ops; +extern const struct vport_ops ovs_geneve_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) -- cgit v1.2.3-70-g09d2 From 389f48947a5a37ea283de520abb742d42174edb0 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 17 Oct 2014 14:03:08 +0800 Subject: openvswitch: fix a use after free pskb_may_pull() called by arphdr_ok can change skb->data, so put the arp setting after arphdr_ok to avoid the use the freed memory Fixes: 0714812134d7d ("openvswitch: Eliminate memset() from flow_extract.") Cc: Jesse Gross Cc: Eric Dumazet Signed-off-by: Li RongQing Acked-by: Jesse Gross Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 62db02ba36b..c5cfc72a553 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -557,10 +557,11 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) } else if (key->eth.type == htons(ETH_P_ARP) || key->eth.type == htons(ETH_P_RARP)) { struct arp_eth_header *arp; + bool arp_available = arphdr_ok(skb); arp = (struct arp_eth_header *)skb_network_header(skb); - if (arphdr_ok(skb) && + if (arp_available && arp->ar_hrd == htons(ARPHRD_ETHER) && arp->ar_pro == htons(ETH_P_IP) && arp->ar_hln == ETH_ALEN && -- cgit v1.2.3-70-g09d2 From 25ef1328a03c72a7285883d5b337c4b602476ecd Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 17 Oct 2014 13:56:31 -0700 Subject: openvswitch: Set flow-key members. This patch adds missing memset which are required to initialize flow key member. For example for IP flow we need to initialize ip.frag for all cases. Found by inspection. This bug is introduced by commit 0714812134d7dcadeb7ecfbfeb18788aa7e1eaac ("openvswitch: Eliminate memset() from flow_extract"). Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/openvswitch/flow.c') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index c5cfc72a553..2b78789ea7c 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -274,6 +274,8 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_LATER; else key->ip.frag = OVS_FRAG_TYPE_FIRST; + } else { + key->ip.frag = OVS_FRAG_TYPE_NONE; } nh_len = payload_ofs - nh_ofs; @@ -358,6 +360,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, */ key->tp.src = htons(icmp->icmp6_type); key->tp.dst = htons(icmp->icmp6_code); + memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd)); if (icmp->icmp6_code == 0 && (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || @@ -674,9 +677,6 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, key->ovs_flow_hash = 0; key->recirc_id = 0; - /* Flags are always used as part of stats */ - key->tp.flags = 0; - return key_extract(skb, key); } -- cgit v1.2.3-70-g09d2