From 23289a37e2b127dfc4de1313fba15bb4c9f0cd5b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Oct 2009 07:06:36 +0000 Subject: net: add a list_head parameter to dellink() method Adding a list_head parameter to rtnl_link_ops->dellink() methods allow us to queue devices on a list, in order to dismantle them all at once. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/net/macvlan.c') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 3aabfd9dd21..20b7707f38e 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -555,13 +555,13 @@ static int macvlan_newlink(struct net_device *dev, return 0; } -static void macvlan_dellink(struct net_device *dev) +static void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; list_del(&vlan->list); - unregister_netdevice(dev); + unregister_netdevice_queue(dev, head); if (list_empty(&port->vlans)) macvlan_port_destroy(port->dev); @@ -601,7 +601,7 @@ static int macvlan_device_event(struct notifier_block *unused, break; case NETDEV_UNREGISTER: list_for_each_entry_safe(vlan, next, &port->vlans, list) - macvlan_dellink(vlan->dev); + macvlan_dellink(vlan->dev, NULL); break; } return NOTIFY_DONE; -- cgit v1.2.3-70-g09d2 From 81adee47dfb608df3ad0b91d230fb3cef75f0060 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 8 Nov 2009 00:53:51 -0800 Subject: net: Support specifying the network namespace upon device creation. There is no good reason to not support userspace specifying the network namespace during device creation, and it makes it easier to create a network device and pass it to a child network namespace with a well known name. We have to be careful to ensure that the target network namespace for the new device exists through the life of the call. To keep that logic clear I have factored out the network namespace grabbing logic into rtnl_link_get_net. In addtion we need to continue to pass the source network namespace to the rtnl_link_ops.newlink method so that we can find the base device source network namespace. Signed-off-by: Eric W. Biederman Acked-by: Eric Dumazet --- drivers/net/can/dev.c | 2 +- drivers/net/macvlan.c | 4 ++-- drivers/net/veth.c | 15 ++++++++++++--- include/net/rtnetlink.h | 8 +++++--- net/8021q/vlan_netlink.c | 4 ++-- net/core/rtnetlink.c | 38 +++++++++++++++++++++++++++----------- net/ipv4/ip_gre.c | 2 +- 7 files changed, 50 insertions(+), 23 deletions(-) (limited to 'drivers/net/macvlan.c') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index c3db111d2ff..5fe34d64ca2 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -674,7 +674,7 @@ nla_put_failure: return -EMSGSIZE; } -static int can_newlink(struct net_device *dev, +static int can_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { return -EOPNOTSUPP; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 20b7707f38e..d7dba3f6f76 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -504,7 +504,7 @@ static int macvlan_get_tx_queues(struct net *net, return 0; } -static int macvlan_newlink(struct net_device *dev, +static int macvlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct macvlan_dev *vlan = netdev_priv(dev); @@ -515,7 +515,7 @@ static int macvlan_newlink(struct net_device *dev, if (!tb[IFLA_LINK]) return -EINVAL; - lowerdev = __dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK])); + lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (lowerdev == NULL) return -ENODEV; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 9bed694cd21..2d657f2314c 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -340,7 +340,7 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[]) static struct rtnl_link_ops veth_link_ops; -static int veth_newlink(struct net_device *dev, +static int veth_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { int err; @@ -348,6 +348,7 @@ static int veth_newlink(struct net_device *dev, struct veth_priv *priv; char ifname[IFNAMSIZ]; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; + struct net *net; /* * create and register peer first @@ -380,14 +381,22 @@ static int veth_newlink(struct net_device *dev, else snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); - peer = rtnl_create_link(dev_net(dev), ifname, &veth_link_ops, tbp); - if (IS_ERR(peer)) + net = rtnl_link_get_net(src_net, tbp); + if (IS_ERR(net)) + return PTR_ERR(net); + + peer = rtnl_create_link(src_net, net, ifname, &veth_link_ops, tbp); + if (IS_ERR(peer)) { + put_net(net); return PTR_ERR(peer); + } if (tbp[IFLA_ADDRESS] == NULL) random_ether_addr(peer->dev_addr); err = register_netdevice(peer); + put_net(net); + net = NULL; if (err < 0) goto err_register_peer; diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index cd5af1f508f..48d3efcb088 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -55,7 +55,8 @@ struct rtnl_link_ops { int (*validate)(struct nlattr *tb[], struct nlattr *data[]); - int (*newlink)(struct net_device *dev, + int (*newlink)(struct net *src_net, + struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]); int (*changelink)(struct net_device *dev, @@ -83,8 +84,9 @@ extern void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops); extern int rtnl_link_register(struct rtnl_link_ops *ops); extern void rtnl_link_unregister(struct rtnl_link_ops *ops); -extern struct net_device *rtnl_create_link(struct net *net, char *ifname, - const struct rtnl_link_ops *ops, struct nlattr *tb[]); +extern struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); +extern struct net_device *rtnl_create_link(struct net *src_net, struct net *net, + char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]); extern const struct nla_policy ifla_policy[IFLA_MAX+1]; #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index a9150485019..3c9cf6a8e7f 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -119,7 +119,7 @@ static int vlan_get_tx_queues(struct net *net, return 0; } -static int vlan_newlink(struct net_device *dev, +static int vlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct vlan_dev_info *vlan = vlan_dev_info(dev); @@ -131,7 +131,7 @@ static int vlan_newlink(struct net_device *dev, if (!tb[IFLA_LINK]) return -EINVAL; - real_dev = __dev_get_by_index(dev_net(dev), nla_get_u32(tb[IFLA_LINK])); + real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) return -ENODEV; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e2f3317f290..33148a56819 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -733,6 +733,20 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_DATA] = { .type = NLA_NESTED }, }; +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net; + /* Examine the link attributes and figure out which + * network namespace we are talking about. + */ + if (tb[IFLA_NET_NS_PID]) + net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + else + net = get_net(src_net); + return net; +} +EXPORT_SYMBOL(rtnl_link_get_net); + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) { if (dev) { @@ -756,8 +770,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, int err; if (tb[IFLA_NET_NS_PID]) { - struct net *net; - net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + struct net *net = rtnl_link_get_net(dev_net(dev), tb); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; @@ -976,8 +989,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return 0; } -struct net_device *rtnl_create_link(struct net *net, char *ifname, - const struct rtnl_link_ops *ops, struct nlattr *tb[]) +struct net_device *rtnl_create_link(struct net *src_net, struct net *net, + char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) { int err; struct net_device *dev; @@ -985,7 +998,7 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname, unsigned int real_num_queues = 1; if (ops->get_tx_queues) { - err = ops->get_tx_queues(net, tb, &num_queues, + err = ops->get_tx_queues(src_net, tb, &num_queues, &real_num_queues); if (err) goto err; @@ -995,16 +1008,16 @@ struct net_device *rtnl_create_link(struct net *net, char *ifname, if (!dev) goto err; + dev_net_set(dev, net); + dev->rtnl_link_ops = ops; dev->real_num_tx_queues = real_num_queues; + if (strchr(dev->name, '%')) { err = dev_alloc_name(dev, dev->name); if (err < 0) goto err_free; } - dev_net_set(dev, net); - dev->rtnl_link_ops = ops; - if (tb[IFLA_MTU]) dev->mtu = nla_get_u32(tb[IFLA_MTU]); if (tb[IFLA_ADDRESS]) @@ -1083,6 +1096,7 @@ replay: if (1) { struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; + struct net *dest_net; if (ops) { if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { @@ -1147,17 +1161,19 @@ replay: if (!ifname[0]) snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - dev = rtnl_create_link(net, ifname, ops, tb); + dest_net = rtnl_link_get_net(net, tb); + dev = rtnl_create_link(net, dest_net, ifname, ops, tb); if (IS_ERR(dev)) err = PTR_ERR(dev); else if (ops->newlink) - err = ops->newlink(dev, tb, data); + err = ops->newlink(net, dev, tb, data); else err = register_netdevice(dev); - if (err < 0 && !IS_ERR(dev)) free_netdev(dev); + + put_net(dest_net); return err; } } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 71a3242fb7d..a7de9e3a8f1 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1483,7 +1483,7 @@ static void ipgre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; } -static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], +static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel *nt; -- cgit v1.2.3-70-g09d2 From cbbef5e183079455763fc470ccf69008f92ab4b6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 10 Nov 2009 06:14:24 +0000 Subject: vlan/macvlan: propagate transmission state to upper layers Both vlan and macvlan devices usually don't use a qdisc and immediately queue packets to the underlying device. Propagate transmission state of the underlying device to the upper layers so they can react on congestion and/or inform the sending process. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 2 +- net/8021q/vlan_dev.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/net/macvlan.c') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d7dba3f6f76..271aa7e1d03 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -202,7 +202,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, } else txq->tx_dropped++; - return NETDEV_TX_OK; + return ret; } static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 790fd55ec31..91596594213 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -332,7 +332,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, } else txq->tx_dropped++; - return NETDEV_TX_OK; + return ret; } static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, @@ -358,7 +358,7 @@ static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, } else txq->tx_dropped++; - return NETDEV_TX_OK; + return ret; } static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) -- cgit v1.2.3-70-g09d2 From fccaf71011b171883efee5bae321eac4760584d1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Nov 2009 08:53:49 +0000 Subject: macvlan: Precise RX stats accounting With multi queue devices, its possible that several cpus call macvlan RX routines simultaneously for the same macvlan device. We update RX stats counter without any locking, so we can get slightly wrong counters. One possible fix is to use percpu counters, to get precise accounting and also get guarantee of no cache line ping pongs between cpus. Note: this adds 16 bytes (32 bytes on 64bit arches) of percpu data per macvlan device. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 76 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 10 deletions(-) (limited to 'drivers/net/macvlan.c') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 271aa7e1d03..ae2b5c79c55 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -38,12 +38,27 @@ struct macvlan_port { struct list_head vlans; }; +/** + * struct macvlan_rx_stats - MACVLAN percpu rx stats + * @rx_packets: number of received packets + * @rx_bytes: number of received bytes + * @multicast: number of received multicast packets + * @rx_errors: number of errors + */ +struct macvlan_rx_stats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long multicast; + unsigned long rx_errors; +}; + struct macvlan_dev { struct net_device *dev; struct list_head list; struct hlist_node hlist; struct macvlan_port *port; struct net_device *lowerdev; + struct macvlan_rx_stats *rx_stats; }; @@ -110,6 +125,7 @@ static void macvlan_broadcast(struct sk_buff *skb, struct net_device *dev; struct sk_buff *nskb; unsigned int i; + struct macvlan_rx_stats *rx_stats; if (skb->protocol == htons(ETH_P_PAUSE)) return; @@ -117,17 +133,17 @@ static void macvlan_broadcast(struct sk_buff *skb, for (i = 0; i < MACVLAN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(vlan, n, &port->vlan_hash[i], hlist) { dev = vlan->dev; + rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); nskb = skb_clone(skb, GFP_ATOMIC); if (nskb == NULL) { - dev->stats.rx_errors++; - dev->stats.rx_dropped++; + rx_stats->rx_errors++; continue; } - dev->stats.rx_bytes += skb->len + ETH_HLEN; - dev->stats.rx_packets++; - dev->stats.multicast++; + rx_stats->rx_bytes += skb->len + ETH_HLEN; + rx_stats->rx_packets++; + rx_stats->multicast++; nskb->dev = dev; if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast)) @@ -147,6 +163,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) const struct macvlan_port *port; const struct macvlan_dev *vlan; struct net_device *dev; + struct macvlan_rx_stats *rx_stats; port = rcu_dereference(skb->dev->macvlan_port); if (port == NULL) @@ -166,16 +183,15 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) kfree_skb(skb); return NULL; } - + rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) { - dev->stats.rx_errors++; - dev->stats.rx_dropped++; + rx_stats->rx_errors++; return NULL; } - dev->stats.rx_bytes += skb->len + ETH_HLEN; - dev->stats.rx_packets++; + rx_stats->rx_bytes += skb->len + ETH_HLEN; + rx_stats->rx_packets++; skb->dev = dev; skb->pkt_type = PACKET_HOST; @@ -365,9 +381,47 @@ static int macvlan_init(struct net_device *dev) macvlan_set_lockdep_class(dev); + vlan->rx_stats = alloc_percpu(struct macvlan_rx_stats); + if (!vlan->rx_stats) + return -ENOMEM; + return 0; } +static void macvlan_uninit(struct net_device *dev) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + + free_percpu(vlan->rx_stats); +} + +static struct net_device_stats *macvlan_dev_get_stats(struct net_device *dev) +{ + struct net_device_stats *stats = &dev->stats; + struct macvlan_dev *vlan = netdev_priv(dev); + + dev_txq_stats_fold(dev, stats); + + if (vlan->rx_stats) { + struct macvlan_rx_stats *p, rx = {0}; + int i; + + for_each_possible_cpu(i) { + p = per_cpu_ptr(vlan->rx_stats, i); + rx.rx_packets += p->rx_packets; + rx.rx_bytes += p->rx_bytes; + rx.rx_errors += p->rx_errors; + rx.multicast += p->multicast; + } + stats->rx_packets = rx.rx_packets; + stats->rx_bytes = rx.rx_bytes; + stats->rx_errors = rx.rx_errors; + stats->rx_dropped = rx.rx_errors; + stats->multicast = rx.multicast; + } + return stats; +} + static void macvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { @@ -404,6 +458,7 @@ static const struct ethtool_ops macvlan_ethtool_ops = { static const struct net_device_ops macvlan_netdev_ops = { .ndo_init = macvlan_init, + .ndo_uninit = macvlan_uninit, .ndo_open = macvlan_open, .ndo_stop = macvlan_stop, .ndo_start_xmit = macvlan_start_xmit, @@ -411,6 +466,7 @@ static const struct net_device_ops macvlan_netdev_ops = { .ndo_change_rx_flags = macvlan_change_rx_flags, .ndo_set_mac_address = macvlan_set_mac_address, .ndo_set_multicast_list = macvlan_set_multicast_list, + .ndo_get_stats = macvlan_dev_get_stats, .ndo_validate_addr = eth_validate_addr, }; -- cgit v1.2.3-70-g09d2 From 8c2acc53fd7987493f11640e266cf7130591e764 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 23 Nov 2009 14:18:53 -0800 Subject: macvlan: fix gso_max_size setting gso_max_size must be set based on the value of the underlying device to support devices not using the full 64k. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/net/macvlan.c') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 3aabfd9dd21..2490aa39804 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -360,6 +360,7 @@ static int macvlan_init(struct net_device *dev) dev->state = (dev->state & ~MACVLAN_STATE_MASK) | (lowerdev->state & MACVLAN_STATE_MASK); dev->features = lowerdev->features & MACVLAN_FEATURES; + dev->gso_max_size = lowerdev->gso_max_size; dev->iflink = lowerdev->ifindex; dev->hard_header_len = lowerdev->hard_header_len; @@ -596,6 +597,7 @@ static int macvlan_device_event(struct notifier_block *unused, case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { vlan->dev->features = dev->features & MACVLAN_FEATURES; + vlan->dev->gso_max_size = dev->gso_max_size; netdev_features_change(vlan->dev); } break; -- cgit v1.2.3-70-g09d2 From a1e514c5d0397b5581721aad9b303f7df83b103d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 26 Nov 2009 06:07:09 +0000 Subject: macvlan: cleanup rx statistics We have very similar code for rx statistics in two places in the macvlan driver, with a third one being added in the next patch. Consolidate them into one function to improve overall readability of the driver. Signed-off-by: Arnd Bergmann Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 70 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 29 deletions(-) (limited to 'drivers/net/macvlan.c') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index ae2b5c79c55..1e7faf9e87b 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -116,42 +116,58 @@ static int macvlan_addr_busy(const struct macvlan_port *port, return 0; } +static inline void macvlan_count_rx(const struct macvlan_dev *vlan, + unsigned int len, bool success, + bool multicast) +{ + struct macvlan_rx_stats *rx_stats; + + rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); + if (likely(success)) { + rx_stats->rx_packets++;; + rx_stats->rx_bytes += len; + if (multicast) + rx_stats->multicast++; + } else { + rx_stats->rx_errors++; + } +} + +static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, + const struct ethhdr *eth) +{ + if (!skb) + return NET_RX_DROP; + + skb->dev = dev; + if (!compare_ether_addr_64bits(eth->h_dest, + dev->broadcast)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + + return netif_rx(skb); +} + static void macvlan_broadcast(struct sk_buff *skb, const struct macvlan_port *port) { const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; struct hlist_node *n; - struct net_device *dev; struct sk_buff *nskb; unsigned int i; - struct macvlan_rx_stats *rx_stats; + int err; if (skb->protocol == htons(ETH_P_PAUSE)) return; for (i = 0; i < MACVLAN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(vlan, n, &port->vlan_hash[i], hlist) { - dev = vlan->dev; - rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); - nskb = skb_clone(skb, GFP_ATOMIC); - if (nskb == NULL) { - rx_stats->rx_errors++; - continue; - } - - rx_stats->rx_bytes += skb->len + ETH_HLEN; - rx_stats->rx_packets++; - rx_stats->multicast++; - - nskb->dev = dev; - if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast)) - nskb->pkt_type = PACKET_BROADCAST; - else - nskb->pkt_type = PACKET_MULTICAST; - - netif_rx(nskb); + err = macvlan_broadcast_one(nskb, vlan->dev, eth); + macvlan_count_rx(vlan, skb->len + ETH_HLEN, + err == NET_RX_SUCCESS, 1); } } } @@ -163,7 +179,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) const struct macvlan_port *port; const struct macvlan_dev *vlan; struct net_device *dev; - struct macvlan_rx_stats *rx_stats; + unsigned int len; port = rcu_dereference(skb->dev->macvlan_port); if (port == NULL) @@ -183,15 +199,11 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) kfree_skb(skb); return NULL; } - rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id()); + len = skb->len + ETH_HLEN; skb = skb_share_check(skb, GFP_ATOMIC); - if (skb == NULL) { - rx_stats->rx_errors++; + macvlan_count_rx(vlan, len, skb != NULL, 0); + if (!skb) return NULL; - } - - rx_stats->rx_bytes += skb->len + ETH_HLEN; - rx_stats->rx_packets++; skb->dev = dev; skb->pkt_type = PACKET_HOST; -- cgit v1.2.3-70-g09d2 From 618e1b7482f7a8a4c6c6e8ccbe140e4c331df4e9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 26 Nov 2009 06:07:10 +0000 Subject: macvlan: implement bridge, VEPA and private mode This allows each macvlan slave device to be in one of three modes, depending on the use case: MACVLAN_PRIVATE: The device never communicates with any other device on the same upper_dev. This even includes frames coming back from a reflective relay, where supported by the adjacent bridge. MACVLAN_VEPA: The new Virtual Ethernet Port Aggregator (VEPA) mode, we assume that the adjacent bridge returns all frames where both source and destination are local to the macvlan port, i.e. the bridge is set up as a reflective relay. Broadcast frames coming in from the upper_dev get flooded to all macvlan interfaces in VEPA mode. We never deliver any frames locally. MACVLAN_BRIDGE: We provide the behavior of a simple bridge between different macvlan interfaces on the same port. Frames from one interface to another one get delivered directly and are not sent out externally. Broadcast frames get flooded to all other bridge ports and to the external interface, but when they come back from a reflective relay, we don't deliver them again. Since we know all the MAC addresses, the macvlan bridge mode does not require learning or STP like the bridge module does. Based on an earlier patch "macvlan: Reflect macvlan packets meant for other macvlan devices" by Eric Biederman. Signed-off-by: Arnd Bergmann Acked-by: Patrick McHardy Cc: Eric Biederman Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 80 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 72 insertions(+), 8 deletions(-) (limited to 'drivers/net/macvlan.c') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 1e7faf9e87b..d6bd84353f4 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -29,9 +29,16 @@ #include #include #include +#include #define MACVLAN_HASH_SIZE (1 << BITS_PER_BYTE) +enum macvlan_mode { + MACVLAN_MODE_PRIVATE = 1, + MACVLAN_MODE_VEPA = 2, + MACVLAN_MODE_BRIDGE = 4, +}; + struct macvlan_port { struct net_device *dev; struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; @@ -59,6 +66,7 @@ struct macvlan_dev { struct macvlan_port *port; struct net_device *lowerdev; struct macvlan_rx_stats *rx_stats; + enum macvlan_mode mode; }; @@ -134,11 +142,14 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan, } static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, - const struct ethhdr *eth) + const struct ethhdr *eth, bool local) { if (!skb) return NET_RX_DROP; + if (local) + return dev_forward_skb(dev, skb); + skb->dev = dev; if (!compare_ether_addr_64bits(eth->h_dest, dev->broadcast)) @@ -150,7 +161,9 @@ static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev, } static void macvlan_broadcast(struct sk_buff *skb, - const struct macvlan_port *port) + const struct macvlan_port *port, + struct net_device *src, + enum macvlan_mode mode) { const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; @@ -164,8 +177,12 @@ static void macvlan_broadcast(struct sk_buff *skb, for (i = 0; i < MACVLAN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(vlan, n, &port->vlan_hash[i], hlist) { + if (vlan->dev == src || !(vlan->mode & mode)) + continue; + nskb = skb_clone(skb, GFP_ATOMIC); - err = macvlan_broadcast_one(nskb, vlan->dev, eth); + err = macvlan_broadcast_one(nskb, vlan->dev, eth, + mode == MACVLAN_MODE_BRIDGE); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, 1); } @@ -178,6 +195,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_port *port; const struct macvlan_dev *vlan; + const struct macvlan_dev *src; struct net_device *dev; unsigned int len; @@ -186,7 +204,25 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) return skb; if (is_multicast_ether_addr(eth->h_dest)) { - macvlan_broadcast(skb, port); + src = macvlan_hash_lookup(port, eth->h_source); + if (!src) + /* frame comes from an external address */ + macvlan_broadcast(skb, port, NULL, + MACVLAN_MODE_PRIVATE | + MACVLAN_MODE_VEPA | + MACVLAN_MODE_BRIDGE); + else if (src->mode == MACVLAN_MODE_VEPA) + /* flood to everyone except source */ + macvlan_broadcast(skb, port, src->dev, + MACVLAN_MODE_VEPA | + MACVLAN_MODE_BRIDGE); + else if (src->mode == MACVLAN_MODE_BRIDGE) + /* + * flood only to VEPA ports, bridge ports + * already saw the frame on the way out. + */ + macvlan_broadcast(skb, port, src->dev, + MACVLAN_MODE_VEPA); return skb; } @@ -212,18 +248,46 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb) return NULL; } +static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) +{ + const struct macvlan_dev *vlan = netdev_priv(dev); + const struct macvlan_port *port = vlan->port; + const struct macvlan_dev *dest; + + if (vlan->mode == MACVLAN_MODE_BRIDGE) { + const struct ethhdr *eth = (void *)skb->data; + + /* send to other bridge ports directly */ + if (is_multicast_ether_addr(eth->h_dest)) { + macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE); + goto xmit_world; + } + + dest = macvlan_hash_lookup(port, eth->h_dest); + if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { + unsigned int length = skb->len + ETH_HLEN; + int ret = dev_forward_skb(dest->dev, skb); + macvlan_count_rx(dest, length, + ret == NET_RX_SUCCESS, 0); + + return NET_XMIT_SUCCESS; + } + } + +xmit_world: + skb->dev = vlan->lowerdev; + return dev_queue_xmit(skb); +} + static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { int i = skb_get_queue_mapping(skb); struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - const struct macvlan_dev *vlan = netdev_priv(dev); unsigned int len = skb->len; int ret; - skb->dev = vlan->lowerdev; - ret = dev_queue_xmit(skb); - + ret = macvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS)) { txq->tx_packets++; txq->tx_bytes += len; -- cgit v1.2.3-70-g09d2 From 27c0b1a850cdea6298f573d835782f3337be913c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 26 Nov 2009 06:07:11 +0000 Subject: macvlan: export macvlan mode through netlink In order to support all three modes of macvlan at runtime, extend the existing netlink protocol to allow choosing the mode per macvlan slave interface. This depends on a matching patch to iproute2 in order to become accessible in user land. Signed-off-by: Arnd Bergmann Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 56 +++++++++++++++++++++++++++++++++++++++++++------ include/linux/if_link.h | 15 +++++++++++++ 2 files changed, 65 insertions(+), 6 deletions(-) (limited to 'drivers/net/macvlan.c') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d6bd84353f4..322112c7358 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -33,12 +33,6 @@ #define MACVLAN_HASH_SIZE (1 << BITS_PER_BYTE) -enum macvlan_mode { - MACVLAN_MODE_PRIVATE = 1, - MACVLAN_MODE_VEPA = 2, - MACVLAN_MODE_BRIDGE = 4, -}; - struct macvlan_port { struct net_device *dev; struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; @@ -614,6 +608,17 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[]) if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } + + if (data && data[IFLA_MACVLAN_MODE]) { + switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) { + case MACVLAN_MODE_PRIVATE: + case MACVLAN_MODE_VEPA: + case MACVLAN_MODE_BRIDGE: + break; + default: + return -EINVAL; + } + } return 0; } @@ -678,6 +683,10 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, vlan->dev = dev; vlan->port = port; + vlan->mode = MACVLAN_MODE_VEPA; + if (data && data[IFLA_MACVLAN_MODE]) + vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); + err = register_netdevice(dev); if (err < 0) return err; @@ -699,6 +708,36 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head) macvlan_port_destroy(port->dev); } +static int macvlan_changelink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + if (data && data[IFLA_MACVLAN_MODE]) + vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); + return 0; +} + +static size_t macvlan_get_size(const struct net_device *dev) +{ + return nla_total_size(4); +} + +static int macvlan_fill_info(struct sk_buff *skb, + const struct net_device *dev) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + + NLA_PUT_U32(skb, IFLA_MACVLAN_MODE, vlan->mode); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { + [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, +}; + static struct rtnl_link_ops macvlan_link_ops __read_mostly = { .kind = "macvlan", .priv_size = sizeof(struct macvlan_dev), @@ -707,6 +746,11 @@ static struct rtnl_link_ops macvlan_link_ops __read_mostly = { .validate = macvlan_validate, .newlink = macvlan_newlink, .dellink = macvlan_dellink, + .maxtype = IFLA_MACVLAN_MAX, + .policy = macvlan_policy, + .changelink = macvlan_changelink, + .get_size = macvlan_get_size, + .fill_info = macvlan_fill_info, }; static int macvlan_device_event(struct notifier_block *unused, diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 1d3b2425f66..6674791622c 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -181,4 +181,19 @@ struct ifla_vlan_qos_mapping { __u32 to; }; +/* MACVLAN section */ +enum { + IFLA_MACVLAN_UNSPEC, + IFLA_MACVLAN_MODE, + __IFLA_MACVLAN_MAX, +}; + +#define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1) + +enum macvlan_mode { + MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */ + MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ + MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ +}; + #endif /* _LINUX_IF_LINK_H */ -- cgit v1.2.3-70-g09d2 From fc4a7489663250360cd40d5adf06a08d1c5d54df Mon Sep 17 00:00:00 2001 From: Patrick Mullaney Date: Thu, 3 Dec 2009 15:59:22 -0800 Subject: netdevice: provide common routine for macvlan and vlan operstate management Provide common routine for the transition of operational state for a leaf device during a root device transition. Signed-off-by: Patrick Mullaney Acked-by: Arnd Bergmann Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 24 +++--------------------- include/linux/netdevice.h | 3 +++ net/8021q/vlan.c | 29 ++++------------------------- net/core/dev.c | 27 +++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 46 deletions(-) (limited to 'drivers/net/macvlan.c') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 93c3e6edf70..21a9c9ab4b3 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -582,25 +582,6 @@ static void macvlan_port_destroy(struct net_device *dev) kfree(port); } -static void macvlan_transfer_operstate(struct net_device *dev) -{ - struct macvlan_dev *vlan = netdev_priv(dev); - const struct net_device *lowerdev = vlan->lowerdev; - - if (lowerdev->operstate == IF_OPER_DORMANT) - netif_dormant_on(dev); - else - netif_dormant_off(dev); - - if (netif_carrier_ok(lowerdev)) { - if (!netif_carrier_ok(dev)) - netif_carrier_on(dev); - } else { - if (netif_carrier_ok(dev)) - netif_carrier_off(dev); - } -} - static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[]) { if (tb[IFLA_ADDRESS]) { @@ -693,7 +674,7 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev, return err; list_add_tail(&vlan->list, &port->vlans); - macvlan_transfer_operstate(dev); + netif_stacked_transfer_operstate(lowerdev, dev); return 0; } @@ -768,7 +749,8 @@ static int macvlan_device_event(struct notifier_block *unused, switch (event) { case NETDEV_CHANGE: list_for_each_entry(vlan, &port->vlans, list) - macvlan_transfer_operstate(vlan->dev); + netif_stacked_transfer_operstate(vlan->lowerdev, + vlan->dev); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index daf13d36749..a3fccc85b1a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1981,6 +1981,9 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one, unsigned long mask); unsigned long netdev_fix_features(unsigned long features, const char *name); +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev); + static inline int net_gso_ok(int features, int gso_type) { int feature = gso_type << NETIF_F_GSO_SHIFT; diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index ec3769295da..33f90e7362c 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -184,27 +184,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) dev_put(real_dev); } -static void vlan_transfer_operstate(const struct net_device *dev, - struct net_device *vlandev) -{ - /* Have to respect userspace enforced dormant state - * of real device, also must allow supplicant running - * on VLAN device - */ - if (dev->operstate == IF_OPER_DORMANT) - netif_dormant_on(vlandev); - else - netif_dormant_off(vlandev); - - if (netif_carrier_ok(dev)) { - if (!netif_carrier_ok(vlandev)) - netif_carrier_on(vlandev); - } else { - if (netif_carrier_ok(vlandev)) - netif_carrier_off(vlandev); - } -} - int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id) { const char *name = real_dev->name; @@ -262,7 +241,7 @@ int register_vlan_dev(struct net_device *dev) /* Account for reference in struct vlan_dev_info */ dev_hold(real_dev); - vlan_transfer_operstate(real_dev, dev); + netif_stacked_transfer_operstate(real_dev, dev); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place @@ -453,7 +432,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, if (!vlandev) continue; - vlan_transfer_operstate(dev, vlandev); + netif_stacked_transfer_operstate(dev, vlandev); } break; @@ -511,7 +490,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan = vlan_dev_info(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs & ~IFF_UP); - vlan_transfer_operstate(dev, vlandev); + netif_stacked_transfer_operstate(dev, vlandev); } break; @@ -529,7 +508,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan = vlan_dev_info(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP); - vlan_transfer_operstate(dev, vlandev); + netif_stacked_transfer_operstate(dev, vlandev); } break; diff --git a/net/core/dev.c b/net/core/dev.c index 0913a08a87d..c36a17aafcf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4900,6 +4900,33 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); +/** + * netif_stacked_transfer_operstate - transfer operstate + * @rootdev: the root or lower level device to transfer state from + * @dev: the device to transfer operstate to + * + * Transfer operational state from root to device. This is normally + * called when a stacking relationship exists between the root + * device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev) +{ + if (rootdev->operstate == IF_OPER_DORMANT) + netif_dormant_on(dev); + else + netif_dormant_off(dev); + + if (netif_carrier_ok(rootdev)) { + if (!netif_carrier_ok(dev)) + netif_carrier_on(dev); + } else { + if (netif_carrier_ok(dev)) + netif_carrier_off(dev); + } +} +EXPORT_SYMBOL(netif_stacked_transfer_operstate); + /** * register_netdevice - register a network device * @dev: device to register -- cgit v1.2.3-70-g09d2