diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-10 20:01:30 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-10 20:01:30 -0800 |
commit | c5ce28df0e7c01a1de23c36ebdefcd803f2b6cbb (patch) | |
tree | 9830baf38832769e1cf621708889111bbe3c93df /drivers/net/vxlan.c | |
parent | 29afc4e9a408f2304e09c6dd0dbcfbd2356d0faa (diff) | |
parent | 9399f0c51489ae8c16d6559b82a452fdc1895e91 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) More iov_iter conversion work from Al Viro.
[ The "crypto: switch af_alg_make_sg() to iov_iter" commit was
wrong, and this pull actually adds an extra commit on top of the
branch I'm pulling to fix that up, so that the pre-merge state is
ok. - Linus ]
2) Various optimizations to the ipv4 forwarding information base trie
lookup implementation. From Alexander Duyck.
3) Remove sock_iocb altogether, from CHristoph Hellwig.
4) Allow congestion control algorithm selection via routing metrics.
From Daniel Borkmann.
5) Make ipv4 uncached route list per-cpu, from Eric Dumazet.
6) Handle rfs hash collisions more gracefully, also from Eric Dumazet.
7) Add xmit_more support to r8169, e1000, and e1000e drivers. From
Florian Westphal.
8) Transparent Ethernet Bridging support for GRO, from Jesse Gross.
9) Add BPF packet actions to packet scheduler, from Jiri Pirko.
10) Add support for uniqu flow IDs to openvswitch, from Joe Stringer.
11) New NetCP ethernet driver, from Muralidharan Karicheri and Wingman
Kwok.
12) More sanely handle out-of-window dupacks, which can result in
serious ACK storms. From Neal Cardwell.
13) Various rhashtable bug fixes and enhancements, from Herbert Xu,
Patrick McHardy, and Thomas Graf.
14) Support xmit_more in be2net, from Sathya Perla.
15) Group Policy extensions for vxlan, from Thomas Graf.
16) Remove Checksum Offload support for vxlan, from Tom Herbert.
17) Like ipv4, support lockless transmit over ipv6 UDP sockets. From
Vlad Yasevich.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1494+1 commits)
crypto: fix af_alg_make_sg() conversion to iov_iter
ipv4: Namespecify TCP PMTU mechanism
i40e: Fix for stats init function call in Rx setup
tcp: don't include Fast Open option in SYN-ACK on pure SYN-data
openvswitch: Only set TUNNEL_VXLAN_OPT if VXLAN-GBP metadata is set
ipv6: Make __ipv6_select_ident static
ipv6: Fix fragment id assignment on LE arches.
bridge: Fix inability to add non-vlan fdb entry
net: Mellanox: Delete unnecessary checks before the function call "vunmap"
cxgb4: Add support in cxgb4 to get expansion rom version via ethtool
ethtool: rename reserved1 memeber in ethtool_drvinfo for expansion ROM version
net: dsa: Remove redundant phy_attach()
IB/mlx4: Reset flow support for IB kernel ULPs
IB/mlx4: Always use the correct port for mirrored multicast attachments
net/bonding: Fix potential bad memory access during bonding events
tipc: remove tipc_snprintf
tipc: nl compat add noop and remove legacy nl framework
tipc: convert legacy nl stats show to nl compat
tipc: convert legacy nl net id get to nl compat
tipc: convert legacy nl net id set to nl compat
...
Diffstat (limited to 'drivers/net/vxlan.c')
-rw-r--r-- | drivers/net/vxlan.c | 440 |
1 files changed, 331 insertions, 109 deletions
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a8c755dcab1..0e57e862c39 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -61,12 +61,6 @@ #define FDB_AGE_DEFAULT 300 /* 5 min */ #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ -#define VXLAN_N_VID (1u << 24) -#define VXLAN_VID_MASK (VXLAN_N_VID - 1) -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) - -#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ - /* UDP port for VXLAN traffic. * The IANA assigned port is 4789, but the Linux default is 8472 * for compatibility with early adopters. @@ -269,15 +263,20 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); } -/* Find VXLAN socket based on network namespace, address family and UDP port */ -static struct vxlan_sock *vxlan_find_sock(struct net *net, - sa_family_t family, __be16 port) +/* Find VXLAN socket based on network namespace, address family and UDP port + * and enabled unshareable flags. + */ +static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, + __be16 port, u32 flags) { struct vxlan_sock *vs; + flags &= VXLAN_F_RCV_FLAGS; + hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { if (inet_sk(vs->sock->sk)->inet_sport == port && - inet_sk(vs->sock->sk)->sk.sk_family == family) + inet_sk(vs->sock->sk)->sk.sk_family == family && + vs->flags == flags) return vs; } return NULL; @@ -297,11 +296,12 @@ static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id) /* Look up VNI in a per net namespace table */ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, - sa_family_t family, __be16 port) + sa_family_t family, __be16 port, + u32 flags) { struct vxlan_sock *vs; - vs = vxlan_find_sock(net, family, port); + vs = vxlan_find_sock(net, family, port, flags); if (!vs) return NULL; @@ -340,6 +340,11 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, ndm->ndm_flags = fdb->flags; ndm->ndm_type = RTN_UNICAST; + if (!net_eq(dev_net(vxlan->dev), vxlan->net) && + nla_put_s32(skb, NDA_LINK_NETNSID, + peernet2id(dev_net(vxlan->dev), vxlan->net))) + goto nla_put_failure; + if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; @@ -364,7 +369,8 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; - return nlmsg_end(skb, nlh); + nlmsg_end(skb, nlh); + return 0; nla_put_failure: nlmsg_cancel(skb, nlh); @@ -379,6 +385,7 @@ static inline size_t vxlan_nlmsg_size(void) + nla_total_size(sizeof(__be16)) /* NDA_PORT */ + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ + + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ + nla_total_size(sizeof(struct nda_cacheinfo)); } @@ -545,15 +552,51 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, return 1; } -static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, + unsigned int off, + struct vxlanhdr *vh, size_t hdrlen, + u32 data) +{ + size_t start, offset, plen; + + if (skb->remcsum_offload) + return vh; + + if (!NAPI_GRO_CB(skb)->csum_valid) + return NULL; + + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; + offset = start + ((data & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check)); + + plen = hdrlen + offset + sizeof(u16); + + /* Pull checksum that will be written */ + if (skb_gro_header_hard(skb, off + plen)) { + vh = skb_gro_header_slow(skb, off + plen, off); + if (!vh) + return NULL; + } + + skb_gro_remcsum_process(skb, (void *)vh + hdrlen, start, offset); + + skb->remcsum_offload = 1; + + return vh; +} + +static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff) { struct sk_buff *p, **pp = NULL; struct vxlanhdr *vh, *vh2; - struct ethhdr *eh, *eh2; - unsigned int hlen, off_vx, off_eth; - const struct packet_offload *ptype; - __be16 type; + unsigned int hlen, off_vx; int flush = 1; + struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock, + udp_offloads); + u32 flags; off_vx = skb_gro_offset(skb); hlen = off_vx + sizeof(*vh); @@ -563,15 +606,17 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff if (unlikely(!vh)) goto out; } + skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); - off_eth = skb_gro_offset(skb); - hlen = off_eth + sizeof(*eh); - eh = skb_gro_header_fast(skb, off_eth); - if (skb_gro_header_hard(skb, hlen)) { - eh = skb_gro_header_slow(skb, hlen, off_eth); - if (unlikely(!eh)) + flags = ntohl(vh->vx_flags); + + if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { + vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), + ntohl(vh->vx_vni)); + + if (!vh) goto out; } @@ -582,54 +627,27 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff continue; vh2 = (struct vxlanhdr *)(p->data + off_vx); - eh2 = (struct ethhdr *)(p->data + off_eth); - if (vh->vx_vni != vh2->vx_vni || compare_ether_header(eh, eh2)) { + if (vh->vx_flags != vh2->vx_flags || + vh->vx_vni != vh2->vx_vni) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } - type = eh->h_proto; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (ptype == NULL) { - flush = 1; - goto out_unlock; - } + pp = eth_gro_receive(head, skb); - skb_gro_pull(skb, sizeof(*eh)); /* pull inner eth header */ - skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); - pp = ptype->callbacks.gro_receive(head, skb); - -out_unlock: - rcu_read_unlock(); out: NAPI_GRO_CB(skb)->flush |= flush; return pp; } -static int vxlan_gro_complete(struct sk_buff *skb, int nhoff) +static int vxlan_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) { - struct ethhdr *eh; - struct packet_offload *ptype; - __be16 type; - int vxlan_len = sizeof(struct vxlanhdr) + sizeof(struct ethhdr); - int err = -ENOSYS; - udp_tunnel_gro_complete(skb, nhoff); - eh = (struct ethhdr *)(skb->data + nhoff + sizeof(struct vxlanhdr)); - type = eh->h_proto; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype != NULL) - err = ptype->callbacks.gro_complete(skb, nhoff + vxlan_len); - - rcu_read_unlock(); - return err; + return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); } /* Notify netdevs that UDP port started listening */ @@ -991,7 +1009,7 @@ static bool vxlan_snoop(struct net_device *dev, if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", - src_mac, &rdst->remote_ip, &src_ip); + src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; f->updated = jiffies; @@ -1131,33 +1149,107 @@ static void vxlan_igmp_leave(struct work_struct *work) dev_put(vxlan->dev); } +static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, + size_t hdrlen, u32 data) +{ + size_t start, offset, plen; + + if (skb->remcsum_offload) { + /* Already processed in GRO path */ + skb->remcsum_offload = 0; + return vh; + } + + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; + offset = start + ((data & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check)); + + plen = hdrlen + offset + sizeof(u16); + + if (!pskb_may_pull(skb, plen)) + return NULL; + + vh = (struct vxlanhdr *)(udp_hdr(skb) + 1); + + skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset); + + return vh; +} + /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct vxlan_sock *vs; struct vxlanhdr *vxh; + u32 flags, vni; + struct vxlan_metadata md = {0}; /* Need Vxlan and inner Ethernet header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) goto error; - /* Return packets with reserved bits set */ vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); - if (vxh->vx_flags != htonl(VXLAN_FLAGS) || - (vxh->vx_vni & htonl(0xff))) { - netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); - goto error; + flags = ntohl(vxh->vx_flags); + vni = ntohl(vxh->vx_vni); + + if (flags & VXLAN_HF_VNI) { + flags &= ~VXLAN_HF_VNI; + } else { + /* VNI flag always required to be set */ + goto bad_flags; } if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) goto drop; + vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; - vs->rcv(vs, skb, vxh->vx_vni); + if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { + vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni); + if (!vxh) + goto drop; + + flags &= ~VXLAN_HF_RCO; + vni &= VXLAN_VID_MASK; + } + + /* For backwards compatibility, only allow reserved fields to be + * used by VXLAN extensions if explicitly requested. + */ + if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) { + struct vxlanhdr_gbp *gbp; + + gbp = (struct vxlanhdr_gbp *)vxh; + md.gbp = ntohs(gbp->policy_id); + + if (gbp->dont_learn) + md.gbp |= VXLAN_GBP_DONT_LEARN; + + if (gbp->policy_applied) + md.gbp |= VXLAN_GBP_POLICY_APPLIED; + + flags &= ~VXLAN_GBP_USED_BITS; + } + + if (flags || (vni & ~VXLAN_VID_MASK)) { + /* If there are any unprocessed flags remaining treat + * this as a malformed packet. This behavior diverges from + * VXLAN RFC (RFC7348) which stipulates that bits in reserved + * in reserved fields are to be ignored. The approach here + * maintains compatbility with previous stack code, and also + * is more robust and provides a little more security in + * adding extensions to VXLAN. + */ + + goto bad_flags; + } + + md.vni = vxh->vx_vni; + vs->rcv(vs, skb, &md); return 0; drop: @@ -1165,13 +1257,17 @@ drop: kfree_skb(skb); return 0; +bad_flags: + netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", + ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); + error: /* Return non vxlan pkt */ return 1; } -static void vxlan_rcv(struct vxlan_sock *vs, - struct sk_buff *skb, __be32 vx_vni) +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, + struct vxlan_metadata *md) { struct iphdr *oip = NULL; struct ipv6hdr *oip6 = NULL; @@ -1182,7 +1278,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, int err = 0; union vxlan_addr *remote_ip; - vni = ntohl(vx_vni) >> 8; + vni = ntohl(md->vni) >> 8; /* Is this VNI defined? */ vxlan = vxlan_vs_find_vni(vs, vni); if (!vxlan) @@ -1216,6 +1312,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, goto drop; skb_reset_network_header(skb); + skb->mark = md->gbp; if (oip6) err = IP6_ECN_decapsulate(oip6, skb); @@ -1565,20 +1662,54 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) return false; } +static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, + struct vxlan_metadata *md) +{ + struct vxlanhdr_gbp *gbp; + + if (!md->gbp) + return; + + gbp = (struct vxlanhdr_gbp *)vxh; + vxh->vx_flags |= htonl(VXLAN_HF_GBP); + + if (md->gbp & VXLAN_GBP_DONT_LEARN) + gbp->dont_learn = 1; + + if (md->gbp & VXLAN_GBP_POLICY_APPLIED) + gbp->policy_applied = 1; + + gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); +} + #if IS_ENABLED(CONFIG_IPV6) -static int vxlan6_xmit_skb(struct vxlan_sock *vs, - struct dst_entry *dst, struct sk_buff *skb, +static int vxlan6_xmit_skb(struct dst_entry *dst, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, __u8 prio, __u8 ttl, - __be16 src_port, __be16 dst_port, __be32 vni, - bool xnet) + __be16 src_port, __be16 dst_port, + struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; int err; - bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk); + bool udp_sum = !(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX); + int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + u16 hdrlen = sizeof(struct vxlanhdr); + + if ((vxflags & VXLAN_F_REMCSUM_TX) && + skb->ip_summed == CHECKSUM_PARTIAL) { + int csum_start = skb_checksum_start_offset(skb); + + if (csum_start <= VXLAN_MAX_REMCSUM_START && + !(csum_start & VXLAN_RCO_SHIFT_MASK) && + (skb->csum_offset == offsetof(struct udphdr, check) || + skb->csum_offset == offsetof(struct tcphdr, check))) { + udp_sum = false; + type |= SKB_GSO_TUNNEL_REMCSUM; + } + } - skb = udp_tunnel_handle_offloads(skb, udp_sum); + skb = iptunnel_handle_offloads(skb, udp_sum, type); if (IS_ERR(skb)) { err = -EINVAL; goto err; @@ -1588,7 +1719,7 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + VXLAN_HLEN + sizeof(struct ipv6hdr) - + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); @@ -1604,13 +1735,33 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs, } vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = htonl(VXLAN_FLAGS); - vxh->vx_vni = vni; + vxh->vx_flags = htonl(VXLAN_HF_VNI); + vxh->vx_vni = md->vni; + + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> + VXLAN_RCO_SHIFT; + + if (skb->csum_offset == offsetof(struct udphdr, check)) + data |= VXLAN_RCO_UDP; + + vxh->vx_vni |= htonl(data); + vxh->vx_flags |= htonl(VXLAN_HF_RCO); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + } + + if (vxflags & VXLAN_F_GBP) + vxlan_build_gbp_hdr(vxh, vxflags, md); skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio, - ttl, src_port, dst_port); + udp_tunnel6_xmit_skb(dst, skb, dev, saddr, daddr, prio, + ttl, src_port, dst_port, + !!(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX)); return 0; err: dst_release(dst); @@ -1618,23 +1769,38 @@ err: } #endif -int vxlan_xmit_skb(struct vxlan_sock *vs, - struct rtable *rt, struct sk_buff *skb, +int vxlan_xmit_skb(struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, __be32 vni, bool xnet) + __be16 src_port, __be16 dst_port, + struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; int err; - bool udp_sum = !vs->sock->sk->sk_no_check_tx; + bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM); + int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + u16 hdrlen = sizeof(struct vxlanhdr); + + if ((vxflags & VXLAN_F_REMCSUM_TX) && + skb->ip_summed == CHECKSUM_PARTIAL) { + int csum_start = skb_checksum_start_offset(skb); + + if (csum_start <= VXLAN_MAX_REMCSUM_START && + !(csum_start & VXLAN_RCO_SHIFT_MASK) && + (skb->csum_offset == offsetof(struct udphdr, check) || + skb->csum_offset == offsetof(struct tcphdr, check))) { + udp_sum = false; + type |= SKB_GSO_TUNNEL_REMCSUM; + } + } - skb = udp_tunnel_handle_offloads(skb, udp_sum); + skb = iptunnel_handle_offloads(skb, udp_sum, type); if (IS_ERR(skb)) return PTR_ERR(skb); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + VXLAN_HLEN + sizeof(struct iphdr) - + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); + + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); @@ -1648,13 +1814,33 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, return -ENOMEM; vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = htonl(VXLAN_FLAGS); - vxh->vx_vni = vni; + vxh->vx_flags = htonl(VXLAN_HF_VNI); + vxh->vx_vni = md->vni; + + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> + VXLAN_RCO_SHIFT; + + if (skb->csum_offset == offsetof(struct udphdr, check)) + data |= VXLAN_RCO_UDP; + + vxh->vx_vni |= htonl(data); + vxh->vx_flags |= htonl(VXLAN_HF_RCO); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + } + + if (vxflags & VXLAN_F_GBP) + vxlan_build_gbp_hdr(vxh, vxflags, md); skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos, - ttl, df, src_port, dst_port, xnet); + return udp_tunnel_xmit_skb(rt, skb, src, dst, tos, + ttl, df, src_port, dst_port, xnet, + !(vxflags & VXLAN_F_UDP_CSUM)); } EXPORT_SYMBOL_GPL(vxlan_xmit_skb); @@ -1711,6 +1897,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, const struct iphdr *old_iph; struct flowi4 fl4; union vxlan_addr *dst; + struct vxlan_metadata md; __be16 src_port = 0, dst_port; u32 vni; __be16 df = 0; @@ -1772,7 +1959,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, ip_rt_put(rt); dst_vxlan = vxlan_find_vni(vxlan->net, vni, - dst->sa.sa_family, dst_port); + dst->sa.sa_family, dst_port, + vxlan->flags); if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); @@ -1781,12 +1969,14 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - - err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb, - fl4.saddr, dst->sin.sin_addr.s_addr, - tos, ttl, df, src_port, dst_port, - htonl(vni << 8), - !net_eq(vxlan->net, dev_net(vxlan->dev))); + md.vni = htonl(vni << 8); + md.gbp = skb->mark; + + err = vxlan_xmit_skb(rt, skb, fl4.saddr, + dst->sin.sin_addr.s_addr, tos, ttl, df, + src_port, dst_port, &md, + !net_eq(vxlan->net, dev_net(vxlan->dev)), + vxlan->flags); if (err < 0) { /* skb is already freed. */ skb = NULL; @@ -1830,7 +2020,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_release(ndst); dst_vxlan = vxlan_find_vni(vxlan->net, vni, - dst->sa.sa_family, dst_port); + dst->sa.sa_family, dst_port, + vxlan->flags); if (!dst_vxlan) goto tx_error; vxlan_encap_bypass(skb, vxlan, dst_vxlan); @@ -1838,11 +2029,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ttl = ttl ? : ip6_dst_hoplimit(ndst); + md.vni = htonl(vni << 8); + md.gbp = skb->mark; - err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb, - dev, &fl6.saddr, &fl6.daddr, 0, ttl, - src_port, dst_port, htonl(vni << 8), - !net_eq(vxlan->net, dev_net(vxlan->dev))); + err = vxlan6_xmit_skb(ndst, skb, dev, &fl6.saddr, &fl6.daddr, + 0, ttl, src_port, dst_port, &md, + !net_eq(vxlan->net, dev_net(vxlan->dev)), + vxlan->flags); #endif } @@ -1998,7 +2191,7 @@ static int vxlan_init(struct net_device *dev) spin_lock(&vn->sock_lock); vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, - vxlan->dst_port); + vxlan->dst_port, vxlan->flags); if (vs && atomic_add_unless(&vs->refcnt, 1, 0)) { /* If we have a socket with same port already, reuse it */ vxlan_vs_add_dev(vs, vxlan); @@ -2242,6 +2435,9 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, + [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, + [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, + [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -2311,15 +2507,11 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, if (ipv6) { udp_conf.family = AF_INET6; - udp_conf.use_udp6_tx_checksums = - !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); udp_conf.use_udp6_rx_checksums = !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); } else { udp_conf.family = AF_INET; udp_conf.local_ip.s_addr = INADDR_ANY; - udp_conf.use_udp_checksums = - !!(flags & VXLAN_F_UDP_CSUM); } udp_conf.local_udp_port = port; @@ -2363,6 +2555,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, atomic_set(&vs->refcnt, 1); vs->rcv = rcv; vs->data = data; + vs->flags = (flags & VXLAN_F_RCV_FLAGS); /* Initialize the vxlan udp offloads structure */ vs->udp_offloads.port = port; @@ -2401,7 +2594,7 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, return vs; spin_lock(&vn->sock_lock); - vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); + vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, flags); if (vs && ((vs->rcv != rcv) || !atomic_add_unless(&vs->refcnt, 1, 0))) vs = ERR_PTR(-EBUSY); @@ -2557,8 +2750,19 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; + if (data[IFLA_VXLAN_REMCSUM_TX] && + nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX])) + vxlan->flags |= VXLAN_F_REMCSUM_TX; + + if (data[IFLA_VXLAN_REMCSUM_RX] && + nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX])) + vxlan->flags |= VXLAN_F_REMCSUM_RX; + + if (data[IFLA_VXLAN_GBP]) + vxlan->flags |= VXLAN_F_GBP; + if (vxlan_find_vni(src_net, vni, use_ipv6 ? AF_INET6 : AF_INET, - vxlan->dst_port)) { + vxlan->dst_port, vxlan->flags)) { pr_info("duplicate VNI %u\n", vni); return -EEXIST; } @@ -2625,6 +2829,8 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ 0; } @@ -2690,18 +2896,33 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, - !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX))) + !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || + nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, + !!(vxlan->flags & VXLAN_F_REMCSUM_TX)) || + nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, + !!(vxlan->flags & VXLAN_F_REMCSUM_RX))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; + if (vxlan->flags & VXLAN_F_GBP && + nla_put_flag(skb, IFLA_VXLAN_GBP)) + goto nla_put_failure; + return 0; nla_put_failure: return -EMSGSIZE; } +static struct net *vxlan_get_link_net(const struct net_device *dev) +{ + struct vxlan_dev *vxlan = netdev_priv(dev); + + return vxlan->net; +} + static struct rtnl_link_ops vxlan_link_ops __read_mostly = { .kind = "vxlan", .maxtype = IFLA_VXLAN_MAX, @@ -2713,6 +2934,7 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = { .dellink = vxlan_dellink, .get_size = vxlan_get_size, .fill_info = vxlan_fill_info, + .get_link_net = vxlan_get_link_net, }; static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, |