diff options
Diffstat (limited to 'net/packet')
-rw-r--r-- | net/packet/Kconfig | 10 | ||||
-rw-r--r-- | net/packet/af_packet.c | 300 |
2 files changed, 211 insertions, 99 deletions
diff --git a/net/packet/Kconfig b/net/packet/Kconfig index 34ff93ff894..0060e3b396b 100644 --- a/net/packet/Kconfig +++ b/net/packet/Kconfig @@ -14,13 +14,3 @@ config PACKET be called af_packet. If unsure, say Y. - -config PACKET_MMAP - bool "Packet socket: mmapped IO" - depends on PACKET - help - If you say Y here, the Packet protocol driver will use an IO - mechanism that results in faster communication. - - If unsure, say N. - diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 939471ef8d5..031a5e6fb4a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -80,6 +80,7 @@ #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> +#include <linux/virtio_net.h> #ifdef CONFIG_INET #include <net/inet_common.h> @@ -156,7 +157,6 @@ struct packet_mreq_max { unsigned char mr_address[MAX_ADDR_LEN]; }; -#ifdef CONFIG_PACKET_MMAP static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing, int tx_ring); @@ -176,7 +176,6 @@ struct packet_ring_buffer { struct packet_sock; static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); -#endif static void packet_flush_mclist(struct sock *sk); @@ -184,26 +183,23 @@ struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; struct tpacket_stats stats; -#ifdef CONFIG_PACKET_MMAP struct packet_ring_buffer rx_ring; struct packet_ring_buffer tx_ring; int copy_thresh; -#endif spinlock_t bind_lock; struct mutex pg_vec_lock; unsigned int running:1, /* prot_hook is attached*/ auxdata:1, - origdev:1; + origdev:1, + has_vnet_hdr:1; int ifindex; /* bound device */ __be16 num; struct packet_mclist *mclist; -#ifdef CONFIG_PACKET_MMAP atomic_t mapped; enum tpacket_versions tp_version; unsigned int tp_hdrlen; unsigned int tp_reserve; unsigned int tp_loss:1; -#endif struct packet_type prot_hook ____cacheline_aligned_in_smp; }; @@ -217,8 +213,6 @@ struct packet_skb_cb { #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) -#ifdef CONFIG_PACKET_MMAP - static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union { @@ -313,8 +307,6 @@ static inline void packet_increment_head(struct packet_ring_buffer *buff) buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } -#endif - static inline struct packet_sock *pkt_sk(struct sock *sk) { return (struct packet_sock *)sk; @@ -638,7 +630,6 @@ drop: return 0; } -#ifdef CONFIG_PACKET_MMAP static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { @@ -1054,7 +1045,30 @@ out: mutex_unlock(&po->pg_vec_lock); return err; } -#endif + +static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, + size_t reserve, size_t len, + size_t linear, int noblock, + int *err) +{ + struct sk_buff *skb; + + /* Under a page? Don't bother with paged skb. */ + if (prepad + len < PAGE_SIZE || !linear) + linear = len; + + skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, + err); + if (!skb) + return NULL; + + skb_reserve(skb, reserve); + skb_put(skb, linear); + skb->data_len = len - linear; + skb->len += len - linear; + + return skb; +} static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) @@ -1066,14 +1080,17 @@ static int packet_snd(struct socket *sock, __be16 proto; unsigned char *addr; int ifindex, err, reserve = 0; + struct virtio_net_hdr vnet_hdr = { 0 }; + int offset = 0; + int vnet_hdr_len; + struct packet_sock *po = pkt_sk(sk); + unsigned short gso_type = 0; /* * Get and verify the address. */ if (saddr == NULL) { - struct packet_sock *po = pkt_sk(sk); - ifindex = po->ifindex; proto = po->num; addr = NULL; @@ -1100,25 +1117,74 @@ static int packet_snd(struct socket *sock, if (!(dev->flags & IFF_UP)) goto out_unlock; + if (po->has_vnet_hdr) { + vnet_hdr_len = sizeof(vnet_hdr); + + err = -EINVAL; + if (len < vnet_hdr_len) + goto out_unlock; + + len -= vnet_hdr_len; + + err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov, + vnet_hdr_len); + if (err < 0) + goto out_unlock; + + if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && + (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 > + vnet_hdr.hdr_len)) + vnet_hdr.hdr_len = vnet_hdr.csum_start + + vnet_hdr.csum_offset + 2; + + err = -EINVAL; + if (vnet_hdr.hdr_len > len) + goto out_unlock; + + if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + gso_type = SKB_GSO_TCPV4; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + gso_type = SKB_GSO_TCPV6; + break; + case VIRTIO_NET_HDR_GSO_UDP: + gso_type = SKB_GSO_UDP; + break; + default: + goto out_unlock; + } + + if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) + gso_type |= SKB_GSO_TCP_ECN; + + if (vnet_hdr.gso_size == 0) + goto out_unlock; + + } + } + err = -EMSGSIZE; - if (len > dev->mtu+reserve) + if (!gso_type && (len > dev->mtu+reserve)) goto out_unlock; - skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev), - msg->msg_flags & MSG_DONTWAIT, &err); + err = -ENOBUFS; + skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev), + LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len, + msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; - skb_reserve(skb, LL_RESERVED_SPACE(dev)); - skb_reset_network_header(skb); + skb_set_network_header(skb, reserve); err = -EINVAL; if (sock->type == SOCK_DGRAM && - dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0) + (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0) goto out_free; /* Returns -EFAULT on error */ - err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); if (err) goto out_free; @@ -1127,6 +1193,25 @@ static int packet_snd(struct socket *sock, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + if (po->has_vnet_hdr) { + if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + if (!skb_partial_csum_set(skb, vnet_hdr.csum_start, + vnet_hdr.csum_offset)) { + err = -EINVAL; + goto out_free; + } + } + + skb_shinfo(skb)->gso_size = vnet_hdr.gso_size; + skb_shinfo(skb)->gso_type = gso_type; + + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + + len += vnet_hdr_len; + } + /* * Now send it */ @@ -1151,13 +1236,11 @@ out: static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len) { -#ifdef CONFIG_PACKET_MMAP struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); if (po->tx_ring.pg_vec) return tpacket_snd(po, msg); else -#endif return packet_snd(sock, msg, len); } @@ -1171,9 +1254,7 @@ static int packet_release(struct socket *sock) struct sock *sk = sock->sk; struct packet_sock *po; struct net *net; -#ifdef CONFIG_PACKET_MMAP struct tpacket_req req; -#endif if (!sk) return 0; @@ -1181,28 +1262,25 @@ static int packet_release(struct socket *sock) net = sock_net(sk); po = pkt_sk(sk); - write_lock_bh(&net->packet.sklist_lock); - sk_del_node_init(sk); + spin_lock_bh(&net->packet.sklist_lock); + sk_del_node_init_rcu(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); - write_unlock_bh(&net->packet.sklist_lock); - - /* - * Unhook packet receive handler. - */ + spin_unlock_bh(&net->packet.sklist_lock); + spin_lock(&po->bind_lock); if (po->running) { /* - * Remove the protocol hook + * Remove from protocol table */ - dev_remove_pack(&po->prot_hook); po->running = 0; po->num = 0; + __dev_remove_pack(&po->prot_hook); __sock_put(sk); } + spin_unlock(&po->bind_lock); packet_flush_mclist(sk); -#ifdef CONFIG_PACKET_MMAP memset(&req, 0, sizeof(req)); if (po->rx_ring.pg_vec) @@ -1210,12 +1288,11 @@ static int packet_release(struct socket *sock) if (po->tx_ring.pg_vec) packet_set_ring(sk, &req, 1, 1); -#endif + synchronize_net(); /* * Now the socket is dead. No more input will appear. */ - sock_orphan(sk); sock->sk = NULL; @@ -1399,10 +1476,11 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, po->running = 1; } - write_lock_bh(&net->packet.sklist_lock); - sk_add_node(sk, &net->packet.sklist); + spin_lock_bh(&net->packet.sklist_lock); + sk_add_node_rcu(sk, &net->packet.sklist); sock_prot_inuse_add(net, &packet_proto, 1); - write_unlock_bh(&net->packet.sklist_lock); + spin_unlock_bh(&net->packet.sklist_lock); + return 0; out: return err; @@ -1420,6 +1498,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb; int copied, err; struct sockaddr_ll *sll; + int vnet_hdr_len = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) @@ -1451,6 +1530,48 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb == NULL) goto out; + if (pkt_sk(sk)->has_vnet_hdr) { + struct virtio_net_hdr vnet_hdr = { 0 }; + + err = -EINVAL; + vnet_hdr_len = sizeof(vnet_hdr); + if ((len -= vnet_hdr_len) < 0) + goto out_free; + + if (skb_is_gso(skb)) { + struct skb_shared_info *sinfo = skb_shinfo(skb); + + /* This is a hint as to how much should be linear. */ + vnet_hdr.hdr_len = skb_headlen(skb); + vnet_hdr.gso_size = sinfo->gso_size; + if (sinfo->gso_type & SKB_GSO_TCPV4) + vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else if (sinfo->gso_type & SKB_GSO_TCPV6) + vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP) + vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; + else if (sinfo->gso_type & SKB_GSO_FCOE) + goto out_free; + else + BUG(); + if (sinfo->gso_type & SKB_GSO_TCP_ECN) + vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; + } else + vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet_hdr.csum_start = skb->csum_start - + skb_headroom(skb); + vnet_hdr.csum_offset = skb->csum_offset; + } /* else everything is zero */ + + err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr, + vnet_hdr_len); + if (err < 0) + goto out_free; + } + /* * If the address length field is there to be filled in, we fill * it in now. @@ -1502,7 +1623,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ - err = (flags&MSG_TRUNC) ? skb->len : copied; + err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); @@ -1613,7 +1734,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) goto done; err = -EINVAL; - if (mreq->mr_alen > dev->addr_len) + if (mreq->mr_alen != dev->addr_len) goto done; err = -ENOBUFS; @@ -1732,7 +1853,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return ret; } -#ifdef CONFIG_PACKET_MMAP case PACKET_RX_RING: case PACKET_TX_RING: { @@ -1740,6 +1860,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen < sizeof(req)) return -EINVAL; + if (pkt_sk(sk)->has_vnet_hdr) + return -EINVAL; if (copy_from_user(&req, optval, sizeof(req))) return -EFAULT; return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING); @@ -1801,7 +1923,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv po->tp_loss = !!val; return 0; } -#endif case PACKET_AUXDATA: { int val; @@ -1826,6 +1947,22 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv po->origdev = !!val; return 0; } + case PACKET_VNET_HDR: + { + int val; + + if (sock->type != SOCK_RAW) + return -EINVAL; + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) + return -EBUSY; + if (optlen < sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + + po->has_vnet_hdr = !!val; + return 0; + } default: return -ENOPROTOOPT; } @@ -1876,7 +2013,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, data = &val; break; -#ifdef CONFIG_PACKET_MMAP + case PACKET_VNET_HDR: + if (len > sizeof(int)) + len = sizeof(int); + val = po->has_vnet_hdr; + + data = &val; + break; case PACKET_VERSION: if (len > sizeof(int)) len = sizeof(int); @@ -1912,7 +2055,6 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, val = po->tp_loss; data = &val; break; -#endif default: return -ENOPROTOOPT; } @@ -1932,8 +2074,8 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void struct net_device *dev = data; struct net *net = dev_net(dev); - read_lock(&net->packet.sklist_lock); - sk_for_each(sk, node, &net->packet.sklist) { + rcu_read_lock(); + sk_for_each_rcu(sk, node, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { @@ -1961,18 +2103,19 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void } break; case NETDEV_UP: - spin_lock(&po->bind_lock); - if (dev->ifindex == po->ifindex && po->num && - !po->running) { - dev_add_pack(&po->prot_hook); - sock_hold(sk); - po->running = 1; + if (dev->ifindex == po->ifindex) { + spin_lock(&po->bind_lock); + if (po->num && !po->running) { + dev_add_pack(&po->prot_hook); + sock_hold(sk); + po->running = 1; + } + spin_unlock(&po->bind_lock); } - spin_unlock(&po->bind_lock); break; } } - read_unlock(&net->packet.sklist_lock); + rcu_read_unlock(); return NOTIFY_DONE; } @@ -2032,11 +2175,6 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, return 0; } -#ifndef CONFIG_PACKET_MMAP -#define packet_mmap sock_no_mmap -#define packet_poll datagram_poll -#else - static unsigned int packet_poll(struct file *file, struct socket *sock, poll_table *wait) { @@ -2318,8 +2456,6 @@ out: mutex_unlock(&po->pg_vec_lock); return err; } -#endif - static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, @@ -2374,40 +2510,26 @@ static struct notifier_block packet_netdev_notifier = { }; #ifdef CONFIG_PROC_FS -static inline struct sock *packet_seq_idx(struct net *net, loff_t off) -{ - struct sock *s; - struct hlist_node *node; - - sk_for_each(s, node, &net->packet.sklist) { - if (!off--) - return s; - } - return NULL; -} static void *packet_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(seq_file_net(seq)->packet.sklist_lock) + __acquires(RCU) { struct net *net = seq_file_net(seq); - read_lock(&net->packet.sklist_lock); - return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN; + + rcu_read_lock(); + return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); } static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); - ++*pos; - return (v == SEQ_START_TOKEN) - ? sk_head(&net->packet.sklist) - : sk_next((struct sock *)v) ; + return seq_hlist_next_rcu(v, &net->packet.sklist, pos); } static void packet_seq_stop(struct seq_file *seq, void *v) - __releases(seq_file_net(seq)->packet.sklist_lock) + __releases(RCU) { - struct net *net = seq_file_net(seq); - read_unlock(&net->packet.sklist_lock); + rcu_read_unlock(); } static int packet_seq_show(struct seq_file *seq, void *v) @@ -2415,7 +2537,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); else { - struct sock *s = v; + struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); seq_printf(seq, @@ -2457,9 +2579,9 @@ static const struct file_operations packet_seq_fops = { #endif -static int packet_net_init(struct net *net) +static int __net_init packet_net_init(struct net *net) { - rwlock_init(&net->packet.sklist_lock); + spin_lock_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops)) @@ -2468,7 +2590,7 @@ static int packet_net_init(struct net *net) return 0; } -static void packet_net_exit(struct net *net) +static void __net_exit packet_net_exit(struct net *net) { proc_net_remove(net, "packet"); } |