diff options
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r-- | net/packet/af_packet.c | 156 |
1 files changed, 100 insertions, 56 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8298e676f5a..91cb1d71f01 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -61,6 +61,7 @@ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -163,8 +164,13 @@ struct packet_mreq_max { static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing, int tx_ring); +#define PGV_FROM_VMALLOC 1 +struct pgv { + char *buffer; +}; + struct packet_ring_buffer { - char **pg_vec; + struct pgv *pg_vec; unsigned int head; unsigned int frames_per_block; unsigned int frame_size; @@ -217,6 +223,13 @@ struct packet_skb_cb { #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) +static inline __pure struct page *pgv_to_page(void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + return virt_to_page(addr); +} + static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union { @@ -229,11 +242,11 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) switch (po->tp_version) { case TPACKET_V1: h.h1->tp_status = status; - flush_dcache_page(virt_to_page(&h.h1->tp_status)); + flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: h.h2->tp_status = status; - flush_dcache_page(virt_to_page(&h.h2->tp_status)); + flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; default: pr_err("TPACKET version not supported\n"); @@ -256,10 +269,10 @@ static int __packet_get_status(struct packet_sock *po, void *frame) h.raw = frame; switch (po->tp_version) { case TPACKET_V1: - flush_dcache_page(virt_to_page(&h.h1->tp_status)); + flush_dcache_page(pgv_to_page(&h.h1->tp_status)); return h.h1->tp_status; case TPACKET_V2: - flush_dcache_page(virt_to_page(&h.h2->tp_status)); + flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return h.h2->tp_status; default: pr_err("TPACKET version not supported\n"); @@ -283,7 +296,8 @@ static void *packet_lookup_frame(struct packet_sock *po, pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; - h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size); + h.raw = rb->pg_vec[pg_vec_pos].buffer + + (frame_offset * rb->frame_size); if (status != __packet_get_status(po, h.raw)) return NULL; @@ -503,7 +517,8 @@ out_free: return err; } -static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, +static inline unsigned int run_filter(const struct sk_buff *skb, + const struct sock *sk, unsigned int res) { struct sk_filter *filter; @@ -511,22 +526,22 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk, rcu_read_lock_bh(); filter = rcu_dereference_bh(sk->sk_filter); if (filter != NULL) - res = sk_run_filter(skb, filter->insns, filter->len); + res = sk_run_filter(skb, filter->insns); rcu_read_unlock_bh(); return res; } /* - This function makes lazy skb cloning in hope that most of packets - are discarded by BPF. - - Note tricky part: we DO mangle shared skb! skb->data, skb->len - and skb->cb are mangled. It works because (and until) packets - falling here are owned by current CPU. Output packets are cloned - by dev_queue_xmit_nit(), input packets are processed by net_bh - sequencially, so that if we return skb to original state on exit, - we will not harm anyone. + * This function makes lazy skb cloning in hope that most of packets + * are discarded by BPF. + * + * Note tricky part: we DO mangle shared skb! skb->data, skb->len + * and skb->cb are mangled. It works because (and until) packets + * falling here are owned by current CPU. Output packets are cloned + * by dev_queue_xmit_nit(), input packets are processed by net_bh + * sequencially, so that if we return skb to original state on exit, + * we will not harm anyone. */ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, @@ -552,11 +567,11 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, if (dev->header_ops) { /* The device has an explicit notion of ll header, - exported to higher levels. - - Otherwise, the device hides datails of it frame - structure, so that corresponding packet head - never delivered to user. + * exported to higher levels. + * + * Otherwise, the device hides details of its frame + * structure, so that corresponding packet head is + * never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); @@ -791,17 +806,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, __packet_set_status(po, h.raw, status); smp_mb(); +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 { - struct page *p_start, *p_end; - u8 *h_end = h.raw + macoff + snaplen - 1; - - p_start = virt_to_page(h.raw); - p_end = virt_to_page(h_end); - while (p_start <= p_end) { - flush_dcache_page(p_start); - p_start++; - } + u8 *start, *end; + + end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); + for (start = h.raw; start < end; start += PAGE_SIZE) + flush_dcache_page(pgv_to_page(start)); } +#endif sk->sk_data_ready(sk, 0); @@ -907,7 +920,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, } err = -EFAULT; - page = virt_to_page(data); offset = offset_in_page(data); len_max = PAGE_SIZE - offset; len = ((to_write > len_max) ? len_max : to_write); @@ -926,11 +938,11 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, return -EFAULT; } + page = pgv_to_page(data); + data += len; flush_dcache_page(page); get_page(page); - skb_fill_page_desc(skb, - nr_frags, - page++, offset, len); + skb_fill_page_desc(skb, nr_frags, page, offset, len); to_write -= len; offset = 0; len_max = PAGE_SIZE; @@ -1638,8 +1650,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb->ip_summed == CHECKSUM_PARTIAL) { vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr.csum_start = skb->csum_start - - skb_headroom(skb); + vnet_hdr.csum_start = skb_checksum_start_offset(skb); vnet_hdr.csum_offset = skb->csum_offset; } /* else everything is zero */ @@ -2325,37 +2336,70 @@ static const struct vm_operations_struct packet_mmap_ops = { .close = packet_mm_close, }; -static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) +static void free_pg_vec(struct pgv *pg_vec, unsigned int order, + unsigned int len) { int i; for (i = 0; i < len; i++) { - if (likely(pg_vec[i])) - free_pages((unsigned long) pg_vec[i], order); + if (likely(pg_vec[i].buffer)) { + if (is_vmalloc_addr(pg_vec[i].buffer)) + vfree(pg_vec[i].buffer); + else + free_pages((unsigned long)pg_vec[i].buffer, + order); + pg_vec[i].buffer = NULL; + } } kfree(pg_vec); } static inline char *alloc_one_pg_vec_page(unsigned long order) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN; + char *buffer = NULL; + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | + __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; - return (char *) __get_free_pages(gfp_flags, order); + buffer = (char *) __get_free_pages(gfp_flags, order); + + if (buffer) + return buffer; + + /* + * __get_free_pages failed, fall back to vmalloc + */ + buffer = vzalloc((1 << order) * PAGE_SIZE); + + if (buffer) + return buffer; + + /* + * vmalloc failed, lets dig into swap here + */ + gfp_flags &= ~__GFP_NORETRY; + buffer = (char *)__get_free_pages(gfp_flags, order); + if (buffer) + return buffer; + + /* + * complete and utter failure + */ + return NULL; } -static char **alloc_pg_vec(struct tpacket_req *req, int order) +static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) { unsigned int block_nr = req->tp_block_nr; - char **pg_vec; + struct pgv *pg_vec; int i; - pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); + pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL); if (unlikely(!pg_vec)) goto out; for (i = 0; i < block_nr; i++) { - pg_vec[i] = alloc_one_pg_vec_page(order); - if (unlikely(!pg_vec[i])) + pg_vec[i].buffer = alloc_one_pg_vec_page(order); + if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } @@ -2371,7 +2415,7 @@ out_free_pgvec: static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing, int tx_ring) { - char **pg_vec = NULL; + struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); int was_running, order = 0; struct packet_ring_buffer *rb; @@ -2456,22 +2500,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, mutex_lock(&po->pg_vec_lock); if (closing || atomic_read(&po->mapped) == 0) { err = 0; -#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; }) spin_lock_bh(&rb_queue->lock); - pg_vec = XC(rb->pg_vec, pg_vec); + swap(rb->pg_vec, pg_vec); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); - order = XC(rb->pg_vec_order, order); - req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr); + swap(rb->pg_vec_order, order); + swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); -#undef XC if (atomic_read(&po->mapped)) pr_err("packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped)); @@ -2533,15 +2575,17 @@ static int packet_mmap(struct file *file, struct socket *sock, continue; for (i = 0; i < rb->pg_vec_len; i++) { - struct page *page = virt_to_page(rb->pg_vec[i]); + struct page *page; + void *kaddr = rb->pg_vec[i].buffer; int pg_num; - for (pg_num = 0; pg_num < rb->pg_vec_pages; - pg_num++, page++) { + for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { + page = pgv_to_page(kaddr); err = vm_insert_page(vma, start, page); if (unlikely(err)) goto out; start += PAGE_SIZE; + kaddr += PAGE_SIZE; } } } |