From da12c90e099789a63073fc82a19542ce54d4efb9 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Thu, 6 Jun 2013 14:49:11 +0800
Subject: netlink: Add compare function for netlink_table

As we know, netlink sockets are private resource of
net namespace, they can communicate with each other
only when they in the same net namespace. this works
well until we try to add namespace support for other
subsystems which use netlink.

Don't like ipv4 and route table.., it is not suited to
make these subsytems belong to net namespace, Such as
audit and crypto subsystems,they are more suitable to
user namespace.

So we must have the ability to make the netlink sockets
in same user namespace can communicate with each other.

This patch adds a new function pointer "compare" for
netlink_table, we can decide if the netlink sockets can
communicate with each other through this netlink_table
self-defined compare function.

The behavior isn't changed if we don't provide the compare
function for netlink_table.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/netlink.h')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6358da5eeee..f78b430f4af 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -46,6 +46,7 @@ struct netlink_kernel_cfg {
 	void		(*input)(struct sk_buff *skb);
 	struct mutex	*cb_mutex;
 	void		(*bind)(int group);
+	bool		(*compare)(struct net *net, struct sock *sk);
 };
 
 extern struct sock *__netlink_kernel_create(struct net *net, int unit,
-- 
cgit v1.2.3-70-g09d2


From bcbde0d449eda7afa8f63280b165c8300dbd00e2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 21 Jun 2013 19:38:07 +0200
Subject: net: netlink: virtual tap device management

Similarly to the networking receive path with ptype_all taps, we add
the possibility to register netdevices that are for ARPHRD_NETLINK to
the netlink subsystem, so that those can be used for netlink analyzers
resp. debuggers. We do not offer a direct callback function as out-of-tree
modules could do crap with it. Instead, a netdevice must be registered
properly and only receives a clone, managed by the netlink layer. Symbols
are exported as GPL-only.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |  10 +++++
 net/netlink/af_netlink.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)

(limited to 'include/linux/netlink.h')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index f78b430f4af..86fde81ac2e 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -145,4 +145,14 @@ static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	return __netlink_dump_start(ssk, skb, nlh, control);
 }
 
+struct netlink_tap {
+	struct net_device *dev;
+	struct module *module;
+	struct list_head list;
+};
+
+extern int netlink_add_tap(struct netlink_tap *nt);
+extern int __netlink_remove_tap(struct netlink_tap *nt);
+extern int netlink_remove_tap(struct netlink_tap *nt);
+
 #endif	/* __LINUX_NETLINK_H */
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 275d901d7e4..6967fbcca6c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -57,6 +57,7 @@
 #include <linux/audit.h>
 #include <linux/mutex.h>
 #include <linux/vmalloc.h>
+#include <linux/if_arp.h>
 #include <asm/cacheflush.h>
 
 #include <net/net_namespace.h>
@@ -101,6 +102,9 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
 
 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
+static DEFINE_SPINLOCK(netlink_tap_lock);
+static struct list_head netlink_tap_all __read_mostly;
+
 static inline u32 netlink_group_mask(u32 group)
 {
 	return group ? 1 << (group - 1) : 0;
@@ -111,6 +115,100 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u
 	return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask];
 }
 
+int netlink_add_tap(struct netlink_tap *nt)
+{
+	if (unlikely(nt->dev->type != ARPHRD_NETLINK))
+		return -EINVAL;
+
+	spin_lock(&netlink_tap_lock);
+	list_add_rcu(&nt->list, &netlink_tap_all);
+	spin_unlock(&netlink_tap_lock);
+
+	if (nt->module)
+		__module_get(nt->module);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(netlink_add_tap);
+
+int __netlink_remove_tap(struct netlink_tap *nt)
+{
+	bool found = false;
+	struct netlink_tap *tmp;
+
+	spin_lock(&netlink_tap_lock);
+
+	list_for_each_entry(tmp, &netlink_tap_all, list) {
+		if (nt == tmp) {
+			list_del_rcu(&nt->list);
+			found = true;
+			goto out;
+		}
+	}
+
+	pr_warn("__netlink_remove_tap: %p not found\n", nt);
+out:
+	spin_unlock(&netlink_tap_lock);
+
+	if (found && nt->module)
+		module_put(nt->module);
+
+	return found ? 0 : -ENODEV;
+}
+EXPORT_SYMBOL_GPL(__netlink_remove_tap);
+
+int netlink_remove_tap(struct netlink_tap *nt)
+{
+	int ret;
+
+	ret = __netlink_remove_tap(nt);
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(netlink_remove_tap);
+
+static int __netlink_deliver_tap_skb(struct sk_buff *skb,
+				     struct net_device *dev)
+{
+	struct sk_buff *nskb;
+	int ret = -ENOMEM;
+
+	dev_hold(dev);
+	nskb = skb_clone(skb, GFP_ATOMIC);
+	if (nskb) {
+		nskb->dev = dev;
+		ret = dev_queue_xmit(nskb);
+		if (unlikely(ret > 0))
+			ret = net_xmit_errno(ret);
+	}
+
+	dev_put(dev);
+	return ret;
+}
+
+static void __netlink_deliver_tap(struct sk_buff *skb)
+{
+	int ret;
+	struct netlink_tap *tmp;
+
+	list_for_each_entry_rcu(tmp, &netlink_tap_all, list) {
+		ret = __netlink_deliver_tap_skb(skb, tmp->dev);
+		if (unlikely(ret))
+			break;
+	}
+}
+
+static void netlink_deliver_tap(struct sk_buff *skb)
+{
+	rcu_read_lock();
+
+	if (unlikely(!list_empty(&netlink_tap_all)))
+		__netlink_deliver_tap(skb);
+
+	rcu_read_unlock();
+}
+
 static void netlink_overrun(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -1518,6 +1616,8 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 {
 	int len = skb->len;
 
+	netlink_deliver_tap(skb);
+
 #ifdef CONFIG_NETLINK_MMAP
 	if (netlink_skb_is_mmaped(skb))
 		netlink_queue_mmaped_skb(sk, skb);
@@ -1578,6 +1678,11 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
 
 	ret = -ECONNREFUSED;
 	if (nlk->netlink_rcv != NULL) {
+		/* We could do a netlink_deliver_tap(skb) here as well
+		 * but since this is intended for the kernel only, we
+		 * should rather let it stay under the hood.
+		 */
+
 		ret = skb->len;
 		netlink_skb_set_owner_r(skb, sk);
 		NETLINK_CB(skb).sk = ssk;
@@ -2975,6 +3080,8 @@ static int __init netlink_proto_init(void)
 		nl_table[i].compare = netlink_compare;
 	}
 
+	INIT_LIST_HEAD(&netlink_tap_all);
+
 	netlink_add_usersock_entry();
 
 	sock_register(&netlink_family_ops);
-- 
cgit v1.2.3-70-g09d2


From 3a36515f729458c8efa0c124c7262d5843ad5c37 Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Fri, 28 Jun 2013 03:04:23 +0200
Subject: netlink: fix splat in skb_clone with large messages

Since (c05cdb1 netlink: allow large data transfers from user-space),
netlink splats if it invokes skb_clone on large netlink skbs since:

* skb_shared_info was not correctly initialized.
* skb->destructor is not set in the cloned skb.

This was spotted by trinity:

[  894.990671] BUG: unable to handle kernel paging request at ffffc9000047b001
[  894.991034] IP: [<ffffffff81a212c4>] skb_clone+0x24/0xc0
[...]
[  894.991034] Call Trace:
[  894.991034]  [<ffffffff81ad299a>] nl_fib_input+0x6a/0x240
[  894.991034]  [<ffffffff81c3b7e6>] ? _raw_read_unlock+0x26/0x40
[  894.991034]  [<ffffffff81a5f189>] netlink_unicast+0x169/0x1e0
[  894.991034]  [<ffffffff81a601e1>] netlink_sendmsg+0x251/0x3d0

Fix it by:

1) introducing a new netlink_skb_clone function that is used in nl_fib_input,
   that sets our special skb->destructor in the cloned skb. Moreover, handle
   the release of the large cloned skb head area in the destructor path.

2) not allowing large skbuffs in the netlink broadcast path. I cannot find
   any reasonable use of the large data transfer using netlink in that path,
   moreover this helps to skip extra skb_clone handling.

I found two more netlink clients that are cloning the skbs, but they are
not in the sendmsg path. Therefore, the sole client cloning that I found
seems to be the fib frontend.

Thanks to Eric Dumazet for helping to address this issue.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  | 16 ++++++++++++++++
 net/ipv4/fib_frontend.c  |  2 +-
 net/netlink/af_netlink.c | 35 ++++++++++++++++++-----------------
 3 files changed, 35 insertions(+), 18 deletions(-)

(limited to 'include/linux/netlink.h')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 86fde81ac2e..7a6c396a263 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -85,6 +85,22 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
 int netlink_sendskb(struct sock *sk, struct sk_buff *skb);
 
+static inline struct sk_buff *
+netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
+{
+	struct sk_buff *nskb;
+
+	nskb = skb_clone(skb, gfp_mask);
+	if (!nskb)
+		return NULL;
+
+	/* This is a large skb, set destructor callback to release head */
+	if (is_vmalloc_addr(skb->head))
+		nskb->destructor = skb->destructor;
+
+	return nskb;
+}
+
 /*
  *	skb should fit one page. This choice is good for headerless malloc.
  *	But we should limit to 8K so that userspace does not have to
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 05a4888dede..b3f627ac4ed 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -961,7 +961,7 @@ static void nl_fib_input(struct sk_buff *skb)
 	    nlmsg_len(nlh) < sizeof(*frn))
 		return;
 
-	skb = skb_clone(skb, GFP_KERNEL);
+	skb = netlink_skb_clone(skb, GFP_KERNEL);
 	if (skb == NULL)
 		return;
 	nlh = nlmsg_hdr(skb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 6967fbcca6c..0c61b59175d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -849,7 +849,10 @@ static void netlink_skb_destructor(struct sk_buff *skb)
 	}
 #endif
 	if (is_vmalloc_addr(skb->head)) {
-		vfree(skb->head);
+		if (!skb->cloned ||
+		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
+			vfree(skb->head);
+
 		skb->head = NULL;
 	}
 	if (skb->sk != NULL)
@@ -1532,33 +1535,31 @@ struct sock *netlink_getsockbyfilp(struct file *filp)
 	return sock;
 }
 
-static struct sk_buff *netlink_alloc_large_skb(unsigned int size)
+static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
+					       int broadcast)
 {
 	struct sk_buff *skb;
 	void *data;
 
-	if (size <= NLMSG_GOODSIZE)
+	if (size <= NLMSG_GOODSIZE || broadcast)
 		return alloc_skb(size, GFP_KERNEL);
 
-	skb = alloc_skb_head(GFP_KERNEL);
-	if (skb == NULL)
-		return NULL;
+	size = SKB_DATA_ALIGN(size) +
+	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
 	data = vmalloc(size);
 	if (data == NULL)
-		goto err;
+		return NULL;
 
-	skb->head	= data;
-	skb->data	= data;
-	skb_reset_tail_pointer(skb);
-	skb->end	= skb->tail + size;
-	skb->len	= 0;
-	skb->destructor = netlink_skb_destructor;
+	skb = build_skb(data, size);
+	if (skb == NULL)
+		vfree(data);
+	else {
+		skb->head_frag = 0;
+		skb->destructor = netlink_skb_destructor;
+	}
 
 	return skb;
-err:
-	kfree_skb(skb);
-	return NULL;
 }
 
 /*
@@ -2244,7 +2245,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	if (len > sk->sk_sndbuf - 32)
 		goto out;
 	err = -ENOBUFS;
-	skb = netlink_alloc_large_skb(len);
+	skb = netlink_alloc_large_skb(len, dst_group);
 	if (skb == NULL)
 		goto out;
 
-- 
cgit v1.2.3-70-g09d2