[NETLINK]: Synchronous message processing.

Let's recap the problem. The current asynchronous netlink kernel message processing is vulnerable to these attacks: 1) Hit and run: Attacker sends one or more messages and then exits before they're processed. This may confuse/disable the next netlink user that gets the netlink address of the attacker since it may receive the responses to the attacker's messages. Proposed solutions: a) Synchronous processing. b) Stream mode socket. c) Restrict/prohibit binding. 2) Starvation: Because various netlink rcv functions were written to not return until all messages have been processed on a socket, it is possible for these functions to execute for an arbitrarily long period of time. If this is successfully exploited it could also be used to hold rtnl forever. Proposed solutions: a) Synchronous processing. b) Stream mode socket. Firstly let's cross off solution c). It only solves the first problem and it has user-visible impacts. In particular, it'll break user space applications that expect to bind or communicate with specific netlink addresses (pid's). So we're left with a choice of synchronous processing versus SOCK_STREAM for netlink. For the moment I'm sticking with the synchronous approach as suggested by Alexey since it's simpler and I'd rather spend my time working on other things. However, it does have a number of deficiencies compared to the stream mode solution: 1) User-space to user-space netlink communication is still vulnerable. 2) Inefficient use of resources. This is especially true for rtnetlink since the lock is shared with other users such as networking drivers. The latter could hold the rtnl while communicating with hardware which causes the rtnetlink user to wait when it could be doing other things. 3) It is still possible to DoS all netlink users by flooding the kernel netlink receive queue. The attacker simply fills the receive socket with a single netlink message that fills up the entire queue. The attacker then continues to call sendmsg with the same message in a loop. Point 3) can be countered by retransmissions in user-space code, however it is pretty messy. In light of these problems (in particular, point 3), we should implement stream mode netlink at some point. In the mean time, here is a patch that implements synchronous processing. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Herbert Xu <herbert@gondor.apana.org.au> 2005-05-03 14:55:09 -0700
committer: David S. Miller <davem@davemloft.net> 2005-05-03 14:55:09 -0700
commit: 2a0a6ebee1d68552152ae8d4aeda91d806995dec (patch)
tree: a0b77861b3395b4728e75f2b2f92755e0a4777d3
parent: 96c36023434b7b6824b1da72a6b7b1ca61d7310c (diff)
7 files changed, 55 insertions, 48 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 0f84dd7af2c..ac26d4d960d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -427,7 +427,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 /* Get message from skb (based on rtnetlink_rcv_skb).  Each message is
  * processed by audit_receive_msg.  Malformed skbs with wrong length are
  * discarded silently.  */
-static int audit_receive_skb(struct sk_buff *skb)
+static void audit_receive_skb(struct sk_buff *skb)
 {
 	int		err;
 	struct nlmsghdr	*nlh;
@@ -436,7 +436,7 @@ static int audit_receive_skb(struct sk_buff *skb)
 	while (skb->len >= NLMSG_SPACE(0)) {
 		nlh = (struct nlmsghdr *)skb->data;
 		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
-			return 0;
+			return;
 		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
 		if (rlen > skb->len)
 			rlen = skb->len;
@@ -446,23 +446,20 @@ static int audit_receive_skb(struct sk_buff *skb)
 			netlink_ack(skb, nlh, 0);
 		skb_pull(skb, rlen);
 	}
-	return 0;
 }
 
 /* Receive messages from netlink socket. */
 static void audit_receive(struct sock *sk, int length)
 {
 	struct sk_buff  *skb;
+	unsigned int qlen;
 
-	if (down_trylock(&audit_netlink_sem))
-		return;
+	down(&audit_netlink_sem);
 
-				/* FIXME: this must not cause starvation */
-	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
-		if (audit_receive_skb(skb) && skb->len)
-			skb_queue_head(&sk->sk_receive_queue, skb);
-		else
-			kfree_skb(skb);
+	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		audit_receive_skb(skb);
+		kfree_skb(skb);
 	}
 	up(&audit_netlink_sem);
 }
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5fb70cfa108..6e1ab1e34b2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -609,26 +609,31 @@ static inline int rtnetlink_rcv_skb(struct sk_buff *skb)
 
 /*
  *  rtnetlink input queue processing routine:
- *	- try to acquire shared lock. If it is failed, defer processing.
+ *	- process as much as there was in the queue upon entry.
  *	- feed skbs to rtnetlink_rcv_skb, until it refuse a message,
- *	  that will occur, when a dump started and/or acquisition of
- *	  exclusive lock failed.
+ *	  that will occur, when a dump started.
  */
 
 static void rtnetlink_rcv(struct sock *sk, int len)
 {
+	unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+
 	do {
 		struct sk_buff *skb;
 
-		if (rtnl_shlock_nowait())
-			return;
+		rtnl_lock();
+
+		if (qlen > skb_queue_len(&sk->sk_receive_queue))
+			qlen = skb_queue_len(&sk->sk_receive_queue);
 
-		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		while (qlen--) {
+			skb = skb_dequeue(&sk->sk_receive_queue);
 			if (rtnetlink_rcv_skb(skb)) {
-				if (skb->len)
+				if (skb->len) {
 					skb_queue_head(&sk->sk_receive_queue,
 						       skb);
-				else
+					qlen++;
+				} else
 					kfree_skb(skb);
 				break;
 			}
@@ -638,7 +643,7 @@ static void rtnetlink_rcv(struct sock *sk, int len)
 		up(&rtnl_sem);
 
 		netdev_run_todo();
-	} while (rtnl && rtnl->sk_receive_queue.qlen);
+	} while (qlen);
 }
 
 static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] =
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index f86a6259fd1..101ddef9ba9 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -119,8 +119,9 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
 static void dnrmg_receive_user_sk(struct sock *sk, int len)
 {
 	struct sk_buff *skb;
+	unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
 
-	while((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+	while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
 		dnrmg_receive_user_skb(skb);
 		kfree_skb(skb);
 	}
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 9e40dffc204..e5746b67441 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -546,20 +546,18 @@ ipq_rcv_skb(struct sk_buff *skb)
 static void
 ipq_rcv_sk(struct sock *sk, int len)
 {
-	do {
-		struct sk_buff *skb;
+	struct sk_buff *skb;
+	unsigned int qlen;
 
-		if (down_trylock(&ipqnl_sem))
-			return;
+	down(&ipqnl_sem);
 			
-		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
-			ipq_rcv_skb(skb);
-			kfree_skb(skb);
-		}
+	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		ipq_rcv_skb(skb);
+		kfree_skb(skb);
+	}
 		
-		up(&ipqnl_sem);
-
-	} while (ipqnl && ipqnl->sk_receive_queue.qlen);
+	up(&ipqnl_sem);
 }
 
 static int
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 313c1408da3..8faa8948f75 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -777,8 +777,9 @@ static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
 static void tcpdiag_rcv(struct sock *sk, int len)
 {
 	struct sk_buff *skb;
+	unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
 
-	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+	while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) {
 		tcpdiag_rcv_skb(skb);
 		kfree_skb(skb);
 	}
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index c54830b8959..750943e2d34 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -549,20 +549,18 @@ ipq_rcv_skb(struct sk_buff *skb)
 static void
 ipq_rcv_sk(struct sock *sk, int len)
 {
-	do {
-		struct sk_buff *skb;
+	struct sk_buff *skb;
+	unsigned int qlen;
 
-		if (down_trylock(&ipqnl_sem))
-			return;
+	down(&ipqnl_sem);
 			
-		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
-			ipq_rcv_skb(skb);
-			kfree_skb(skb);
-		}
+	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
+		skb = skb_dequeue(&sk->sk_receive_queue);
+		ipq_rcv_skb(skb);
+		kfree_skb(skb);
+	}
 		
-		up(&ipqnl_sem);
-
-	} while (ipqnl && ipqnl->sk_receive_queue.qlen);
+	up(&ipqnl_sem);
 }
 
 static int
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 52b5843937c..dab112f1dd8 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1008,17 +1008,24 @@ static int xfrm_user_rcv_skb(struct sk_buff *skb)
 
 static void xfrm_netlink_rcv(struct sock *sk, int len)
 {
+	unsigned int qlen = skb_queue_len(&sk->sk_receive_queue);
+
 	do {
 		struct sk_buff *skb;
 
 		down(&xfrm_cfg_sem);
 
-		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		if (qlen > skb_queue_len(&sk->sk_receive_queue))
+			qlen = skb_queue_len(&sk->sk_receive_queue);
+
+		while (qlen--) {
+			skb = skb_dequeue(&sk->sk_receive_queue);
 			if (xfrm_user_rcv_skb(skb)) {
-				if (skb->len)
+				if (skb->len) {
 					skb_queue_head(&sk->sk_receive_queue,
 						       skb);
-				else
+					qlen++;
+				} else
 					kfree_skb(skb);
 				break;
 			}
@@ -1027,7 +1034,7 @@ static void xfrm_netlink_rcv(struct sock *sk, int len)
 
 		up(&xfrm_cfg_sem);
 
-	} while (xfrm_nl && xfrm_nl->sk_receive_queue.qlen);
+	} while (qlen);
 }
 
 static int build_expire(struct sk_buff *skb, struct xfrm_state *x, int hard)
author	Herbert Xu <herbert@gondor.apana.org.au>	2005-05-03 14:55:09 -0700
committer	David S. Miller <davem@davemloft.net>	2005-05-03 14:55:09 -0700
commit	2a0a6ebee1d68552152ae8d4aeda91d806995dec (patch)
tree	a0b77861b3395b4728e75f2b2f92755e0a4777d3
parent	96c36023434b7b6824b1da72a6b7b1ca61d7310c (diff)