From 892c141e62982272b9c738b5520ad0e5e1ad7b42 Mon Sep 17 00:00:00 2001
From: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Date: Fri, 4 Aug 2006 23:08:56 -0700
Subject: [MLSXFRM]: Add security sid to sock

This adds security for IP sockets at the sock level. Security at the
sock level is needed to enforce the SELinux security policy for
security associations even when a sock is orphaned (such as in the TCP
LAST_ACK state).

This will also be used to enforce SELinux controls over data arriving
at or leaving a child socket while it's still waiting to be accepted.

Signed-off-by: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h | 12 ++++++++++++
 include/net/sock.h       | 13 +++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 6bc2aad494f..4d7fb59996b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -812,6 +812,8 @@ struct swap_info_struct;
  *      which is used to copy security attributes between local stream sockets.
  * @sk_free_security:
  *	Deallocate security structure.
+ * @sk_clone_security:
+ *	Clone/copy security structure.
  * @sk_getsid:
  *	Retrieve the LSM-specific sid for the sock to enable caching of network
  *	authorizations.
@@ -1332,6 +1334,7 @@ struct security_operations {
 	int (*socket_getpeersec_dgram) (struct socket *sock, struct sk_buff *skb, u32 *secid);
 	int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority);
 	void (*sk_free_security) (struct sock *sk);
+	void (*sk_clone_security) (const struct sock *sk, struct sock *newsk);
 	unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir);
 #endif	/* CONFIG_SECURITY_NETWORK */
 
@@ -2885,6 +2888,11 @@ static inline void security_sk_free(struct sock *sk)
 	return security_ops->sk_free_security(sk);
 }
 
+static inline void security_sk_clone(const struct sock *sk, struct sock *newsk)
+{
+	return security_ops->sk_clone_security(sk, newsk);
+}
+
 static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
 {
 	return security_ops->sk_getsid(sk, fl, dir);
@@ -3011,6 +3019,10 @@ static inline void security_sk_free(struct sock *sk)
 {
 }
 
+static inline void security_sk_clone(const struct sock *sk, struct sock *newsk)
+{
+}
+
 static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
 {
 	return 0;
diff --git a/include/net/sock.h b/include/net/sock.h
index 324b3ea233d..91cdceb3c02 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -972,6 +972,19 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
+static inline void sock_copy(struct sock *nsk, const struct sock *osk)
+{
+#ifdef CONFIG_SECURITY_NETWORK
+	void *sptr = nsk->sk_security;
+#endif
+
+	memcpy(nsk, osk, osk->sk_prot->obj_size);
+#ifdef CONFIG_SECURITY_NETWORK
+	nsk->sk_security = sptr;
+	security_sk_clone(osk, nsk);
+#endif
+}
+
 extern int sock_i_uid(struct sock *sk);
 extern unsigned long sock_i_ino(struct sock *sk);
 
-- 
cgit v1.2.3-70-g09d2


From b6340fcd761acf9249b3acbc95c4dc555d9beb07 Mon Sep 17 00:00:00 2001
From: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Date: Mon, 24 Jul 2006 23:28:37 -0700
Subject: [MLSXFRM]: Add security sid to flowi

This adds security to flow key for labeling of flows as also to allow
for making flow cache lookups based on the security label seemless.

Signed-off-by: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/secid.txt | 14 ++++++++++++++
 include/net/flow.h                 |  1 +
 2 files changed, 15 insertions(+)
 create mode 100644 Documentation/networking/secid.txt

(limited to 'include')

diff --git a/Documentation/networking/secid.txt b/Documentation/networking/secid.txt
new file mode 100644
index 00000000000..95ea0678433
--- /dev/null
+++ b/Documentation/networking/secid.txt
@@ -0,0 +1,14 @@
+flowi structure:
+
+The secid member in the flow structure is used in LSMs (e.g. SELinux) to indicate
+the label of the flow. This label of the flow is currently used in selecting
+matching labeled xfrm(s).
+
+If this is an outbound flow, the label is derived from the socket, if any, or
+the incoming packet this flow is being generated as a response to (e.g. tcp
+resets, timewait ack, etc.). It is also conceivable that the label could be
+derived from other sources such as process context, device, etc., in special
+cases, as may be appropriate.
+
+If this is an inbound flow, the label is derived from the IPSec security
+associations, if any, used by the packet.
diff --git a/include/net/flow.h b/include/net/flow.h
index 04d89f76345..1cee5a83433 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -78,6 +78,7 @@ struct flowi {
 #define fl_icmp_type	uli_u.icmpt.type
 #define fl_icmp_code	uli_u.icmpt.code
 #define fl_ipsec_spi	uli_u.spi
+	__u32           secid;	/* used by xfrm; see secid.txt */
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 #define FLOW_DIR_IN	0
-- 
cgit v1.2.3-70-g09d2


From e0d1caa7b0d5f02e4f34aa09c695d04251310c6c Mon Sep 17 00:00:00 2001
From: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Date: Mon, 24 Jul 2006 23:29:07 -0700
Subject: [MLSXFRM]: Flow based matching of xfrm policy and state

This implements a seemless mechanism for xfrm policy selection and
state matching based on the flow sid. This also includes the necessary
SELinux enforcement pieces.

Signed-off-by: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h        | 106 +++++++++++++++++----
 include/net/flow.h              |   4 +-
 net/core/flow.c                 |   7 +-
 net/xfrm/xfrm_policy.c          |  28 +++---
 net/xfrm/xfrm_state.c           |  12 ++-
 security/dummy.c                |  23 ++++-
 security/selinux/hooks.c        |   7 +-
 security/selinux/include/xfrm.h |  23 +++--
 security/selinux/xfrm.c         | 199 +++++++++++++++++++++++++++++++++-------
 9 files changed, 329 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 4d7fb59996b..2c4921d79d1 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,6 +31,7 @@
 #include <linux/msg.h>
 #include <linux/sched.h>
 #include <linux/key.h>
+#include <linux/xfrm.h>
 
 struct ctl_table;
 
@@ -825,9 +826,8 @@ struct swap_info_struct;
  *	used by the XFRM system.
  *	@sec_ctx contains the security context information being provided by
  *	the user-level policy update program (e.g., setkey).
- *	Allocate a security structure to the xp->security field.
- *	The security field is initialized to NULL when the xfrm_policy is
- *	allocated.
+ *	Allocate a security structure to the xp->security field; the security
+ *	field is initialized to NULL when the xfrm_policy is allocated.
  *	Return 0 if operation was successful (memory to allocate, legal context)
  * @xfrm_policy_clone_security:
  *	@old contains an existing xfrm_policy in the SPD.
@@ -846,9 +846,14 @@ struct swap_info_struct;
  *	Database by the XFRM system.
  *	@sec_ctx contains the security context information being provided by
  *	the user-level SA generation program (e.g., setkey or racoon).
- *	Allocate a security structure to the x->security field.  The
- *	security field is initialized to NULL when the xfrm_state is
- *	allocated.
+ *	@polsec contains the security context information associated with a xfrm
+ *	policy rule from which to take the base context. polsec must be NULL
+ *	when sec_ctx is specified.
+ *	@secid contains the secid from which to take the mls portion of the context.
+ *	Allocate a security structure to the x->security field; the security
+ *	field is initialized to NULL when the xfrm_state is allocated. Set the
+ *	context to correspond to either sec_ctx or polsec, with the mls portion
+ *	taken from secid in the latter case.
  *	Return 0 if operation was successful (memory to allocate, legal context).
  * @xfrm_state_free_security:
  *	@x contains the xfrm_state.
@@ -859,13 +864,26 @@ struct swap_info_struct;
  * @xfrm_policy_lookup:
  *	@xp contains the xfrm_policy for which the access control is being
  *	checked.
- *	@sk_sid contains the sock security label that is used to authorize
+ *	@fl_secid contains the flow security label that is used to authorize
  *	access to the policy xp.
  *	@dir contains the direction of the flow (input or output).
- *	Check permission when a sock selects a xfrm_policy for processing
+ *	Check permission when a flow selects a xfrm_policy for processing
  *	XFRMs on a packet.  The hook is called when selecting either a
  *	per-socket policy or a generic xfrm policy.
  *	Return 0 if permission is granted.
+ * @xfrm_state_pol_flow_match:
+ *	@x contains the state to match.
+ *	@xp contains the policy to check for a match.
+ *	@fl contains the flow to check for a match.
+ *	Return 1 if there is a match.
+ * @xfrm_flow_state_match:
+ *	@fl contains the flow key to match.
+ *	@xfrm points to the xfrm_state to match.
+ *	Return 1 if there is a match.
+ * @xfrm_decode_session:
+ *	@skb points to skb to decode.
+ *	@fl points to the flow key to set.
+ *	Return 0 if successful decoding.
  *
  * Security hooks affecting all Key Management operations
  *
@@ -1343,10 +1361,16 @@ struct security_operations {
 	int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new);
 	void (*xfrm_policy_free_security) (struct xfrm_policy *xp);
 	int (*xfrm_policy_delete_security) (struct xfrm_policy *xp);
-	int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
+	int (*xfrm_state_alloc_security) (struct xfrm_state *x,
+		struct xfrm_user_sec_ctx *sec_ctx, struct xfrm_sec_ctx *polsec,
+		u32 secid);
 	void (*xfrm_state_free_security) (struct xfrm_state *x);
 	int (*xfrm_state_delete_security) (struct xfrm_state *x);
-	int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
+	int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 fl_secid, u8 dir);
+	int (*xfrm_state_pol_flow_match)(struct xfrm_state *x,
+			struct xfrm_policy *xp, struct flowi *fl);
+	int (*xfrm_flow_state_match)(struct flowi *fl, struct xfrm_state *xfrm);
+	int (*xfrm_decode_session)(struct sk_buff *skb, struct flowi *fl);
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 
 	/* key management security hooks */
@@ -3050,9 +3074,18 @@ static inline int security_xfrm_policy_delete(struct xfrm_policy *xp)
 	return security_ops->xfrm_policy_delete_security(xp);
 }
 
-static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+static inline int security_xfrm_state_alloc(struct xfrm_state *x,
+			struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return security_ops->xfrm_state_alloc_security(x, sec_ctx, NULL, 0);
+}
+
+static inline int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
+				struct xfrm_sec_ctx *polsec, u32 secid)
 {
-	return security_ops->xfrm_state_alloc_security(x, sec_ctx);
+	if (!polsec)
+		return 0;
+	return security_ops->xfrm_state_alloc_security(x, NULL, polsec, secid);
 }
 
 static inline int security_xfrm_state_delete(struct xfrm_state *x)
@@ -3065,9 +3098,25 @@ static inline void security_xfrm_state_free(struct xfrm_state *x)
 	security_ops->xfrm_state_free_security(x);
 }
 
-static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 fl_secid, u8 dir)
+{
+	return security_ops->xfrm_policy_lookup(xp, fl_secid, dir);
+}
+
+static inline int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
+			struct xfrm_policy *xp, struct flowi *fl)
+{
+	return security_ops->xfrm_state_pol_flow_match(x, xp, fl);
+}
+
+static inline int security_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm)
+{
+	return security_ops->xfrm_flow_state_match(fl, xfrm);
+}
+
+static inline int security_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl)
 {
-	return security_ops->xfrm_policy_lookup(xp, sk_sid, dir);
+	return security_ops->xfrm_decode_session(skb, fl);
 }
 #else	/* CONFIG_SECURITY_NETWORK_XFRM */
 static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
@@ -3089,7 +3138,14 @@ static inline int security_xfrm_policy_delete(struct xfrm_policy *xp)
 	return 0;
 }
 
-static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+static inline int security_xfrm_state_alloc(struct xfrm_state *x,
+					struct xfrm_user_sec_ctx *sec_ctx)
+{
+	return 0;
+}
+
+static inline int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
+					struct xfrm_sec_ctx *polsec, u32 secid)
 {
 	return 0;
 }
@@ -3103,10 +3159,28 @@ static inline int security_xfrm_state_delete(struct xfrm_state *x)
 	return 0;
 }
 
-static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 fl_secid, u8 dir)
 {
 	return 0;
 }
+
+static inline int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
+			struct xfrm_policy *xp, struct flowi *fl)
+{
+	return 1;
+}
+
+static inline int security_xfrm_flow_state_match(struct flowi *fl,
+                                struct xfrm_state *xfrm)
+{
+	return 1;
+}
+
+static inline int security_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 
 #ifdef CONFIG_KEYS
diff --git a/include/net/flow.h b/include/net/flow.h
index 1cee5a83433..21d988b2058 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,10 +86,10 @@ struct flowi {
 #define FLOW_DIR_FWD	2
 
 struct sock;
-typedef void (*flow_resolve_t)(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
+typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir,
 			       void **objp, atomic_t **obj_refp);
 
-extern void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
+extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 	 		       flow_resolve_t resolver);
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
diff --git a/net/core/flow.c b/net/core/flow.c
index 2191af5f26a..645241165e6 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -32,7 +32,6 @@ struct flow_cache_entry {
 	u8			dir;
 	struct flowi		key;
 	u32			genid;
-	u32			sk_sid;
 	void			*object;
 	atomic_t		*object_ref;
 };
@@ -165,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
+void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir,
 			flow_resolve_t resolver)
 {
 	struct flow_cache_entry *fle, **head;
@@ -189,7 +188,6 @@ void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    fle->sk_sid == sk_sid &&
 		    flow_key_compare(key, &fle->key) == 0) {
 			if (fle->genid == atomic_read(&flow_cache_genid)) {
 				void *ret = fle->object;
@@ -214,7 +212,6 @@ void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir,
 			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
-			fle->sk_sid = sk_sid;
 			memcpy(&fle->key, key, sizeof(*key));
 			fle->object = NULL;
 			flow_count(cpu)++;
@@ -226,7 +223,7 @@ nocache:
 		void *obj;
 		atomic_t *obj_ref;
 
-		resolver(key, sk_sid, family, dir, &obj, &obj_ref);
+		resolver(key, family, dir, &obj, &obj_ref);
 
 		if (fle) {
 			fle->genid = atomic_read(&flow_cache_genid);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 3da67ca2c3c..79405daadc5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -597,7 +597,7 @@ EXPORT_SYMBOL(xfrm_policy_walk);
 
 /* Find policy to apply to this flow. */
 
-static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir,
+static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
 			       void **objp, atomic_t **obj_refp)
 {
 	struct xfrm_policy *pol;
@@ -613,7 +613,7 @@ static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir,
 		match = xfrm_selector_match(sel, fl, family);
 
 		if (match) {
- 			if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) {
+ 			if (!security_xfrm_policy_lookup(pol, fl->secid, dir)) {
 				xfrm_pol_hold(pol);
 				break;
 			}
@@ -641,7 +641,7 @@ static inline int policy_to_flow_dir(int dir)
 	};
 }
 
-static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid)
+static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl)
 {
 	struct xfrm_policy *pol;
 
@@ -652,7 +652,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struc
  		int err = 0;
 
 		if (match)
-		  err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir));
+		  err = security_xfrm_policy_lookup(pol, fl->secid, policy_to_flow_dir(dir));
 
  		if (match && !err)
 			xfrm_pol_hold(pol);
@@ -862,19 +862,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
 	u32 genid;
 	u16 family;
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
-	u32 sk_sid = security_sk_sid(sk, fl, dir);
+
+	fl->secid = security_sk_sid(sk, fl, dir);
 restart:
 	genid = atomic_read(&flow_cache_genid);
 	policy = NULL;
 	if (sk && sk->sk_policy[1])
-		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid);
+		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
 
 	if (!policy) {
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
 			return 0;
 
-		policy = flow_cache_lookup(fl, sk_sid, dst_orig->ops->family,
+		policy = flow_cache_lookup(fl, dst_orig->ops->family,
 					   dir, xfrm_policy_lookup);
 	}
 
@@ -1032,13 +1033,15 @@ int
 xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+	int err;
 
 	if (unlikely(afinfo == NULL))
 		return -EAFNOSUPPORT;
 
 	afinfo->decode_session(skb, fl);
+	err = security_xfrm_decode_session(skb, fl);
 	xfrm_policy_put_afinfo(afinfo);
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL(xfrm_decode_session);
 
@@ -1058,14 +1061,11 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	struct xfrm_policy *pol;
 	struct flowi fl;
 	u8 fl_dir = policy_to_flow_dir(dir);
-	u32 sk_sid;
 
 	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
 	nf_nat_decode_session(skb, &fl, family);
 
-	sk_sid = security_sk_sid(sk, &fl, fl_dir);
-
 	/* First, check used SA against their selectors. */
 	if (skb->sp) {
 		int i;
@@ -1079,10 +1079,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 
 	pol = NULL;
 	if (sk && sk->sk_policy[dir])
-		pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid);
+		pol = xfrm_sk_policy_lookup(sk, dir, &fl);
 
 	if (!pol)
-		pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir,
+		pol = flow_cache_lookup(&fl, family, fl_dir,
 					xfrm_policy_lookup);
 
 	if (!pol)
@@ -1298,6 +1298,8 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family)
 
 		if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family))
 			return 0;
+		if (fl && !security_xfrm_flow_state_match(fl, dst->xfrm))
+			return 0;
 		if (dst->xfrm->km.state != XFRM_STATE_VALID)
 			return 0;
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 0021aad5db4..be02bd981d1 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -367,7 +367,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 			 */
 			if (x->km.state == XFRM_STATE_VALID) {
 				if (!xfrm_selector_match(&x->sel, fl, family) ||
-				    !xfrm_sec_ctx_match(pol->security, x->security))
+				    !security_xfrm_state_pol_flow_match(x, pol, fl))
 					continue;
 				if (!best ||
 				    best->km.dying > x->km.dying ||
@@ -379,7 +379,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 			} else if (x->km.state == XFRM_STATE_ERROR ||
 				   x->km.state == XFRM_STATE_EXPIRED) {
  				if (xfrm_selector_match(&x->sel, fl, family) &&
-				    xfrm_sec_ctx_match(pol->security, x->security))
+				    security_xfrm_state_pol_flow_match(x, pol, fl))
 					error = -ESRCH;
 			}
 		}
@@ -403,6 +403,14 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 		 * to current session. */
 		xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family);
 
+		error = security_xfrm_state_alloc_acquire(x, pol->security, fl->secid);
+		if (error) {
+			x->km.state = XFRM_STATE_DEAD;
+			xfrm_state_put(x);
+			x = NULL;
+			goto out;
+		}
+
 		if (km_query(x, tmpl, pol) == 0) {
 			x->km.state = XFRM_STATE_ACQ;
 			list_add_tail(&x->bydst, xfrm_state_bydst+h);
diff --git a/security/dummy.c b/security/dummy.c
index bd3bc5faa9a..c1f10654871 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -835,7 +835,8 @@ static int dummy_xfrm_policy_delete_security(struct xfrm_policy *xp)
 	return 0;
 }
 
-static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx)
+static int dummy_xfrm_state_alloc_security(struct xfrm_state *x,
+	struct xfrm_user_sec_ctx *sec_ctx, struct xfrm_sec_ctx *pol, u32 secid)
 {
 	return 0;
 }
@@ -853,6 +854,23 @@ static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
 {
 	return 0;
 }
+
+static int dummy_xfrm_state_pol_flow_match(struct xfrm_state *x,
+				struct xfrm_policy *xp, struct flowi *fl)
+{
+	return 1;
+}
+
+static int dummy_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm)
+{
+	return 1;
+}
+
+static int dummy_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SECURITY_NETWORK_XFRM */
 static int dummy_register_security (const char *name, struct security_operations *ops)
 {
@@ -1076,6 +1094,9 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, xfrm_state_free_security);
 	set_to_dummy_if_null(ops, xfrm_state_delete_security);
 	set_to_dummy_if_null(ops, xfrm_policy_lookup);
+	set_to_dummy_if_null(ops, xfrm_state_pol_flow_match);
+	set_to_dummy_if_null(ops, xfrm_flow_state_match);
+	set_to_dummy_if_null(ops, xfrm_decode_session);
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 #ifdef CONFIG_KEYS
 	set_to_dummy_if_null(ops, key_alloc);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d67abf77584..5c189da07bc 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3468,7 +3468,7 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (err)
 		goto out;
 
-	err = selinux_xfrm_sock_rcv_skb(sock_sid, skb);
+	err = selinux_xfrm_sock_rcv_skb(sock_sid, skb, &ad);
 out:	
 	return err;
 }
@@ -3720,7 +3720,7 @@ static unsigned int selinux_ip_postroute_last(unsigned int hooknum,
 	if (err)
 		goto out;
 
-	err = selinux_xfrm_postroute_last(isec->sid, skb);
+	err = selinux_xfrm_postroute_last(isec->sid, skb, &ad);
 out:
 	return err ? NF_DROP : NF_ACCEPT;
 }
@@ -4633,6 +4633,9 @@ static struct security_operations selinux_ops = {
 	.xfrm_state_free_security =	selinux_xfrm_state_free,
 	.xfrm_state_delete_security =	selinux_xfrm_state_delete,
 	.xfrm_policy_lookup = 		selinux_xfrm_policy_lookup,
+	.xfrm_state_pol_flow_match =	selinux_xfrm_state_pol_flow_match,
+	.xfrm_flow_state_match =	selinux_xfrm_flow_state_match,
+	.xfrm_decode_session =		selinux_xfrm_decode_session,
 #endif
 
 #ifdef CONFIG_KEYS
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index c96498a10eb..f51a3e84bd9 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -2,6 +2,7 @@
  * SELinux support for the XFRM LSM hooks
  *
  * Author : Trent Jaeger, <jaegert@us.ibm.com>
+ * Updated : Venkat Yekkirala, <vyekkirala@TrustedCS.com>
  */
 #ifndef _SELINUX_XFRM_H_
 #define _SELINUX_XFRM_H_
@@ -10,10 +11,16 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *
 int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new);
 void selinux_xfrm_policy_free(struct xfrm_policy *xp);
 int selinux_xfrm_policy_delete(struct xfrm_policy *xp);
-int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx);
+int selinux_xfrm_state_alloc(struct xfrm_state *x,
+	struct xfrm_user_sec_ctx *sec_ctx, struct xfrm_sec_ctx *pol, u32 secid);
 void selinux_xfrm_state_free(struct xfrm_state *x);
 int selinux_xfrm_state_delete(struct xfrm_state *x);
-int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir);
+int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 fl_secid, u8 dir);
+int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x,
+			struct xfrm_policy *xp, struct flowi *fl);
+int selinux_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm);
+int selinux_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl);
+
 
 /*
  * Extract the security blob from the sock (it's actually on the socket)
@@ -39,17 +46,21 @@ static inline u32 selinux_no_sk_sid(struct flowi *fl)
 }
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
-int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb);
-int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb);
+int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb,
+			struct avc_audit_data *ad);
+int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
+			struct avc_audit_data *ad);
 u32 selinux_socket_getpeer_stream(struct sock *sk);
 u32 selinux_socket_getpeer_dgram(struct sk_buff *skb);
 #else
-static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb)
+static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
+			struct avc_audit_data *ad)
 {
 	return 0;
 }
 
-static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb)
+static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
+			struct avc_audit_data *ad)
 {
 	return 0;
 }
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index 6c985ced810..a502b0540e3 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -6,7 +6,12 @@
  *  Authors:  Serge Hallyn <sergeh@us.ibm.com>
  *	      Trent Jaeger <jaegert@us.ibm.com>
  *
+ *  Updated: Venkat Yekkirala <vyekkirala@TrustedCS.com>
+ *
+ *           Granular IPSec Associations for use in MLS environments.
+ *
  *  Copyright (C) 2005 International Business Machines Corporation
+ *  Copyright (C) 2006 Trusted Computer Solutions, Inc.
  *
  *	This program is free software; you can redistribute it and/or modify
  *	it under the terms of the GNU General Public License version 2,
@@ -67,10 +72,10 @@ static inline int selinux_authorizable_xfrm(struct xfrm_state *x)
 }
 
 /*
- * LSM hook implementation that authorizes that a socket can be used
- * with the corresponding xfrm_sec_ctx and direction.
+ * LSM hook implementation that authorizes that a flow can use
+ * a xfrm policy rule.
  */
-int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
+int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 fl_secid, u8 dir)
 {
 	int rc = 0;
 	u32 sel_sid = SECINITSID_UNLABELED;
@@ -84,27 +89,129 @@ int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir)
 		sel_sid = ctx->ctx_sid;
 	}
 
-	rc = avc_has_perm(sk_sid, sel_sid, SECCLASS_ASSOCIATION,
-			  ((dir == FLOW_DIR_IN) ? ASSOCIATION__RECVFROM :
-			   ((dir == FLOW_DIR_OUT) ?  ASSOCIATION__SENDTO :
-			    (ASSOCIATION__SENDTO | ASSOCIATION__RECVFROM))),
+	rc = avc_has_perm(fl_secid, sel_sid, SECCLASS_ASSOCIATION,
+			  ASSOCIATION__POLMATCH,
 			  NULL);
 
 	return rc;
 }
 
+/*
+ * LSM hook implementation that authorizes that a state matches
+ * the given policy, flow combo.
+ */
+
+int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp,
+			struct flowi *fl)
+{
+	u32 state_sid;
+	u32 pol_sid;
+	int err;
+
+	if (x->security)
+		state_sid = x->security->ctx_sid;
+	else
+		state_sid = SECINITSID_UNLABELED;
+
+	if (xp->security)
+		pol_sid = xp->security->ctx_sid;
+	else
+		pol_sid = SECINITSID_UNLABELED;
+
+	err = avc_has_perm(state_sid, pol_sid, SECCLASS_ASSOCIATION,
+			  ASSOCIATION__POLMATCH,
+			  NULL);
+
+	if (err)
+		return 0;
+
+	return selinux_xfrm_flow_state_match(fl, x);
+}
+
+/*
+ * LSM hook implementation that authorizes that a particular outgoing flow
+ * can use a given security association.
+ */
+
+int selinux_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm)
+{
+	int rc = 0;
+	u32 sel_sid = SECINITSID_UNLABELED;
+	struct xfrm_sec_ctx *ctx;
+
+	/* Context sid is either set to label or ANY_ASSOC */
+	if ((ctx = xfrm->security)) {
+		if (!selinux_authorizable_ctx(ctx))
+			return 0;
+
+		sel_sid = ctx->ctx_sid;
+	}
+
+	rc = avc_has_perm(fl->secid, sel_sid, SECCLASS_ASSOCIATION,
+			  ASSOCIATION__SENDTO,
+			  NULL)? 0:1;
+
+	return rc;
+}
+
+/*
+ * LSM hook implementation that determines the sid for the session.
+ */
+
+int selinux_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+	struct sec_path *sp;
+
+	fl->secid = SECSID_NULL;
+
+	if (skb == NULL)
+		return 0;
+
+	sp = skb->sp;
+	if (sp) {
+		int i, sid_set = 0;
+
+		for (i = sp->len-1; i >= 0; i--) {
+			struct xfrm_state *x = sp->xvec[i];
+			if (selinux_authorizable_xfrm(x)) {
+				struct xfrm_sec_ctx *ctx = x->security;
+
+				if (!sid_set) {
+					fl->secid = ctx->ctx_sid;
+					sid_set = 1;
+				}
+				else if (fl->secid != ctx->ctx_sid)
+					return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Security blob allocation for xfrm_policy and xfrm_state
  * CTX does not have a meaningful value on input
  */
-static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *uctx)
+static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp,
+	struct xfrm_user_sec_ctx *uctx, struct xfrm_sec_ctx *pol, u32 sid)
 {
 	int rc = 0;
 	struct task_security_struct *tsec = current->security;
-	struct xfrm_sec_ctx *ctx;
+	struct xfrm_sec_ctx *ctx = NULL;
+	char *ctx_str = NULL;
+	u32 str_len;
+	u32 ctx_sid;
+
+	BUG_ON(uctx && pol);
+
+	if (pol)
+		goto from_policy;
 
 	BUG_ON(!uctx);
-	BUG_ON(uctx->ctx_doi != XFRM_SC_ALG_SELINUX);
+
+	if (uctx->ctx_doi != XFRM_SC_ALG_SELINUX)
+		return -EINVAL;
 
 	if (uctx->ctx_len >= PAGE_SIZE)
 		return -ENOMEM;
@@ -141,9 +248,41 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_us
 
 	return rc;
 
+from_policy:
+	BUG_ON(!pol);
+	rc = security_sid_mls_copy(pol->ctx_sid, sid, &ctx_sid);
+	if (rc)
+		goto out;
+
+	rc = security_sid_to_context(ctx_sid, &ctx_str, &str_len);
+	if (rc)
+		goto out;
+
+	*ctxp = ctx = kmalloc(sizeof(*ctx) +
+			      str_len,
+			      GFP_ATOMIC);
+
+	if (!ctx) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+
+	ctx->ctx_doi = XFRM_SC_DOI_LSM;
+	ctx->ctx_alg = XFRM_SC_ALG_SELINUX;
+	ctx->ctx_sid = ctx_sid;
+	ctx->ctx_len = str_len;
+	memcpy(ctx->ctx_str,
+	       ctx_str,
+	       str_len);
+
+	goto out2;
+
 out:
 	*ctxp = NULL;
 	kfree(ctx);
+out2:
+	kfree(ctx_str);
 	return rc;
 }
 
@@ -157,7 +296,7 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *
 
 	BUG_ON(!xp);
 
-	err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx);
+	err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, NULL, 0);
 	return err;
 }
 
@@ -217,13 +356,14 @@ int selinux_xfrm_policy_delete(struct xfrm_policy *xp)
  * LSM hook implementation that allocs and transfers sec_ctx spec to
  * xfrm_state.
  */
-int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx)
+int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx,
+		struct xfrm_sec_ctx *pol, u32 secid)
 {
 	int err;
 
 	BUG_ON(!x);
 
-	err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx);
+	err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx, pol, secid);
 	return err;
 }
 
@@ -329,38 +469,30 @@ int selinux_xfrm_state_delete(struct xfrm_state *x)
  * we need to check for unlabelled access since this may not have
  * gone thru the IPSec process.
  */
-int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb)
+int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb,
+				struct avc_audit_data *ad)
 {
 	int i, rc = 0;
 	struct sec_path *sp;
+	u32 sel_sid = SECINITSID_UNLABELED;
 
 	sp = skb->sp;
 
 	if (sp) {
-		/*
-		 * __xfrm_policy_check does not approve unless xfrm_policy_ok
-		 * says that spi's match for policy and the socket.
-		 *
-		 *  Only need to verify the existence of an authorizable sp.
-		 */
 		for (i = 0; i < sp->len; i++) {
 			struct xfrm_state *x = sp->xvec[i];
 
-			if (x && selinux_authorizable_xfrm(x))
-				goto accept;
+			if (x && selinux_authorizable_xfrm(x)) {
+				struct xfrm_sec_ctx *ctx = x->security;
+				sel_sid = ctx->ctx_sid;
+				break;
+			}
 		}
 	}
 
-	/* check SELinux sock for unlabelled access */
-	rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION,
-			  ASSOCIATION__RECVFROM, NULL);
-	if (rc)
-		goto drop;
-
-accept:
-	return 0;
+	rc = avc_has_perm(isec_sid, sel_sid, SECCLASS_ASSOCIATION,
+			  ASSOCIATION__RECVFROM, ad);
 
-drop:
 	return rc;
 }
 
@@ -371,7 +503,8 @@ drop:
  * If we do have a authorizable security association, then it has already been
  * checked in xfrm_policy_lookup hook.
  */
-int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb)
+int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb,
+					struct avc_audit_data *ad)
 {
 	struct dst_entry *dst;
 	int rc = 0;
@@ -391,7 +524,7 @@ int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb)
 	}
 
 	rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION,
-			  ASSOCIATION__SENDTO, NULL);
+			  ASSOCIATION__SENDTO, ad);
 out:
 	return rc;
 }
-- 
cgit v1.2.3-70-g09d2


From beb8d13bed80f8388f1a9a107d07ddd342e627e8 Mon Sep 17 00:00:00 2001
From: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Date: Fri, 4 Aug 2006 23:12:42 -0700
Subject: [MLSXFRM]: Add flow labeling

This labels the flows that could utilize IPSec xfrms at the points the
flows are defined so that IPSec policy and SAs at the right label can
be used.

The following protos are currently not handled, but they should
continue to be able to use single-labeled IPSec like they currently
do.

ipmr
ip_gre
ipip
igmp
sit
sctp
ip6_tunnel (IPv6 over IPv6 tunnel device)
decnet

Signed-off-by: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h         | 38 +++++++++++++++++++++++++-------------
 include/net/route.h              |  3 +++
 net/dccp/ipv4.c                  |  1 +
 net/dccp/ipv6.c                  |  6 ++++++
 net/ipv4/af_inet.c               |  1 +
 net/ipv4/icmp.c                  |  2 ++
 net/ipv4/inet_connection_sock.c  |  1 +
 net/ipv4/ip_output.c             |  2 ++
 net/ipv4/netfilter/ipt_REJECT.c  |  1 +
 net/ipv4/raw.c                   |  1 +
 net/ipv4/syncookies.c            |  1 +
 net/ipv4/udp.c                   |  1 +
 net/ipv6/af_inet6.c              |  1 +
 net/ipv6/datagram.c              |  2 ++
 net/ipv6/icmp.c                  |  2 ++
 net/ipv6/inet6_connection_sock.c |  1 +
 net/ipv6/ndisc.c                 |  1 +
 net/ipv6/netfilter/ip6t_REJECT.c |  1 +
 net/ipv6/raw.c                   |  1 +
 net/ipv6/tcp_ipv6.c              |  7 +++++++
 net/ipv6/udp.c                   |  2 ++
 net/xfrm/xfrm_policy.c           |  3 +--
 security/dummy.c                 |  7 +++----
 security/selinux/hooks.c         |  8 ++++----
 security/selinux/include/xfrm.h  | 14 +-------------
 security/selinux/xfrm.c          | 11 +++++++----
 26 files changed, 79 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 2c4921d79d1..f3909d189fe 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -32,6 +32,7 @@
 #include <linux/sched.h>
 #include <linux/key.h>
 #include <linux/xfrm.h>
+#include <net/flow.h>
 
 struct ctl_table;
 
@@ -815,8 +816,8 @@ struct swap_info_struct;
  *	Deallocate security structure.
  * @sk_clone_security:
  *	Clone/copy security structure.
- * @sk_getsid:
- *	Retrieve the LSM-specific sid for the sock to enable caching of network
+ * @sk_getsecid:
+ *	Retrieve the LSM-specific secid for the sock to enable caching of network
  *	authorizations.
  *
  * Security hooks for XFRM operations.
@@ -882,8 +883,9 @@ struct swap_info_struct;
  *	Return 1 if there is a match.
  * @xfrm_decode_session:
  *	@skb points to skb to decode.
- *	@fl points to the flow key to set.
- *	Return 0 if successful decoding.
+ *	@secid points to the flow key secid to set.
+ *	@ckall says if all xfrms used should be checked for same secid.
+ *	Return 0 if ckall is zero or all xfrms used have the same secid.
  *
  * Security hooks affecting all Key Management operations
  *
@@ -1353,7 +1355,7 @@ struct security_operations {
 	int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority);
 	void (*sk_free_security) (struct sock *sk);
 	void (*sk_clone_security) (const struct sock *sk, struct sock *newsk);
-	unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir);
+	void (*sk_getsecid) (struct sock *sk, u32 *secid);
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
@@ -1370,7 +1372,7 @@ struct security_operations {
 	int (*xfrm_state_pol_flow_match)(struct xfrm_state *x,
 			struct xfrm_policy *xp, struct flowi *fl);
 	int (*xfrm_flow_state_match)(struct flowi *fl, struct xfrm_state *xfrm);
-	int (*xfrm_decode_session)(struct sk_buff *skb, struct flowi *fl);
+	int (*xfrm_decode_session)(struct sk_buff *skb, u32 *secid, int ckall);
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 
 	/* key management security hooks */
@@ -2917,9 +2919,9 @@ static inline void security_sk_clone(const struct sock *sk, struct sock *newsk)
 	return security_ops->sk_clone_security(sk, newsk);
 }
 
-static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
+static inline void security_sk_classify_flow(struct sock *sk, struct flowi *fl)
 {
-	return security_ops->sk_getsid(sk, fl, dir);
+	security_ops->sk_getsecid(sk, &fl->secid);
 }
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct socket * sock,
@@ -3047,9 +3049,8 @@ static inline void security_sk_clone(const struct sock *sk, struct sock *newsk)
 {
 }
 
-static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir)
+static inline void security_sk_classify_flow(struct sock *sk, struct flowi *fl)
 {
-	return 0;
 }
 #endif	/* CONFIG_SECURITY_NETWORK */
 
@@ -3114,9 +3115,16 @@ static inline int security_xfrm_flow_state_match(struct flowi *fl, struct xfrm_s
 	return security_ops->xfrm_flow_state_match(fl, xfrm);
 }
 
-static inline int security_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl)
+static inline int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
+{
+	return security_ops->xfrm_decode_session(skb, secid, 1);
+}
+
+static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl)
 {
-	return security_ops->xfrm_decode_session(skb, fl);
+	int rc = security_ops->xfrm_decode_session(skb, &fl->secid, 0);
+
+	BUG_ON(rc);
 }
 #else	/* CONFIG_SECURITY_NETWORK_XFRM */
 static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
@@ -3176,11 +3184,15 @@ static inline int security_xfrm_flow_state_match(struct flowi *fl,
 	return 1;
 }
 
-static inline int security_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl)
+static inline int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
 {
 	return 0;
 }
 
+static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl)
+{
+}
+
 #endif	/* CONFIG_SECURITY_NETWORK_XFRM */
 
 #ifdef CONFIG_KEYS
diff --git a/include/net/route.h b/include/net/route.h
index c4a068692dc..7f93ac0e089 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -32,6 +32,7 @@
 #include <linux/route.h>
 #include <linux/ip.h>
 #include <linux/cache.h>
+#include <linux/security.h>
 
 #ifndef __KERNEL__
 #warning This file is not supposed to be used outside of kernel.
@@ -166,6 +167,7 @@ static inline int ip_route_connect(struct rtable **rp, u32 dst,
 		ip_rt_put(*rp);
 		*rp = NULL;
 	}
+	security_sk_classify_flow(sk, &fl);
 	return ip_route_output_flow(rp, &fl, sk, 0);
 }
 
@@ -182,6 +184,7 @@ static inline int ip_route_newports(struct rtable **rp, u8 protocol,
 		fl.proto = protocol;
 		ip_rt_put(*rp);
 		*rp = NULL;
+		security_sk_classify_flow(sk, &fl);
 		return ip_route_output_flow(rp, &fl, sk, 0);
 	}
 	return 0;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 7f56f7e8f57..386498053b1 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -678,6 +678,7 @@ static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
 			   	     }
 			  };
 
+	security_skb_classify_flow(skb, &fl);
 	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
 		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 		return NULL;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 610c722ac27..53d255c0143 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -201,6 +201,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl.oif = sk->sk_bound_dev_if;
 	fl.fl_ip_dport = usin->sin6_port;
 	fl.fl_ip_sport = inet->sport;
+	security_sk_classify_flow(sk, &fl);
 
 	if (np->opt != NULL && np->opt->srcrt != NULL) {
 		const struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
@@ -322,6 +323,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			fl.oif = sk->sk_bound_dev_if;
 			fl.fl_ip_dport = inet->dport;
 			fl.fl_ip_sport = inet->sport;
+			security_sk_classify_flow(sk, &fl);
 
 			err = ip6_dst_lookup(sk, &dst, &fl);
 			if (err) {
@@ -422,6 +424,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
 	fl.oif = ireq6->iif;
 	fl.fl_ip_dport = inet_rsk(req)->rmt_port;
 	fl.fl_ip_sport = inet_sk(sk)->sport;
+	security_sk_classify_flow(sk, &fl);
 
 	if (dst == NULL) {
 		opt = np->opt;
@@ -566,6 +569,7 @@ static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb)
 	fl.oif = inet6_iif(rxskb);
 	fl.fl_ip_dport = dh->dccph_dport;
 	fl.fl_ip_sport = dh->dccph_sport;
+	security_skb_classify_flow(rxskb, &fl);
 
 	/* sk = NULL, but it is safe for now. RST socket required. */
 	if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
@@ -622,6 +626,7 @@ static void dccp_v6_reqsk_send_ack(struct sk_buff *rxskb,
 	fl.oif = inet6_iif(rxskb);
 	fl.fl_ip_dport = dh->dccph_dport;
 	fl.fl_ip_sport = dh->dccph_sport;
+	security_skb_classify_flow(rxskb, &fl);
 
 	if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
 		if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
@@ -842,6 +847,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 		fl.oif = sk->sk_bound_dev_if;
 		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
 		fl.fl_ip_sport = inet_sk(sk)->sport;
+		security_sk_classify_flow(sk, &fl);
 
 		if (ip6_dst_lookup(sk, &dst, &fl))
 			goto out;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c84a32070f8..fc40da3b6d3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1074,6 +1074,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 		},
 	};
 						
+	security_sk_classify_flow(sk, &fl);
 	err = ip_route_output_flow(&rt, &fl, sk, 0);
 }
 	if (!err)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4c86ac3d882..6ad797c1416 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -406,6 +406,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 						.saddr = rt->rt_spec_dst,
 						.tos = RT_TOS(skb->nh.iph->tos) } },
 				    .proto = IPPROTO_ICMP };
+		security_skb_classify_flow(skb, &fl);
 		if (ip_route_output_key(&rt, &fl))
 			goto out_unlock;
 	}
@@ -560,6 +561,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
 				}
 			}
 		};
+		security_skb_classify_flow(skb_in, &fl);
 		if (ip_route_output_key(&rt, &fl))
 			goto out_unlock;
 	}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index e50a1bfd7cc..772b4eac78b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -327,6 +327,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
 				       { .sport = inet_sk(sk)->sport,
 					 .dport = ireq->rmt_port } } };
 
+	security_sk_classify_flow(sk, &fl);
 	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
 		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 		return NULL;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a2ede167e04..308bdeac345 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -328,6 +328,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 			 * keep trying until route appears or the connection times
 			 * itself out.
 			 */
+			security_sk_classify_flow(sk, &fl);
 			if (ip_route_output_flow(&rt, &fl, sk, 0))
 				goto no_route;
 		}
@@ -1366,6 +1367,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 					       { .sport = skb->h.th->dest,
 					         .dport = skb->h.th->source } },
 				    .proto = sk->sk_protocol };
+		security_skb_classify_flow(skb, &fl);
 		if (ip_route_output_key(&rt, &fl))
 			return;
 	}
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 269bc2067cb..7f905bf2bde 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -90,6 +90,7 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
 	fl.proto = IPPROTO_TCP;
 	fl.fl_ip_sport = tcph->dest;
 	fl.fl_ip_dport = tcph->source;
+	security_skb_classify_flow(skb, &fl);
 
 	xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0);
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 62b2762a242..fe44cb50a1c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -484,6 +484,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		if (!inet->hdrincl)
 			raw_probe_proto_opt(&fl, msg);
 
+		security_sk_classify_flow(sk, &fl);
 		err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
 	}
 	if (err)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e20be3331f6..307dc3c0d63 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -259,6 +259,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 				    .uli_u = { .ports =
 					       { .sport = skb->h.th->dest,
 						 .dport = skb->h.th->source } } };
+		security_sk_classify_flow(sk, &fl);
 		if (ip_route_output_key(&rt, &fl)) {
 			reqsk_free(req);
 			goto out; 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f136cec96d9..a4d005eccc7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -603,6 +603,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 				    .uli_u = { .ports =
 					       { .sport = inet->sport,
 						 .dport = dport } } };
+		security_sk_classify_flow(sk, &fl);
 		err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
 		if (err)
 			goto out;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index ac85e9c532c..82a1b1a328d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -637,6 +637,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 		fl.oif = sk->sk_bound_dev_if;
 		fl.fl_ip_dport = inet->dport;
 		fl.fl_ip_sport = inet->sport;
+		security_sk_classify_flow(sk, &fl);
 
 		if (np->opt && np->opt->srcrt) {
 			struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 3b55b4c8e2d..c73508e090a 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -156,6 +156,8 @@ ipv4_connected:
 	if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST))
 		fl.oif = np->mcast_oif;
 
+	security_sk_classify_flow(sk, &fl);
+
 	if (flowlabel) {
 		if (flowlabel->opt && flowlabel->opt->srcrt) {
 			struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 356a8a7ef22..dbfce089e91 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -358,6 +358,7 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	fl.oif = iif;
 	fl.fl_icmp_type = type;
 	fl.fl_icmp_code = code;
+	security_skb_classify_flow(skb, &fl);
 
 	if (icmpv6_xmit_lock())
 		return;
@@ -472,6 +473,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 		ipv6_addr_copy(&fl.fl6_src, saddr);
 	fl.oif = skb->dev->ifindex;
 	fl.fl_icmp_type = ICMPV6_ECHO_REPLY;
+	security_skb_classify_flow(skb, &fl);
 
 	if (icmpv6_xmit_lock())
 		return;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index bf491077b82..7a51a258615 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -157,6 +157,7 @@ int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
 	fl.oif = sk->sk_bound_dev_if;
 	fl.fl_ip_sport = inet->sport;
 	fl.fl_ip_dport = inet->dport;
+	security_sk_classify_flow(sk, &fl);
 
 	if (np->opt && np->opt->srcrt) {
 		struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b50055b9278..67cfc3813c3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -419,6 +419,7 @@ static inline void ndisc_flow_init(struct flowi *fl, u8 type,
 	fl->proto	 	= IPPROTO_ICMPV6;
 	fl->fl_icmp_type	= type;
 	fl->fl_icmp_code	= 0;
+	security_sk_classify_flow(ndisc_socket->sk, fl);
 }
 
 static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 8629ba195d2..c4eba1aeb32 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -96,6 +96,7 @@ static void send_reset(struct sk_buff *oldskb)
 	ipv6_addr_copy(&fl.fl6_dst, &oip6h->saddr);
 	fl.fl_ip_sport = otcph.dest;
 	fl.fl_ip_dport = otcph.source;
+	security_skb_classify_flow(oldskb, &fl);
 	dst = ip6_route_output(NULL, &fl);
 	if (dst == NULL)
 		return;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 15b862d8aca..d5040e17229 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -759,6 +759,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 
 	if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
 		fl.oif = np->mcast_oif;
+	security_sk_classify_flow(sk, &fl);
 
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 802a1a6b103..46922e57e31 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -251,6 +251,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		final_p = &final;
 	}
 
+	security_sk_classify_flow(sk, &fl);
+
 	err = ip6_dst_lookup(sk, &dst, &fl);
 	if (err)
 		goto failure;
@@ -374,6 +376,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			fl.oif = sk->sk_bound_dev_if;
 			fl.fl_ip_dport = inet->dport;
 			fl.fl_ip_sport = inet->sport;
+			security_skb_classify_flow(skb, &fl);
 
 			if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
 				sk->sk_err_soft = -err;
@@ -467,6 +470,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 	fl.oif = treq->iif;
 	fl.fl_ip_dport = inet_rsk(req)->rmt_port;
 	fl.fl_ip_sport = inet_sk(sk)->sport;
+	security_sk_classify_flow(sk, &fl);
 
 	if (dst == NULL) {
 		opt = np->opt;
@@ -625,6 +629,7 @@ static void tcp_v6_send_reset(struct sk_buff *skb)
 	fl.oif = inet6_iif(skb);
 	fl.fl_ip_dport = t1->dest;
 	fl.fl_ip_sport = t1->source;
+	security_skb_classify_flow(skb, &fl);
 
 	/* sk = NULL, but it is safe for now. RST socket required. */
 	if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
@@ -691,6 +696,7 @@ static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32
 	fl.oif = inet6_iif(skb);
 	fl.fl_ip_dport = t1->dest;
 	fl.fl_ip_sport = t1->source;
+	security_skb_classify_flow(skb, &fl);
 
 	if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) {
 		if (xfrm_lookup(&buff->dst, &fl, NULL, 0) >= 0) {
@@ -923,6 +929,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		fl.oif = sk->sk_bound_dev_if;
 		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
 		fl.fl_ip_sport = inet_sk(sk)->sport;
+		security_sk_classify_flow(sk, &fl);
 
 		if (ip6_dst_lookup(sk, &dst, &fl))
 			goto out;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3d54f246411..82c7c9cde2a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -782,6 +782,8 @@ do_udp_sendmsg:
 		connected = 0;
 	}
 
+	security_sk_classify_flow(sk, fl);
+
 	err = ip6_sk_dst_lookup(sk, &dst, fl);
 	if (err)
 		goto out;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 79405daadc5..32c963c9057 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -863,7 +863,6 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
 	u16 family;
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
 
-	fl->secid = security_sk_sid(sk, fl, dir);
 restart:
 	genid = atomic_read(&flow_cache_genid);
 	policy = NULL;
@@ -1039,7 +1038,7 @@ xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family
 		return -EAFNOSUPPORT;
 
 	afinfo->decode_session(skb, fl);
-	err = security_xfrm_decode_session(skb, fl);
+	err = security_xfrm_decode_session(skb, &fl->secid);
 	xfrm_policy_put_afinfo(afinfo);
 	return err;
 }
diff --git a/security/dummy.c b/security/dummy.c
index c1f10654871..c0ff6b9bfd7 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -809,9 +809,8 @@ static inline void dummy_sk_clone_security (const struct sock *sk, struct sock *
 {
 }
 
-static unsigned int dummy_sk_getsid(struct sock *sk, struct flowi *fl, u8 dir)
+static inline void dummy_sk_getsecid(struct sock *sk, u32 *secid)
 {
-	return 0;
 }
 #endif	/* CONFIG_SECURITY_NETWORK */
 
@@ -866,7 +865,7 @@ static int dummy_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm
 	return 1;
 }
 
-static int dummy_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl)
+static int dummy_xfrm_decode_session(struct sk_buff *skb, u32 *fl, int ckall)
 {
 	return 0;
 }
@@ -1083,7 +1082,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, sk_alloc_security);
 	set_to_dummy_if_null(ops, sk_free_security);
 	set_to_dummy_if_null(ops, sk_clone_security);
-	set_to_dummy_if_null(ops, sk_getsid);
+	set_to_dummy_if_null(ops, sk_getsecid);
  #endif	/* CONFIG_SECURITY_NETWORK */
 #ifdef  CONFIG_SECURITY_NETWORK_XFRM
 	set_to_dummy_if_null(ops, xfrm_policy_alloc_security);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5c189da07bc..4e5989d584c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3561,14 +3561,14 @@ static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk)
 	newssec->peer_sid = ssec->peer_sid;
 }
 
-static unsigned int selinux_sk_getsid_security(struct sock *sk, struct flowi *fl, u8 dir)
+static void selinux_sk_getsecid(struct sock *sk, u32 *secid)
 {
 	if (!sk)
-		return selinux_no_sk_sid(fl);
+		*secid = SECINITSID_ANY_SOCKET;
 	else {
 		struct sk_security_struct *sksec = sk->sk_security;
 
-		return sksec->sid;
+		*secid = sksec->sid;
 	}
 }
 
@@ -4622,7 +4622,7 @@ static struct security_operations selinux_ops = {
 	.sk_alloc_security =		selinux_sk_alloc_security,
 	.sk_free_security =		selinux_sk_free_security,
 	.sk_clone_security =		selinux_sk_clone_security,
-	.sk_getsid = 			selinux_sk_getsid_security,
+	.sk_getsecid = 			selinux_sk_getsecid,
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 	.xfrm_policy_alloc_security =	selinux_xfrm_policy_alloc,
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index f51a3e84bd9..8e45c1d588a 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -19,7 +19,7 @@ int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 fl_secid, u8 dir);
 int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x,
 			struct xfrm_policy *xp, struct flowi *fl);
 int selinux_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm);
-int selinux_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl);
+int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *fl, int ckall);
 
 
 /*
@@ -33,18 +33,6 @@ static inline struct inode_security_struct *get_sock_isec(struct sock *sk)
 	return SOCK_INODE(sk->sk_socket)->i_security;
 }
 
-
-static inline u32 selinux_no_sk_sid(struct flowi *fl)
-{
-	/* NOTE: no sock occurs on ICMP reply, forwards, ... */
-	/* icmp_reply: authorize as kernel packet */
-	if (fl && fl->proto == IPPROTO_ICMP) {
-		return SECINITSID_KERNEL;
-	}
-
-	return SECINITSID_ANY_SOCKET;
-}
-
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb,
 			struct avc_audit_data *ad);
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index a502b0540e3..c750ef7af66 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -158,11 +158,11 @@ int selinux_xfrm_flow_state_match(struct flowi *fl, struct xfrm_state *xfrm)
  * LSM hook implementation that determines the sid for the session.
  */
 
-int selinux_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl)
+int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall)
 {
 	struct sec_path *sp;
 
-	fl->secid = SECSID_NULL;
+	*sid = SECSID_NULL;
 
 	if (skb == NULL)
 		return 0;
@@ -177,10 +177,13 @@ int selinux_xfrm_decode_session(struct sk_buff *skb, struct flowi *fl)
 				struct xfrm_sec_ctx *ctx = x->security;
 
 				if (!sid_set) {
-					fl->secid = ctx->ctx_sid;
+					*sid = ctx->ctx_sid;
 					sid_set = 1;
+
+					if (!ckall)
+						break;
 				}
-				else if (fl->secid != ctx->ctx_sid)
+				else if (*sid != ctx->ctx_sid)
 					return -EINVAL;
 			}
 		}
-- 
cgit v1.2.3-70-g09d2


From cb969f072b6d67770b559617f14e767f47e77ece Mon Sep 17 00:00:00 2001
From: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Date: Mon, 24 Jul 2006 23:32:20 -0700
Subject: [MLSXFRM]: Default labeling of socket specific IPSec policies

This defaults the label of socket-specific IPSec policies to be the
same as the socket they are set on.

Signed-off-by: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h        | 19 ++++++++++++++++---
 include/net/xfrm.h              |  2 +-
 net/key/af_key.c                | 15 +++++++++++----
 net/xfrm/xfrm_state.c           |  2 +-
 net/xfrm/xfrm_user.c            | 13 +++++++++++--
 security/dummy.c                |  3 ++-
 security/selinux/include/xfrm.h |  3 ++-
 security/selinux/xfrm.c         | 33 ++++++++++++++++++++++-----------
 8 files changed, 66 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index f3909d189fe..8e3dc6c51a6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -827,8 +827,10 @@ struct swap_info_struct;
  *	used by the XFRM system.
  *	@sec_ctx contains the security context information being provided by
  *	the user-level policy update program (e.g., setkey).
+ *	@sk refers to the sock from which to derive the security context.
  *	Allocate a security structure to the xp->security field; the security
- *	field is initialized to NULL when the xfrm_policy is allocated.
+ *	field is initialized to NULL when the xfrm_policy is allocated. Only
+ *	one of sec_ctx or sock can be specified.
  *	Return 0 if operation was successful (memory to allocate, legal context)
  * @xfrm_policy_clone_security:
  *	@old contains an existing xfrm_policy in the SPD.
@@ -1359,7 +1361,8 @@ struct security_operations {
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
-	int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
+	int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp,
+			struct xfrm_user_sec_ctx *sec_ctx, struct sock *sk);
 	int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new);
 	void (*xfrm_policy_free_security) (struct xfrm_policy *xp);
 	int (*xfrm_policy_delete_security) (struct xfrm_policy *xp);
@@ -3057,7 +3060,12 @@ static inline void security_sk_classify_flow(struct sock *sk, struct flowi *fl)
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
 {
-	return security_ops->xfrm_policy_alloc_security(xp, sec_ctx);
+	return security_ops->xfrm_policy_alloc_security(xp, sec_ctx, NULL);
+}
+
+static inline int security_xfrm_sock_policy_alloc(struct xfrm_policy *xp, struct sock *sk)
+{
+	return security_ops->xfrm_policy_alloc_security(xp, NULL, sk);
 }
 
 static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
@@ -3132,6 +3140,11 @@ static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm
 	return 0;
 }
 
+static inline int security_xfrm_sock_policy_alloc(struct xfrm_policy *xp, struct sock *sk)
+{
+	return 0;
+}
+
 static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new)
 {
 	return 0;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 3ecd9fa1ed4..00bf86e6e82 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -362,7 +362,7 @@ struct xfrm_mgr
 	char			*id;
 	int			(*notify)(struct xfrm_state *x, struct km_event *c);
 	int			(*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp, int dir);
-	struct xfrm_policy	*(*compile_policy)(u16 family, int opt, u8 *data, int len, int *dir);
+	struct xfrm_policy	*(*compile_policy)(struct sock *sk, int opt, u8 *data, int len, int *dir);
 	int			(*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport);
 	int			(*notify_policy)(struct xfrm_policy *x, int dir, struct km_event *c);
 };
diff --git a/net/key/af_key.c b/net/key/af_key.c
index a065e1a6777..797c744a843 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2843,14 +2843,14 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
 	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL);
 }
 
-static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
+static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt,
                                                 u8 *data, int len, int *dir)
 {
 	struct xfrm_policy *xp;
 	struct sadb_x_policy *pol = (struct sadb_x_policy*)data;
 	struct sadb_x_sec_ctx *sec_ctx;
 
-	switch (family) {
+	switch (sk->sk_family) {
 	case AF_INET:
 		if (opt != IP_IPSEC_POLICY) {
 			*dir = -EOPNOTSUPP;
@@ -2891,7 +2891,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
 	xp->lft.hard_byte_limit = XFRM_INF;
 	xp->lft.soft_packet_limit = XFRM_INF;
 	xp->lft.hard_packet_limit = XFRM_INF;
-	xp->family = family;
+	xp->family = sk->sk_family;
 
 	xp->xfrm_nr = 0;
 	if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC &&
@@ -2907,8 +2907,10 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
 		p += pol->sadb_x_policy_len*8;
 		sec_ctx = (struct sadb_x_sec_ctx *)p;
 		if (len < pol->sadb_x_policy_len*8 +
-		    sec_ctx->sadb_x_sec_len)
+		    sec_ctx->sadb_x_sec_len) {
+			*dir = -EINVAL;
 			goto out;
+		}
 		if ((*dir = verify_sec_ctx_len(p)))
 			goto out;
 		uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx);
@@ -2918,6 +2920,11 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt,
 		if (*dir)
 			goto out;
 	}
+	else {
+		*dir = security_xfrm_sock_policy_alloc(xp, sk);
+		if (*dir)
+			goto out;
+	}
 
 	*dir = pol->sadb_x_policy_dir-1;
 	return xp;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index be02bd981d1..1c796087ee7 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1026,7 +1026,7 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen
 	err = -EINVAL;
 	read_lock(&xfrm_km_lock);
 	list_for_each_entry(km, &xfrm_km_list, list) {
-		pol = km->compile_policy(sk->sk_family, optname, data,
+		pol = km->compile_policy(sk, optname, data,
 					 optlen, &err);
 		if (err >= 0)
 			break;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index dac8db1088b..f70e158874d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1757,7 +1757,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt,
 /* User gives us xfrm_user_policy_info followed by an array of 0
  * or more templates.
  */
-static struct xfrm_policy *xfrm_compile_policy(u16 family, int opt,
+static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
 					       u8 *data, int len, int *dir)
 {
 	struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data;
@@ -1765,7 +1765,7 @@ static struct xfrm_policy *xfrm_compile_policy(u16 family, int opt,
 	struct xfrm_policy *xp;
 	int nr;
 
-	switch (family) {
+	switch (sk->sk_family) {
 	case AF_INET:
 		if (opt != IP_XFRM_POLICY) {
 			*dir = -EOPNOTSUPP;
@@ -1807,6 +1807,15 @@ static struct xfrm_policy *xfrm_compile_policy(u16 family, int opt,
 	copy_from_user_policy(xp, p);
 	copy_templates(xp, ut, nr);
 
+	if (!xp->security) {
+		int err = security_xfrm_sock_policy_alloc(xp, sk);
+		if (err) {
+			kfree(xp);
+			*dir = err;
+			return NULL;
+		}
+	}
+
 	*dir = p->dir;
 
 	return xp;
diff --git a/security/dummy.c b/security/dummy.c
index c0ff6b9bfd7..66cc0640493 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -815,7 +815,8 @@ static inline void dummy_sk_getsecid(struct sock *sk, u32 *secid)
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
-static int dummy_xfrm_policy_alloc_security(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx)
+static int dummy_xfrm_policy_alloc_security(struct xfrm_policy *xp,
+		struct xfrm_user_sec_ctx *sec_ctx, struct sock *sk)
 {
 	return 0;
 }
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 8e45c1d588a..1822c73e508 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -7,7 +7,8 @@
 #ifndef _SELINUX_XFRM_H_
 #define _SELINUX_XFRM_H_
 
-int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx);
+int selinux_xfrm_policy_alloc(struct xfrm_policy *xp,
+		struct xfrm_user_sec_ctx *sec_ctx, struct sock *sk);
 int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new);
 void selinux_xfrm_policy_free(struct xfrm_policy *xp);
 int selinux_xfrm_policy_delete(struct xfrm_policy *xp);
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index c750ef7af66..d3690f98513 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -208,10 +208,8 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp,
 
 	BUG_ON(uctx && pol);
 
-	if (pol)
-		goto from_policy;
-
-	BUG_ON(!uctx);
+	if (!uctx)
+		goto not_from_user;
 
 	if (uctx->ctx_doi != XFRM_SC_ALG_SELINUX)
 		return -EINVAL;
@@ -251,11 +249,14 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp,
 
 	return rc;
 
-from_policy:
-	BUG_ON(!pol);
-	rc = security_sid_mls_copy(pol->ctx_sid, sid, &ctx_sid);
-	if (rc)
-		goto out;
+not_from_user:
+	if (pol) {
+		rc = security_sid_mls_copy(pol->ctx_sid, sid, &ctx_sid);
+		if (rc)
+			goto out;
+	}
+	else
+		ctx_sid = sid;
 
 	rc = security_sid_to_context(ctx_sid, &ctx_str, &str_len);
 	if (rc)
@@ -293,13 +294,23 @@ out2:
  * LSM hook implementation that allocs and transfers uctx spec to
  * xfrm_policy.
  */
-int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *uctx)
+int selinux_xfrm_policy_alloc(struct xfrm_policy *xp,
+		struct xfrm_user_sec_ctx *uctx, struct sock *sk)
 {
 	int err;
+	u32 sid;
 
 	BUG_ON(!xp);
+	BUG_ON(uctx && sk);
+
+	if (sk) {
+		struct sk_security_struct *ssec = sk->sk_security;
+		sid = ssec->sid;
+	}
+	else
+		sid = SECSID_NULL;
 
-	err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, NULL, 0);
+	err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx, NULL, sid);
 	return err;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4237c75c0a35535d7f9f2bfeeb4b4df1e068a0bf Mon Sep 17 00:00:00 2001
From: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Date: Mon, 24 Jul 2006 23:32:50 -0700
Subject: [MLSXFRM]: Auto-labeling of child sockets

This automatically labels the TCP, Unix stream, and dccp child sockets
as well as openreqs to be at the same MLS level as the peer. This will
result in the selection of appropriately labeled IPSec Security
Associations.

This also uses the sock's sid (as opposed to the isec sid) in SELinux
enforcement of secmark in rcv_skb and postroute_last hooks.

Signed-off-by: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h        |  55 ++++++++++++++++
 include/net/request_sock.h      |   1 +
 include/net/sock.h              |   1 +
 net/dccp/ipv4.c                 |   3 +
 net/dccp/ipv6.c                 |   7 +-
 net/ipv4/inet_connection_sock.c |   4 +-
 net/ipv4/syncookies.c           |   6 +-
 net/ipv4/tcp_ipv4.c             |   3 +
 net/ipv6/tcp_ipv6.c             |   6 +-
 security/dummy.c                |  24 +++++++
 security/selinux/hooks.c        | 137 +++++++++++++++++++++++++++-------------
 security/selinux/xfrm.c         |   1 -
 12 files changed, 197 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 8e3dc6c51a6..bb4c80fdfe7 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -90,6 +90,7 @@ extern int cap_netlink_recv(struct sk_buff *skb, int cap);
 struct nfsctl_arg;
 struct sched_param;
 struct swap_info_struct;
+struct request_sock;
 
 /* bprm_apply_creds unsafe reasons */
 #define LSM_UNSAFE_SHARE	1
@@ -819,6 +820,14 @@ struct swap_info_struct;
  * @sk_getsecid:
  *	Retrieve the LSM-specific secid for the sock to enable caching of network
  *	authorizations.
+ * @sock_graft:
+ *	Sets the socket's isec sid to the sock's sid.
+ * @inet_conn_request:
+ *	Sets the openreq's sid to socket's sid with MLS portion taken from peer sid.
+ * @inet_csk_clone:
+ *	Sets the new child socket's sid to the openreq sid.
+ * @req_classify_flow:
+ *	Sets the flow's sid to the openreq sid.
  *
  * Security hooks for XFRM operations.
  *
@@ -1358,6 +1367,11 @@ struct security_operations {
 	void (*sk_free_security) (struct sock *sk);
 	void (*sk_clone_security) (const struct sock *sk, struct sock *newsk);
 	void (*sk_getsecid) (struct sock *sk, u32 *secid);
+	void (*sock_graft)(struct sock* sk, struct socket *parent);
+	int (*inet_conn_request)(struct sock *sk, struct sk_buff *skb,
+					struct request_sock *req);
+	void (*inet_csk_clone)(struct sock *newsk, const struct request_sock *req);
+	void (*req_classify_flow)(const struct request_sock *req, struct flowi *fl);
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
@@ -2926,6 +2940,28 @@ static inline void security_sk_classify_flow(struct sock *sk, struct flowi *fl)
 {
 	security_ops->sk_getsecid(sk, &fl->secid);
 }
+
+static inline void security_req_classify_flow(const struct request_sock *req, struct flowi *fl)
+{
+	security_ops->req_classify_flow(req, fl);
+}
+
+static inline void security_sock_graft(struct sock* sk, struct socket *parent)
+{
+	security_ops->sock_graft(sk, parent);
+}
+
+static inline int security_inet_conn_request(struct sock *sk,
+			struct sk_buff *skb, struct request_sock *req)
+{
+	return security_ops->inet_conn_request(sk, skb, req);
+}
+
+static inline void security_inet_csk_clone(struct sock *newsk,
+			const struct request_sock *req)
+{
+	security_ops->inet_csk_clone(newsk, req);
+}
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct socket * sock,
 					       struct socket * other, 
@@ -3055,6 +3091,25 @@ static inline void security_sk_clone(const struct sock *sk, struct sock *newsk)
 static inline void security_sk_classify_flow(struct sock *sk, struct flowi *fl)
 {
 }
+
+static inline void security_req_classify_flow(const struct request_sock *req, struct flowi *fl)
+{
+}
+
+static inline void security_sock_graft(struct sock* sk, struct socket *parent)
+{
+}
+
+static inline int security_inet_conn_request(struct sock *sk,
+			struct sk_buff *skb, struct request_sock *req)
+{
+	return 0;
+}
+
+static inline void security_inet_csk_clone(struct sock *newsk,
+			const struct request_sock *req)
+{
+}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index c5d7f920c35..8e165ca16bd 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -53,6 +53,7 @@ struct request_sock {
 	unsigned long			expires;
 	struct request_sock_ops		*rsk_ops;
 	struct sock			*sk;
+	u32				secid;
 };
 
 static inline struct request_sock *reqsk_alloc(struct request_sock_ops *ops)
diff --git a/include/net/sock.h b/include/net/sock.h
index 91cdceb3c02..337ebec84c7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -969,6 +969,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 	sk->sk_sleep = &parent->wait;
 	parent->sk = sk;
 	sk->sk_socket = parent;
+	security_sock_graft(sk, parent);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 386498053b1..171d363876e 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -501,6 +501,9 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	dccp_openreq_init(req, &dp, skb);
 
+	if (security_inet_conn_request(sk, skb, req))
+		goto drop_and_free;
+
 	ireq = inet_rsk(req);
 	ireq->loc_addr = daddr;
 	ireq->rmt_addr = saddr;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 53d255c0143..231bc7c7e74 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -424,7 +424,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
 	fl.oif = ireq6->iif;
 	fl.fl_ip_dport = inet_rsk(req)->rmt_port;
 	fl.fl_ip_sport = inet_sk(sk)->sport;
-	security_sk_classify_flow(sk, &fl);
+	security_req_classify_flow(req, &fl);
 
 	if (dst == NULL) {
 		opt = np->opt;
@@ -626,7 +626,7 @@ static void dccp_v6_reqsk_send_ack(struct sk_buff *rxskb,
 	fl.oif = inet6_iif(rxskb);
 	fl.fl_ip_dport = dh->dccph_dport;
 	fl.fl_ip_sport = dh->dccph_sport;
-	security_skb_classify_flow(rxskb, &fl);
+	security_req_classify_flow(req, &fl);
 
 	if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
 		if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
@@ -709,6 +709,9 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	dccp_openreq_init(req, &dp, skb);
 
+	if (security_inet_conn_request(sk, skb, req))
+		goto drop_and_free;
+
 	ireq6 = inet6_rsk(req);
 	ireq = inet_rsk(req);
 	ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 772b4eac78b..07204391d08 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -327,7 +327,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
 				       { .sport = inet_sk(sk)->sport,
 					 .dport = ireq->rmt_port } } };
 
-	security_sk_classify_flow(sk, &fl);
+	security_req_classify_flow(req, &fl);
 	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
 		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
 		return NULL;
@@ -510,6 +510,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
 
 		/* Deinitialize accept_queue to trap illegal accesses. */
 		memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+
+		security_inet_csk_clone(newsk, req);
 	}
 	return newsk;
 }
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 307dc3c0d63..661e0a4bca7 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -214,6 +214,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	if (!req)
 		goto out;
 
+	if (security_inet_conn_request(sk, skb, req)) {
+		reqsk_free(req);
+		goto out;
+	}
 	ireq = inet_rsk(req);
 	treq = tcp_rsk(req);
 	treq->rcv_isn		= htonl(skb->h.th->seq) - 1;
@@ -259,7 +263,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 				    .uli_u = { .ports =
 					       { .sport = skb->h.th->dest,
 						 .dport = skb->h.th->source } } };
-		security_sk_classify_flow(sk, &fl);
+		security_req_classify_flow(req, &fl);
 		if (ip_route_output_key(&rt, &fl)) {
 			reqsk_free(req);
 			goto out; 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4b04c3edd4a..43f6740244f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -798,6 +798,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	tcp_openreq_init(req, &tmp_opt, skb);
 
+	if (security_inet_conn_request(sk, skb, req))
+		goto drop_and_free;
+
 	ireq = inet_rsk(req);
 	ireq->loc_addr = daddr;
 	ireq->rmt_addr = saddr;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 46922e57e31..302786a11cd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -470,7 +470,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 	fl.oif = treq->iif;
 	fl.fl_ip_dport = inet_rsk(req)->rmt_port;
 	fl.fl_ip_sport = inet_sk(sk)->sport;
-	security_sk_classify_flow(sk, &fl);
+	security_req_classify_flow(req, &fl);
 
 	if (dst == NULL) {
 		opt = np->opt;
@@ -826,6 +826,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	tcp_rsk(req)->snt_isn = isn;
 
+	security_inet_conn_request(sk, skb, req);
+
 	if (tcp_v6_send_synack(sk, req, NULL))
 		goto drop;
 
@@ -929,7 +931,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		fl.oif = sk->sk_bound_dev_if;
 		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
 		fl.fl_ip_sport = inet_sk(sk)->sport;
-		security_sk_classify_flow(sk, &fl);
+		security_req_classify_flow(req, &fl);
 
 		if (ip6_dst_lookup(sk, &dst, &fl))
 			goto out;
diff --git a/security/dummy.c b/security/dummy.c
index 66cc0640493..1c45f8e4aad 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -812,6 +812,26 @@ static inline void dummy_sk_clone_security (const struct sock *sk, struct sock *
 static inline void dummy_sk_getsecid(struct sock *sk, u32 *secid)
 {
 }
+
+static inline void dummy_sock_graft(struct sock* sk, struct socket *parent)
+{
+}
+
+static inline int dummy_inet_conn_request(struct sock *sk,
+			struct sk_buff *skb, struct request_sock *req)
+{
+	return 0;
+}
+
+static inline void dummy_inet_csk_clone(struct sock *newsk,
+			const struct request_sock *req)
+{
+}
+
+static inline void dummy_req_classify_flow(const struct request_sock *req,
+			struct flowi *fl)
+{
+}
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
@@ -1084,6 +1104,10 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, sk_free_security);
 	set_to_dummy_if_null(ops, sk_clone_security);
 	set_to_dummy_if_null(ops, sk_getsecid);
+	set_to_dummy_if_null(ops, sock_graft);
+	set_to_dummy_if_null(ops, inet_conn_request);
+	set_to_dummy_if_null(ops, inet_csk_clone);
+	set_to_dummy_if_null(ops, req_classify_flow);
  #endif	/* CONFIG_SECURITY_NETWORK */
 #ifdef  CONFIG_SECURITY_NETWORK_XFRM
 	set_to_dummy_if_null(ops, xfrm_policy_alloc_security);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4e5989d584c..1dc935f7b91 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3328,8 +3328,9 @@ static int selinux_socket_unix_stream_connect(struct socket *sock,
 	/* server child socket */
 	ssec = newsk->sk_security;
 	ssec->peer_sid = isec->sid;
-	
-	return 0;
+	err = security_sid_mls_copy(other_isec->sid, ssec->peer_sid, &ssec->sid);
+
+	return err;
 }
 
 static int selinux_socket_unix_may_send(struct socket *sock,
@@ -3355,11 +3356,29 @@ static int selinux_socket_unix_may_send(struct socket *sock,
 }
 
 static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
-		struct avc_audit_data *ad, u32 sock_sid, u16 sock_class,
-		u16 family, char *addrp, int len)
+		struct avc_audit_data *ad, u16 family, char *addrp, int len)
 {
 	int err = 0;
 	u32 netif_perm, node_perm, node_sid, if_sid, recv_perm = 0;
+	struct socket *sock;
+	u16 sock_class = 0;
+	u32 sock_sid = 0;
+
+ 	read_lock_bh(&sk->sk_callback_lock);
+ 	sock = sk->sk_socket;
+ 	if (sock) {
+ 		struct inode *inode;
+ 		inode = SOCK_INODE(sock);
+ 		if (inode) {
+ 			struct inode_security_struct *isec;
+ 			isec = inode->i_security;
+ 			sock_sid = isec->sid;
+ 			sock_class = isec->sclass;
+ 		}
+ 	}
+ 	read_unlock_bh(&sk->sk_callback_lock);
+ 	if (!sock_sid)
+  		goto out;
 
 	if (!skb->dev)
 		goto out;
@@ -3419,12 +3438,10 @@ out:
 static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	u16 family;
-	u16 sock_class = 0;
 	char *addrp;
 	int len, err = 0;
-	u32 sock_sid = 0;
-	struct socket *sock;
 	struct avc_audit_data ad;
+	struct sk_security_struct *sksec = sk->sk_security;
 
 	family = sk->sk_family;
 	if (family != PF_INET && family != PF_INET6)
@@ -3434,22 +3451,6 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (family == PF_INET6 && skb->protocol == ntohs(ETH_P_IP))
 		family = PF_INET;
 
- 	read_lock_bh(&sk->sk_callback_lock);
- 	sock = sk->sk_socket;
- 	if (sock) {
- 		struct inode *inode;
- 		inode = SOCK_INODE(sock);
- 		if (inode) {
- 			struct inode_security_struct *isec;
- 			isec = inode->i_security;
- 			sock_sid = isec->sid;
- 			sock_class = isec->sclass;
- 		}
- 	}
- 	read_unlock_bh(&sk->sk_callback_lock);
- 	if (!sock_sid)
-  		goto out;
-
 	AVC_AUDIT_DATA_INIT(&ad, NET);
 	ad.u.net.netif = skb->dev ? skb->dev->name : "[unknown]";
 	ad.u.net.family = family;
@@ -3459,16 +3460,15 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		goto out;
 
 	if (selinux_compat_net)
-		err = selinux_sock_rcv_skb_compat(sk, skb, &ad, sock_sid,
-						  sock_class, family,
+		err = selinux_sock_rcv_skb_compat(sk, skb, &ad, family,
 						  addrp, len);
 	else
-		err = avc_has_perm(sock_sid, skb->secmark, SECCLASS_PACKET,
+		err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
 				   PACKET__RECV, &ad);
 	if (err)
 		goto out;
 
-	err = selinux_xfrm_sock_rcv_skb(sock_sid, skb, &ad);
+	err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
 out:	
 	return err;
 }
@@ -3572,6 +3572,49 @@ static void selinux_sk_getsecid(struct sock *sk, u32 *secid)
 	}
 }
 
+void selinux_sock_graft(struct sock* sk, struct socket *parent)
+{
+	struct inode_security_struct *isec = SOCK_INODE(parent)->i_security;
+	struct sk_security_struct *sksec = sk->sk_security;
+
+	isec->sid = sksec->sid;
+}
+
+int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
+					   struct request_sock *req)
+{
+	struct sk_security_struct *sksec = sk->sk_security;
+	int err;
+	u32 newsid = 0;
+	u32 peersid;
+
+	err = selinux_xfrm_decode_session(skb, &peersid, 0);
+	BUG_ON(err);
+
+	err = security_sid_mls_copy(sksec->sid, peersid, &newsid);
+	if (err)
+		return err;
+
+	req->secid = newsid;
+	return 0;
+}
+
+void selinux_inet_csk_clone(struct sock *newsk, const struct request_sock *req)
+{
+	struct sk_security_struct *newsksec = newsk->sk_security;
+
+	newsksec->sid = req->secid;
+	/* NOTE: Ideally, we should also get the isec->sid for the
+	   new socket in sync, but we don't have the isec available yet.
+	   So we will wait until sock_graft to do it, by which
+	   time it will have been created and available. */
+}
+
+void selinux_req_classify_flow(const struct request_sock *req, struct flowi *fl)
+{
+	fl->secid = req->secid;
+}
+
 static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb)
 {
 	int err = 0;
@@ -3611,12 +3654,24 @@ out:
 #ifdef CONFIG_NETFILTER
 
 static int selinux_ip_postroute_last_compat(struct sock *sk, struct net_device *dev,
-					    struct inode_security_struct *isec,
 					    struct avc_audit_data *ad,
 					    u16 family, char *addrp, int len)
 {
-	int err;
+	int err = 0;
 	u32 netif_perm, node_perm, node_sid, if_sid, send_perm = 0;
+	struct socket *sock;
+	struct inode *inode;
+	struct inode_security_struct *isec;
+
+	sock = sk->sk_socket;
+	if (!sock)
+		goto out;
+
+	inode = SOCK_INODE(sock);
+	if (!inode)
+		goto out;
+
+	isec = inode->i_security;
 	
 	err = sel_netif_sids(dev, &if_sid, NULL);
 	if (err)
@@ -3681,26 +3736,16 @@ static unsigned int selinux_ip_postroute_last(unsigned int hooknum,
 	char *addrp;
 	int len, err = 0;
 	struct sock *sk;
-	struct socket *sock;
-	struct inode *inode;
 	struct sk_buff *skb = *pskb;
-	struct inode_security_struct *isec;
 	struct avc_audit_data ad;
 	struct net_device *dev = (struct net_device *)out;
+	struct sk_security_struct *sksec;
 
 	sk = skb->sk;
 	if (!sk)
 		goto out;
 
-	sock = sk->sk_socket;
-	if (!sock)
-		goto out;
-
-	inode = SOCK_INODE(sock);
-	if (!inode)
-		goto out;
-
-	isec = inode->i_security;
+	sksec = sk->sk_security;
 
 	AVC_AUDIT_DATA_INIT(&ad, NET);
 	ad.u.net.netif = dev->name;
@@ -3711,16 +3756,16 @@ static unsigned int selinux_ip_postroute_last(unsigned int hooknum,
 		goto out;
 
 	if (selinux_compat_net)
-		err = selinux_ip_postroute_last_compat(sk, dev, isec, &ad,
+		err = selinux_ip_postroute_last_compat(sk, dev, &ad,
 						       family, addrp, len);
 	else
-		err = avc_has_perm(isec->sid, skb->secmark, SECCLASS_PACKET,
+		err = avc_has_perm(sksec->sid, skb->secmark, SECCLASS_PACKET,
 				   PACKET__SEND, &ad);
 
 	if (err)
 		goto out;
 
-	err = selinux_xfrm_postroute_last(isec->sid, skb, &ad);
+	err = selinux_xfrm_postroute_last(sksec->sid, skb, &ad);
 out:
 	return err ? NF_DROP : NF_ACCEPT;
 }
@@ -4623,6 +4668,10 @@ static struct security_operations selinux_ops = {
 	.sk_free_security =		selinux_sk_free_security,
 	.sk_clone_security =		selinux_sk_clone_security,
 	.sk_getsecid = 			selinux_sk_getsecid,
+	.sock_graft =			selinux_sock_graft,
+	.inet_conn_request =		selinux_inet_conn_request,
+	.inet_csk_clone =		selinux_inet_csk_clone,
+	.req_classify_flow =		selinux_req_classify_flow,
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 	.xfrm_policy_alloc_security =	selinux_xfrm_policy_alloc,
diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c
index d3690f98513..3e742b850af 100644
--- a/security/selinux/xfrm.c
+++ b/security/selinux/xfrm.c
@@ -271,7 +271,6 @@ not_from_user:
 		goto out;
 	}
 
-
 	ctx->ctx_doi = XFRM_SC_DOI_LSM;
 	ctx->ctx_alg = XFRM_SC_ALG_SELINUX;
 	ctx->ctx_sid = ctx_sid;
-- 
cgit v1.2.3-70-g09d2


From 11a03f78fbf15a866ba3bf6359a75cdfd1ced703 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Thu, 3 Aug 2006 16:46:20 -0700
Subject: [NetLabel]: core network changes

Changes to the core network stack to support the NetLabel subsystem.  This
includes changes to the IPv4 option handling to support CIPSO labels.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ip.h       |   1 +
 include/net/cipso_ipv4.h | 250 ++++++++++++++++++++++++++++++++++++++++
 include/net/inet_sock.h  |   2 +-
 include/net/netlabel.h   | 291 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ah4.c           |   2 +-
 net/ipv4/ip_options.c    |  19 ++++
 6 files changed, 563 insertions(+), 2 deletions(-)
 create mode 100644 include/net/cipso_ipv4.h
 create mode 100644 include/net/netlabel.h

(limited to 'include')

diff --git a/include/linux/ip.h b/include/linux/ip.h
index 4b55cf1df73..2f4600146f8 100644
--- a/include/linux/ip.h
+++ b/include/linux/ip.h
@@ -57,6 +57,7 @@
 #define IPOPT_SEC	(2 |IPOPT_CONTROL|IPOPT_COPY)
 #define IPOPT_LSRR	(3 |IPOPT_CONTROL|IPOPT_COPY)
 #define IPOPT_TIMESTAMP	(4 |IPOPT_MEASUREMENT)
+#define IPOPT_CIPSO	(6 |IPOPT_CONTROL|IPOPT_COPY)
 #define IPOPT_RR	(7 |IPOPT_CONTROL)
 #define IPOPT_SID	(8 |IPOPT_CONTROL|IPOPT_COPY)
 #define IPOPT_SSRR	(9 |IPOPT_CONTROL|IPOPT_COPY)
diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
new file mode 100644
index 00000000000..c7175e72580
--- /dev/null
+++ b/include/net/cipso_ipv4.h
@@ -0,0 +1,250 @@
+/*
+ * CIPSO - Commercial IP Security Option
+ *
+ * This is an implementation of the CIPSO 2.2 protocol as specified in
+ * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
+ * FIPS-188, copies of both documents can be found in the Documentation
+ * directory.  While CIPSO never became a full IETF RFC standard many vendors
+ * have chosen to adopt the protocol and over the years it has become a
+ * de-facto standard for labeled networking.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _CIPSO_IPV4_H
+#define _CIPSO_IPV4_H
+
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <net/netlabel.h>
+
+/* known doi values */
+#define CIPSO_V4_DOI_UNKNOWN          0x00000000
+
+/* tag types */
+#define CIPSO_V4_TAG_INVALID          0
+#define CIPSO_V4_TAG_RBITMAP          1
+#define CIPSO_V4_TAG_ENUM             2
+#define CIPSO_V4_TAG_RANGE            5
+#define CIPSO_V4_TAG_PBITMAP          6
+#define CIPSO_V4_TAG_FREEFORM         7
+
+/* doi mapping types */
+#define CIPSO_V4_MAP_UNKNOWN          0
+#define CIPSO_V4_MAP_STD              1
+#define CIPSO_V4_MAP_PASS             2
+
+/* limits */
+#define CIPSO_V4_MAX_REM_LVLS         256
+#define CIPSO_V4_INV_LVL              0x80000000
+#define CIPSO_V4_MAX_LOC_LVLS         (CIPSO_V4_INV_LVL - 1)
+#define CIPSO_V4_MAX_REM_CATS         65536
+#define CIPSO_V4_INV_CAT              0x80000000
+#define CIPSO_V4_MAX_LOC_CATS         (CIPSO_V4_INV_CAT - 1)
+
+/*
+ * CIPSO DOI definitions
+ */
+
+/* DOI definition struct */
+#define CIPSO_V4_TAG_MAXCNT           5
+struct cipso_v4_doi {
+	u32 doi;
+	u32 type;
+	union {
+		struct cipso_v4_std_map_tbl *std;
+	} map;
+	u8 tags[CIPSO_V4_TAG_MAXCNT];
+
+	u32 valid;
+	struct list_head list;
+	struct rcu_head rcu;
+	struct list_head dom_list;
+};
+
+/* Standard CIPSO mapping table */
+/* NOTE: the highest order bit (i.e. 0x80000000) is an 'invalid' flag, if the
+ *       bit is set then consider that value as unspecified, meaning the
+ *       mapping for that particular level/category is invalid */
+struct cipso_v4_std_map_tbl {
+	struct {
+		u32 *cipso;
+		u32 *local;
+		u32 cipso_size;
+		u32 local_size;
+	} lvl;
+	struct {
+		u32 *cipso;
+		u32 *local;
+		u32 cipso_size;
+		u32 local_size;
+	} cat;
+};
+
+/*
+ * Sysctl Variables
+ */
+
+#ifdef CONFIG_NETLABEL
+extern int cipso_v4_cache_enabled;
+extern int cipso_v4_cache_bucketsize;
+extern int cipso_v4_rbm_optfmt;
+extern int cipso_v4_rbm_strictvalid;
+#endif
+
+/*
+ * Helper Functions
+ */
+
+#define CIPSO_V4_OPTEXIST(x) (IPCB(x)->opt.cipso != 0)
+#define CIPSO_V4_OPTPTR(x) ((x)->nh.raw + IPCB(x)->opt.cipso)
+
+/*
+ * DOI List Functions
+ */
+
+#ifdef CONFIG_NETLABEL
+int cipso_v4_doi_add(struct cipso_v4_doi *doi_def);
+int cipso_v4_doi_remove(u32 doi, void (*callback) (struct rcu_head * head));
+struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi);
+struct sk_buff *cipso_v4_doi_dump_all(size_t headroom);
+struct sk_buff *cipso_v4_doi_dump(u32 doi, size_t headroom);
+int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def, const char *domain);
+int cipso_v4_doi_domhsh_remove(struct cipso_v4_doi *doi_def,
+			       const char *domain);
+#else
+static inline int cipso_v4_doi_add(struct cipso_v4_doi *doi_def)
+{
+	return -ENOSYS;
+}
+
+static inline int cipso_v4_doi_remove(u32 doi,
+				    void (*callback) (struct rcu_head * head))
+{
+	return 0;
+}
+
+static inline struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
+{
+	return NULL;
+}
+
+static inline struct sk_buff *cipso_v4_doi_dump_all(size_t headroom)
+{
+	return NULL;
+}
+
+static inline struct sk_buff *cipso_v4_doi_dump(u32 doi, size_t headroom)
+{
+	return NULL;
+}
+
+static inline int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def,
+					  const char *domain)
+{
+	return -ENOSYS;
+}
+
+static inline int cipso_v4_doi_domhsh_remove(struct cipso_v4_doi *doi_def,
+					     const char *domain)
+{
+	return 0;
+}
+#endif /* CONFIG_NETLABEL */
+
+/*
+ * Label Mapping Cache Functions
+ */
+
+#ifdef CONFIG_NETLABEL
+void cipso_v4_cache_invalidate(void);
+int cipso_v4_cache_add(const struct sk_buff *skb,
+		       const struct netlbl_lsm_secattr *secattr);
+#else
+static inline void cipso_v4_cache_invalidate(void)
+{
+	return;
+}
+
+static inline int cipso_v4_cache_add(const struct sk_buff *skb,
+				     const struct netlbl_lsm_secattr *secattr)
+{
+	return 0;
+}
+#endif /* CONFIG_NETLABEL */
+
+/*
+ * Protocol Handling Functions
+ */
+
+#ifdef CONFIG_NETLABEL
+void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway);
+int cipso_v4_socket_setopt(struct socket *sock,
+			   unsigned char *opt,
+			   u32 opt_len);
+int cipso_v4_socket_setattr(const struct socket *sock,
+			    const struct cipso_v4_doi *doi_def,
+			    const struct netlbl_lsm_secattr *secattr);
+int cipso_v4_socket_getopt(const struct socket *sock,
+			   unsigned char **opt,
+			   u32 *opt_len);
+int cipso_v4_socket_getattr(const struct socket *sock,
+			    struct netlbl_lsm_secattr *secattr);
+int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
+			    struct netlbl_lsm_secattr *secattr);
+int cipso_v4_validate(unsigned char **option);
+#else
+static inline void cipso_v4_error(struct sk_buff *skb,
+				  int error,
+				  u32 gateway)
+{
+	return;
+}
+
+static inline int cipso_v4_socket_setattr(const struct socket *sock,
+				  const struct cipso_v4_doi *doi_def,
+				  const struct netlbl_lsm_secattr *secattr)
+{
+	return -ENOSYS;
+}
+
+static inline int cipso_v4_socket_getattr(const struct socket *sock,
+					  struct netlbl_lsm_secattr *secattr)
+{
+	return -ENOSYS;
+}
+
+static inline int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
+					  struct netlbl_lsm_secattr *secattr)
+{
+	return -ENOSYS;
+}
+
+static inline int cipso_v4_validate(unsigned char **option)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_NETLABEL */
+
+#endif /* _CIPSO_IPV4_H */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 1f4a9a60d4c..f4caad56cd0 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -51,7 +51,7 @@ struct ip_options {
 			ts_needtime:1,
 			ts_needaddr:1;
 	unsigned char	router_alert;
-	unsigned char	__pad1;
+	unsigned char	cipso;
 	unsigned char	__pad2;
 	unsigned char	__data[0];
 };
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
new file mode 100644
index 00000000000..7cae730832c
--- /dev/null
+++ b/include/net/netlabel.h
@@ -0,0 +1,291 @@
+/*
+ * NetLabel System
+ *
+ * The NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _NETLABEL_H
+#define _NETLABEL_H
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+
+/*
+ * NetLabel - A management interface for maintaining network packet label
+ *            mapping tables for explicit packet labling protocols.
+ *
+ * Network protocols such as CIPSO and RIPSO require a label translation layer
+ * to convert the label on the packet into something meaningful on the host
+ * machine.  In the current Linux implementation these mapping tables live
+ * inside the kernel; NetLabel provides a mechanism for user space applications
+ * to manage these mapping tables.
+ *
+ * NetLabel makes use of the Generic NETLINK mechanism as a transport layer to
+ * send messages between kernel and user space.  The general format of a
+ * NetLabel message is shown below:
+ *
+ *  +-----------------+-------------------+--------- --- -- -
+ *  | struct nlmsghdr | struct genlmsghdr | payload
+ *  +-----------------+-------------------+--------- --- -- -
+ *
+ * The 'nlmsghdr' and 'genlmsghdr' structs should be dealt with like normal.
+ * The payload is dependent on the subsystem specified in the
+ * 'nlmsghdr->nlmsg_type' and should be defined below, supporting functions
+ * should be defined in the corresponding net/netlabel/netlabel_<subsys>.h|c
+ * file.  All of the fields in the NetLabel payload are NETLINK attributes, the
+ * length of each field is the length of the NETLINK attribute payload, see
+ * include/net/netlink.h for more information on NETLINK attributes.
+ *
+ */
+
+/*
+ * NetLabel NETLINK protocol
+ */
+
+#define NETLBL_PROTO_VERSION            1
+
+/* NetLabel NETLINK types/families */
+#define NETLBL_NLTYPE_NONE              0
+#define NETLBL_NLTYPE_MGMT              1
+#define NETLBL_NLTYPE_MGMT_NAME         "NLBL_MGMT"
+#define NETLBL_NLTYPE_RIPSO             2
+#define NETLBL_NLTYPE_RIPSO_NAME        "NLBL_RIPSO"
+#define NETLBL_NLTYPE_CIPSOV4           3
+#define NETLBL_NLTYPE_CIPSOV4_NAME      "NLBL_CIPSOv4"
+#define NETLBL_NLTYPE_CIPSOV6           4
+#define NETLBL_NLTYPE_CIPSOV6_NAME      "NLBL_CIPSOv6"
+#define NETLBL_NLTYPE_UNLABELED         5
+#define NETLBL_NLTYPE_UNLABELED_NAME    "NLBL_UNLBL"
+
+/* NetLabel return codes */
+#define NETLBL_E_OK                     0
+
+/*
+ * Helper functions
+ */
+
+#define NETLBL_LEN_U8                   nla_total_size(sizeof(u8))
+#define NETLBL_LEN_U16                  nla_total_size(sizeof(u16))
+#define NETLBL_LEN_U32                  nla_total_size(sizeof(u32))
+
+/**
+ * netlbl_netlink_alloc_skb - Allocate a NETLINK message buffer
+ * @head: the amount of headroom in bytes
+ * @body: the desired size (minus headroom) in bytes
+ * @gfp_flags: the alloc flags to pass to alloc_skb()
+ *
+ * Description:
+ * Allocate a NETLINK message buffer based on the sizes given in @head and
+ * @body.  If @head is greater than zero skb_reserve() is called to reserve
+ * @head bytes at the start of the buffer.  Returns a valid sk_buff pointer on
+ * success, NULL on failure.
+ *
+ */
+static inline struct sk_buff *netlbl_netlink_alloc_skb(size_t head,
+						       size_t body,
+						       int gfp_flags)
+{
+	struct sk_buff *skb;
+
+	skb = alloc_skb(NLMSG_ALIGN(head + body), gfp_flags);
+	if (skb == NULL)
+		return NULL;
+	if (head > 0) {
+		skb_reserve(skb, head);
+		if (skb_tailroom(skb) < body) {
+			kfree_skb(skb);
+			return NULL;
+		}
+	}
+
+	return skb;
+}
+
+/*
+ * NetLabel - Kernel API for accessing the network packet label mappings.
+ *
+ * The following functions are provided for use by other kernel modules,
+ * specifically kernel LSM modules, to provide a consistent, transparent API
+ * for dealing with explicit packet labeling protocols such as CIPSO and
+ * RIPSO.  The functions defined here are implemented in the
+ * net/netlabel/netlabel_kapi.c file.
+ *
+ */
+
+/* Domain mapping definition struct */
+struct netlbl_dom_map;
+
+/* Domain mapping operations */
+int netlbl_domhsh_remove(const char *domain);
+
+/* LSM security attributes */
+struct netlbl_lsm_cache {
+	void (*free) (const void *data);
+	void *data;
+};
+struct netlbl_lsm_secattr {
+	char *domain;
+
+	u32 mls_lvl;
+	u32 mls_lvl_vld;
+	unsigned char *mls_cat;
+	size_t mls_cat_len;
+
+	struct netlbl_lsm_cache cache;
+};
+
+/*
+ * LSM security attribute operations
+ */
+
+
+/**
+ * netlbl_secattr_init - Initialize a netlbl_lsm_secattr struct
+ * @secattr: the struct to initialize
+ *
+ * Description:
+ * Initialize an already allocated netlbl_lsm_secattr struct.  Returns zero on
+ * success, negative values on error.
+ *
+ */
+static inline int netlbl_secattr_init(struct netlbl_lsm_secattr *secattr)
+{
+	memset(secattr, 0, sizeof(*secattr));
+	return 0;
+}
+
+/**
+ * netlbl_secattr_destroy - Clears a netlbl_lsm_secattr struct
+ * @secattr: the struct to clear
+ * @clear_cache: cache clear flag
+ *
+ * Description:
+ * Destroys the @secattr struct, including freeing all of the internal buffers.
+ * If @clear_cache is true then free the cache fields, otherwise leave them
+ * intact.  The struct must be reset with a call to netlbl_secattr_init()
+ * before reuse.
+ *
+ */
+static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr,
+					  u32 clear_cache)
+{
+	if (clear_cache && secattr->cache.data != NULL && secattr->cache.free)
+		secattr->cache.free(secattr->cache.data);
+	kfree(secattr->domain);
+	kfree(secattr->mls_cat);
+}
+
+/**
+ * netlbl_secattr_alloc - Allocate and initialize a netlbl_lsm_secattr struct
+ * @flags: the memory allocation flags
+ *
+ * Description:
+ * Allocate and initialize a netlbl_lsm_secattr struct.  Returns a valid
+ * pointer on success, or NULL on failure.
+ *
+ */
+static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(int flags)
+{
+	return kzalloc(sizeof(struct netlbl_lsm_secattr), flags);
+}
+
+/**
+ * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct
+ * @secattr: the struct to free
+ * @clear_cache: cache clear flag
+ *
+ * Description:
+ * Frees @secattr including all of the internal buffers.  If @clear_cache is
+ * true then free the cache fields, otherwise leave them intact.
+ *
+ */
+static inline void netlbl_secattr_free(struct netlbl_lsm_secattr *secattr,
+				       u32 clear_cache)
+{
+	netlbl_secattr_destroy(secattr, clear_cache);
+	kfree(secattr);
+}
+
+/*
+ * LSM protocol operations
+ */
+
+#ifdef CONFIG_NETLABEL
+int netlbl_socket_setattr(const struct socket *sock,
+			  const struct netlbl_lsm_secattr *secattr);
+int netlbl_socket_getattr(const struct socket *sock,
+			  struct netlbl_lsm_secattr *secattr);
+int netlbl_skbuff_getattr(const struct sk_buff *skb,
+			  struct netlbl_lsm_secattr *secattr);
+void netlbl_skbuff_err(struct sk_buff *skb, int error);
+#else
+static inline int netlbl_socket_setattr(const struct socket *sock,
+				     const struct netlbl_lsm_secattr *secattr)
+{
+	return -ENOSYS;
+}
+
+static inline int netlbl_socket_getattr(const struct socket *sock,
+					struct netlbl_lsm_secattr *secattr)
+{
+	return -ENOSYS;
+}
+
+static inline int netlbl_skbuff_getattr(const struct sk_buff *skb,
+					struct netlbl_lsm_secattr *secattr)
+{
+	return -ENOSYS;
+}
+
+static inline void netlbl_skbuff_err(struct sk_buff *skb, int error)
+{
+	return;
+}
+#endif /* CONFIG_NETLABEL */
+
+/*
+ * LSM label mapping cache operations
+ */
+
+#ifdef CONFIG_NETLABEL
+void netlbl_cache_invalidate(void);
+int netlbl_cache_add(const struct sk_buff *skb,
+		     const struct netlbl_lsm_secattr *secattr);
+#else
+static inline void netlbl_cache_invalidate(void)
+{
+	return;
+}
+
+static inline int netlbl_cache_add(const struct sk_buff *skb,
+				   const struct netlbl_lsm_secattr *secattr)
+{
+	return 0;
+}
+#endif /* CONFIG_NETLABEL */
+
+#endif /* _NETLABEL_H */
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 2b98943e6b0..008e69d2e42 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -35,7 +35,7 @@ static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr)
 		switch (*optptr) {
 		case IPOPT_SEC:
 		case 0x85:	/* Some "Extended Security" crap. */
-		case 0x86:	/* Another "Commercial Security" crap. */
+		case IPOPT_CIPSO:
 		case IPOPT_RA:
 		case 0x80|21:	/* RFC1770 */
 			break;
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 406056edc02..e0a93b4fa8c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -24,6 +24,7 @@
 #include <net/ip.h>
 #include <net/icmp.h>
 #include <net/route.h>
+#include <net/cipso_ipv4.h>
 
 /* 
  * Write options to IP header, record destination address to
@@ -194,6 +195,13 @@ int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb)
 			dopt->is_strictroute = sopt->is_strictroute;
 		}
 	}
+	if (sopt->cipso) {
+		optlen  = sptr[sopt->cipso+1];
+		dopt->cipso = dopt->optlen+sizeof(struct iphdr);
+		memcpy(dptr, sptr+sopt->cipso, optlen);
+		dptr += optlen;
+		dopt->optlen += optlen;
+	}
 	while (dopt->optlen & 3) {
 		*dptr++ = IPOPT_END;
 		dopt->optlen++;
@@ -434,6 +442,17 @@ int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
 			if (optptr[2] == 0 && optptr[3] == 0)
 				opt->router_alert = optptr - iph;
 			break;
+		      case IPOPT_CIPSO:
+		        if (opt->cipso) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			opt->cipso = optptr - iph;
+		        if (cipso_v4_validate(&optptr)) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			break;
 		      case IPOPT_SEC:
 		      case IPOPT_SID:
 		      default:
-- 
cgit v1.2.3-70-g09d2


From 446fda4f26822b2d42ab3396aafcedf38a9ff2b6 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Thu, 3 Aug 2006 16:48:06 -0700
Subject: [NetLabel]: CIPSOv4 engine

Add support for the Commercial IP Security Option (CIPSO) to the IPv4
network stack.  CIPSO has become a de-facto standard for
trusted/labeled networking amongst existing Trusted Operating Systems
such as Trusted Solaris, HP-UX CMW, etc.  This implementation is
designed to be used with the NetLabel subsystem to provide explicit
packet labeling to LSM developers.

The CIPSO/IPv4 packet labeling works by the LSM calling a NetLabel API
function which attaches a CIPSO label (IPv4 option) to a given socket;
this in turn attaches the CIPSO label to every packet leaving the
socket without any extra processing on the outbound side.  On the
inbound side the individual packet's sk_buff is examined through a
call to a NetLabel API function to determine if a CIPSO/IPv4 label is
present and if so the security attributes of the CIPSO label are
returned to the caller of the NetLabel API function.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h     |    4 +
 net/ipv4/Makefile          |    1 +
 net/ipv4/cipso_ipv4.c      | 1607 ++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/sysctl_net_ipv4.c |   35 +
 4 files changed, 1647 insertions(+)
 create mode 100644 net/ipv4/cipso_ipv4.c

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index e4b1a4d4dcf..af61d923540 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -411,6 +411,10 @@ enum
 	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
 	NET_TCP_DMA_COPYBREAK=116,
 	NET_TCP_SLOW_START_AFTER_IDLE=117,
+	NET_CIPSOV4_CACHE_ENABLE=118,
+	NET_CIPSOV4_CACHE_BUCKET_SIZE=119,
+	NET_CIPSOV4_RBM_OPTFMT=120,
+	NET_CIPSOV4_RBM_STRICTVALID=121,
 };
 
 enum {
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4878fc5be85..f66049e28ae 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
 obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
 obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
new file mode 100644
index 00000000000..b82a101c95c
--- /dev/null
+++ b/net/ipv4/cipso_ipv4.c
@@ -0,0 +1,1607 @@
+/*
+ * CIPSO - Commercial IP Security Option
+ *
+ * This is an implementation of the CIPSO 2.2 protocol as specified in
+ * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
+ * FIPS-188, copies of both documents can be found in the Documentation
+ * directory.  While CIPSO never became a full IETF RFC standard many vendors
+ * have chosen to adopt the protocol and over the years it has become a
+ * de-facto standard for labeled networking.
+ *
+ * Author: Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/netlabel.h>
+#include <net/cipso_ipv4.h>
+#include <asm/bug.h>
+
+struct cipso_v4_domhsh_entry {
+	char *domain;
+	u32 valid;
+	struct list_head list;
+	struct rcu_head rcu;
+};
+
+/* List of available DOI definitions */
+/* XXX - Updates should be minimal so having a single lock for the
+ * cipso_v4_doi_list and the cipso_v4_doi_list->dom_list should be
+ * okay. */
+/* XXX - This currently assumes a minimal number of different DOIs in use,
+ * if in practice there are a lot of different DOIs this list should
+ * probably be turned into a hash table or something similar so we
+ * can do quick lookups. */
+DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
+static struct list_head cipso_v4_doi_list = LIST_HEAD_INIT(cipso_v4_doi_list);
+
+/* Label mapping cache */
+int cipso_v4_cache_enabled = 1;
+int cipso_v4_cache_bucketsize = 10;
+#define CIPSO_V4_CACHE_BUCKETBITS     7
+#define CIPSO_V4_CACHE_BUCKETS        (1 << CIPSO_V4_CACHE_BUCKETBITS)
+#define CIPSO_V4_CACHE_REORDERLIMIT   10
+struct cipso_v4_map_cache_bkt {
+	spinlock_t lock;
+	u32 size;
+	struct list_head list;
+};
+struct cipso_v4_map_cache_entry {
+	u32 hash;
+	unsigned char *key;
+	size_t key_len;
+
+	struct netlbl_lsm_cache lsm_data;
+
+	u32 activity;
+	struct list_head list;
+};
+static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL;
+
+/* Restricted bitmap (tag #1) flags */
+int cipso_v4_rbm_optfmt = 0;
+int cipso_v4_rbm_strictvalid = 1;
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit
+ * @bitmap: the bitmap
+ * @bitmap_len: length in bits
+ * @offset: starting offset
+ * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
+ *
+ * Description:
+ * Starting at @offset, walk the bitmap from left to right until either the
+ * desired bit is found or we reach the end.  Return the bit offset, -1 if
+ * not found, or -2 if error.
+ */
+static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
+				u32 bitmap_len,
+				u32 offset,
+				u8 state)
+{
+	u32 bit_spot;
+	u32 byte_offset;
+	unsigned char bitmask;
+	unsigned char byte;
+
+	/* gcc always rounds to zero when doing integer division */
+	byte_offset = offset / 8;
+	byte = bitmap[byte_offset];
+	bit_spot = offset;
+	bitmask = 0x80 >> (offset % 8);
+
+	while (bit_spot < bitmap_len) {
+		if ((state && (byte & bitmask) == bitmask) ||
+		    (state == 0 && (byte & bitmask) == 0))
+			return bit_spot;
+
+		bit_spot++;
+		bitmask >>= 1;
+		if (bitmask == 0) {
+			byte = bitmap[++byte_offset];
+			bitmask = 0x80;
+		}
+	}
+
+	return -1;
+}
+
+/**
+ * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap
+ * @bitmap: the bitmap
+ * @bit: the bit
+ * @state: if non-zero, set the bit (1) else clear the bit (0)
+ *
+ * Description:
+ * Set a single bit in the bitmask.  Returns zero on success, negative values
+ * on error.
+ */
+static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
+				   u32 bit,
+				   u8 state)
+{
+	u32 byte_spot;
+	u8 bitmask;
+
+	/* gcc always rounds to zero when doing integer division */
+	byte_spot = bit / 8;
+	bitmask = 0x80 >> (bit % 8);
+	if (state)
+		bitmap[byte_spot] |= bitmask;
+	else
+		bitmap[byte_spot] &= ~bitmask;
+}
+
+/**
+ * cipso_v4_doi_domhsh_free - Frees a domain list entry
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to a domain list entry can be released
+ * safely.
+ *
+ */
+static void cipso_v4_doi_domhsh_free(struct rcu_head *entry)
+{
+	struct cipso_v4_domhsh_entry *ptr;
+
+	ptr = container_of(entry, struct cipso_v4_domhsh_entry, rcu);
+	kfree(ptr->domain);
+	kfree(ptr);
+}
+
+/**
+ * cipso_v4_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry.
+ *
+ */
+static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry)
+{
+	if (entry->lsm_data.free)
+		entry->lsm_data.free(entry->lsm_data.data);
+	kfree(entry->key);
+	kfree(entry);
+}
+
+/**
+ * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CIPSO tag hashing function.  Returns a 32-bit hash value.
+ *
+ */
+static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+/*
+ * Label Mapping Cache Functions
+ */
+
+/**
+ * cipso_v4_cache_init - Initialize the CIPSO cache
+ *
+ * Description:
+ * Initializes the CIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file.  Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int cipso_v4_cache_init(void)
+{
+	u32 iter;
+
+	cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS,
+				 sizeof(struct cipso_v4_map_cache_bkt),
+				 GFP_KERNEL);
+	if (cipso_v4_cache == NULL)
+		return -ENOMEM;
+
+	for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+		spin_lock_init(&cipso_v4_cache[iter].lock);
+		cipso_v4_cache[iter].size = 0;
+		INIT_LIST_HEAD(&cipso_v4_cache[iter].list);
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CIPSO cache.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void cipso_v4_cache_invalidate(void)
+{
+	struct cipso_v4_map_cache_entry *entry, *tmp_entry;
+	u32 iter;
+
+	for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+		spin_lock(&cipso_v4_cache[iter].lock);
+		list_for_each_entry_safe(entry,
+					 tmp_entry,
+					 &cipso_v4_cache[iter].list, list) {
+			list_del(&entry->list);
+			cipso_v4_cache_entry_free(entry);
+		}
+		cipso_v4_cache[iter].size = 0;
+		spin_unlock(&cipso_v4_cache[iter].lock);
+	}
+
+	return;
+}
+
+/**
+ * cipso_v4_cache_check - Check the CIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key.  If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes.  The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ *  1. The cache entry's activity counter is incremented
+ *  2. The previous (higher ranking) entry's activity counter is decremented
+ *  3. If the difference between the two activity counters is geater than
+ *     CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int cipso_v4_cache_check(const unsigned char *key,
+				u32 key_len,
+				struct netlbl_lsm_secattr *secattr)
+{
+	u32 bkt;
+	struct cipso_v4_map_cache_entry *entry;
+	struct cipso_v4_map_cache_entry *prev_entry = NULL;
+	u32 hash;
+
+	if (!cipso_v4_cache_enabled)
+		return -ENOENT;
+
+	hash = cipso_v4_map_cache_hash(key, key_len);
+	bkt = hash & (CIPSO_V4_CACHE_BUCKETBITS - 1);
+	spin_lock(&cipso_v4_cache[bkt].lock);
+	list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
+		if (entry->hash == hash &&
+		    entry->key_len == key_len &&
+		    memcmp(entry->key, key, key_len) == 0) {
+			entry->activity += 1;
+			secattr->cache.free = entry->lsm_data.free;
+			secattr->cache.data = entry->lsm_data.data;
+			if (prev_entry == NULL) {
+				spin_unlock(&cipso_v4_cache[bkt].lock);
+				return 0;
+			}
+
+			if (prev_entry->activity > 0)
+				prev_entry->activity -= 1;
+			if (entry->activity > prev_entry->activity &&
+			    entry->activity - prev_entry->activity >
+			    CIPSO_V4_CACHE_REORDERLIMIT) {
+				__list_del(entry->list.prev, entry->list.next);
+				__list_add(&entry->list,
+					   prev_entry->list.prev,
+					   &prev_entry->list);
+			}
+
+			spin_unlock(&cipso_v4_cache[bkt].lock);
+			return 0;
+		}
+		prev_entry = entry;
+	}
+	spin_unlock(&cipso_v4_cache[bkt].lock);
+
+	return -ENOENT;
+}
+
+/**
+ * cipso_v4_cache_add - Add an entry to the CIPSO cache
+ * @skb: the packet
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CIPSO label mapping cache.  Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first.  It is important to note that there is
+ * currently no checking for duplicate keys.  Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int cipso_v4_cache_add(const struct sk_buff *skb,
+		       const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -EPERM;
+	u32 bkt;
+	struct cipso_v4_map_cache_entry *entry = NULL;
+	struct cipso_v4_map_cache_entry *old_entry = NULL;
+	unsigned char *cipso_ptr;
+	u32 cipso_ptr_len;
+
+	if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
+		return 0;
+
+	cipso_ptr = CIPSO_V4_OPTPTR(skb);
+	cipso_ptr_len = cipso_ptr[1];
+
+	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL)
+		return -ENOMEM;
+	entry->key = kmalloc(cipso_ptr_len, GFP_ATOMIC);
+	if (entry->key == NULL) {
+		ret_val = -ENOMEM;
+		goto cache_add_failure;
+	}
+	memcpy(entry->key, cipso_ptr, cipso_ptr_len);
+	entry->key_len = cipso_ptr_len;
+	entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len);
+	entry->lsm_data.free = secattr->cache.free;
+	entry->lsm_data.data = secattr->cache.data;
+
+	bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETBITS - 1);
+	spin_lock(&cipso_v4_cache[bkt].lock);
+	if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
+		list_add(&entry->list, &cipso_v4_cache[bkt].list);
+		cipso_v4_cache[bkt].size += 1;
+	} else {
+		old_entry = list_entry(cipso_v4_cache[bkt].list.prev,
+				       struct cipso_v4_map_cache_entry, list);
+		list_del(&old_entry->list);
+		list_add(&entry->list, &cipso_v4_cache[bkt].list);
+		cipso_v4_cache_entry_free(old_entry);
+	}
+	spin_unlock(&cipso_v4_cache[bkt].lock);
+
+	return 0;
+
+cache_add_failure:
+	if (entry)
+		cipso_v4_cache_entry_free(entry);
+	return ret_val;
+}
+
+/*
+ * DOI List Functions
+ */
+
+/**
+ * cipso_v4_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi.  The caller is responsibile for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
+{
+	struct cipso_v4_doi *iter;
+
+	list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
+		if (iter->doi == doi && iter->valid)
+			return iter;
+	return NULL;
+}
+
+/**
+ * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine
+ * @doi_def: the DOI structure
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CIPSO engine and calls this
+ * function to add it to the list of acceptable domains.  The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see cipso_ipv4.h for details).  Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+int cipso_v4_doi_add(struct cipso_v4_doi *doi_def)
+{
+	if (doi_def == NULL || doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+		return -EINVAL;
+
+	doi_def->valid = 1;
+	INIT_RCU_HEAD(&doi_def->rcu);
+	INIT_LIST_HEAD(&doi_def->dom_list);
+
+	rcu_read_lock();
+	if (cipso_v4_doi_search(doi_def->doi) != NULL)
+		goto doi_add_failure_rlock;
+	spin_lock(&cipso_v4_doi_list_lock);
+	if (cipso_v4_doi_search(doi_def->doi) != NULL)
+		goto doi_add_failure_slock;
+	list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list);
+	spin_unlock(&cipso_v4_doi_list_lock);
+	rcu_read_unlock();
+
+	return 0;
+
+doi_add_failure_slock:
+	spin_unlock(&cipso_v4_doi_list_lock);
+doi_add_failure_rlock:
+	rcu_read_unlock();
+	return -EEXIST;
+}
+
+/**
+ * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
+ * @doi: the DOI value
+ * @callback: the DOI cleanup/free callback
+ *
+ * Description:
+ * Removes a DOI definition from the CIPSO engine, @callback is called to
+ * free any memory.  The NetLabel routines will be called to release their own
+ * LSM domain mappings as well as our own domain list.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int cipso_v4_doi_remove(u32 doi, void (*callback) (struct rcu_head * head))
+{
+	struct cipso_v4_doi *doi_def;
+	struct cipso_v4_domhsh_entry *dom_iter;
+
+	rcu_read_lock();
+	if (cipso_v4_doi_search(doi) != NULL) {
+		spin_lock(&cipso_v4_doi_list_lock);
+		doi_def = cipso_v4_doi_search(doi);
+		if (doi_def == NULL) {
+			spin_unlock(&cipso_v4_doi_list_lock);
+			rcu_read_unlock();
+			return -ENOENT;
+		}
+		doi_def->valid = 0;
+		list_del_rcu(&doi_def->list);
+		spin_unlock(&cipso_v4_doi_list_lock);
+		list_for_each_entry_rcu(dom_iter, &doi_def->dom_list, list)
+			if (dom_iter->valid)
+				netlbl_domhsh_remove(dom_iter->domain);
+		cipso_v4_cache_invalidate();
+		rcu_read_unlock();
+
+		call_rcu(&doi_def->rcu, callback);
+		return 0;
+	}
+	rcu_read_unlock();
+
+	return -ENOENT;
+}
+
+/**
+ * cipso_v4_doi_getdef - Returns a pointer to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller.  Otherwise NULL is returned.  The caller must ensure that
+ * rcu_read_lock() is held while accessing the returned definition.
+ *
+ */
+struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
+{
+	return cipso_v4_doi_search(doi);
+}
+
+/**
+ * cipso_v4_doi_dump_all - Dump all the CIPSO DOI definitions into a sk_buff
+ * @headroom: the amount of headroom to allocate for the sk_buff
+ *
+ * Description:
+ * Dump a list of all the configured DOI values into a sk_buff.  The returned
+ * sk_buff has room at the front of the sk_buff for @headroom bytes.  See
+ * net/netlabel/netlabel_cipso_v4.h for the LISTALL message format.  This
+ * function may fail if another process is changing the DOI list at the same
+ * time.  Returns a pointer to a sk_buff on success, NULL on error.
+ *
+ */
+struct sk_buff *cipso_v4_doi_dump_all(size_t headroom)
+{
+	struct sk_buff *skb = NULL;
+	struct cipso_v4_doi *iter;
+	u32 doi_cnt = 0;
+	ssize_t buf_len;
+
+	buf_len = NETLBL_LEN_U32;
+	rcu_read_lock();
+	list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
+		if (iter->valid) {
+			doi_cnt += 1;
+			buf_len += 2 * NETLBL_LEN_U32;
+		}
+
+	skb = netlbl_netlink_alloc_skb(headroom, buf_len, GFP_ATOMIC);
+	if (skb == NULL)
+		goto doi_dump_all_failure;
+
+	if (nla_put_u32(skb, NLA_U32, doi_cnt) != 0)
+		goto doi_dump_all_failure;
+	buf_len -= NETLBL_LEN_U32;
+	list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
+		if (iter->valid) {
+			if (buf_len < 2 * NETLBL_LEN_U32)
+				goto doi_dump_all_failure;
+			if (nla_put_u32(skb, NLA_U32, iter->doi) != 0)
+				goto doi_dump_all_failure;
+			if (nla_put_u32(skb, NLA_U32, iter->type) != 0)
+				goto doi_dump_all_failure;
+			buf_len -= 2 * NETLBL_LEN_U32;
+		}
+	rcu_read_unlock();
+
+	return skb;
+
+doi_dump_all_failure:
+	rcu_read_unlock();
+	kfree(skb);
+	return NULL;
+}
+
+/**
+ * cipso_v4_doi_dump - Dump a CIPSO DOI definition into a sk_buff
+ * @doi: the DOI value
+ * @headroom: the amount of headroom to allocate for the sk_buff
+ *
+ * Description:
+ * Lookup the DOI definition matching @doi and dump it's contents into a
+ * sk_buff.  The returned sk_buff has room at the front of the sk_buff for
+ * @headroom bytes.  See net/netlabel/netlabel_cipso_v4.h for the LIST message
+ * format.  This function may fail if another process is changing the DOI list
+ * at the same time.  Returns a pointer to a sk_buff on success, NULL on error.
+ *
+ */
+struct sk_buff *cipso_v4_doi_dump(u32 doi, size_t headroom)
+{
+	struct sk_buff *skb = NULL;
+	struct cipso_v4_doi *iter;
+	u32 tag_cnt = 0;
+	u32 lvl_cnt = 0;
+	u32 cat_cnt = 0;
+	ssize_t buf_len;
+	ssize_t tmp;
+
+	rcu_read_lock();
+	iter = cipso_v4_doi_getdef(doi);
+	if (iter == NULL)
+		goto doi_dump_failure;
+	buf_len = NETLBL_LEN_U32;
+	switch (iter->type) {
+	case CIPSO_V4_MAP_PASS:
+		buf_len += NETLBL_LEN_U32;
+		while(tag_cnt < CIPSO_V4_TAG_MAXCNT &&
+		      iter->tags[tag_cnt] != CIPSO_V4_TAG_INVALID) {
+			tag_cnt += 1;
+			buf_len += NETLBL_LEN_U8;
+		}
+		break;
+	case CIPSO_V4_MAP_STD:
+		buf_len += 3 * NETLBL_LEN_U32;
+		while (tag_cnt < CIPSO_V4_TAG_MAXCNT &&
+		       iter->tags[tag_cnt] != CIPSO_V4_TAG_INVALID) {
+			tag_cnt += 1;
+			buf_len += NETLBL_LEN_U8;
+		}
+		for (tmp = 0; tmp < iter->map.std->lvl.local_size; tmp++)
+			if (iter->map.std->lvl.local[tmp] !=
+			    CIPSO_V4_INV_LVL) {
+				lvl_cnt += 1;
+				buf_len += NETLBL_LEN_U32 + NETLBL_LEN_U8;
+			}
+		for (tmp = 0; tmp < iter->map.std->cat.local_size; tmp++)
+			if (iter->map.std->cat.local[tmp] !=
+			    CIPSO_V4_INV_CAT) {
+				cat_cnt += 1;
+				buf_len += NETLBL_LEN_U32 + NETLBL_LEN_U16;
+			}
+		break;
+	}
+
+	skb = netlbl_netlink_alloc_skb(headroom, buf_len, GFP_ATOMIC);
+	if (skb == NULL)
+		goto doi_dump_failure;
+
+	if (nla_put_u32(skb, NLA_U32, iter->type) != 0)
+		goto doi_dump_failure;
+	buf_len -= NETLBL_LEN_U32;
+	if (iter != cipso_v4_doi_getdef(doi))
+		goto doi_dump_failure;
+	switch (iter->type) {
+	case CIPSO_V4_MAP_PASS:
+		if (nla_put_u32(skb, NLA_U32, tag_cnt) != 0)
+			goto doi_dump_failure;
+		buf_len -= NETLBL_LEN_U32;
+		for (tmp = 0;
+		     tmp < CIPSO_V4_TAG_MAXCNT &&
+			     iter->tags[tmp] != CIPSO_V4_TAG_INVALID;
+		     tmp++) {
+			if (buf_len < NETLBL_LEN_U8)
+				goto doi_dump_failure;
+			if (nla_put_u8(skb, NLA_U8, iter->tags[tmp]) != 0)
+				goto doi_dump_failure;
+			buf_len -= NETLBL_LEN_U8;
+		}
+		break;
+	case CIPSO_V4_MAP_STD:
+		if (nla_put_u32(skb, NLA_U32, tag_cnt) != 0)
+			goto doi_dump_failure;
+		if (nla_put_u32(skb, NLA_U32, lvl_cnt) != 0)
+			goto doi_dump_failure;
+		if (nla_put_u32(skb, NLA_U32, cat_cnt) != 0)
+			goto doi_dump_failure;
+		buf_len -= 3 * NETLBL_LEN_U32;
+		for (tmp = 0;
+		     tmp < CIPSO_V4_TAG_MAXCNT &&
+			     iter->tags[tmp] != CIPSO_V4_TAG_INVALID;
+		     tmp++) {
+			if (buf_len < NETLBL_LEN_U8)
+				goto doi_dump_failure;
+			if (nla_put_u8(skb, NLA_U8, iter->tags[tmp]) != 0)
+				goto doi_dump_failure;
+			buf_len -= NETLBL_LEN_U8;
+		}
+		for (tmp = 0; tmp < iter->map.std->lvl.local_size; tmp++)
+			if (iter->map.std->lvl.local[tmp] !=
+			    CIPSO_V4_INV_LVL) {
+				if (buf_len < NETLBL_LEN_U32 + NETLBL_LEN_U8)
+					goto doi_dump_failure;
+				if (nla_put_u32(skb, NLA_U32, tmp) != 0)
+					goto doi_dump_failure;
+				if (nla_put_u8(skb,
+					   NLA_U8,
+					   iter->map.std->lvl.local[tmp]) != 0)
+					goto doi_dump_failure;
+				buf_len -= NETLBL_LEN_U32 + NETLBL_LEN_U8;
+			}
+		for (tmp = 0; tmp < iter->map.std->cat.local_size; tmp++)
+			if (iter->map.std->cat.local[tmp] !=
+			    CIPSO_V4_INV_CAT) {
+				if (buf_len < NETLBL_LEN_U32 + NETLBL_LEN_U16)
+					goto doi_dump_failure;
+				if (nla_put_u32(skb, NLA_U32, tmp) != 0)
+					goto doi_dump_failure;
+				if (nla_put_u16(skb,
+					   NLA_U16,
+					   iter->map.std->cat.local[tmp]) != 0)
+					goto doi_dump_failure;
+				buf_len -= NETLBL_LEN_U32 + NETLBL_LEN_U16;
+			}
+		break;
+	}
+	rcu_read_unlock();
+
+	return skb;
+
+doi_dump_failure:
+	rcu_read_unlock();
+	kfree(skb);
+	return NULL;
+}
+
+/**
+ * cipso_v4_doi_domhsh_add - Adds a domain entry to a DOI definition
+ * @doi_def: the DOI definition
+ * @domain: the domain to add
+ *
+ * Description:
+ * Adds the @domain to the the DOI specified by @doi_def, this function
+ * should only be called by external functions (i.e. NetLabel).  This function
+ * does allocate memory.  Returns zero on success, negative values on failure.
+ *
+ */
+int cipso_v4_doi_domhsh_add(struct cipso_v4_doi *doi_def, const char *domain)
+{
+	struct cipso_v4_domhsh_entry *iter;
+	struct cipso_v4_domhsh_entry *new_dom;
+
+	new_dom = kzalloc(sizeof(*new_dom), GFP_KERNEL);
+	if (new_dom == NULL)
+		return -ENOMEM;
+	if (domain) {
+		new_dom->domain = kstrdup(domain, GFP_KERNEL);
+		if (new_dom->domain == NULL) {
+			kfree(new_dom);
+			return -ENOMEM;
+		}
+	}
+	new_dom->valid = 1;
+	INIT_RCU_HEAD(&new_dom->rcu);
+
+	rcu_read_lock();
+	spin_lock(&cipso_v4_doi_list_lock);
+	list_for_each_entry_rcu(iter, &doi_def->dom_list, list)
+		if (iter->valid &&
+		    ((domain != NULL && iter->domain != NULL &&
+		      strcmp(iter->domain, domain) == 0) ||
+		     (domain == NULL && iter->domain == NULL))) {
+			spin_unlock(&cipso_v4_doi_list_lock);
+			rcu_read_unlock();
+			kfree(new_dom->domain);
+			kfree(new_dom);
+			return -EEXIST;
+		}
+	list_add_tail_rcu(&new_dom->list, &doi_def->dom_list);
+	spin_unlock(&cipso_v4_doi_list_lock);
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/**
+ * cipso_v4_doi_domhsh_remove - Removes a domain entry from a DOI definition
+ * @doi_def: the DOI definition
+ * @domain: the domain to remove
+ *
+ * Description:
+ * Removes the @domain from the DOI specified by @doi_def, this function
+ * should only be called by external functions (i.e. NetLabel).   Returns zero
+ * on success and negative values on error.
+ *
+ */
+int cipso_v4_doi_domhsh_remove(struct cipso_v4_doi *doi_def,
+			       const char *domain)
+{
+	struct cipso_v4_domhsh_entry *iter;
+
+	rcu_read_lock();
+	spin_lock(&cipso_v4_doi_list_lock);
+	list_for_each_entry_rcu(iter, &doi_def->dom_list, list)
+		if (iter->valid &&
+		    ((domain != NULL && iter->domain != NULL &&
+		      strcmp(iter->domain, domain) == 0) ||
+		     (domain == NULL && iter->domain == NULL))) {
+			iter->valid = 0;
+			list_del_rcu(&iter->list);
+			spin_unlock(&cipso_v4_doi_list_lock);
+			rcu_read_unlock();
+			call_rcu(&iter->rcu, cipso_v4_doi_domhsh_free);
+
+			return 0;
+		}
+	spin_unlock(&cipso_v4_doi_list_lock);
+	rcu_read_unlock();
+
+	return -ENOENT;
+}
+
+/*
+ * Label Mapping Functions
+ */
+
+/**
+ * cipso_v4_map_lvl_valid - Checks to see if the given level is understood
+ * @doi_def: the DOI definition
+ * @level: the level to check
+ *
+ * Description:
+ * Checks the given level against the given DOI definition and returns a
+ * negative value if the level does not have a valid mapping and a zero value
+ * if the level is defined by the DOI.
+ *
+ */
+static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
+{
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		return 0;
+	case CIPSO_V4_MAP_STD:
+		if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
+			return 0;
+		break;
+	}
+
+	return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network
+ * @doi_def: the DOI definition
+ * @host_lvl: the host MLS level
+ * @net_lvl: the network/CIPSO MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS level to the correct
+ * CIPSO level using the given DOI definition.  Returns zero on success,
+ * negative values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
+				 u32 host_lvl,
+				 u32 *net_lvl)
+{
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		*net_lvl = host_lvl;
+		return 0;
+	case CIPSO_V4_MAP_STD:
+		if (host_lvl < doi_def->map.std->lvl.local_size) {
+			*net_lvl = doi_def->map.std->lvl.local[host_lvl];
+			return 0;
+		}
+		break;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host
+ * @doi_def: the DOI definition
+ * @net_lvl: the network/CIPSO MLS level
+ * @host_lvl: the host MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO level to the correct local MLS
+ * level using the given DOI definition.  Returns zero on success, negative
+ * values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
+				 u32 net_lvl,
+				 u32 *host_lvl)
+{
+	struct cipso_v4_std_map_tbl *map_tbl;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		*host_lvl = net_lvl;
+		return 0;
+	case CIPSO_V4_MAP_STD:
+		map_tbl = doi_def->map.std;
+		if (net_lvl < map_tbl->lvl.cipso_size &&
+		    map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
+			*host_lvl = doi_def->map.std->lvl.cipso[net_lvl];
+			return 0;
+		}
+		break;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid
+ * @doi_def: the DOI definition
+ * @bitmap: category bitmap
+ * @bitmap_len: bitmap length in bytes
+ *
+ * Description:
+ * Checks the given category bitmap against the given DOI definition and
+ * returns a negative value if any of the categories in the bitmap do not have
+ * a valid mapping and a zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
+				      const unsigned char *bitmap,
+				      u32 bitmap_len)
+{
+	int cat = -1;
+	u32 bitmap_len_bits = bitmap_len * 8;
+	u32 cipso_cat_size = doi_def->map.std->cat.cipso_size;
+	u32 *cipso_array = doi_def->map.std->cat.cipso;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		return 0;
+	case CIPSO_V4_MAP_STD:
+		for (;;) {
+			cat = cipso_v4_bitmap_walk(bitmap,
+						   bitmap_len_bits,
+						   cat + 1,
+						   1);
+			if (cat < 0)
+				break;
+			if (cat >= cipso_cat_size ||
+			    cipso_array[cat] >= CIPSO_V4_INV_CAT)
+				return -EFAULT;
+		}
+
+		if (cat == -1)
+			return 0;
+		break;
+	}
+
+	return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @host_cat: the category bitmap in host format
+ * @host_cat_len: the length of the host's category bitmap in bytes
+ * @net_cat: the zero'd out category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO bitmap using the given DOI definition.  Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
+				     const unsigned char *host_cat,
+				     u32 host_cat_len,
+				     unsigned char *net_cat,
+				     u32 net_cat_len)
+{
+	int host_spot = -1;
+	u32 net_spot;
+	u32 net_spot_max = 0;
+	u32 host_clen_bits = host_cat_len * 8;
+	u32 net_clen_bits = net_cat_len * 8;
+	u32 host_cat_size = doi_def->map.std->cat.local_size;
+	u32 *host_cat_array = doi_def->map.std->cat.local;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		net_spot_max = host_cat_len - 1;
+		while (net_spot_max > 0 && host_cat[net_spot_max] == 0)
+			net_spot_max--;
+		if (net_spot_max > net_cat_len)
+			return -EINVAL;
+		memcpy(net_cat, host_cat, net_spot_max);
+		return net_spot_max;
+	case CIPSO_V4_MAP_STD:
+		for (;;) {
+			host_spot = cipso_v4_bitmap_walk(host_cat,
+							 host_clen_bits,
+							 host_spot + 1,
+							 1);
+			if (host_spot < 0)
+				break;
+			if (host_spot >= host_cat_size)
+				return -EPERM;
+
+			net_spot = host_cat_array[host_spot];
+			if (net_spot >= net_clen_bits)
+				return -ENOSPC;
+			cipso_v4_bitmap_setbit(net_cat, net_spot, 1);
+
+			if (net_spot > net_spot_max)
+				net_spot_max = net_spot;
+		}
+
+		if (host_spot == -2)
+			return -EFAULT;
+
+		if (++net_spot_max % 8)
+			return net_spot_max / 8 + 1;
+		return net_spot_max / 8;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @host_cat: the zero'd out category bitmap in host format
+ * @host_cat_len: the length of the host's category bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition.  Returns the minimum
+ * size in bytes of the host bitmap on success, negative values otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
+				     const unsigned char *net_cat,
+				     u32 net_cat_len,
+				     unsigned char *host_cat,
+				     u32 host_cat_len)
+{
+	u32 host_spot;
+	u32 host_spot_max = 0;
+	int net_spot = -1;
+	u32 net_clen_bits = net_cat_len * 8;
+	u32 host_clen_bits = host_cat_len * 8;
+	u32 net_cat_size = doi_def->map.std->cat.cipso_size;
+	u32 *net_cat_array = doi_def->map.std->cat.cipso;
+
+	switch (doi_def->type) {
+	case CIPSO_V4_MAP_PASS:
+		if (net_cat_len > host_cat_len)
+			return -EINVAL;
+		memcpy(host_cat, net_cat, net_cat_len);
+		return net_cat_len;
+	case CIPSO_V4_MAP_STD:
+		for (;;) {
+			net_spot = cipso_v4_bitmap_walk(net_cat,
+							net_clen_bits,
+							net_spot + 1,
+							1);
+			if (net_spot < 0)
+				break;
+			if (net_spot >= net_cat_size ||
+			    net_cat_array[net_spot] >= CIPSO_V4_INV_CAT)
+				return -EPERM;
+
+			host_spot = net_cat_array[net_spot];
+			if (host_spot >= host_clen_bits)
+				return -ENOSPC;
+			cipso_v4_bitmap_setbit(host_cat, host_spot, 1);
+
+			if (host_spot > host_spot_max)
+				host_spot_max = host_spot;
+		}
+
+		if (net_spot == -2)
+			return -EFAULT;
+
+		if (++host_spot_max % 8)
+			return host_spot_max / 8 + 1;
+		return host_spot_max / 8;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Protocol Handling Functions
+ */
+
+#define CIPSO_V4_HDR_LEN              6
+
+/**
+ * cipso_v4_gentag_hdr - Generate a CIPSO option header
+ * @doi_def: the DOI definition
+ * @len: the total tag length in bytes
+ * @buf: the CIPSO option buffer
+ *
+ * Description:
+ * Write a CIPSO header into the beginning of @buffer.  Return zero on success,
+ * negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def,
+			       u32 len,
+			       unsigned char *buf)
+{
+	if (CIPSO_V4_HDR_LEN + len > 40)
+		return -ENOSPC;
+
+	buf[0] = IPOPT_CIPSO;
+	buf[1] = CIPSO_V4_HDR_LEN + len;
+	*(u32 *)&buf[2] = htonl(doi_def->doi);
+
+	return 0;
+}
+
+#define CIPSO_V4_TAG1_CAT_LEN         30
+
+/**
+ * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the restricted bitmap tag, tag type #1.  The
+ * actual buffer length may be larger than the indicated size due to
+ * translation between host and network category bitmaps.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
+			       const struct netlbl_lsm_secattr *secattr,
+			       unsigned char **buffer,
+			       u32 *buffer_len)
+{
+	int ret_val = -EPERM;
+	unsigned char *buf = NULL;
+	u32 buf_len;
+	u32 level;
+
+	if (secattr->mls_cat) {
+		buf = kzalloc(CIPSO_V4_HDR_LEN + 4 + CIPSO_V4_TAG1_CAT_LEN,
+			      GFP_ATOMIC);
+		if (buf == NULL)
+			return -ENOMEM;
+
+		ret_val = cipso_v4_map_cat_rbm_hton(doi_def,
+						    secattr->mls_cat,
+						    secattr->mls_cat_len,
+						    &buf[CIPSO_V4_HDR_LEN + 4],
+						    CIPSO_V4_TAG1_CAT_LEN);
+		if (ret_val < 0)
+			goto gentag_failure;
+
+		/* This will send packets using the "optimized" format when
+		 * possibile as specified in  section 3.4.2.6 of the
+		 * CIPSO draft. */
+		if (cipso_v4_rbm_optfmt && (ret_val > 0 && ret_val < 10))
+			ret_val = 10;
+
+		buf_len = 4 + ret_val;
+	} else {
+		buf = kzalloc(CIPSO_V4_HDR_LEN + 4, GFP_ATOMIC);
+		if (buf == NULL)
+			return -ENOMEM;
+		buf_len = 4;
+	}
+
+	ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->mls_lvl, &level);
+	if (ret_val != 0)
+		goto gentag_failure;
+
+	ret_val = cipso_v4_gentag_hdr(doi_def, buf_len, buf);
+	if (ret_val != 0)
+		goto gentag_failure;
+
+	buf[CIPSO_V4_HDR_LEN] = 0x01;
+	buf[CIPSO_V4_HDR_LEN + 1] = buf_len;
+	buf[CIPSO_V4_HDR_LEN + 3] = level;
+
+	*buffer = buf;
+	*buffer_len = CIPSO_V4_HDR_LEN + buf_len;
+
+	return 0;
+
+gentag_failure:
+	kfree(buf);
+	return ret_val;
+}
+
+/**
+ * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security
+ * attributes in @secattr.  Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
+				 const unsigned char *tag,
+				 struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val;
+	u8 tag_len = tag[1];
+	u32 level;
+
+	ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+	if (ret_val != 0)
+		return ret_val;
+	secattr->mls_lvl = level;
+	secattr->mls_lvl_vld = 1;
+
+	if (tag_len > 4) {
+		switch (doi_def->type) {
+		case CIPSO_V4_MAP_PASS:
+			secattr->mls_cat_len = tag_len - 4;
+			break;
+		case CIPSO_V4_MAP_STD:
+			secattr->mls_cat_len =
+				doi_def->map.std->cat.local_size;
+			break;
+		}
+		secattr->mls_cat = kzalloc(secattr->mls_cat_len, GFP_ATOMIC);
+		if (secattr->mls_cat == NULL)
+			return -ENOMEM;
+
+		ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
+						    &tag[4],
+						    tag_len - 4,
+						    secattr->mls_cat,
+						    secattr->mls_cat_len);
+		if (ret_val < 0) {
+			kfree(secattr->mls_cat);
+			return ret_val;
+		}
+		secattr->mls_cat_len = ret_val;
+	}
+
+	return 0;
+}
+
+/**
+ * cipso_v4_validate - Validate a CIPSO option
+ * @option: the start of the option, on error it is set to point to the error
+ *
+ * Description:
+ * This routine is called to validate a CIPSO option, it checks all of the
+ * fields to ensure that they are at least valid, see the draft snippet below
+ * for details.  If the option is valid then a zero value is returned and
+ * the value of @option is unchanged.  If the option is invalid then a
+ * non-zero value is returned and @option is adjusted to point to the
+ * offending portion of the option.  From the IETF draft ...
+ *
+ *  "If any field within the CIPSO options, such as the DOI identifier, is not
+ *   recognized the IP datagram is discarded and an ICMP 'parameter problem'
+ *   (type 12) is generated and returned.  The ICMP code field is set to 'bad
+ *   parameter' (code 0) and the pointer is set to the start of the CIPSO field
+ *   that is unrecognized."
+ *
+ */
+int cipso_v4_validate(unsigned char **option)
+{
+	unsigned char *opt = *option;
+	unsigned char *tag;
+	unsigned char opt_iter;
+	unsigned char err_offset = 0;
+	u8 opt_len;
+	u8 tag_len;
+	struct cipso_v4_doi *doi_def = NULL;
+	u32 tag_iter;
+
+	/* caller already checks for length values that are too large */
+	opt_len = opt[1];
+	if (opt_len < 8) {
+		err_offset = 1;
+		goto validate_return;
+	}
+
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_getdef(ntohl(*((u32 *)&opt[2])));
+	if (doi_def == NULL) {
+		err_offset = 2;
+		goto validate_return_locked;
+	}
+
+	opt_iter = 6;
+	tag = opt + opt_iter;
+	while (opt_iter < opt_len) {
+		for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
+			if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID ||
+			    ++tag_iter == CIPSO_V4_TAG_MAXCNT) {
+				err_offset = opt_iter;
+				goto validate_return_locked;
+			}
+
+		tag_len = tag[1];
+		if (tag_len > (opt_len - opt_iter)) {
+			err_offset = opt_iter + 1;
+			goto validate_return_locked;
+		}
+
+		switch (tag[0]) {
+		case CIPSO_V4_TAG_RBITMAP:
+			if (tag_len < 4) {
+				err_offset = opt_iter + 1;
+				goto validate_return_locked;
+			}
+
+			/* We are already going to do all the verification
+			 * necessary at the socket layer so from our point of
+			 * view it is safe to turn these checks off (and less
+			 * work), however, the CIPSO draft says we should do
+			 * all the CIPSO validations here but it doesn't
+			 * really specify _exactly_ what we need to validate
+			 * ... so, just make it a sysctl tunable. */
+			if (cipso_v4_rbm_strictvalid) {
+				if (cipso_v4_map_lvl_valid(doi_def,
+							   tag[3]) < 0) {
+					err_offset = opt_iter + 3;
+					goto validate_return_locked;
+				}
+				if (tag_len > 4 &&
+				    cipso_v4_map_cat_rbm_valid(doi_def,
+							    &tag[4],
+							    tag_len - 4) < 0) {
+					err_offset = opt_iter + 4;
+					goto validate_return_locked;
+				}
+			}
+			break;
+		default:
+			err_offset = opt_iter;
+			goto validate_return_locked;
+		}
+
+		tag += tag_len;
+		opt_iter += tag_len;
+	}
+
+validate_return_locked:
+	rcu_read_unlock();
+validate_return:
+	*option = opt + err_offset;
+	return err_offset;
+}
+
+/**
+ * cipso_v4_error - Send the correct reponse for a bad packet
+ * @skb: the packet
+ * @error: the error code
+ * @gateway: CIPSO gateway flag
+ *
+ * Description:
+ * Based on the error code given in @error, send an ICMP error message back to
+ * the originating host.  From the IETF draft ...
+ *
+ *  "If the contents of the CIPSO [option] are valid but the security label is
+ *   outside of the configured host or port label range, the datagram is
+ *   discarded and an ICMP 'destination unreachable' (type 3) is generated and
+ *   returned.  The code field of the ICMP is set to 'communication with
+ *   destination network administratively prohibited' (code 9) or to
+ *   'communication with destination host administratively prohibited'
+ *   (code 10).  The value of the code is dependent on whether the originator
+ *   of the ICMP message is acting as a CIPSO host or a CIPSO gateway.  The
+ *   recipient of the ICMP message MUST be able to handle either value.  The
+ *   same procedure is performed if a CIPSO [option] can not be added to an
+ *   IP packet because it is too large to fit in the IP options area."
+ *
+ *  "If the error is triggered by receipt of an ICMP message, the message is
+ *   discarded and no response is permitted (consistent with general ICMP
+ *   processing rules)."
+ *
+ */
+void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
+{
+	if (skb->nh.iph->protocol == IPPROTO_ICMP || error != -EACCES)
+		return;
+
+	if (gateway)
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
+	else
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
+}
+
+/**
+ * cipso_v4_socket_setattr - Add a CIPSO option to a socket
+ * @sock: the socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function.  This function requires
+ * exclusive access to @sock->sk, which means it either needs to be in the
+ * process of being created or locked via lock_sock(sock->sk).  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int cipso_v4_socket_setattr(const struct socket *sock,
+			    const struct cipso_v4_doi *doi_def,
+			    const struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -EPERM;
+	u32 iter;
+	unsigned char *buf = NULL;
+	u32 buf_len = 0;
+	u32 opt_len;
+	struct ip_options *opt = NULL;
+	struct sock *sk;
+	struct inet_sock *sk_inet;
+	struct inet_connection_sock *sk_conn;
+
+	/* In the case of sock_create_lite(), the sock->sk field is not
+	 * defined yet but it is not a problem as the only users of these
+	 * "lite" PF_INET sockets are functions which do an accept() call
+	 * afterwards so we will label the socket as part of the accept(). */
+	sk = sock->sk;
+	if (sk == NULL)
+		return 0;
+
+	/* XXX - This code assumes only one tag per CIPSO option which isn't
+	 * really a good assumption to make but since we only support the MAC
+	 * tags right now it is a safe assumption. */
+	iter = 0;
+	do {
+		switch (doi_def->tags[iter]) {
+		case CIPSO_V4_TAG_RBITMAP:
+			ret_val = cipso_v4_gentag_rbm(doi_def,
+						      secattr,
+						      &buf,
+						      &buf_len);
+			break;
+		default:
+			ret_val = -EPERM;
+			goto socket_setattr_failure;
+		}
+
+		iter++;
+	} while (ret_val != 0 &&
+		 iter < CIPSO_V4_TAG_MAXCNT &&
+		 doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
+	if (ret_val != 0)
+		goto socket_setattr_failure;
+
+	/* We can't use ip_options_get() directly because it makes a call to
+	 * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+	 * we can't block here. */
+	opt_len = (buf_len + 3) & ~3;
+	opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+	if (opt == NULL) {
+		ret_val = -ENOMEM;
+		goto socket_setattr_failure;
+	}
+	memcpy(opt->__data, buf, buf_len);
+	opt->optlen = opt_len;
+	opt->is_data = 1;
+	kfree(buf);
+	buf = NULL;
+	ret_val = ip_options_compile(opt, NULL);
+	if (ret_val != 0)
+		goto socket_setattr_failure;
+
+	sk_inet = inet_sk(sk);
+	if (sk_inet->is_icsk) {
+		sk_conn = inet_csk(sk);
+		if (sk_inet->opt)
+			sk_conn->icsk_ext_hdr_len -= sk_inet->opt->optlen;
+		sk_conn->icsk_ext_hdr_len += opt->optlen;
+		sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+	}
+	opt = xchg(&sk_inet->opt, opt);
+	kfree(opt);
+
+	return 0;
+
+socket_setattr_failure:
+	kfree(buf);
+	kfree(opt);
+	return ret_val;
+}
+
+/**
+ * cipso_v4_socket_getattr - Get the security attributes from a socket
+ * @sock: the socket
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sock to see if there is a CIPSO option attached to the socket and if
+ * there is return the CIPSO security attributes in @secattr.  Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int cipso_v4_socket_getattr(const struct socket *sock,
+			    struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	struct sock *sk;
+	struct inet_sock *sk_inet;
+	unsigned char *cipso_ptr;
+	u32 doi;
+	struct cipso_v4_doi *doi_def;
+
+	sk = sock->sk;
+	lock_sock(sk);
+	sk_inet = inet_sk(sk);
+	if (sk_inet->opt == NULL || sk_inet->opt->cipso == 0)
+		goto socket_getattr_return;
+	cipso_ptr = sk_inet->opt->__data + sk_inet->opt->cipso -
+		sizeof(struct iphdr);
+	ret_val = cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr);
+	if (ret_val == 0)
+		goto socket_getattr_return;
+
+	doi = ntohl(*(u32 *)&cipso_ptr[2]);
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_getdef(doi);
+	if (doi_def == NULL) {
+		rcu_read_unlock();
+		goto socket_getattr_return;
+	}
+	switch (cipso_ptr[6]) {
+	case CIPSO_V4_TAG_RBITMAP:
+		ret_val = cipso_v4_parsetag_rbm(doi_def,
+						&cipso_ptr[6],
+						secattr);
+		break;
+	}
+	rcu_read_unlock();
+
+socket_getattr_return:
+	release_sock(sk);
+	return ret_val;
+}
+
+/**
+ * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option
+ * @skb: the packet
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse the given packet's CIPSO option and return the security attributes.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
+			    struct netlbl_lsm_secattr *secattr)
+{
+	int ret_val = -ENOMSG;
+	unsigned char *cipso_ptr;
+	u32 doi;
+	struct cipso_v4_doi *doi_def;
+
+	if (!CIPSO_V4_OPTEXIST(skb))
+		return -ENOMSG;
+	cipso_ptr = CIPSO_V4_OPTPTR(skb);
+	if (cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr) == 0)
+		return 0;
+
+	doi = ntohl(*(u32 *)&cipso_ptr[2]);
+	rcu_read_lock();
+	doi_def = cipso_v4_doi_getdef(doi);
+	if (doi_def == NULL)
+		goto skbuff_getattr_return;
+	switch (cipso_ptr[6]) {
+	case CIPSO_V4_TAG_RBITMAP:
+		ret_val = cipso_v4_parsetag_rbm(doi_def,
+						&cipso_ptr[6],
+						secattr);
+		break;
+	}
+
+skbuff_getattr_return:
+	rcu_read_unlock();
+	return ret_val;
+}
+
+/*
+ * Setup Functions
+ */
+
+/**
+ * cipso_v4_init - Initialize the CIPSO module
+ *
+ * Description:
+ * Initialize the CIPSO module and prepare it for use.  Returns zero on success
+ * and negative values on failure.
+ *
+ */
+static int __init cipso_v4_init(void)
+{
+	int ret_val;
+
+	ret_val = cipso_v4_cache_init();
+	if (ret_val != 0)
+		panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n",
+		      ret_val);
+
+	return 0;
+}
+
+subsys_initcall(cipso_v4_init);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 70cea9d08a3..19b2071ff31 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -17,6 +17,7 @@
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
+#include <net/cipso_ipv4.h>
 
 /* From af_inet.c */
 extern int sysctl_ip_nonlocal_bind;
@@ -697,6 +698,40 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+#ifdef CONFIG_NETLABEL
+	{
+		.ctl_name	= NET_CIPSOV4_CACHE_ENABLE,
+		.procname	= "cipso_cache_enable",
+		.data		= &cipso_v4_cache_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_CIPSOV4_CACHE_BUCKET_SIZE,
+		.procname	= "cipso_cache_bucket_size",
+		.data		= &cipso_v4_cache_bucketsize,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_CIPSOV4_RBM_OPTFMT,
+		.procname	= "cipso_rbm_optfmt",
+		.data		= &cipso_v4_rbm_optfmt,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_CIPSOV4_RBM_STRICTVALID,
+		.procname	= "cipso_rbm_strictvalid",
+		.data		= &cipso_v4_rbm_strictvalid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif /* CONFIG_NETLABEL */
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3-70-g09d2


From 7420ed23a4f77480b5b7b3245e5da30dd24b7575 Mon Sep 17 00:00:00 2001
From: Venkat Yekkirala <vyekkirala@TrustedCS.com>
Date: Fri, 4 Aug 2006 23:17:57 -0700
Subject: [NetLabel]: SELinux support

Add NetLabel support to the SELinux LSM and modify the
socket_post_create() LSM hook to return an error code.  The most
significant part of this patch is the addition of NetLabel hooks into
the following SELinux LSM hooks:

 * selinux_file_permission()
 * selinux_socket_sendmsg()
 * selinux_socket_post_create()
 * selinux_socket_sock_rcv_skb()
 * selinux_socket_getpeersec_stream()
 * selinux_socket_getpeersec_dgram()
 * selinux_sock_graft()
 * selinux_inet_conn_request()

The basic reasoning behind this patch is that outgoing packets are
"NetLabel'd" by labeling their socket and the NetLabel security
attributes are checked via the additional hook in
selinux_socket_sock_rcv_skb().  NetLabel itself is only a labeling
mechanism, similar to filesystem extended attributes, it is up to the
SELinux enforcement mechanism to perform the actual access checks.

In addition to the changes outlined above this patch also includes
some changes to the extended bitmap (ebitmap) and multi-level security
(mls) code to import and export SELinux TE/MLS attributes into and out
of NetLabel.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h                    |  25 +-
 net/socket.c                                |  13 +-
 security/dummy.c                            |   6 +-
 security/selinux/hooks.c                    |  56 +++-
 security/selinux/include/objsec.h           |   8 +
 security/selinux/include/selinux_netlabel.h | 125 +++++++
 security/selinux/ss/ebitmap.c               | 144 ++++++++
 security/selinux/ss/ebitmap.h               |   6 +
 security/selinux/ss/mls.c                   | 156 +++++++++
 security/selinux/ss/mls.h                   |  21 ++
 security/selinux/ss/services.c              | 488 ++++++++++++++++++++++++++++
 11 files changed, 1020 insertions(+), 28 deletions(-)
 create mode 100644 security/selinux/include/selinux_netlabel.h

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index bb4c80fdfe7..9f56fb8a4a6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1341,8 +1341,8 @@ struct security_operations {
 	int (*unix_may_send) (struct socket * sock, struct socket * other);
 
 	int (*socket_create) (int family, int type, int protocol, int kern);
-	void (*socket_post_create) (struct socket * sock, int family,
-				    int type, int protocol, int kern);
+	int (*socket_post_create) (struct socket * sock, int family,
+				   int type, int protocol, int kern);
 	int (*socket_bind) (struct socket * sock,
 			    struct sockaddr * address, int addrlen);
 	int (*socket_connect) (struct socket * sock,
@@ -2824,13 +2824,13 @@ static inline int security_socket_create (int family, int type,
 	return security_ops->socket_create(family, type, protocol, kern);
 }
 
-static inline void security_socket_post_create(struct socket * sock, 
-					       int family,
-					       int type, 
-					       int protocol, int kern)
+static inline int security_socket_post_create(struct socket * sock,
+					      int family,
+					      int type,
+					      int protocol, int kern)
 {
-	security_ops->socket_post_create(sock, family, type,
-					 protocol, kern);
+	return security_ops->socket_post_create(sock, family, type,
+						protocol, kern);
 }
 
 static inline int security_socket_bind(struct socket * sock, 
@@ -2982,11 +2982,12 @@ static inline int security_socket_create (int family, int type,
 	return 0;
 }
 
-static inline void security_socket_post_create(struct socket * sock, 
-					       int family,
-					       int type, 
-					       int protocol, int kern)
+static inline int security_socket_post_create(struct socket * sock,
+					      int family,
+					      int type,
+					      int protocol, int kern)
 {
+	return 0;
 }
 
 static inline int security_socket_bind(struct socket * sock, 
diff --git a/net/socket.c b/net/socket.c
index 6d261bf206f..6756e57e1ff 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -973,11 +973,18 @@ int sock_create_lite(int family, int type, int protocol, struct socket **res)
 		goto out;
 	}
 
-	security_socket_post_create(sock, family, type, protocol, 1);
 	sock->type = type;
+	err = security_socket_post_create(sock, family, type, protocol, 1);
+	if (err)
+		goto out_release;
+
 out:
 	*res = sock;
 	return err;
+out_release:
+	sock_release(sock);
+	sock = NULL;
+	goto out;
 }
 
 /* No kernel lock held - perfect */
@@ -1214,7 +1221,9 @@ static int __sock_create(int family, int type, int protocol, struct socket **res
 	 */
 	module_put(net_families[family]->owner);
 	*res = sock;
-	security_socket_post_create(sock, family, type, protocol, kern);
+	err = security_socket_post_create(sock, family, type, protocol, kern);
+	if (err)
+		goto out_release;
 
 out:
 	net_family_read_unlock();
diff --git a/security/dummy.c b/security/dummy.c
index 1c45f8e4aad..aeee7056550 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -709,10 +709,10 @@ static int dummy_socket_create (int family, int type,
 	return 0;
 }
 
-static void dummy_socket_post_create (struct socket *sock, int family, int type,
-				      int protocol, int kern)
+static int dummy_socket_post_create (struct socket *sock, int family, int type,
+				     int protocol, int kern)
 {
-	return;
+	return 0;
 }
 
 static int dummy_socket_bind (struct socket *sock, struct sockaddr *address,
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 33028b3b19c..2a6bbb921e1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -12,6 +12,8 @@
  *  Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
  *  Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
  *                          <dgoeddel@trustedcs.com>
+ *  Copyright (C) 2006 Hewlett-Packard Development Company, L.P.
+ *                     Paul Moore, <paul.moore@hp.com>
  *
  *	This program is free software; you can redistribute it and/or modify
  *	it under the terms of the GNU General Public License version 2,
@@ -74,6 +76,7 @@
 #include "objsec.h"
 #include "netif.h"
 #include "xfrm.h"
+#include "selinux_netlabel.h"
 
 #define XATTR_SELINUX_SUFFIX "selinux"
 #define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX
@@ -2395,6 +2398,7 @@ static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t
 
 static int selinux_file_permission(struct file *file, int mask)
 {
+	int rc;
 	struct inode *inode = file->f_dentry->d_inode;
 
 	if (!mask) {
@@ -2406,8 +2410,12 @@ static int selinux_file_permission(struct file *file, int mask)
 	if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE))
 		mask |= MAY_APPEND;
 
-	return file_has_perm(current, file,
-			     file_mask_to_av(inode->i_mode, mask));
+	rc = file_has_perm(current, file,
+			   file_mask_to_av(inode->i_mode, mask));
+	if (rc)
+		return rc;
+
+	return selinux_netlbl_inode_permission(inode, mask);
 }
 
 static int selinux_file_alloc_security(struct file *file)
@@ -3058,9 +3066,10 @@ out:
 	return err;
 }
 
-static void selinux_socket_post_create(struct socket *sock, int family,
-				       int type, int protocol, int kern)
+static int selinux_socket_post_create(struct socket *sock, int family,
+				      int type, int protocol, int kern)
 {
+	int err = 0;
 	struct inode_security_struct *isec;
 	struct task_security_struct *tsec;
 	struct sk_security_struct *sksec;
@@ -3077,9 +3086,12 @@ static void selinux_socket_post_create(struct socket *sock, int family,
 	if (sock->sk) {
 		sksec = sock->sk->sk_security;
 		sksec->sid = isec->sid;
+		err = selinux_netlbl_socket_post_create(sock,
+							family,
+							isec->sid);
 	}
 
-	return;
+	return err;
 }
 
 /* Range of port numbers used to automatically bind.
@@ -3260,7 +3272,13 @@ static int selinux_socket_accept(struct socket *sock, struct socket *newsock)
 static int selinux_socket_sendmsg(struct socket *sock, struct msghdr *msg,
  				  int size)
 {
-	return socket_has_perm(current, sock, SOCKET__WRITE);
+	int rc;
+
+	rc = socket_has_perm(current, sock, SOCKET__WRITE);
+	if (rc)
+		return rc;
+
+	return selinux_netlbl_inode_permission(SOCK_INODE(sock), MAY_WRITE);
 }
 
 static int selinux_socket_recvmsg(struct socket *sock, struct msghdr *msg,
@@ -3468,6 +3486,10 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (err)
 		goto out;
 
+	err = selinux_netlbl_sock_rcv_skb(sksec, skb, &ad);
+	if (err)
+		goto out;
+
 	err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
 out:	
 	return err;
@@ -3491,8 +3513,9 @@ static int selinux_socket_getpeersec_stream(struct socket *sock, char __user *op
 		peer_sid = ssec->peer_sid;
 	}
 	else if (isec->sclass == SECCLASS_TCP_SOCKET) {
-		peer_sid = selinux_socket_getpeer_stream(sock->sk);
-
+		peer_sid = selinux_netlbl_socket_getpeersec_stream(sock);
+		if (peer_sid == SECSID_NULL)
+			peer_sid = selinux_socket_getpeer_stream(sock->sk);
 		if (peer_sid == SECSID_NULL) {
 			err = -ENOPROTOOPT;
 			goto out;
@@ -3532,8 +3555,11 @@ static int selinux_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *
 
 	if (sock && (sock->sk->sk_family == PF_UNIX))
 		selinux_get_inode_sid(SOCK_INODE(sock), &peer_secid);
-	else if (skb)
-		peer_secid = selinux_socket_getpeer_dgram(skb);
+	else if (skb) {
+		peer_secid = selinux_netlbl_socket_getpeersec_dgram(skb);
+		if (peer_secid == SECSID_NULL)
+			peer_secid = selinux_socket_getpeer_dgram(skb);
+	}
 
 	if (peer_secid == SECSID_NULL)
 		err = -EINVAL;
@@ -3578,6 +3604,8 @@ void selinux_sock_graft(struct sock* sk, struct socket *parent)
 	struct sk_security_struct *sksec = sk->sk_security;
 
 	isec->sid = sksec->sid;
+
+	selinux_netlbl_sock_graft(sk, parent);
 }
 
 int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
@@ -3585,9 +3613,15 @@ int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 {
 	struct sk_security_struct *sksec = sk->sk_security;
 	int err;
-	u32 newsid = 0;
+	u32 newsid;
 	u32 peersid;
 
+	newsid = selinux_netlbl_inet_conn_request(skb, sksec->sid);
+	if (newsid != SECSID_NULL) {
+		req->secid = newsid;
+		return 0;
+	}
+
 	err = selinux_xfrm_decode_session(skb, &peersid, 0);
 	BUG_ON(err);
 
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 79b9e0af19a..0a39bfd1319 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -101,6 +101,14 @@ struct sk_security_struct {
 	struct sock *sk;		/* back pointer to sk object */
 	u32 sid;			/* SID of this object */
 	u32 peer_sid;			/* SID of peer */
+#ifdef CONFIG_NETLABEL
+	u16 sclass;			/* sock security class */
+	enum {				/* NetLabel state */
+		NLBL_UNSET = 0,
+		NLBL_REQUIRE,
+		NLBL_LABELED,
+	} nlbl_state;
+#endif
 };
 
 struct key_security_struct {
diff --git a/security/selinux/include/selinux_netlabel.h b/security/selinux/include/selinux_netlabel.h
new file mode 100644
index 00000000000..88c463eef1e
--- /dev/null
+++ b/security/selinux/include/selinux_netlabel.h
@@ -0,0 +1,125 @@
+/*
+ * SELinux interface to the NetLabel subsystem
+ *
+ * Author : Paul Moore <paul.moore@hp.com>
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _SELINUX_NETLABEL_H_
+#define _SELINUX_NETLABEL_H_
+
+#ifdef CONFIG_NETLABEL
+void selinux_netlbl_cache_invalidate(void);
+int selinux_netlbl_socket_post_create(struct socket *sock,
+				      int sock_family,
+				      u32 sid);
+void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock);
+u32 selinux_netlbl_inet_conn_request(struct sk_buff *skb, u32 sock_sid);
+int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
+				struct sk_buff *skb,
+				struct avc_audit_data *ad);
+u32 selinux_netlbl_socket_getpeersec_stream(struct socket *sock);
+u32 selinux_netlbl_socket_getpeersec_dgram(struct sk_buff *skb);
+
+int __selinux_netlbl_inode_permission(struct inode *inode, int mask);
+/**
+ * selinux_netlbl_inode_permission - Verify the socket is NetLabel labeled
+ * @inode: the file descriptor's inode
+ * @mask: the permission mask
+ *
+ * Description:
+ * Looks at a file's inode and if it is marked as a socket protected by
+ * NetLabel then verify that the socket has been labeled, if not try to label
+ * the socket now with the inode's SID.  Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static inline int selinux_netlbl_inode_permission(struct inode *inode,
+						  int mask)
+{
+	int rc = 0;
+	struct inode_security_struct *isec;
+	struct sk_security_struct *sksec;
+
+	if (!S_ISSOCK(inode->i_mode))
+		return 0;
+
+	isec = inode->i_security;
+	sksec = SOCKET_I(inode)->sk->sk_security;
+	down(&isec->sem);
+	if (unlikely(sksec->nlbl_state == NLBL_REQUIRE &&
+		     (mask & (MAY_WRITE | MAY_APPEND))))
+		rc = __selinux_netlbl_inode_permission(inode, mask);
+	up(&isec->sem);
+
+	return rc;
+}
+#else
+static inline void selinux_netlbl_cache_invalidate(void)
+{
+	return;
+}
+
+static inline int selinux_netlbl_socket_post_create(struct socket *sock,
+						    int sock_family,
+						    u32 sid)
+{
+	return 0;
+}
+
+static inline void selinux_netlbl_sock_graft(struct sock *sk,
+					     struct socket *sock)
+{
+	return;
+}
+
+static inline u32 selinux_netlbl_inet_conn_request(struct sk_buff *skb,
+						   u32 sock_sid)
+{
+	return SECSID_NULL;
+}
+
+static inline int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
+					      struct sk_buff *skb,
+					      struct avc_audit_data *ad)
+{
+	return 0;
+}
+
+static inline u32 selinux_netlbl_socket_getpeersec_stream(struct socket *sock)
+{
+	return SECSID_NULL;
+}
+
+static inline u32 selinux_netlbl_socket_getpeersec_dgram(struct sk_buff *skb)
+{
+	return SECSID_NULL;
+}
+
+static inline int selinux_netlbl_inode_permission(struct inode *inode,
+						  int mask)
+{
+	return 0;
+}
+#endif /* CONFIG_NETLABEL */
+
+#endif
diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c
index 47024a6e184..4b915eb60c4 100644
--- a/security/selinux/ss/ebitmap.c
+++ b/security/selinux/ss/ebitmap.c
@@ -3,6 +3,14 @@
  *
  * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
  */
+/*
+ * Updated: Hewlett-Packard <paul.moore@hp.com>
+ *
+ *      Added ebitmap_export() and ebitmap_import()
+ *
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ */
+
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
@@ -59,6 +67,142 @@ int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src)
 	return 0;
 }
 
+/**
+ * ebitmap_export - Export an ebitmap to a unsigned char bitmap string
+ * @src: the ebitmap to export
+ * @dst: the resulting bitmap string
+ * @dst_len: length of dst in bytes
+ *
+ * Description:
+ * Allocate a buffer at least src->highbit bits long and export the extensible
+ * bitmap into the buffer.  The bitmap string will be in little endian format,
+ * i.e. LSB first.  The value returned in dst_len may not the true size of the
+ * buffer as the length of the buffer is rounded up to a multiple of MAPTYPE.
+ * The caller must free the buffer when finished. Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int ebitmap_export(const struct ebitmap *src,
+		   unsigned char **dst,
+		   size_t *dst_len)
+{
+	size_t bitmap_len;
+	unsigned char *bitmap;
+	struct ebitmap_node *iter_node;
+	MAPTYPE node_val;
+	size_t bitmap_byte;
+	unsigned char bitmask;
+
+	bitmap_len = src->highbit / 8;
+	if (src->highbit % 7)
+		bitmap_len += 1;
+	if (bitmap_len == 0)
+		return -EINVAL;
+
+	bitmap = kzalloc((bitmap_len & ~(sizeof(MAPTYPE) - 1)) +
+			 sizeof(MAPTYPE),
+			 GFP_ATOMIC);
+	if (bitmap == NULL)
+		return -ENOMEM;
+
+	iter_node = src->node;
+	do {
+		bitmap_byte = iter_node->startbit / 8;
+		bitmask = 0x80;
+		node_val = iter_node->map;
+		do {
+			if (bitmask == 0) {
+				bitmap_byte++;
+				bitmask = 0x80;
+			}
+			if (node_val & (MAPTYPE)0x01)
+				bitmap[bitmap_byte] |= bitmask;
+			node_val >>= 1;
+			bitmask >>= 1;
+		} while (node_val > 0);
+		iter_node = iter_node->next;
+	} while (iter_node);
+
+	*dst = bitmap;
+	*dst_len = bitmap_len;
+	return 0;
+}
+
+/**
+ * ebitmap_import - Import an unsigned char bitmap string into an ebitmap
+ * @src: the bitmap string
+ * @src_len: the bitmap length in bytes
+ * @dst: the empty ebitmap
+ *
+ * Description:
+ * This function takes a little endian bitmap string in src and imports it into
+ * the ebitmap pointed to by dst.  Returns zero on success, negative values on
+ * failure.
+ *
+ */
+int ebitmap_import(const unsigned char *src,
+		   size_t src_len,
+		   struct ebitmap *dst)
+{
+	size_t src_off = 0;
+	struct ebitmap_node *node_new;
+	struct ebitmap_node *node_last = NULL;
+	size_t iter;
+	size_t iter_bit;
+	size_t iter_limit;
+	unsigned char src_byte;
+
+	do {
+		iter_limit = src_len - src_off;
+		if (iter_limit >= sizeof(MAPTYPE)) {
+			if (*(MAPTYPE *)&src[src_off] == 0) {
+				src_off += sizeof(MAPTYPE);
+				continue;
+			}
+			iter_limit = sizeof(MAPTYPE);
+		} else {
+			iter = src_off;
+			src_byte = 0;
+			do {
+				src_byte |= src[iter++];
+			} while (iter < src_len && src_byte == 0);
+			if (src_byte == 0)
+				break;
+		}
+
+		node_new = kzalloc(sizeof(*node_new), GFP_ATOMIC);
+		if (unlikely(node_new == NULL)) {
+			ebitmap_destroy(dst);
+			return -ENOMEM;
+		}
+		node_new->startbit = src_off * 8;
+		iter = 0;
+		do {
+			src_byte = src[src_off++];
+			iter_bit = iter++ * 8;
+			while (src_byte != 0) {
+				if (src_byte & 0x80)
+					node_new->map |= MAPBIT << iter_bit;
+				iter_bit++;
+				src_byte <<= 1;
+			}
+		} while (iter < iter_limit);
+
+		if (node_last != NULL)
+			node_last->next = node_new;
+		else
+			dst->node = node_new;
+		node_last = node_new;
+	} while (src_off < src_len);
+
+	if (likely(node_last != NULL))
+		dst->highbit = node_last->startbit + MAPSIZE;
+	else
+		ebitmap_init(dst);
+
+	return 0;
+}
+
 int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2)
 {
 	struct ebitmap_node *n1, *n2;
diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h
index 8bf41055a6c..da2d4651b10 100644
--- a/security/selinux/ss/ebitmap.h
+++ b/security/selinux/ss/ebitmap.h
@@ -69,6 +69,12 @@ static inline int ebitmap_node_get_bit(struct ebitmap_node * n,
 
 int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2);
 int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src);
+int ebitmap_export(const struct ebitmap *src,
+		   unsigned char **dst,
+		   size_t *dst_len);
+int ebitmap_import(const unsigned char *src,
+		   size_t src_len,
+		   struct ebitmap *dst);
 int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2);
 int ebitmap_get_bit(struct ebitmap *e, unsigned long bit);
 int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value);
diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c
index e15f7e0399b..119bd6078ba 100644
--- a/security/selinux/ss/mls.c
+++ b/security/selinux/ss/mls.c
@@ -10,6 +10,13 @@
  *
  * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
  */
+/*
+ * Updated: Hewlett-Packard <paul.moore@hp.com>
+ *
+ *      Added support to import/export the MLS label
+ *
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ */
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -565,3 +572,152 @@ int mls_compute_sid(struct context *scontext,
 	return -EINVAL;
 }
 
+/**
+ * mls_export_lvl - Export the MLS sensitivity levels
+ * @context: the security context
+ * @low: the low sensitivity level
+ * @high: the high sensitivity level
+ *
+ * Description:
+ * Given the security context copy the low MLS sensitivity level into lvl_low
+ * and the high sensitivity level in lvl_high.  The MLS levels are only
+ * exported if the pointers are not NULL, if they are NULL then that level is
+ * not exported.
+ *
+ */
+void mls_export_lvl(const struct context *context, u32 *low, u32 *high)
+{
+	if (!selinux_mls_enabled)
+		return;
+
+	if (low != NULL)
+		*low = context->range.level[0].sens - 1;
+	if (high != NULL)
+		*high = context->range.level[1].sens - 1;
+}
+
+/**
+ * mls_import_lvl - Import the MLS sensitivity levels
+ * @context: the security context
+ * @low: the low sensitivity level
+ * @high: the high sensitivity level
+ *
+ * Description:
+ * Given the security context and the two sensitivty levels, set the MLS levels
+ * in the context according the two given as parameters.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+void mls_import_lvl(struct context *context, u32 low, u32 high)
+{
+	if (!selinux_mls_enabled)
+		return;
+
+	context->range.level[0].sens = low + 1;
+	context->range.level[1].sens = high + 1;
+}
+
+/**
+ * mls_export_cat - Export the MLS categories
+ * @context: the security context
+ * @low: the low category
+ * @low_len: length of the cat_low bitmap in bytes
+ * @high: the high category
+ * @high_len: length of the cat_high bitmap in bytes
+ *
+ * Description:
+ * Given the security context export the low MLS category bitmap into cat_low
+ * and the high category bitmap into cat_high.  The MLS categories are only
+ * exported if the pointers are not NULL, if they are NULL then that level is
+ * not exported.  The caller is responsibile for freeing the memory when
+ * finished.  Returns zero on success, negative values on failure.
+ *
+ */
+int mls_export_cat(const struct context *context,
+		   unsigned char **low,
+		   size_t *low_len,
+		   unsigned char **high,
+		   size_t *high_len)
+{
+	int rc = -EPERM;
+
+	if (!selinux_mls_enabled)
+		return 0;
+
+	if (low != NULL) {
+		rc = ebitmap_export(&context->range.level[0].cat,
+				    low,
+				    low_len);
+		if (rc != 0)
+			goto export_cat_failure;
+	}
+	if (high != NULL) {
+		rc = ebitmap_export(&context->range.level[1].cat,
+				    high,
+				    high_len);
+		if (rc != 0)
+			goto export_cat_failure;
+	}
+
+	return 0;
+
+export_cat_failure:
+	if (low != NULL)
+		kfree(*low);
+	if (high != NULL)
+		kfree(*high);
+	return rc;
+}
+
+/**
+ * mls_import_cat - Import the MLS categories
+ * @context: the security context
+ * @low: the low category
+ * @low_len: length of the cat_low bitmap in bytes
+ * @high: the high category
+ * @high_len: length of the cat_high bitmap in bytes
+ *
+ * Description:
+ * Given the security context and the two category bitmap strings import the
+ * categories into the security context.  The MLS categories are only imported
+ * if the pointers are not NULL, if they are NULL they are skipped.  Returns
+ * zero on success, negative values on failure.
+ *
+ */
+int mls_import_cat(struct context *context,
+		   const unsigned char *low,
+		   size_t low_len,
+		   const unsigned char *high,
+		   size_t high_len)
+{
+	int rc = -EPERM;
+
+	if (!selinux_mls_enabled)
+		return 0;
+
+	if (low != NULL) {
+		rc = ebitmap_import(low,
+				    low_len,
+				    &context->range.level[0].cat);
+		if (rc != 0)
+			goto import_cat_failure;
+	}
+	if (high != NULL) {
+		if (high == low)
+			rc = ebitmap_cpy(&context->range.level[1].cat,
+					 &context->range.level[0].cat);
+		else
+			rc = ebitmap_import(high,
+					    high_len,
+					    &context->range.level[1].cat);
+		if (rc != 0)
+			goto import_cat_failure;
+	}
+
+	return 0;
+
+import_cat_failure:
+	ebitmap_destroy(&context->range.level[0].cat);
+	ebitmap_destroy(&context->range.level[1].cat);
+	return rc;
+}
diff --git a/security/selinux/ss/mls.h b/security/selinux/ss/mls.h
index 90c5e88987f..df6032c6d49 100644
--- a/security/selinux/ss/mls.h
+++ b/security/selinux/ss/mls.h
@@ -10,6 +10,13 @@
  *
  * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
  */
+/*
+ * Updated: Hewlett-Packard <paul.moore@hp.com>
+ *
+ *      Added support to import/export the MLS label
+ *
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ */
 
 #ifndef _SS_MLS_H_
 #define _SS_MLS_H_
@@ -62,5 +69,19 @@ int mls_compute_sid(struct context *scontext,
 int mls_setup_user_range(struct context *fromcon, struct user_datum *user,
                          struct context *usercon);
 
+void mls_export_lvl(const struct context *context, u32 *low, u32 *high);
+void mls_import_lvl(struct context *context, u32 low, u32 high);
+
+int mls_export_cat(const struct context *context,
+		   unsigned char **low,
+		   size_t *low_len,
+		   unsigned char **high,
+		   size_t *high_len);
+int mls_import_cat(struct context *context,
+		   const unsigned char *low,
+		   size_t low_len,
+		   const unsigned char *high,
+		   size_t high_len);
+
 #endif	/* _SS_MLS_H */
 
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index b00ec69f0ff..910afa1ffc3 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -13,6 +13,11 @@
  *
  * 	Added conditional policy language extensions
  *
+ * Updated: Hewlett-Packard <paul.moore@hp.com>
+ *
+ *      Added support for NetLabel
+ *
+ * Copyright (C) 2006 Hewlett-Packard Development Company, L.P.
  * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
  * Copyright (C) 2003 - 2004 Tresys Technology, LLC
  * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
@@ -29,6 +34,8 @@
 #include <linux/sched.h>
 #include <linux/audit.h>
 #include <linux/mutex.h>
+#include <net/sock.h>
+#include <net/netlabel.h>
 
 #include "flask.h"
 #include "avc.h"
@@ -40,6 +47,8 @@
 #include "services.h"
 #include "conditional.h"
 #include "mls.h"
+#include "objsec.h"
+#include "selinux_netlabel.h"
 
 extern void selnl_notify_policyload(u32 seqno);
 unsigned int policydb_loaded_version;
@@ -1241,6 +1250,7 @@ int security_load_policy(void *data, size_t len)
 		selinux_complete_init();
 		avc_ss_reset(seqno);
 		selnl_notify_policyload(seqno);
+		selinux_netlbl_cache_invalidate();
 		return 0;
 	}
 
@@ -1295,6 +1305,7 @@ int security_load_policy(void *data, size_t len)
 
 	avc_ss_reset(seqno);
 	selnl_notify_policyload(seqno);
+	selinux_netlbl_cache_invalidate();
 
 	return 0;
 
@@ -2133,3 +2144,480 @@ void selinux_audit_set_callback(int (*callback)(void))
 {
 	aurule_callback = callback;
 }
+
+#ifdef CONFIG_NETLABEL
+/*
+ * This is the structure we store inside the NetLabel cache block.
+ */
+#define NETLBL_CACHE(x)           ((struct netlbl_cache *)(x))
+#define NETLBL_CACHE_T_NONE       0
+#define NETLBL_CACHE_T_SID        1
+#define NETLBL_CACHE_T_MLS        2
+struct netlbl_cache {
+	u32 type;
+	union {
+		u32 sid;
+		struct mls_range mls_label;
+	} data;
+};
+
+/**
+ * selinux_netlbl_cache_free - Free the NetLabel cached data
+ * @data: the data to free
+ *
+ * Description:
+ * This function is intended to be used as the free() callback inside the
+ * netlbl_lsm_cache structure.
+ *
+ */
+static void selinux_netlbl_cache_free(const void *data)
+{
+	struct netlbl_cache *cache = NETLBL_CACHE(data);
+	switch (cache->type) {
+	case NETLBL_CACHE_T_MLS:
+		ebitmap_destroy(&cache->data.mls_label.level[0].cat);
+		break;
+	}
+	kfree(data);
+}
+
+/**
+ * selinux_netlbl_cache_add - Add an entry to the NetLabel cache
+ * @skb: the packet
+ * @ctx: the SELinux context
+ *
+ * Description:
+ * Attempt to cache the context in @ctx, which was derived from the packet in
+ * @skb, in the NetLabel subsystem cache.
+ *
+ */
+static void selinux_netlbl_cache_add(struct sk_buff *skb, struct context *ctx)
+{
+	struct netlbl_cache *cache = NULL;
+	struct netlbl_lsm_secattr secattr;
+
+	netlbl_secattr_init(&secattr);
+
+	cache = kzalloc(sizeof(*cache),	GFP_ATOMIC);
+	if (cache == NULL)
+		goto netlbl_cache_add_failure;
+	secattr.cache.free = selinux_netlbl_cache_free;
+	secattr.cache.data = (void *)cache;
+
+	cache->type = NETLBL_CACHE_T_MLS;
+	if (ebitmap_cpy(&cache->data.mls_label.level[0].cat,
+			&ctx->range.level[0].cat) != 0)
+		goto netlbl_cache_add_failure;
+	cache->data.mls_label.level[1].cat.highbit =
+		cache->data.mls_label.level[0].cat.highbit;
+	cache->data.mls_label.level[1].cat.node =
+		cache->data.mls_label.level[0].cat.node;
+	cache->data.mls_label.level[0].sens = ctx->range.level[0].sens;
+	cache->data.mls_label.level[1].sens = ctx->range.level[0].sens;
+
+	if (netlbl_cache_add(skb, &secattr) != 0)
+		goto netlbl_cache_add_failure;
+
+	return;
+
+netlbl_cache_add_failure:
+	netlbl_secattr_destroy(&secattr, 1);
+}
+
+/**
+ * selinux_netlbl_cache_invalidate - Invalidate the NetLabel cache
+ *
+ * Description:
+ * Invalidate the NetLabel security attribute mapping cache.
+ *
+ */
+void selinux_netlbl_cache_invalidate(void)
+{
+	netlbl_cache_invalidate();
+}
+
+/**
+ * selinux_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
+ * @skb: the network packet
+ * @secattr: the NetLabel packet security attributes
+ * @base_sid: the SELinux SID to use as a context for MLS only attributes
+ * @sid: the SELinux SID
+ *
+ * Description:
+ * Convert the given NetLabel packet security attributes in @secattr into a
+ * SELinux SID.  If the @secattr field does not contain a full SELinux
+ * SID/context then use the context in @base_sid as the foundation.  If @skb
+ * is not NULL attempt to cache as much data as possibile.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int selinux_netlbl_secattr_to_sid(struct sk_buff *skb,
+					 struct netlbl_lsm_secattr *secattr,
+					 u32 base_sid,
+					 u32 *sid)
+{
+	int rc = -EIDRM;
+	struct context *ctx;
+	struct context ctx_new;
+	struct netlbl_cache *cache;
+
+	POLICY_RDLOCK;
+
+	if (secattr->cache.data) {
+		cache = NETLBL_CACHE(secattr->cache.data);
+		switch (cache->type) {
+		case NETLBL_CACHE_T_SID:
+			*sid = cache->data.sid;
+			rc = 0;
+			break;
+		case NETLBL_CACHE_T_MLS:
+			ctx = sidtab_search(&sidtab, base_sid);
+			if (ctx == NULL)
+				goto netlbl_secattr_to_sid_return;
+
+			ctx_new.user = ctx->user;
+			ctx_new.role = ctx->role;
+			ctx_new.type = ctx->type;
+			ctx_new.range.level[0].sens =
+				cache->data.mls_label.level[0].sens;
+			ctx_new.range.level[0].cat.highbit =
+				cache->data.mls_label.level[0].cat.highbit;
+			ctx_new.range.level[0].cat.node =
+				cache->data.mls_label.level[0].cat.node;
+			ctx_new.range.level[1].sens =
+				cache->data.mls_label.level[1].sens;
+			ctx_new.range.level[1].cat.highbit =
+				cache->data.mls_label.level[1].cat.highbit;
+			ctx_new.range.level[1].cat.node =
+				cache->data.mls_label.level[1].cat.node;
+
+			rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid);
+			break;
+		default:
+			goto netlbl_secattr_to_sid_return;
+		}
+	} else if (secattr->mls_lvl_vld) {
+		ctx = sidtab_search(&sidtab, base_sid);
+		if (ctx == NULL)
+			goto netlbl_secattr_to_sid_return;
+
+		ctx_new.user = ctx->user;
+		ctx_new.role = ctx->role;
+		ctx_new.type = ctx->type;
+		mls_import_lvl(&ctx_new, secattr->mls_lvl, secattr->mls_lvl);
+		if (secattr->mls_cat) {
+			if (mls_import_cat(&ctx_new,
+					   secattr->mls_cat,
+					   secattr->mls_cat_len,
+					   NULL,
+					   0) != 0)
+				goto netlbl_secattr_to_sid_return;
+			ctx_new.range.level[1].cat.highbit =
+				ctx_new.range.level[0].cat.highbit;
+			ctx_new.range.level[1].cat.node =
+				ctx_new.range.level[0].cat.node;
+		} else {
+			ebitmap_init(&ctx_new.range.level[0].cat);
+			ebitmap_init(&ctx_new.range.level[1].cat);
+		}
+		if (mls_context_isvalid(&policydb, &ctx_new) != 1)
+			goto netlbl_secattr_to_sid_return_cleanup;
+
+		rc = sidtab_context_to_sid(&sidtab, &ctx_new, sid);
+		if (rc != 0)
+			goto netlbl_secattr_to_sid_return_cleanup;
+
+		if (skb != NULL)
+			selinux_netlbl_cache_add(skb, &ctx_new);
+		ebitmap_destroy(&ctx_new.range.level[0].cat);
+	} else {
+		*sid = SECINITSID_UNLABELED;
+		rc = 0;
+	}
+
+netlbl_secattr_to_sid_return:
+	POLICY_RDUNLOCK;
+	return rc;
+netlbl_secattr_to_sid_return_cleanup:
+	ebitmap_destroy(&ctx_new.range.level[0].cat);
+	goto netlbl_secattr_to_sid_return;
+}
+
+/**
+ * selinux_netlbl_skbuff_getsid - Get the sid of a packet using NetLabel
+ * @skb: the packet
+ * @base_sid: the SELinux SID to use as a context for MLS only attributes
+ * @sid: the SID
+ *
+ * Description:
+ * Call the NetLabel mechanism to get the security attributes of the given
+ * packet and use those attributes to determine the correct context/SID to
+ * assign to the packet.  Returns zero on success, negative values on failure.
+ *
+ */
+static int selinux_netlbl_skbuff_getsid(struct sk_buff *skb,
+					u32 base_sid,
+					u32 *sid)
+{
+	int rc;
+	struct netlbl_lsm_secattr secattr;
+
+	netlbl_secattr_init(&secattr);
+	rc = netlbl_skbuff_getattr(skb, &secattr);
+	if (rc == 0)
+		rc = selinux_netlbl_secattr_to_sid(skb,
+						   &secattr,
+						   base_sid,
+						   sid);
+	netlbl_secattr_destroy(&secattr, 0);
+
+	return rc;
+}
+
+/**
+ * selinux_netlbl_socket_setsid - Label a socket using the NetLabel mechanism
+ * @sock: the socket to label
+ * @sid: the SID to use
+ *
+ * Description:
+ * Attempt to label a socket using the NetLabel mechanism using the given
+ * SID.  Returns zero values on success, negative values on failure.
+ *
+ */
+static int selinux_netlbl_socket_setsid(struct socket *sock, u32 sid)
+{
+	int rc = -ENOENT;
+	struct sk_security_struct *sksec = sock->sk->sk_security;
+	struct netlbl_lsm_secattr secattr;
+	struct context *ctx;
+
+	if (!ss_initialized)
+		return 0;
+
+	POLICY_RDLOCK;
+
+	ctx = sidtab_search(&sidtab, sid);
+	if (ctx == NULL)
+		goto netlbl_socket_setsid_return;
+
+	netlbl_secattr_init(&secattr);
+	secattr.domain = kstrdup(policydb.p_type_val_to_name[ctx->type - 1],
+				 GFP_ATOMIC);
+	mls_export_lvl(ctx, &secattr.mls_lvl, NULL);
+	secattr.mls_lvl_vld = 1;
+	mls_export_cat(ctx,
+		       &secattr.mls_cat,
+		       &secattr.mls_cat_len,
+		       NULL,
+		       NULL);
+
+	rc = netlbl_socket_setattr(sock, &secattr);
+	if (rc == 0)
+		sksec->nlbl_state = NLBL_LABELED;
+
+	netlbl_secattr_destroy(&secattr, 0);
+
+netlbl_socket_setsid_return:
+	POLICY_RDUNLOCK;
+	return rc;
+}
+
+/**
+ * selinux_netlbl_socket_post_create - Label a socket using NetLabel
+ * @sock: the socket to label
+ * @sock_family: the socket family
+ * @sid: the SID to use
+ *
+ * Description:
+ * Attempt to label a socket using the NetLabel mechanism using the given
+ * SID.  Returns zero values on success, negative values on failure.
+ *
+ */
+int selinux_netlbl_socket_post_create(struct socket *sock,
+				      int sock_family,
+				      u32 sid)
+{
+	struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
+	struct sk_security_struct *sksec = sock->sk->sk_security;
+
+	if (sock_family != PF_INET)
+		return 0;
+
+	sksec->sclass = isec->sclass;
+	sksec->nlbl_state = NLBL_REQUIRE;
+	return selinux_netlbl_socket_setsid(sock, sid);
+}
+
+/**
+ * selinux_netlbl_sock_graft - Netlabel the new socket
+ * @sk: the new connection
+ * @sock: the new socket
+ *
+ * Description:
+ * The connection represented by @sk is being grafted onto @sock so set the
+ * socket's NetLabel to match the SID of @sk.
+ *
+ */
+void selinux_netlbl_sock_graft(struct sock *sk, struct socket *sock)
+{
+	struct inode_security_struct *isec = SOCK_INODE(sock)->i_security;
+	struct sk_security_struct *sksec = sk->sk_security;
+
+	if (sk->sk_family != PF_INET)
+		return;
+
+	sksec->nlbl_state = NLBL_REQUIRE;
+	sksec->peer_sid = sksec->sid;
+	sksec->sclass = isec->sclass;
+
+	/* Try to set the NetLabel on the socket to save time later, if we fail
+	 * here we will pick up the pieces in later calls to
+	 * selinux_netlbl_inode_permission(). */
+	selinux_netlbl_socket_setsid(sock, sksec->sid);
+}
+
+/**
+ * selinux_netlbl_inet_conn_request - Handle a new connection request
+ * @skb: the packet
+ * @sock_sid: the SID of the parent socket
+ *
+ * Description:
+ * If present, use the security attributes of the packet in @skb and the
+ * parent sock's SID to arrive at a SID for the new child sock.  Returns the
+ * SID of the connection or SECSID_NULL on failure.
+ *
+ */
+u32 selinux_netlbl_inet_conn_request(struct sk_buff *skb, u32 sock_sid)
+{
+	int rc;
+	u32 peer_sid;
+
+	rc = selinux_netlbl_skbuff_getsid(skb, sock_sid, &peer_sid);
+	if (rc != 0)
+		return SECSID_NULL;
+
+	if (peer_sid == SECINITSID_UNLABELED)
+		return SECSID_NULL;
+
+	return peer_sid;
+}
+
+/**
+ * __selinux_netlbl_inode_permission - Label a socket using NetLabel
+ * @inode: the file descriptor's inode
+ * @mask: the permission mask
+ *
+ * Description:
+ * Try to label a socket with the inode's SID using NetLabel.  Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int __selinux_netlbl_inode_permission(struct inode *inode, int mask)
+{
+	int rc;
+	struct socket *sock = SOCKET_I(inode);
+	struct sk_security_struct *sksec = sock->sk->sk_security;
+
+	lock_sock(sock->sk);
+	rc = selinux_netlbl_socket_setsid(sock, sksec->sid);
+	release_sock(sock->sk);
+
+	return rc;
+}
+
+/**
+ * selinux_netlbl_sock_rcv_skb - Do an inbound access check using NetLabel
+ * @sksec: the sock's sk_security_struct
+ * @skb: the packet
+ * @ad: the audit data
+ *
+ * Description:
+ * Fetch the NetLabel security attributes from @skb and perform an access check
+ * against the receiving socket.  Returns zero on success, negative values on
+ * error.
+ *
+ */
+int selinux_netlbl_sock_rcv_skb(struct sk_security_struct *sksec,
+				struct sk_buff *skb,
+				struct avc_audit_data *ad)
+{
+	int rc;
+	u32 netlbl_sid;
+	u32 recv_perm;
+
+	rc = selinux_netlbl_skbuff_getsid(skb, sksec->sid, &netlbl_sid);
+	if (rc != 0)
+		return rc;
+
+	if (netlbl_sid == SECINITSID_UNLABELED)
+		return 0;
+
+	switch (sksec->sclass) {
+	case SECCLASS_UDP_SOCKET:
+		recv_perm = UDP_SOCKET__RECV_MSG;
+		break;
+	case SECCLASS_TCP_SOCKET:
+		recv_perm = TCP_SOCKET__RECV_MSG;
+		break;
+	default:
+		recv_perm = RAWIP_SOCKET__RECV_MSG;
+	}
+
+	rc = avc_has_perm(sksec->sid,
+			  netlbl_sid,
+			  sksec->sclass,
+			  recv_perm,
+			  ad);
+	if (rc == 0)
+		return 0;
+
+	netlbl_skbuff_err(skb, rc);
+	return rc;
+}
+
+/**
+ * selinux_netlbl_socket_peersid - Return the peer SID of a connected socket
+ * @sock: the socket
+ *
+ * Description:
+ * Examine @sock to find the connected peer's SID.  Returns the SID on success
+ * or SECSID_NULL on error.
+ *
+ */
+u32 selinux_netlbl_socket_getpeersec_stream(struct socket *sock)
+{
+	struct sk_security_struct *sksec = sock->sk->sk_security;
+
+	if (sksec->peer_sid == SECINITSID_UNLABELED)
+		return SECSID_NULL;
+
+	return sksec->peer_sid;
+}
+
+/**
+ * selinux_netlbl_socket_getpeersec_dgram - Return the SID of a NetLabel packet
+ * @skb: the packet
+ *
+ * Description:
+ * Examine @skb to find the SID assigned to it by NetLabel.  Returns the SID on
+ * success, SECSID_NULL on error.
+ *
+ */
+u32 selinux_netlbl_socket_getpeersec_dgram(struct sk_buff *skb)
+{
+	int peer_sid;
+	struct sock *sk = skb->sk;
+	struct inode_security_struct *isec;
+
+	if (sk == NULL || sk->sk_socket == NULL)
+		return SECSID_NULL;
+
+	isec = SOCK_INODE(sk->sk_socket)->i_security;
+	if (selinux_netlbl_skbuff_getsid(skb, isec->sid, &peer_sid) != 0)
+		return SECSID_NULL;
+	if (peer_sid == SECINITSID_UNLABELED)
+		return SECSID_NULL;
+
+	return peer_sid;
+}
+#endif /* CONFIG_NETLABEL */
-- 
cgit v1.2.3-70-g09d2


From c71099acce933455123ee505cc75964610a209ad Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 4 Aug 2006 23:20:06 -0700
Subject: [IPV6]: Multiple Routing Tables

Adds the framework to support multiple IPv6 routing tables.
Currently all automatically generated routes are put into the
same table. This could be changed at a later point after
considering the produced locking overhead.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   |  39 ++++-
 include/net/ip6_route.h |   3 +-
 net/ipv6/Kconfig        |   6 +
 net/ipv6/addrconf.c     |   6 +-
 net/ipv6/ip6_fib.c      | 144 +++++++++++++++++-
 net/ipv6/route.c        | 380 ++++++++++++++++++++++++++++++++----------------
 6 files changed, 441 insertions(+), 137 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a66e9de16a6..818411519c8 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -51,6 +51,8 @@ struct rt6key
 	int		plen;
 };
 
+struct fib6_table;
+
 struct rt6_info
 {
 	union {
@@ -71,6 +73,7 @@ struct rt6_info
 	u32				rt6i_flags;
 	u32				rt6i_metric;
 	atomic_t			rt6i_ref;
+	struct fib6_table		*rt6i_table;
 
 	struct rt6key			rt6i_dst;
 	struct rt6key			rt6i_src;
@@ -143,12 +146,43 @@ struct rt6_statistics {
 
 typedef void			(*f_pnode)(struct fib6_node *fn, void *);
 
-extern struct fib6_node		ip6_routing_table;
+struct fib6_table {
+	struct hlist_node	tb6_hlist;
+	u32			tb6_id;
+	rwlock_t		tb6_lock;
+	struct fib6_node	tb6_root;
+};
+
+#define RT6_TABLE_UNSPEC	RT_TABLE_UNSPEC
+#define RT6_TABLE_MAIN		RT_TABLE_MAIN
+#define RT6_TABLE_LOCAL		RT6_TABLE_MAIN
+#define RT6_TABLE_DFLT		RT6_TABLE_MAIN
+#define RT6_TABLE_INFO		RT6_TABLE_MAIN
+#define RT6_TABLE_PREFIX	RT6_TABLE_MAIN
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+#define FIB6_TABLE_MIN		1
+#define FIB6_TABLE_MAX		RT_TABLE_MAX
+#else
+#define FIB6_TABLE_MIN		RT_TABLE_MAIN
+#define FIB6_TABLE_MAX		FIB6_TABLE_MIN
+#endif
+
+#define RT6_F_STRICT		1
+#define RT6_F_HAS_SADDR		2
+
+typedef struct rt6_info *(*pol_lookup_t)(struct fib6_table *,
+					 struct flowi *, int);
 
 /*
  *	exported functions
  */
 
+extern struct fib6_table *	fib6_get_table(u32 id);
+extern struct fib6_table *	fib6_new_table(u32 id);
+extern struct dst_entry *	fib6_rule_lookup(struct flowi *fl, int flags,
+						 pol_lookup_t lookup);
+
 extern struct fib6_node		*fib6_lookup(struct fib6_node *root,
 					     struct in6_addr *daddr,
 					     struct in6_addr *saddr);
@@ -161,6 +195,9 @@ extern void			fib6_clean_tree(struct fib6_node *root,
 						int (*func)(struct rt6_info *, void *arg),
 						int prune, void *arg);
 
+extern void			fib6_clean_all(int (*func)(struct rt6_info *, void *arg),
+					       int prune, void *arg);
+
 extern int			fib6_walk(struct fib6_walker_t *w);
 extern int			fib6_walk_continue(struct fib6_walker_t *w);
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 96b0e66406e..d49c8c90eb6 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -58,7 +58,8 @@ extern int			ipv6_route_ioctl(unsigned int cmd, void __user *arg);
 extern int			ip6_route_add(struct in6_rtmsg *rtmsg,
 					      struct nlmsghdr *,
 					      void *rtattr,
-					      struct netlink_skb_parms *req);
+					      struct netlink_skb_parms *req,
+					      u32 table_id);
 extern int			ip6_ins_rt(struct rt6_info *,
 					   struct nlmsghdr *,
 					   void *rtattr,
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 0ba06c0c5d3..159c63d99c8 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -136,3 +136,9 @@ config IPV6_TUNNEL
 
 	  If unsure, say N.
 
+config IPV6_MULTIPLE_TABLES
+	bool "IPv6: Multiple Routing Tables"
+	depends on IPV6 && EXPERIMENTAL
+	---help---
+	  Support multiple routing tables.
+
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c7852b38e03..318767fcefd 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1525,7 +1525,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 	if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
 		rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
 
-	ip6_route_add(&rtmsg, NULL, NULL, NULL);
+	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_PREFIX);
 }
 
 /* Create "default" multicast route to the interface */
@@ -1542,7 +1542,7 @@ static void addrconf_add_mroute(struct net_device *dev)
 	rtmsg.rtmsg_ifindex = dev->ifindex;
 	rtmsg.rtmsg_flags = RTF_UP;
 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
-	ip6_route_add(&rtmsg, NULL, NULL, NULL);
+	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_LOCAL);
 }
 
 static void sit_route_add(struct net_device *dev)
@@ -1559,7 +1559,7 @@ static void sit_route_add(struct net_device *dev)
 	rtmsg.rtmsg_flags	= RTF_UP|RTF_NONEXTHOP;
 	rtmsg.rtmsg_ifindex	= dev->ifindex;
 
-	ip6_route_add(&rtmsg, NULL, NULL, NULL);
+	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_MAIN);
 }
 
 static void addrconf_add_lroute(struct net_device *dev)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 764221220af..fcd7da830ac 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -26,6 +26,7 @@
 #include <linux/netdevice.h>
 #include <linux/in6.h>
 #include <linux/init.h>
+#include <linux/list.h>
 
 #ifdef 	CONFIG_PROC_FS
 #include <linux/proc_fs.h>
@@ -147,6 +148,126 @@ static __inline__ void rt6_release(struct rt6_info *rt)
 		dst_free(&rt->u.dst);
 }
 
+static struct fib6_table fib6_main_tbl = {
+	.tb6_id		= RT6_TABLE_MAIN,
+	.tb6_lock	= RW_LOCK_UNLOCKED,
+	.tb6_root	= {
+		.leaf		= &ip6_null_entry,
+		.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
+	},
+};
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+
+#define FIB_TABLE_HASHSZ 256
+static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
+
+static struct fib6_table *fib6_alloc_table(u32 id)
+{
+	struct fib6_table *table;
+
+	table = kzalloc(sizeof(*table), GFP_ATOMIC);
+	if (table != NULL) {
+		table->tb6_id = id;
+		table->tb6_lock = RW_LOCK_UNLOCKED;
+		table->tb6_root.leaf = &ip6_null_entry;
+		table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+	}
+
+	return table;
+}
+
+static void fib6_link_table(struct fib6_table *tb)
+{
+	unsigned int h;
+
+	h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1);
+
+	/*
+	 * No protection necessary, this is the only list mutatation
+	 * operation, tables never disappear once they exist.
+	 */
+	hlist_add_head_rcu(&tb->tb6_hlist, &fib_table_hash[h]);
+}
+
+struct fib6_table *fib6_new_table(u32 id)
+{
+	struct fib6_table *tb;
+
+	if (id == 0)
+		id = RT6_TABLE_MAIN;
+	tb = fib6_get_table(id);
+	if (tb)
+		return tb;
+
+	tb = fib6_alloc_table(id);
+	if (tb != NULL)
+		fib6_link_table(tb);
+
+	return tb;
+}
+
+struct fib6_table *fib6_get_table(u32 id)
+{
+	struct fib6_table *tb;
+	struct hlist_node *node;
+	unsigned int h;
+
+	if (id == 0)
+		id = RT6_TABLE_MAIN;
+	h = id & (FIB_TABLE_HASHSZ - 1);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb6_hlist) {
+		if (tb->tb6_id == id) {
+			rcu_read_unlock();
+			return tb;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
+				   pol_lookup_t lookup)
+{
+	/*
+	 * TODO: Add rule lookup
+	 */
+	struct fib6_table *table = fib6_get_table(RT6_TABLE_MAIN);
+
+	return (struct dst_entry *) lookup(table, fl, flags);
+}
+
+static void __init fib6_tables_init(void)
+{
+	fib6_link_table(&fib6_main_tbl);
+}
+
+#else
+
+struct fib6_table *fib6_new_table(u32 id)
+{
+	return fib6_get_table(id);
+}
+
+struct fib6_table *fib6_get_table(u32 id)
+{
+	return &fib6_main_tbl;
+}
+
+struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
+				   pol_lookup_t lookup)
+{
+	return (struct dst_entry *) lookup(&fib6_main_tbl, fl, flags);
+}
+
+static void __init fib6_tables_init(void)
+{
+}
+
+#endif
+
 
 /*
  *	Routing Table
@@ -1064,6 +1185,22 @@ void fib6_clean_tree(struct fib6_node *root,
 	fib6_walk(&c.w);
 }
 
+void fib6_clean_all(int (*func)(struct rt6_info *, void *arg),
+		    int prune, void *arg)
+{
+	int i;
+	struct fib6_table *table;
+
+	for (i = FIB6_TABLE_MIN; i <= FIB6_TABLE_MAX; i++) {
+		table = fib6_get_table(i);
+		if (table != NULL) {
+			write_lock_bh(&table->tb6_lock);
+			fib6_clean_tree(&table->tb6_root, func, prune, arg);
+			write_unlock_bh(&table->tb6_lock);
+		}
+	}
+}
+
 static int fib6_prune_clone(struct rt6_info *rt, void *arg)
 {
 	if (rt->rt6i_flags & RTF_CACHE) {
@@ -1142,11 +1279,8 @@ void fib6_run_gc(unsigned long dummy)
 	}
 	gc_args.more = 0;
 
-
-	write_lock_bh(&rt6_lock);
 	ndisc_dst_gc(&gc_args.more);
-	fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL);
-	write_unlock_bh(&rt6_lock);
+	fib6_clean_all(fib6_age, 0, NULL);
 
 	if (gc_args.more)
 		mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval);
@@ -1165,6 +1299,8 @@ void __init fib6_init(void)
 					   NULL, NULL);
 	if (!fib6_node_kmem)
 		panic("cannot create fib6_nodes cache");
+
+	fib6_tables_init();
 }
 
 void fib6_gc_cleanup(void)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ce1f49b595b..73efdadb9ab 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -140,16 +140,6 @@ struct rt6_info ip6_null_entry = {
 	.rt6i_ref	= ATOMIC_INIT(1),
 };
 
-struct fib6_node ip6_routing_table = {
-	.leaf		= &ip6_null_entry,
-	.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
-};
-
-/* Protects all the ip6 fib */
-
-DEFINE_RWLOCK(rt6_lock);
-
-
 /* allocate dst with ip6_dst_ops */
 static __inline__ struct rt6_info *ip6_dst_alloc(void)
 {
@@ -188,8 +178,14 @@ static __inline__ int rt6_check_expired(const struct rt6_info *rt)
 		time_after(jiffies, rt->rt6i_expires));
 }
 
+static inline int rt6_need_strict(struct in6_addr *daddr)
+{
+	return (ipv6_addr_type(daddr) &
+		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
+}
+
 /*
- *	Route lookup. Any rt6_lock is implied.
+ *	Route lookup. Any table->tb6_lock is implied.
  */
 
 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
@@ -441,27 +437,66 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 }
 #endif
 
-struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
-			    int oif, int strict)
+#define BACKTRACK() \
+if (rt == &ip6_null_entry && flags & RT6_F_STRICT) { \
+	while ((fn = fn->parent) != NULL) { \
+		if (fn->fn_flags & RTN_TL_ROOT) { \
+			dst_hold(&rt->u.dst); \
+			goto out; \
+		} \
+		if (fn->fn_flags & RTN_RTINFO) \
+			goto restart; \
+	} \
+}
+
+static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
+					     struct flowi *fl, int flags)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
 
-	read_lock_bh(&rt6_lock);
-	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
-	rt = rt6_device_match(fn->leaf, oif, strict);
+	read_lock_bh(&table->tb6_lock);
+	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
+restart:
+	rt = fn->leaf;
+	rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
+	BACKTRACK();
 	dst_hold(&rt->u.dst);
-	rt->u.dst.__use++;
-	read_unlock_bh(&rt6_lock);
+out:
+	read_unlock_bh(&table->tb6_lock);
 
 	rt->u.dst.lastuse = jiffies;
-	if (rt->u.dst.error == 0)
-		return rt;
-	dst_release(&rt->u.dst);
+	rt->u.dst.__use++;
+
+	return rt;
+
+}
+
+struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
+			    int oif, int strict)
+{
+	struct flowi fl = {
+		.oif = oif,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *daddr,
+				/* TODO: saddr */
+			},
+		},
+	};
+	struct dst_entry *dst;
+	int flags = strict ? RT6_F_STRICT : 0;
+
+	dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
+	if (dst->error == 0)
+		return (struct rt6_info *) dst;
+
+	dst_release(dst);
+
 	return NULL;
 }
 
-/* ip6_ins_rt is called with FREE rt6_lock.
+/* ip6_ins_rt is called with FREE table->tb6_lock.
    It takes new route entry, the addition fails by any reason the
    route is freed. In any case, if caller does not hold it, it may
    be destroyed.
@@ -471,10 +506,12 @@ int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
 		void *_rtattr, struct netlink_skb_parms *req)
 {
 	int err;
+	struct fib6_table *table;
 
-	write_lock_bh(&rt6_lock);
-	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
-	write_unlock_bh(&rt6_lock);
+	table = rt->rt6i_table;
+	write_lock_bh(&table->tb6_lock);
+	err = fib6_add(&table->tb6_root, rt, nlh, _rtattr, req);
+	write_unlock_bh(&table->tb6_lock);
 
 	return err;
 }
@@ -532,51 +569,40 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *d
 	return rt;
 }
 
-#define BACKTRACK() \
-if (rt == &ip6_null_entry) { \
-       while ((fn = fn->parent) != NULL) { \
-		if (fn->fn_flags & RTN_ROOT) { \
-			goto out; \
-		} \
-		if (fn->fn_flags & RTN_RTINFO) \
-			goto restart; \
-	} \
-}
-
-
-void ip6_route_input(struct sk_buff *skb)
+struct rt6_info *ip6_pol_route_input(struct fib6_table *table, struct flowi *fl,
+				     int flags)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt, *nrt;
-	int strict;
+	int strict = 0;
 	int attempts = 3;
 	int err;
 	int reachable = RT6_SELECT_F_REACHABLE;
 
-	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
+	if (flags & RT6_F_STRICT)
+		strict = RT6_SELECT_F_IFACE;
 
 relookup:
-	read_lock_bh(&rt6_lock);
+	read_lock_bh(&table->tb6_lock);
 
 restart_2:
-	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
-			 &skb->nh.ipv6h->saddr);
+	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 
 restart:
-	rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
+	rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
 	BACKTRACK();
 	if (rt == &ip6_null_entry ||
 	    rt->rt6i_flags & RTF_CACHE)
 		goto out;
 
 	dst_hold(&rt->u.dst);
-	read_unlock_bh(&rt6_lock);
+	read_unlock_bh(&table->tb6_lock);
 
 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
-		nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
+		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
 	else {
 #if CLONE_OFFLINK_ROUTE
-		nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
+		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
 #else
 		goto out2;
 #endif
@@ -587,7 +613,7 @@ restart:
 
 	dst_hold(&rt->u.dst);
 	if (nrt) {
-		err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
+		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
 		if (!err)
 			goto out2;
 	}
@@ -596,7 +622,7 @@ restart:
 		goto out2;
 
 	/*
-	 * Race condition! In the gap, when rt6_lock was
+	 * Race condition! In the gap, when table->tb6_lock was
 	 * released someone could insert this route.  Relookup.
 	 */
 	dst_release(&rt->u.dst);
@@ -608,30 +634,54 @@ out:
 		goto restart_2;
 	}
 	dst_hold(&rt->u.dst);
-	read_unlock_bh(&rt6_lock);
+	read_unlock_bh(&table->tb6_lock);
 out2:
 	rt->u.dst.lastuse = jiffies;
 	rt->u.dst.__use++;
-	skb->dst = (struct dst_entry *) rt;
-	return;
+
+	return rt;
 }
 
-struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
+void ip6_route_input(struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = skb->nh.ipv6h;
+	struct flowi fl = {
+		.iif = skb->dev->ifindex,
+		.nl_u = {
+			.ip6_u = {
+				.daddr = iph->daddr,
+				.saddr = iph->saddr,
+				.flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
+			},
+		},
+		.proto = iph->nexthdr,
+	};
+	int flags = 0;
+
+	if (rt6_need_strict(&iph->daddr))
+		flags |= RT6_F_STRICT;
+
+	skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
+}
+
+static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
+					     struct flowi *fl, int flags)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt, *nrt;
-	int strict;
+	int strict = 0;
 	int attempts = 3;
 	int err;
 	int reachable = RT6_SELECT_F_REACHABLE;
 
-	strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
+	if (flags & RT6_F_STRICT)
+		strict = RT6_SELECT_F_IFACE;
 
 relookup:
-	read_lock_bh(&rt6_lock);
+	read_lock_bh(&table->tb6_lock);
 
 restart_2:
-	fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
+	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 
 restart:
 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
@@ -641,7 +691,7 @@ restart:
 		goto out;
 
 	dst_hold(&rt->u.dst);
-	read_unlock_bh(&rt6_lock);
+	read_unlock_bh(&table->tb6_lock);
 
 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
@@ -667,7 +717,7 @@ restart:
 		goto out2;
 
 	/*
-	 * Race condition! In the gap, when rt6_lock was
+	 * Race condition! In the gap, when table->tb6_lock was
 	 * released someone could insert this route.  Relookup.
 	 */
 	dst_release(&rt->u.dst);
@@ -679,11 +729,21 @@ out:
 		goto restart_2;
 	}
 	dst_hold(&rt->u.dst);
-	read_unlock_bh(&rt6_lock);
+	read_unlock_bh(&table->tb6_lock);
 out2:
 	rt->u.dst.lastuse = jiffies;
 	rt->u.dst.__use++;
-	return &rt->u.dst;
+	return rt;
+}
+
+struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
+{
+	int flags = 0;
+
+	if (rt6_need_strict(&fl->fl6_dst))
+		flags |= RT6_F_STRICT;
+
+	return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
 }
 
 
@@ -906,7 +966,8 @@ int ipv6_get_hoplimit(struct net_device *dev)
  */
 
 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
-		void *_rtattr, struct netlink_skb_parms *req)
+		  void *_rtattr, struct netlink_skb_parms *req,
+		  u32 table_id)
 {
 	int err;
 	struct rtmsg *r;
@@ -914,6 +975,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 	struct rt6_info *rt = NULL;
 	struct net_device *dev = NULL;
 	struct inet6_dev *idev = NULL;
+	struct fib6_table *table;
 	int addr_type;
 
 	rta = (struct rtattr **) _rtattr;
@@ -937,6 +999,12 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 	if (rtmsg->rtmsg_metric == 0)
 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
 
+	table = fib6_new_table(table_id);
+	if (table == NULL) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
 	rt = ip6_dst_alloc();
 
 	if (rt == NULL) {
@@ -1093,6 +1161,7 @@ install_route:
 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
 	rt->u.dst.dev = dev;
 	rt->rt6i_idev = idev;
+	rt->rt6i_table = table;
 	return ip6_ins_rt(rt, nlh, _rtattr, req);
 
 out:
@@ -1108,26 +1177,35 @@ out:
 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
 {
 	int err;
+	struct fib6_table *table;
 
-	write_lock_bh(&rt6_lock);
+	table = rt->rt6i_table;
+	write_lock_bh(&table->tb6_lock);
 
 	err = fib6_del(rt, nlh, _rtattr, req);
 	dst_release(&rt->u.dst);
 
-	write_unlock_bh(&rt6_lock);
+	write_unlock_bh(&table->tb6_lock);
 
 	return err;
 }
 
-static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
+static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
+			 void *_rtattr, struct netlink_skb_parms *req,
+			 u32 table_id)
 {
+	struct fib6_table *table;
 	struct fib6_node *fn;
 	struct rt6_info *rt;
 	int err = -ESRCH;
 
-	read_lock_bh(&rt6_lock);
+	table = fib6_get_table(table_id);
+	if (table == NULL)
+		return err;
+
+	read_lock_bh(&table->tb6_lock);
 
-	fn = fib6_locate(&ip6_routing_table,
+	fn = fib6_locate(&table->tb6_root,
 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
 	
@@ -1144,12 +1222,12 @@ static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_r
 			    rtmsg->rtmsg_metric != rt->rt6i_metric)
 				continue;
 			dst_hold(&rt->u.dst);
-			read_unlock_bh(&rt6_lock);
+			read_unlock_bh(&table->tb6_lock);
 
 			return ip6_del_rt(rt, nlh, _rtattr, req);
 		}
 	}
-	read_unlock_bh(&rt6_lock);
+	read_unlock_bh(&table->tb6_lock);
 
 	return err;
 }
@@ -1161,10 +1239,15 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
 		  struct neighbour *neigh, u8 *lladdr, int on_link)
 {
 	struct rt6_info *rt, *nrt = NULL;
-	int strict;
 	struct fib6_node *fn;
+	struct fib6_table *table;
 	struct netevent_redirect netevent;
 
+	/* TODO: Very lazy, might need to check all tables */
+	table = fib6_get_table(RT6_TABLE_MAIN);
+	if (table == NULL)
+		return;
+
 	/*
 	 * Get the "current" route for this destination and
 	 * check if the redirect has come from approriate router.
@@ -1175,10 +1258,9 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
 	 * is a bit fuzzy and one might need to check all possible
 	 * routes.
 	 */
-	strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
 
-	read_lock_bh(&rt6_lock);
-	fn = fib6_lookup(&ip6_routing_table, dest, NULL);
+	read_lock_bh(&table->tb6_lock);
+	fn = fib6_lookup(&table->tb6_root, dest, NULL);
 restart:
 	for (rt = fn->leaf; rt; rt = rt->u.next) {
 		/*
@@ -1201,7 +1283,7 @@ restart:
 	}
 	if (rt)
 		dst_hold(&rt->u.dst);
-	else if (strict) {
+	else if (rt6_need_strict(dest)) {
 		while ((fn = fn->parent) != NULL) {
 			if (fn->fn_flags & RTN_ROOT)
 				break;
@@ -1209,7 +1291,7 @@ restart:
 				goto restart;
 		}
 	}
-	read_unlock_bh(&rt6_lock);
+	read_unlock_bh(&table->tb6_lock);
 
 	if (!rt) {
 		if (net_ratelimit())
@@ -1384,6 +1466,7 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
 #ifdef CONFIG_IPV6_SUBTREES
 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
 #endif
+		rt->rt6i_table = ort->rt6i_table;
 	}
 	return rt;
 }
@@ -1394,9 +1477,14 @@ static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixle
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt = NULL;
+	struct fib6_table *table;
+
+	table = fib6_get_table(RT6_TABLE_INFO);
+	if (table == NULL)
+		return NULL;
 
-	write_lock_bh(&rt6_lock);
-	fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
+	write_lock_bh(&table->tb6_lock);
+	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
 	if (!fn)
 		goto out;
 
@@ -1411,7 +1499,7 @@ static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixle
 		break;
 	}
 out:
-	write_unlock_bh(&rt6_lock);
+	write_unlock_bh(&table->tb6_lock);
 	return rt;
 }
 
@@ -1433,7 +1521,7 @@ static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixle
 		rtmsg.rtmsg_flags |= RTF_DEFAULT;
 	rtmsg.rtmsg_ifindex = ifindex;
 
-	ip6_route_add(&rtmsg, NULL, NULL, NULL);
+	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_INFO);
 
 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
 }
@@ -1442,12 +1530,14 @@ static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixle
 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
 {	
 	struct rt6_info *rt;
-	struct fib6_node *fn;
+	struct fib6_table *table;
 
-	fn = &ip6_routing_table;
+	table = fib6_get_table(RT6_TABLE_DFLT);
+	if (table == NULL)
+		return NULL;
 
-	write_lock_bh(&rt6_lock);
-	for (rt = fn->leaf; rt; rt=rt->u.next) {
+	write_lock_bh(&table->tb6_lock);
+	for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
 		if (dev == rt->rt6i_dev &&
 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
@@ -1455,7 +1545,7 @@ struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *d
 	}
 	if (rt)
 		dst_hold(&rt->u.dst);
-	write_unlock_bh(&rt6_lock);
+	write_unlock_bh(&table->tb6_lock);
 	return rt;
 }
 
@@ -1474,28 +1564,31 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
 
 	rtmsg.rtmsg_ifindex = dev->ifindex;
 
-	ip6_route_add(&rtmsg, NULL, NULL, NULL);
+	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_DFLT);
 	return rt6_get_dflt_router(gwaddr, dev);
 }
 
 void rt6_purge_dflt_routers(void)
 {
 	struct rt6_info *rt;
+	struct fib6_table *table;
+
+	/* NOTE: Keep consistent with rt6_get_dflt_router */
+	table = fib6_get_table(RT6_TABLE_DFLT);
+	if (table == NULL)
+		return;
 
 restart:
-	read_lock_bh(&rt6_lock);
-	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
+	read_lock_bh(&table->tb6_lock);
+	for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
 			dst_hold(&rt->u.dst);
-
-			read_unlock_bh(&rt6_lock);
-
+			read_unlock_bh(&table->tb6_lock);
 			ip6_del_rt(rt, NULL, NULL, NULL);
-
 			goto restart;
 		}
 	}
-	read_unlock_bh(&rt6_lock);
+	read_unlock_bh(&table->tb6_lock);
 }
 
 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
@@ -1516,10 +1609,12 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
 		rtnl_lock();
 		switch (cmd) {
 		case SIOCADDRT:
-			err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
+			err = ip6_route_add(&rtmsg, NULL, NULL, NULL,
+					    RT6_TABLE_MAIN);
 			break;
 		case SIOCDELRT:
-			err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
+			err = ip6_route_del(&rtmsg, NULL, NULL, NULL,
+					    RT6_TABLE_MAIN);
 			break;
 		default:
 			err = -EINVAL;
@@ -1593,6 +1688,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 
 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
 	rt->rt6i_dst.plen = 128;
+	rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
 
 	atomic_set(&rt->u.dst.__refcnt, 1);
 
@@ -1611,9 +1707,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *arg)
 
 void rt6_ifdown(struct net_device *dev)
 {
-	write_lock_bh(&rt6_lock);
-	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
-	write_unlock_bh(&rt6_lock);
+	fib6_clean_all(fib6_ifdown, 0, dev);
 }
 
 struct rt6_mtu_change_arg
@@ -1663,13 +1757,12 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 
 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
 {
-	struct rt6_mtu_change_arg arg;
+	struct rt6_mtu_change_arg arg = {
+		.dev = dev,
+		.mtu = mtu,
+	};
 
-	arg.dev = dev;
-	arg.mtu = mtu;
-	read_lock_bh(&rt6_lock);
-	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
-	read_unlock_bh(&rt6_lock);
+	fib6_clean_all(rt6_mtu_change_route, 0, &arg);
 }
 
 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
@@ -1719,7 +1812,7 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
 		return -EINVAL;
-	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
+	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb), r->rtm_table);
 }
 
 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -1729,7 +1822,7 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
 		return -EINVAL;
-	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
+	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb), r->rtm_table);
 }
 
 struct rt6_rtnl_dump_arg
@@ -1761,6 +1854,10 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
 	rtm->rtm_src_len = rt->rt6i_src.plen;
 	rtm->rtm_tos = 0;
+	if (rt->rt6i_table)
+		rtm->rtm_table = rt->rt6i_table->tb6_id;
+	else
+		rtm->rtm_table = RT6_TABLE_UNSPEC;
 	rtm->rtm_table = RT_TABLE_MAIN;
 	if (rt->rt6i_flags&RTF_REJECT)
 		rtm->rtm_type = RTN_UNREACHABLE;
@@ -1868,7 +1965,6 @@ static void fib6_dump_end(struct netlink_callback *cb)
 
 	if (w) {
 		cb->args[0] = 0;
-		fib6_walker_unlink(w);
 		kfree(w);
 	}
 	cb->done = (void*)cb->args[1];
@@ -1883,13 +1979,20 @@ static int fib6_dump_done(struct netlink_callback *cb)
 
 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct fib6_table *table;
 	struct rt6_rtnl_dump_arg arg;
 	struct fib6_walker_t *w;
-	int res;
+	int i, res = 0;
 
 	arg.skb = skb;
 	arg.cb = cb;
 
+	/*
+	 * cb->args[0] = pointer to walker structure
+	 * cb->args[1] = saved cb->done() pointer
+	 * cb->args[2] = current table being dumped
+	 */
+
 	w = (void*)cb->args[0];
 	if (w == NULL) {
 		/* New dump:
@@ -1905,24 +2008,48 @@ int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		w = kzalloc(sizeof(*w), GFP_ATOMIC);
 		if (w == NULL)
 			return -ENOMEM;
-		RT6_TRACE("dump<%p", w);
-		w->root = &ip6_routing_table;
 		w->func = fib6_dump_node;
 		w->args = &arg;
 		cb->args[0] = (long)w;
-		read_lock_bh(&rt6_lock);
-		res = fib6_walk(w);
-		read_unlock_bh(&rt6_lock);
+		cb->args[2] = FIB6_TABLE_MIN;
 	} else {
 		w->args = &arg;
-		read_lock_bh(&rt6_lock);
-		res = fib6_walk_continue(w);
-		read_unlock_bh(&rt6_lock);
+		i = cb->args[2];
+		if (i > FIB6_TABLE_MAX)
+			goto end;
+
+		table = fib6_get_table(i);
+		if (table != NULL) {
+			read_lock_bh(&table->tb6_lock);
+			w->root = &table->tb6_root;
+			res = fib6_walk_continue(w);
+			read_unlock_bh(&table->tb6_lock);
+			if (res != 0) {
+				if (res < 0)
+					fib6_walker_unlink(w);
+				goto end;
+			}
+		}
+
+		fib6_walker_unlink(w);
+		cb->args[2] = ++i;
 	}
-#if RT6_DEBUG >= 3
-	if (res <= 0 && skb->len == 0)
-		RT6_TRACE("%p>dump end\n", w);
-#endif
+
+	for (i = cb->args[2]; i <= FIB6_TABLE_MAX; i++) {
+		table = fib6_get_table(i);
+		if (table == NULL)
+			continue;
+
+		read_lock_bh(&table->tb6_lock);
+		w->root = &table->tb6_root;
+		res = fib6_walk(w);
+		read_unlock_bh(&table->tb6_lock);
+		if (res)
+			break;
+	}
+end:
+	cb->args[2] = i;
+
 	res = res < 0 ? res : skb->len;
 	/* res < 0 is an error. (really, impossible)
 	   res == 0 means that dump is complete, but skb still can contain data.
@@ -2102,16 +2229,13 @@ static int rt6_info_route(struct rt6_info *rt, void *p_arg)
 
 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
 {
-	struct rt6_proc_arg arg;
-	arg.buffer = buffer;
-	arg.offset = offset;
-	arg.length = length;
-	arg.skip = 0;
-	arg.len = 0;
+	struct rt6_proc_arg arg = {
+		.buffer = buffer,
+		.offset = offset,
+		.length = length,
+	};
 
-	read_lock_bh(&rt6_lock);
-	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
-	read_unlock_bh(&rt6_lock);
+	fib6_clean_all(rt6_info_route, 0, &arg);
 
 	*start = buffer;
 	if (offset)
-- 
cgit v1.2.3-70-g09d2


From 14c0b97ddfc2944982d078b8e33b088840068976 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 4 Aug 2006 03:38:38 -0700
Subject: [NET]: Protocol Independant Policy Routing Rules Framework

Derived from net/ipv/fib_rules.c

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/fib_rules.h |  60 +++++++
 include/net/fib_rules.h   |  90 ++++++++++
 net/Kconfig               |   3 +
 net/core/Makefile         |   1 +
 net/core/fib_rules.c      | 416 ++++++++++++++++++++++++++++++++++++++++++++++
 net/core/rtnetlink.c      |   9 +-
 6 files changed, 577 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/fib_rules.h
 create mode 100644 include/net/fib_rules.h
 create mode 100644 net/core/fib_rules.c

(limited to 'include')

diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h
new file mode 100644
index 00000000000..5e503f0ca6e
--- /dev/null
+++ b/include/linux/fib_rules.h
@@ -0,0 +1,60 @@
+#ifndef __LINUX_FIB_RULES_H
+#define __LINUX_FIB_RULES_H
+
+#include <linux/types.h>
+#include <linux/rtnetlink.h>
+
+/* rule is permanent, and cannot be deleted */
+#define FIB_RULE_PERMANENT	1
+
+struct fib_rule_hdr
+{
+	__u8		family;
+	__u8		dst_len;
+	__u8		src_len;
+	__u8		tos;
+
+	__u8		table;
+	__u8		res1;	/* reserved */
+	__u8		res2;	/* reserved */
+	__u8		action;
+
+	__u32		flags;
+};
+
+enum
+{
+	FRA_UNSPEC,
+	FRA_DST,	/* destination address */
+	FRA_SRC,	/* source address */
+	FRA_IFNAME,	/* interface name */
+	FRA_UNUSED1,
+	FRA_UNUSED2,
+	FRA_PRIORITY,	/* priority/preference */
+	FRA_UNUSED3,
+	FRA_UNUSED4,
+	FRA_UNUSED5,
+	FRA_FWMARK,	/* netfilter mark (IPv4) */
+	FRA_FLOW,	/* flow/class id */
+	__FRA_MAX
+};
+
+#define FRA_MAX (__FRA_MAX - 1)
+
+enum
+{
+	FR_ACT_UNSPEC,
+	FR_ACT_TO_TBL,		/* Pass to fixed table */
+	FR_ACT_RES1,
+	FR_ACT_RES2,
+	FR_ACT_RES3,
+	FR_ACT_RES4,
+	FR_ACT_BLACKHOLE,	/* Drop without notification */
+	FR_ACT_UNREACHABLE,	/* Drop with ENETUNREACH */
+	FR_ACT_PROHIBIT,	/* Drop with EACCES */
+	__FR_ACT_MAX,
+};
+
+#define FR_ACT_MAX (__FR_ACT_MAX - 1)
+
+#endif
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
new file mode 100644
index 00000000000..61375d9e53f
--- /dev/null
+++ b/include/net/fib_rules.h
@@ -0,0 +1,90 @@
+#ifndef __NET_FIB_RULES_H
+#define __NET_FIB_RULES_H
+
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/fib_rules.h>
+#include <net/flow.h>
+#include <net/netlink.h>
+
+struct fib_rule
+{
+	struct list_head	list;
+	atomic_t		refcnt;
+	int			ifindex;
+	char			ifname[IFNAMSIZ];
+	u32			pref;
+	u32			flags;
+	u32			table;
+	u8			action;
+	struct rcu_head		rcu;
+};
+
+struct fib_lookup_arg
+{
+	void			*lookup_ptr;
+	void			*result;
+	struct fib_rule		*rule;
+};
+
+struct fib_rules_ops
+{
+	int			family;
+	struct list_head	list;
+	int			rule_size;
+
+	int			(*action)(struct fib_rule *,
+					  struct flowi *, int,
+					  struct fib_lookup_arg *);
+	int			(*match)(struct fib_rule *,
+					 struct flowi *, int);
+	int			(*configure)(struct fib_rule *,
+					     struct sk_buff *,
+					     struct nlmsghdr *,
+					     struct fib_rule_hdr *,
+					     struct nlattr **);
+	int			(*compare)(struct fib_rule *,
+					   struct fib_rule_hdr *,
+					   struct nlattr **);
+	int			(*fill)(struct fib_rule *, struct sk_buff *,
+					struct nlmsghdr *,
+					struct fib_rule_hdr *);
+	u32			(*default_pref)(void);
+
+	int			nlgroup;
+	struct nla_policy	*policy;
+	struct list_head	*rules_list;
+	struct module		*owner;
+};
+
+static inline void fib_rule_get(struct fib_rule *rule)
+{
+	atomic_inc(&rule->refcnt);
+}
+
+static inline void fib_rule_put_rcu(struct rcu_head *head)
+{
+	struct fib_rule *rule = container_of(head, struct fib_rule, rcu);
+	kfree(rule);
+}
+
+static inline void fib_rule_put(struct fib_rule *rule)
+{
+	if (atomic_dec_and_test(&rule->refcnt))
+		call_rcu(&rule->rcu, fib_rule_put_rcu);
+}
+
+extern int			fib_rules_register(struct fib_rules_ops *);
+extern int			fib_rules_unregister(struct fib_rules_ops *);
+
+extern int			fib_rules_lookup(struct fib_rules_ops *,
+						 struct flowi *, int flags,
+						 struct fib_lookup_arg *);
+
+extern int			fib_nl_newrule(struct sk_buff *,
+					       struct nlmsghdr *, void *);
+extern int			fib_nl_delrule(struct sk_buff *,
+					       struct nlmsghdr *, void *);
+extern int			fib_rules_dump(struct sk_buff *,
+					       struct netlink_callback *, int);
+#endif
diff --git a/net/Kconfig b/net/Kconfig
index eb855b7fa64..6528a935622 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -251,6 +251,9 @@ config WIRELESS_EXT
 
 source "net/netlabel/Kconfig"
 
+config FIB_RULES
+	bool
+
 endif   # if NET
 endmenu # Networking
 
diff --git a/net/core/Makefile b/net/core/Makefile
index 2645ba428d4..119568077da 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_WIRELESS_EXT) += wireless.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
 obj-$(CONFIG_NET_DMA) += user_dma.o
+obj-$(CONFIG_FIB_RULES) += fib_rules.o
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
new file mode 100644
index 00000000000..6cdad24038e
--- /dev/null
+++ b/net/core/fib_rules.c
@@ -0,0 +1,416 @@
+/*
+ * net/core/fib_rules.c		Generic Routing Rules
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <net/fib_rules.h>
+
+static LIST_HEAD(rules_ops);
+static DEFINE_SPINLOCK(rules_mod_lock);
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+			       struct fib_rules_ops *ops);
+
+static struct fib_rules_ops *lookup_rules_ops(int family)
+{
+	struct fib_rules_ops *ops;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &rules_ops, list) {
+		if (ops->family == family) {
+			if (!try_module_get(ops->owner))
+				ops = NULL;
+			rcu_read_unlock();
+			return ops;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void rules_ops_put(struct fib_rules_ops *ops)
+{
+	if (ops)
+		module_put(ops->owner);
+}
+
+int fib_rules_register(struct fib_rules_ops *ops)
+{
+	int err = -EEXIST;
+	struct fib_rules_ops *o;
+
+	if (ops->rule_size < sizeof(struct fib_rule))
+		return -EINVAL;
+
+	if (ops->match == NULL || ops->configure == NULL ||
+	    ops->compare == NULL || ops->fill == NULL ||
+	    ops->action == NULL)
+		return -EINVAL;
+
+	spin_lock(&rules_mod_lock);
+	list_for_each_entry(o, &rules_ops, list)
+		if (ops->family == o->family)
+			goto errout;
+
+	list_add_tail_rcu(&ops->list, &rules_ops);
+	err = 0;
+errout:
+	spin_unlock(&rules_mod_lock);
+
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(fib_rules_register);
+
+static void cleanup_ops(struct fib_rules_ops *ops)
+{
+	struct fib_rule *rule, *tmp;
+
+	list_for_each_entry_safe(rule, tmp, ops->rules_list, list) {
+		list_del_rcu(&rule->list);
+		fib_rule_put(rule);
+	}
+}
+
+int fib_rules_unregister(struct fib_rules_ops *ops)
+{
+	int err = 0;
+	struct fib_rules_ops *o;
+
+	spin_lock(&rules_mod_lock);
+	list_for_each_entry(o, &rules_ops, list) {
+		if (o == ops) {
+			list_del_rcu(&o->list);
+			cleanup_ops(ops);
+			goto out;
+		}
+	}
+
+	err = -ENOENT;
+out:
+	spin_unlock(&rules_mod_lock);
+
+	synchronize_rcu();
+
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(fib_rules_unregister);
+
+int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
+		     int flags, struct fib_lookup_arg *arg)
+{
+	struct fib_rule *rule;
+	int err;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(rule, ops->rules_list, list) {
+		if (rule->ifindex && (rule->ifindex != fl->iif))
+			continue;
+
+		if (!ops->match(rule, fl, flags))
+			continue;
+
+		err = ops->action(rule, fl, flags, arg);
+		if (err != -EAGAIN) {
+			fib_rule_get(rule);
+			arg->rule = rule;
+			goto out;
+		}
+	}
+
+	err = -ENETUNREACH;
+out:
+	rcu_read_unlock();
+
+	return err;
+}
+
+EXPORT_SYMBOL_GPL(fib_rules_lookup);
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct fib_rule_hdr *frh = nlmsg_data(nlh);
+	struct fib_rules_ops *ops = NULL;
+	struct fib_rule *rule, *r, *last = NULL;
+	struct nlattr *tb[FRA_MAX+1];
+	int err = -EINVAL;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+		goto errout;
+
+	ops = lookup_rules_ops(frh->family);
+	if (ops == NULL) {
+		err = EAFNOSUPPORT;
+		goto errout;
+	}
+
+	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+	if (err < 0)
+		goto errout;
+
+	if (tb[FRA_IFNAME] && nla_len(tb[FRA_IFNAME]) > IFNAMSIZ)
+		goto errout;
+
+	rule = kzalloc(ops->rule_size, GFP_KERNEL);
+	if (rule == NULL) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	if (tb[FRA_PRIORITY])
+		rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+
+	if (tb[FRA_IFNAME]) {
+		struct net_device *dev;
+
+		rule->ifindex = -1;
+		if (nla_strlcpy(rule->ifname, tb[FRA_IFNAME],
+				IFNAMSIZ) >= IFNAMSIZ)
+			goto errout_free;
+
+		dev = __dev_get_by_name(rule->ifname);
+		if (dev)
+			rule->ifindex = dev->ifindex;
+	}
+
+	rule->action = frh->action;
+	rule->flags = frh->flags;
+	rule->table = frh->table;
+
+	if (!rule->pref && ops->default_pref)
+		rule->pref = ops->default_pref();
+
+	err = ops->configure(rule, skb, nlh, frh, tb);
+	if (err < 0)
+		goto errout_free;
+
+	list_for_each_entry(r, ops->rules_list, list) {
+		if (r->pref > rule->pref)
+			break;
+		last = r;
+	}
+
+	fib_rule_get(rule);
+
+	if (last)
+		list_add_rcu(&rule->list, &last->list);
+	else
+		list_add_rcu(&rule->list, ops->rules_list);
+
+	notify_rule_change(RTM_NEWRULE, rule, ops);
+	rules_ops_put(ops);
+	return 0;
+
+errout_free:
+	kfree(rule);
+errout:
+	rules_ops_put(ops);
+	return err;
+}
+
+int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct fib_rule_hdr *frh = nlmsg_data(nlh);
+	struct fib_rules_ops *ops = NULL;
+	struct fib_rule *rule;
+	struct nlattr *tb[FRA_MAX+1];
+	int err = -EINVAL;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+		goto errout;
+
+	ops = lookup_rules_ops(frh->family);
+	if (ops == NULL) {
+		err = EAFNOSUPPORT;
+		goto errout;
+	}
+
+	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
+	if (err < 0)
+		goto errout;
+
+	list_for_each_entry(rule, ops->rules_list, list) {
+		if (frh->action && (frh->action != rule->action))
+			continue;
+
+		if (frh->table && (frh->table != rule->table))
+			continue;
+
+		if (tb[FRA_PRIORITY] &&
+		    (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
+			continue;
+
+		if (tb[FRA_IFNAME] &&
+		    nla_strcmp(tb[FRA_IFNAME], rule->ifname))
+			continue;
+
+		if (!ops->compare(rule, frh, tb))
+			continue;
+
+		if (rule->flags & FIB_RULE_PERMANENT) {
+			err = -EPERM;
+			goto errout;
+		}
+
+		list_del_rcu(&rule->list);
+		synchronize_rcu();
+		notify_rule_change(RTM_DELRULE, rule, ops);
+		fib_rule_put(rule);
+		rules_ops_put(ops);
+		return 0;
+	}
+
+	err = -ENOENT;
+errout:
+	rules_ops_put(ops);
+	return err;
+}
+
+static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
+			    u32 pid, u32 seq, int type, int flags,
+			    struct fib_rules_ops *ops)
+{
+	struct nlmsghdr *nlh;
+	struct fib_rule_hdr *frh;
+
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
+	if (nlh == NULL)
+		return -1;
+
+	frh = nlmsg_data(nlh);
+	frh->table = rule->table;
+	frh->res1 = 0;
+	frh->res2 = 0;
+	frh->action = rule->action;
+	frh->flags = rule->flags;
+
+	if (rule->ifname[0])
+		NLA_PUT_STRING(skb, FRA_IFNAME, rule->ifname);
+
+	if (rule->pref)
+		NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref);
+
+	if (ops->fill(rule, skb, nlh, frh) < 0)
+		goto nla_put_failure;
+
+	return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+	return nlmsg_cancel(skb, nlh);
+}
+
+int fib_rules_dump(struct sk_buff *skb, struct netlink_callback *cb, int family)
+{
+	int idx = 0;
+	struct fib_rule *rule;
+	struct fib_rules_ops *ops;
+
+	ops = lookup_rules_ops(family);
+	if (ops == NULL)
+		return -EAFNOSUPPORT;
+
+	rcu_read_lock();
+	list_for_each_entry(rule, ops->rules_list, list) {
+		if (idx < cb->args[0])
+			goto skip;
+
+		if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid,
+				     cb->nlh->nlmsg_seq, RTM_NEWRULE,
+				     NLM_F_MULTI, ops) < 0)
+			break;
+skip:
+		idx++;
+	}
+	rcu_read_unlock();
+	cb->args[0] = idx;
+	rules_ops_put(ops);
+
+	return skb->len;
+}
+
+EXPORT_SYMBOL_GPL(fib_rules_dump);
+
+static void notify_rule_change(int event, struct fib_rule *rule,
+			       struct fib_rules_ops *ops)
+{
+	int size = nlmsg_total_size(sizeof(struct fib_rule_hdr) + 128);
+	struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
+
+	if (skb == NULL)
+		netlink_set_err(rtnl, 0, ops->nlgroup, ENOBUFS);
+	else if (fib_nl_fill_rule(skb, rule, 0, 0, event, 0, ops) < 0) {
+		kfree_skb(skb);
+		netlink_set_err(rtnl, 0, ops->nlgroup, EINVAL);
+	} else
+		netlink_broadcast(rtnl, skb, 0, ops->nlgroup, GFP_KERNEL);
+}
+
+static void attach_rules(struct list_head *rules, struct net_device *dev)
+{
+	struct fib_rule *rule;
+
+	list_for_each_entry(rule, rules, list) {
+		if (rule->ifindex == -1 &&
+		    strcmp(dev->name, rule->ifname) == 0)
+			rule->ifindex = dev->ifindex;
+	}
+}
+
+static void detach_rules(struct list_head *rules, struct net_device *dev)
+{
+	struct fib_rule *rule;
+
+	list_for_each_entry(rule, rules, list)
+		if (rule->ifindex == dev->ifindex)
+			rule->ifindex = -1;
+}
+
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct fib_rules_ops *ops;
+
+	ASSERT_RTNL();
+	rcu_read_lock();
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		list_for_each_entry(ops, &rules_ops, list)
+			attach_rules(ops->rules_list, dev);
+		break;
+
+	case NETDEV_UNREGISTER:
+		list_for_each_entry(ops, &rules_ops, list)
+			detach_rules(ops->rules_list, dev);
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_rules_notifier = {
+	.notifier_call = fib_rules_event,
+};
+
+static int __init fib_rules_init(void)
+{
+	return register_netdevice_notifier(&fib_rules_notifier);
+}
+
+subsys_initcall(fib_rules_init);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 30cc1ba6ed5..aa7cff2257b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -49,6 +49,7 @@
 #include <net/udp.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
+#include <net/fib_rules.h>
 #include <net/netlink.h>
 #ifdef CONFIG_NET_WIRELESS_RTNETLINK
 #include <linux/wireless.h>
@@ -103,7 +104,7 @@ static const int rtm_min[RTM_NR_FAMILIES] =
 	[RTM_FAM(RTM_NEWADDR)]      = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
 	[RTM_FAM(RTM_NEWROUTE)]     = NLMSG_LENGTH(sizeof(struct rtmsg)),
 	[RTM_FAM(RTM_NEWNEIGH)]     = NLMSG_LENGTH(sizeof(struct ndmsg)),
-	[RTM_FAM(RTM_NEWRULE)]      = NLMSG_LENGTH(sizeof(struct rtmsg)),
+	[RTM_FAM(RTM_NEWRULE)]      = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)),
 	[RTM_FAM(RTM_NEWQDISC)]     = NLMSG_LENGTH(sizeof(struct tcmsg)),
 	[RTM_FAM(RTM_NEWTCLASS)]    = NLMSG_LENGTH(sizeof(struct tcmsg)),
 	[RTM_FAM(RTM_NEWTFILTER)]   = NLMSG_LENGTH(sizeof(struct tcmsg)),
@@ -120,7 +121,7 @@ static const int rta_max[RTM_NR_FAMILIES] =
 	[RTM_FAM(RTM_NEWADDR)]      = IFA_MAX,
 	[RTM_FAM(RTM_NEWROUTE)]     = RTA_MAX,
 	[RTM_FAM(RTM_NEWNEIGH)]     = NDA_MAX,
-	[RTM_FAM(RTM_NEWRULE)]      = RTA_MAX,
+	[RTM_FAM(RTM_NEWRULE)]      = FRA_MAX,
 	[RTM_FAM(RTM_NEWQDISC)]     = TCA_MAX,
 	[RTM_FAM(RTM_NEWTCLASS)]    = TCA_MAX,
 	[RTM_FAM(RTM_NEWTFILTER)]   = TCA_MAX,
@@ -757,6 +758,10 @@ static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] =
 	[RTM_NEWNEIGH    - RTM_BASE] = { .doit   = neigh_add		 },
 	[RTM_DELNEIGH    - RTM_BASE] = { .doit   = neigh_delete		 },
 	[RTM_GETNEIGH    - RTM_BASE] = { .dumpit = neigh_dump_info	 },
+#ifdef CONFIG_FIB_RULES
+	[RTM_NEWRULE     - RTM_BASE] = { .doit   = fib_nl_newrule	 },
+	[RTM_DELRULE     - RTM_BASE] = { .doit   = fib_nl_delrule	 },
+#endif
 	[RTM_GETRULE     - RTM_BASE] = { .dumpit = rtnetlink_dump_all	 },
 	[RTM_GETNEIGHTBL - RTM_BASE] = { .dumpit = neightbl_dump_info	 },
 	[RTM_SETNEIGHTBL - RTM_BASE] = { .doit   = neightbl_set		 },
-- 
cgit v1.2.3-70-g09d2


From 101367c2f8c464ea96643192673aa18d88e6336d Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 4 Aug 2006 03:39:02 -0700
Subject: [IPV6]: Policy Routing Rules

Adds support for policy routing rules including a new
local table for routes with a local destination.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |   2 +
 include/net/ip6_fib.h     |   9 +-
 include/net/ip6_route.h   |   5 +
 net/ipv6/Kconfig          |   1 +
 net/ipv6/Makefile         |   1 +
 net/ipv6/addrconf.c       |   1 +
 net/ipv6/fib6_rules.c     | 251 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ip6_fib.c        |  21 ++--
 net/ipv6/route.c          |  50 +++++++++
 9 files changed, 329 insertions(+), 12 deletions(-)
 create mode 100644 net/ipv6/fib6_rules.c

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index facd9ee37b7..bf353538ae9 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -889,6 +889,8 @@ enum rtnetlink_groups {
 	RTNLGRP_NOP4,
 	RTNLGRP_IPV6_PREFIX,
 #define RTNLGRP_IPV6_PREFIX	RTNLGRP_IPV6_PREFIX
+	RTNLGRP_IPV6_RULE,
+#define RTNLGRP_IPV6_RULE	RTNLGRP_IPV6_RULE
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 818411519c8..7b47e8d5a76 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -155,7 +155,6 @@ struct fib6_table {
 
 #define RT6_TABLE_UNSPEC	RT_TABLE_UNSPEC
 #define RT6_TABLE_MAIN		RT_TABLE_MAIN
-#define RT6_TABLE_LOCAL		RT6_TABLE_MAIN
 #define RT6_TABLE_DFLT		RT6_TABLE_MAIN
 #define RT6_TABLE_INFO		RT6_TABLE_MAIN
 #define RT6_TABLE_PREFIX	RT6_TABLE_MAIN
@@ -163,9 +162,11 @@ struct fib6_table {
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 #define FIB6_TABLE_MIN		1
 #define FIB6_TABLE_MAX		RT_TABLE_MAX
+#define RT6_TABLE_LOCAL		RT_TABLE_LOCAL
 #else
 #define FIB6_TABLE_MIN		RT_TABLE_MAIN
 #define FIB6_TABLE_MAX		FIB6_TABLE_MIN
+#define RT6_TABLE_LOCAL		RT6_TABLE_MAIN
 #endif
 
 #define RT6_F_STRICT		1
@@ -221,5 +222,11 @@ extern void			fib6_run_gc(unsigned long dummy);
 extern void			fib6_gc_cleanup(void);
 
 extern void			fib6_init(void);
+
+extern void			fib6_rules_init(void);
+extern void			fib6_rules_cleanup(void);
+extern int			fib6_rules_dump(struct sk_buff *,
+						struct netlink_callback *);
+
 #endif
 #endif
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d49c8c90eb6..9bfa3cc6ced 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -41,6 +41,11 @@ struct pol_chain {
 
 extern struct rt6_info	ip6_null_entry;
 
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+extern struct rt6_info	ip6_prohibit_entry;
+extern struct rt6_info	ip6_blk_hole_entry;
+#endif
+
 extern int ip6_rt_gc_interval;
 
 extern void			ip6_route_input(struct sk_buff *skb);
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 159c63d99c8..36a6c2b7988 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -139,6 +139,7 @@ config IPV6_TUNNEL
 config IPV6_MULTIPLE_TABLES
 	bool "IPv6: Multiple Routing Tables"
 	depends on IPV6 && EXPERIMENTAL
+	select FIB_RULES
 	---help---
 	  Support multiple routing tables.
 
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 386e0a62694..9eebf609127 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -13,6 +13,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \
 ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
 	xfrm6_output.o
 ipv6-$(CONFIG_NETFILTER) += netfilter.o
+ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
 ipv6-objs += $(ipv6-y)
 
 obj-$(CONFIG_INET6_AH) += ah6.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 318767fcefd..ed766eebc02 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3528,6 +3528,7 @@ static struct rtnetlink_link inet6_rtnetlink_table[RTM_NR_MSGTYPES] = {
 	[RTM_DELROUTE - RTM_BASE] = { .doit	= inet6_rtm_delroute, },
 	[RTM_GETROUTE - RTM_BASE] = { .doit	= inet6_rtm_getroute,
 				      .dumpit	= inet6_dump_fib, },
+	[RTM_GETRULE  - RTM_BASE] = { .dumpit   = fib6_rules_dump,   },
 };
 
 static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
new file mode 100644
index 00000000000..c3c8195744e
--- /dev/null
+++ b/net/ipv6/fib6_rules.c
@@ -0,0 +1,251 @@
+/*
+ * net/ipv6/fib6_rules.c	IPv6 Routing Policy Rules
+ *
+ * Copyright (C)2003-2006 Helsinki University of Technology
+ * Copyright (C)2003-2006 USAGI/WIDE Project
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License as
+ *	published by the Free Software Foundation, version 2.
+ *
+ * Authors
+ *	Thomas Graf		<tgraf@suug.ch>
+ *	Ville Nuorvala		<vnuorval@tcs.hut.fi>
+ */
+
+#include <linux/config.h>
+#include <linux/netdevice.h>
+
+#include <net/fib_rules.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/netlink.h>
+
+struct fib6_rule
+{
+	struct fib_rule		common;
+	struct rt6key		src;
+	struct rt6key		dst;
+	u8			tclass;
+};
+
+static struct fib_rules_ops fib6_rules_ops;
+
+static struct fib6_rule main_rule = {
+	.common = {
+		.refcnt =	ATOMIC_INIT(2),
+		.pref =		0x7FFE,
+		.action =	FR_ACT_TO_TBL,
+		.table =	RT6_TABLE_MAIN,
+	},
+};
+
+static struct fib6_rule local_rule = {
+	.common = {
+		.refcnt =	ATOMIC_INIT(2),
+		.pref =		0,
+		.action =	FR_ACT_TO_TBL,
+		.table =	RT6_TABLE_LOCAL,
+		.flags =	FIB_RULE_PERMANENT,
+	},
+};
+
+static LIST_HEAD(fib6_rules);
+
+struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
+				   pol_lookup_t lookup)
+{
+	struct fib_lookup_arg arg = {
+		.lookup_ptr = lookup,
+	};
+
+	fib_rules_lookup(&fib6_rules_ops, fl, flags, &arg);
+	if (arg.rule)
+		fib_rule_put(arg.rule);
+
+	return (struct dst_entry *) arg.result;
+}
+
+int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+		     int flags, struct fib_lookup_arg *arg)
+{
+	struct rt6_info *rt = NULL;
+	struct fib6_table *table;
+	pol_lookup_t lookup = arg->lookup_ptr;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+	case FR_ACT_UNREACHABLE:
+		rt = &ip6_null_entry;
+		goto discard_pkt;
+	default:
+	case FR_ACT_BLACKHOLE:
+		rt = &ip6_blk_hole_entry;
+		goto discard_pkt;
+	case FR_ACT_PROHIBIT:
+		rt = &ip6_prohibit_entry;
+		goto discard_pkt;
+	}
+
+	table = fib6_get_table(rule->table);
+	if (table)
+		rt = lookup(table, flp, flags);
+
+	if (rt != &ip6_null_entry)
+		goto out;
+
+	dst_release(&rt->u.dst);
+discard_pkt:
+	dst_hold(&rt->u.dst);
+out:
+	arg->result = rt;
+	return rt == NULL ? -EAGAIN : 0;
+}
+
+
+static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+	struct fib6_rule *r = (struct fib6_rule *) rule;
+
+	if (!ipv6_prefix_equal(&fl->fl6_dst, &r->dst.addr, r->dst.plen))
+		return 0;
+
+	if ((flags & RT6_F_HAS_SADDR) &&
+	    !ipv6_prefix_equal(&fl->fl6_src, &r->src.addr, r->src.plen))
+		return 0;
+
+	return 1;
+}
+
+static struct nla_policy fib6_rule_policy[RTA_MAX+1] __read_mostly = {
+	[FRA_IFNAME]	= { .type = NLA_STRING },
+	[FRA_PRIORITY]	= { .type = NLA_U32 },
+	[FRA_SRC]	= { .minlen = sizeof(struct in6_addr) },
+	[FRA_DST]	= { .minlen = sizeof(struct in6_addr) },
+};
+
+static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+			       struct nlmsghdr *nlh, struct fib_rule_hdr *frh,
+			       struct nlattr **tb)
+{
+	int err = -EINVAL;
+	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+	if (frh->src_len > 128 || frh->dst_len > 128 ||
+	    (frh->tos & ~IPV6_FLOWINFO_MASK))
+		goto errout;
+
+	if (rule->action == FR_ACT_TO_TBL) {
+		if (rule->table == RT6_TABLE_UNSPEC)
+			goto errout;
+
+		if (fib6_new_table(rule->table) == NULL) {
+			err = -ENOBUFS;
+			goto errout;
+		}
+	}
+
+	if (tb[FRA_SRC])
+		nla_memcpy(&rule6->src.addr, tb[FRA_SRC],
+			   sizeof(struct in6_addr));
+
+	if (tb[FRA_DST])
+		nla_memcpy(&rule6->dst.addr, tb[FRA_DST],
+			   sizeof(struct in6_addr));
+
+	rule6->src.plen = frh->src_len;
+	rule6->dst.plen = frh->dst_len;
+	rule6->tclass = frh->tos;
+
+	err = 0;
+errout:
+	return err;
+}
+
+static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			     struct nlattr **tb)
+{
+	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+	if (frh->src_len && (rule6->src.plen != frh->src_len))
+		return 0;
+
+	if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
+		return 0;
+
+	if (frh->tos && (rule6->tclass != frh->tos))
+		return 0;
+
+	if (tb[FRA_SRC] &&
+	    nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr)))
+		return 0;
+
+	if (tb[FRA_DST] &&
+	    nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr)))
+		return 0;
+
+	return 1;
+}
+
+static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			  struct nlmsghdr *nlh, struct fib_rule_hdr *frh)
+{
+	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+	frh->family = AF_INET6;
+	frh->dst_len = rule6->dst.plen;
+	frh->src_len = rule6->src.plen;
+	frh->tos = rule6->tclass;
+
+	if (rule6->dst.plen)
+		NLA_PUT(skb, FRA_DST, sizeof(struct in6_addr),
+			&rule6->dst.addr);
+
+	if (rule6->src.plen)
+		NLA_PUT(skb, FRA_SRC, sizeof(struct in6_addr),
+			&rule6->src.addr);
+
+	return 0;
+
+nla_put_failure:
+	return -ENOBUFS;
+}
+
+int fib6_rules_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	return fib_rules_dump(skb, cb, AF_INET6);
+}
+
+static u32 fib6_rule_default_pref(void)
+{
+	return 0x3FFF;
+}
+
+static struct fib_rules_ops fib6_rules_ops = {
+	.family			= AF_INET6,
+	.rule_size		= sizeof(struct fib6_rule),
+	.action			= fib6_rule_action,
+	.match			= fib6_rule_match,
+	.configure		= fib6_rule_configure,
+	.compare		= fib6_rule_compare,
+	.fill			= fib6_rule_fill,
+	.default_pref		= fib6_rule_default_pref,
+	.nlgroup		= RTNLGRP_IPV6_RULE,
+	.policy			= fib6_rule_policy,
+	.rules_list		= &fib6_rules,
+	.owner			= THIS_MODULE,
+};
+
+void __init fib6_rules_init(void)
+{
+	list_add_tail(&local_rule.common.list, &fib6_rules);
+	list_add_tail(&main_rule.common.list, &fib6_rules);
+
+	fib_rules_register(&fib6_rules_ops);
+}
+
+void fib6_rules_cleanup(void)
+{
+	fib_rules_unregister(&fib6_rules_ops);
+}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index fcd7da830ac..ce226c14bef 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -159,6 +159,15 @@ static struct fib6_table fib6_main_tbl = {
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 
+static struct fib6_table fib6_local_tbl = {
+	.tb6_id		= RT6_TABLE_LOCAL,
+	.tb6_lock	= RW_LOCK_UNLOCKED,
+	.tb6_root 	= {
+		.leaf		= &ip6_null_entry,
+		.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
+	},
+};
+
 #define FIB_TABLE_HASHSZ 256
 static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
 
@@ -228,20 +237,10 @@ struct fib6_table *fib6_get_table(u32 id)
 	return NULL;
 }
 
-struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
-				   pol_lookup_t lookup)
-{
-	/*
-	 * TODO: Add rule lookup
-	 */
-	struct fib6_table *table = fib6_get_table(RT6_TABLE_MAIN);
-
-	return (struct dst_entry *) lookup(table, fl, flags);
-}
-
 static void __init fib6_tables_init(void)
 {
 	fib6_link_table(&fib6_main_tbl);
+	fib6_link_table(&fib6_local_tbl);
 }
 
 #else
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 73efdadb9ab..438977e2085 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -140,6 +140,50 @@ struct rt6_info ip6_null_entry = {
 	.rt6i_ref	= ATOMIC_INIT(1),
 };
 
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+
+struct rt6_info ip6_prohibit_entry = {
+	.u = {
+		.dst = {
+			.__refcnt	= ATOMIC_INIT(1),
+			.__use		= 1,
+			.dev		= &loopback_dev,
+			.obsolete	= -1,
+			.error		= -EACCES,
+			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
+			.input		= ip6_pkt_discard,
+			.output		= ip6_pkt_discard_out,
+			.ops		= &ip6_dst_ops,
+			.path		= (struct dst_entry*)&ip6_prohibit_entry,
+		}
+	},
+	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_metric	= ~(u32) 0,
+	.rt6i_ref	= ATOMIC_INIT(1),
+};
+
+struct rt6_info ip6_blk_hole_entry = {
+	.u = {
+		.dst = {
+			.__refcnt	= ATOMIC_INIT(1),
+			.__use		= 1,
+			.dev		= &loopback_dev,
+			.obsolete	= -1,
+			.error		= -EINVAL,
+			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
+			.input		= ip6_pkt_discard,
+			.output		= ip6_pkt_discard_out,
+			.ops		= &ip6_dst_ops,
+			.path		= (struct dst_entry*)&ip6_blk_hole_entry,
+		}
+	},
+	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_metric	= ~(u32) 0,
+	.rt6i_ref	= ATOMIC_INIT(1),
+};
+
+#endif
+
 /* allocate dst with ip6_dst_ops */
 static __inline__ struct rt6_info *ip6_dst_alloc(void)
 {
@@ -2408,10 +2452,16 @@ void __init ip6_route_init(void)
 #ifdef CONFIG_XFRM
 	xfrm6_init();
 #endif
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	fib6_rules_init();
+#endif
 }
 
 void ip6_route_cleanup(void)
 {
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	fib6_rules_cleanup();
+#endif
 #ifdef CONFIG_PROC_FS
 	proc_net_remove("ipv6_route");
 	proc_net_remove("rt6_stats");
-- 
cgit v1.2.3-70-g09d2


From e1ef4bf23b1ced0bf78a1c98289f746486e5c912 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 4 Aug 2006 03:39:22 -0700
Subject: [IPV4]: Use Protocol Independant Policy Routing Rules Framework

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  14 +-
 net/ipv4/Kconfig        |   1 +
 net/ipv4/devinet.c      |   4 +-
 net/ipv4/fib_frontend.c |   2 +-
 net/ipv4/fib_rules.c    | 605 +++++++++++++++++++-----------------------------
 5 files changed, 249 insertions(+), 377 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a095d1dec7a..14c82e611c9 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -18,6 +18,7 @@
 
 #include <net/flow.h>
 #include <linux/seq_file.h>
+#include <net/fib_rules.h>
 
 /* WARNING: The ordering of these elements must match ordering
  *          of RTA_* rtnetlink attribute numbers.
@@ -203,9 +204,8 @@ static inline void fib_select_default(const struct flowi *flp, struct fib_result
 #define ip_fib_main_table (fib_tables[RT_TABLE_MAIN])
 
 extern struct fib_table * fib_tables[RT_TABLE_MAX+1];
-extern int fib_lookup(const struct flowi *flp, struct fib_result *res);
+extern int fib_lookup(struct flowi *flp, struct fib_result *res);
 extern struct fib_table *__fib_new_table(int id);
-extern void fib_rule_put(struct fib_rule *r);
 
 static inline struct fib_table *fib_get_table(int id)
 {
@@ -251,15 +251,15 @@ extern u32  __fib_res_prefsrc(struct fib_result *res);
 extern struct fib_table *fib_hash_init(int id);
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
-/* Exported by fib_rules.c */
+extern int fib4_rules_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+extern void __init fib4_rules_init(void);
+extern void __exit fib4_rules_cleanup(void);
 
-extern int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
-extern int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
-extern int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb);
 #ifdef CONFIG_NET_CLS_ROUTE
 extern u32 fib_rules_tclass(struct fib_result *res);
 #endif
-extern void fib_rules_init(void);
+
 #endif
 
 static inline void fib_combine_itag(u32 *itag, struct fib_result *res)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 3b5d504a74b..1650b64415a 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -88,6 +88,7 @@ config IP_FIB_HASH
 config IP_MULTIPLE_TABLES
 	bool "IP: policy routing"
 	depends on IP_ADVANCED_ROUTER
+	select FIB_RULES
 	---help---
 	  Normally, a router decides what to do with a received packet based
 	  solely on the packet's final destination address. If you say Y here,
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index a6cc31d911e..9f3ffbec329 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1151,9 +1151,7 @@ static struct rtnetlink_link inet_rtnetlink_table[RTM_NR_MSGTYPES] = {
 	[RTM_GETROUTE - RTM_BASE] = { .doit	= inet_rtm_getroute,
 				      .dumpit	= inet_dump_fib,	},
 #ifdef CONFIG_IP_MULTIPLE_TABLES
-	[RTM_NEWRULE  - RTM_BASE] = { .doit	= inet_rtm_newrule,	},
-	[RTM_DELRULE  - RTM_BASE] = { .doit	= inet_rtm_delrule,	},
-	[RTM_GETRULE  - RTM_BASE] = { .dumpit	= inet_dump_rules,	},
+	[RTM_GETRULE  - RTM_BASE] = { .dumpit	= fib4_rules_dump,	},
 #endif
 };
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ba2a70745a6..fe4a53d4d10 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -656,7 +656,7 @@ void __init ip_fib_init(void)
 	ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
 	ip_fib_main_table  = fib_hash_init(RT_TABLE_MAIN);
 #else
-	fib_rules_init();
+	fib4_rules_init();
 #endif
 
 	register_netdevice_notifier(&fib_netdev_notifier);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 79b04718bdf..23ec6ae1a0f 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -5,9 +5,8 @@
  *
  *		IPv4 Forwarding Information Base: policy rules.
  *
- * Version:	$Id: fib_rules.c,v 1.17 2001/10/31 21:55:54 davem Exp $
- *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ * 		Thomas Graf <tgraf@suug.ch>
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -19,129 +18,154 @@
  *		Marc Boucher	:	routing by fwmark
  */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/inetdevice.h>
 #include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
 #include <linux/netlink.h>
+#include <linux/inetdevice.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/rcupdate.h>
-
 #include <net/ip.h>
-#include <net/protocol.h>
 #include <net/route.h>
 #include <net/tcp.h>
-#include <net/sock.h>
 #include <net/ip_fib.h>
+#include <net/fib_rules.h>
 
-#define FRprintk(a...)
+static struct fib_rules_ops fib4_rules_ops;
 
-struct fib_rule
+struct fib4_rule
 {
-	struct hlist_node hlist;
-	atomic_t	r_clntref;
-	u32		r_preference;
-	unsigned char	r_table;
-	unsigned char	r_action;
-	unsigned char	r_dst_len;
-	unsigned char	r_src_len;
-	u32		r_src;
-	u32		r_srcmask;
-	u32		r_dst;
-	u32		r_dstmask;
-	u32		r_srcmap;
-	u8		r_flags;
-	u8		r_tos;
+	struct fib_rule		common;
+	u8			dst_len;
+	u8			src_len;
+	u8			tos;
+	u32			src;
+	u32			srcmask;
+	u32			dst;
+	u32			dstmask;
 #ifdef CONFIG_IP_ROUTE_FWMARK
-	u32		r_fwmark;
+	u32			fwmark;
 #endif
-	int		r_ifindex;
 #ifdef CONFIG_NET_CLS_ROUTE
-	__u32		r_tclassid;
+	u32			tclassid;
 #endif
-	char		r_ifname[IFNAMSIZ];
-	int		r_dead;
-	struct		rcu_head rcu;
 };
 
-static struct fib_rule default_rule = {
-	.r_clntref =	ATOMIC_INIT(2),
-	.r_preference =	0x7FFF,
-	.r_table =	RT_TABLE_DEFAULT,
-	.r_action =	RTN_UNICAST,
+static struct fib4_rule default_rule = {
+	.common = {
+		.refcnt =	ATOMIC_INIT(2),
+		.pref =		0x7FFF,
+		.table =	RT_TABLE_DEFAULT,
+		.action =	FR_ACT_TO_TBL,
+	},
 };
 
-static struct fib_rule main_rule = {
-	.r_clntref =	ATOMIC_INIT(2),
-	.r_preference =	0x7FFE,
-	.r_table =	RT_TABLE_MAIN,
-	.r_action =	RTN_UNICAST,
+static struct fib4_rule main_rule = {
+	.common = {
+		.refcnt =	ATOMIC_INIT(2),
+		.pref =		0x7FFE,
+		.table =	RT_TABLE_MAIN,
+		.action =	FR_ACT_TO_TBL,
+	},
 };
 
-static struct fib_rule local_rule = {
-	.r_clntref =	ATOMIC_INIT(2),
-	.r_table =	RT_TABLE_LOCAL,
-	.r_action =	RTN_UNICAST,
+static struct fib4_rule local_rule = {
+	.common = {
+		.refcnt =	ATOMIC_INIT(2),
+		.table =	RT_TABLE_LOCAL,
+		.action =	FR_ACT_TO_TBL,
+		.flags =	FIB_RULE_PERMANENT,
+	},
 };
 
-static struct hlist_head fib_rules;
+static LIST_HEAD(fib4_rules);
+
+#ifdef CONFIG_NET_CLS_ROUTE
+u32 fib_rules_tclass(struct fib_result *res)
+{
+	return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
+}
+#endif
 
-/* writer func called from netlink -- rtnl_sem hold*/
+int fib_lookup(struct flowi *flp, struct fib_result *res)
+{
+	struct fib_lookup_arg arg = {
+		.result = res,
+	};
+	int err;
 
-static void rtmsg_rule(int, struct fib_rule *);
+	err = fib_rules_lookup(&fib4_rules_ops, flp, 0, &arg);
+	res->r = arg.rule;
 
-int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+	return err;
+}
+
+int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags,
+		     struct fib_lookup_arg *arg)
 {
-	struct rtattr **rta = arg;
-	struct rtmsg *rtm = NLMSG_DATA(nlh);
-	struct fib_rule *r;
-	struct hlist_node *node;
-	int err = -ESRCH;
-
-	hlist_for_each_entry(r, node, &fib_rules, hlist) {
-		if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) &&
-		    rtm->rtm_src_len == r->r_src_len &&
-		    rtm->rtm_dst_len == r->r_dst_len &&
-		    (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) &&
-		    rtm->rtm_tos == r->r_tos &&
-#ifdef CONFIG_IP_ROUTE_FWMARK
-		    (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
-#endif
-		    (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
-		    (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
-		    (!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) &&
-		    (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
-			err = -EPERM;
-			if (r == &local_rule)
-				break;
-
-			hlist_del_rcu(&r->hlist);
-			r->r_dead = 1;
-			rtmsg_rule(RTM_DELRULE, r);
-			fib_rule_put(r);
-			err = 0;
-			break;
-		}
+	int err = -EAGAIN;
+	struct fib_table *tbl;
+
+	switch (rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+
+	case FR_ACT_UNREACHABLE:
+		err = -ENETUNREACH;
+		goto errout;
+
+	case FR_ACT_PROHIBIT:
+		err = -EACCES;
+		goto errout;
+
+	case FR_ACT_BLACKHOLE:
+	default:
+		err = -EINVAL;
+		goto errout;
 	}
+
+	if ((tbl = fib_get_table(rule->table)) == NULL)
+		goto errout;
+
+	err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result);
+	if (err > 0)
+		err = -EAGAIN;
+errout:
 	return err;
 }
 
-/* Allocate new unique table id */
+
+void fib_select_default(const struct flowi *flp, struct fib_result *res)
+{
+	if (res->r && res->r->action == FR_ACT_TO_TBL &&
+	    FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
+		struct fib_table *tb;
+		if ((tb = fib_get_table(res->r->table)) != NULL)
+			tb->tb_select_default(tb, flp, res);
+	}
+}
+
+static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+	struct fib4_rule *r = (struct fib4_rule *) rule;
+	u32 daddr = fl->fl4_dst;
+	u32 saddr = fl->fl4_src;
+
+	if (((saddr ^ r->src) & r->srcmask) ||
+	    ((daddr ^ r->dst) & r->dstmask))
+		return 0;
+
+	if (r->tos && (r->tos != fl->fl4_tos))
+		return 0;
+
+#ifdef CONFIG_IP_ROUTE_FWMARK
+	if (r->fwmark && (r->fwmark != fl->fl4_fwmark))
+		return 0;
+#endif
+
+	return 1;
+}
 
 static struct fib_table *fib_empty_table(void)
 {
@@ -153,329 +177,178 @@ static struct fib_table *fib_empty_table(void)
 	return NULL;
 }
 
-static inline void fib_rule_put_rcu(struct rcu_head *head)
-{
-	struct fib_rule *r = container_of(head, struct fib_rule, rcu);
-	kfree(r);
-}
+static struct nla_policy fib4_rule_policy[FRA_MAX+1] __read_mostly = {
+	[FRA_IFNAME]	= { .type = NLA_STRING },
+	[FRA_PRIORITY]	= { .type = NLA_U32 },
+	[FRA_SRC]	= { .type = NLA_U32 },
+	[FRA_DST]	= { .type = NLA_U32 },
+	[FRA_FWMARK]	= { .type = NLA_U32 },
+	[FRA_FLOW]	= { .type = NLA_U32 },
+};
 
-void fib_rule_put(struct fib_rule *r)
+static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+			       struct nlmsghdr *nlh, struct fib_rule_hdr *frh,
+			       struct nlattr **tb)
 {
-	if (atomic_dec_and_test(&r->r_clntref)) {
-		if (r->r_dead)
-			call_rcu(&r->rcu, fib_rule_put_rcu);
-		else
-			printk("Freeing alive rule %p\n", r);
-	}
-}
+	int err = -EINVAL;
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
-/* writer func called from netlink -- rtnl_sem hold*/
+	if (frh->src_len > 32 || frh->dst_len > 32 ||
+	    (frh->tos & ~IPTOS_TOS_MASK))
+		goto errout;
 
-int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
-{
-	struct rtattr **rta = arg;
-	struct rtmsg *rtm = NLMSG_DATA(nlh);
-	struct fib_rule *r, *new_r, *last = NULL;
-	struct hlist_node *node = NULL;
-	unsigned char table_id;
-
-	if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 ||
-	    (rtm->rtm_tos & ~IPTOS_TOS_MASK))
-		return -EINVAL;
-
-	if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
-		return -EINVAL;
-
-	table_id = rtm->rtm_table;
-	if (table_id == RT_TABLE_UNSPEC) {
-		struct fib_table *table;
-		if (rtm->rtm_type == RTN_UNICAST) {
-			if ((table = fib_empty_table()) == NULL)
-				return -ENOBUFS;
-			table_id = table->tb_id;
-		}
-	}
+	if (rule->table == RT_TABLE_UNSPEC) {
+		if (rule->action == FR_ACT_TO_TBL) {
+			struct fib_table *table;
 
-	new_r = kzalloc(sizeof(*new_r), GFP_KERNEL);
-	if (!new_r)
-		return -ENOMEM;
-
-	if (rta[RTA_SRC-1])
-		memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4);
-	if (rta[RTA_DST-1])
-		memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4);
-	if (rta[RTA_GATEWAY-1])
-		memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4);
-	new_r->r_src_len = rtm->rtm_src_len;
-	new_r->r_dst_len = rtm->rtm_dst_len;
-	new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len);
-	new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len);
-	new_r->r_tos = rtm->rtm_tos;
-#ifdef CONFIG_IP_ROUTE_FWMARK
-	if (rta[RTA_PROTOINFO-1])
-		memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
-#endif
-	new_r->r_action = rtm->rtm_type;
-	new_r->r_flags = rtm->rtm_flags;
-	if (rta[RTA_PRIORITY-1])
-		memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
-	new_r->r_table = table_id;
-	if (rta[RTA_IIF-1]) {
-		struct net_device *dev;
-		rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ);
-		new_r->r_ifindex = -1;
-		dev = __dev_get_by_name(new_r->r_ifname);
-		if (dev)
-			new_r->r_ifindex = dev->ifindex;
-	}
-#ifdef CONFIG_NET_CLS_ROUTE
-	if (rta[RTA_FLOW-1])
-		memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4);
-#endif
-	r = container_of(fib_rules.first, struct fib_rule, hlist);
+			table = fib_empty_table();
+			if (table == NULL) {
+				err = -ENOBUFS;
+				goto errout;
+			}
 
-	if (!new_r->r_preference) {
-		if (r && r->hlist.next != NULL) {
-			r = container_of(r->hlist.next, struct fib_rule, hlist);
-			if (r->r_preference)
-				new_r->r_preference = r->r_preference - 1;
+			rule->table = table->tb_id;
 		}
 	}
 
-	hlist_for_each_entry(r, node, &fib_rules, hlist) {
-		if (r->r_preference > new_r->r_preference)
-			break;
-		last = r;
-	}
-	atomic_inc(&new_r->r_clntref);
+	if (tb[FRA_SRC])
+		rule4->src = nla_get_u32(tb[FRA_SRC]);
 
-	if (last)
-		hlist_add_after_rcu(&last->hlist, &new_r->hlist);
-	else
-		hlist_add_before_rcu(&new_r->hlist, &r->hlist);
+	if (tb[FRA_DST])
+		rule4->dst = nla_get_u32(tb[FRA_DST]);
 
-	rtmsg_rule(RTM_NEWRULE, new_r);
-	return 0;
-}
+#ifdef CONFIG_IP_ROUTE_FWMARK
+	if (tb[FRA_FWMARK])
+		rule4->fwmark = nla_get_u32(tb[FRA_FWMARK]);
+#endif
 
 #ifdef CONFIG_NET_CLS_ROUTE
-u32 fib_rules_tclass(struct fib_result *res)
-{
-	if (res->r)
-		return res->r->r_tclassid;
-	return 0;
-}
+	if (tb[FRA_FLOW])
+		rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
 #endif
 
-/* callers should hold rtnl semaphore */
-
-static void fib_rules_detach(struct net_device *dev)
-{
-	struct hlist_node *node;
-	struct fib_rule *r;
-
-	hlist_for_each_entry(r, node, &fib_rules, hlist) {
-		if (r->r_ifindex == dev->ifindex)
-			r->r_ifindex = -1;
+	rule4->src_len = frh->src_len;
+	rule4->srcmask = inet_make_mask(rule4->src_len);
+	rule4->dst_len = frh->dst_len;
+	rule4->dstmask = inet_make_mask(rule4->dst_len);
+	rule4->tos = frh->tos;
 
-	}
-}
-
-/* callers should hold rtnl semaphore */
-
-static void fib_rules_attach(struct net_device *dev)
-{
-	struct hlist_node *node;
-	struct fib_rule *r;
-
-	hlist_for_each_entry(r, node, &fib_rules, hlist) {
-		if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0)
-			r->r_ifindex = dev->ifindex;
-	}
+	err = 0;
+errout:
+	return err;
 }
 
-int fib_lookup(const struct flowi *flp, struct fib_result *res)
+static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			     struct nlattr **tb)
 {
-	int err;
-	struct fib_rule *r, *policy;
-	struct fib_table *tb;
-	struct hlist_node *node;
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
-	u32 daddr = flp->fl4_dst;
-	u32 saddr = flp->fl4_src;
+	if (frh->src_len && (rule4->src_len != frh->src_len))
+		return 0;
 
-FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ",
-	NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src));
+	if (frh->dst_len && (rule4->dst_len != frh->dst_len))
+		return 0;
 
-	rcu_read_lock();
+	if (frh->tos && (rule4->tos != frh->tos))
+		return 0;
 
-	hlist_for_each_entry_rcu(r, node, &fib_rules, hlist) {
-		if (((saddr^r->r_src) & r->r_srcmask) ||
-		    ((daddr^r->r_dst) & r->r_dstmask) ||
-		    (r->r_tos && r->r_tos != flp->fl4_tos) ||
 #ifdef CONFIG_IP_ROUTE_FWMARK
-		    (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) ||
+	if (tb[FRA_FWMARK] && (rule4->fwmark != nla_get_u32(tb[FRA_FWMARK])))
+		return 0;
 #endif
-		    (r->r_ifindex && r->r_ifindex != flp->iif))
-			continue;
-
-FRprintk("tb %d r %d ", r->r_table, r->r_action);
-		switch (r->r_action) {
-		case RTN_UNICAST:
-			policy = r;
-			break;
-		case RTN_UNREACHABLE:
-			rcu_read_unlock();
-			return -ENETUNREACH;
-		default:
-		case RTN_BLACKHOLE:
-			rcu_read_unlock();
-			return -EINVAL;
-		case RTN_PROHIBIT:
-			rcu_read_unlock();
-			return -EACCES;
-		}
 
-		if ((tb = fib_get_table(r->r_table)) == NULL)
-			continue;
-		err = tb->tb_lookup(tb, flp, res);
-		if (err == 0) {
-			res->r = policy;
-			if (policy)
-				atomic_inc(&policy->r_clntref);
-			rcu_read_unlock();
-			return 0;
-		}
-		if (err < 0 && err != -EAGAIN) {
-			rcu_read_unlock();
-			return err;
-		}
-	}
-FRprintk("FAILURE\n");
-	rcu_read_unlock();
-	return -ENETUNREACH;
-}
+#ifdef CONFIG_NET_CLS_ROUTE
+	if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
+		return 0;
+#endif
 
-void fib_select_default(const struct flowi *flp, struct fib_result *res)
-{
-	if (res->r && res->r->r_action == RTN_UNICAST &&
-	    FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
-		struct fib_table *tb;
-		if ((tb = fib_get_table(res->r->r_table)) != NULL)
-			tb->tb_select_default(tb, flp, res);
-	}
-}
+	if (tb[FRA_SRC] && (rule4->src != nla_get_u32(tb[FRA_SRC])))
+		return 0;
 
-static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	struct net_device *dev = ptr;
+	if (tb[FRA_DST] && (rule4->dst != nla_get_u32(tb[FRA_DST])))
+		return 0;
 
-	if (event == NETDEV_UNREGISTER)
-		fib_rules_detach(dev);
-	else if (event == NETDEV_REGISTER)
-		fib_rules_attach(dev);
-	return NOTIFY_DONE;
+	return 1;
 }
 
+static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			  struct nlmsghdr *nlh, struct fib_rule_hdr *frh)
+{
+	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
-static struct notifier_block fib_rules_notifier = {
-	.notifier_call =fib_rules_event,
-};
+	frh->family = AF_INET;
+	frh->dst_len = rule4->dst_len;
+	frh->src_len = rule4->src_len;
+	frh->tos = rule4->tos;
 
-static __inline__ int inet_fill_rule(struct sk_buff *skb,
-				     struct fib_rule *r,
-				     u32 pid, u32 seq, int event,
-				     unsigned int flags)
-{
-	struct rtmsg *rtm;
-	struct nlmsghdr  *nlh;
-	unsigned char	 *b = skb->tail;
-
-	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
-	rtm = NLMSG_DATA(nlh);
-	rtm->rtm_family = AF_INET;
-	rtm->rtm_dst_len = r->r_dst_len;
-	rtm->rtm_src_len = r->r_src_len;
-	rtm->rtm_tos = r->r_tos;
 #ifdef CONFIG_IP_ROUTE_FWMARK
-	if (r->r_fwmark)
-		RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
+	if (rule4->fwmark)
+		NLA_PUT_U32(skb, FRA_FWMARK, rule4->fwmark);
 #endif
-	rtm->rtm_table = r->r_table;
-	rtm->rtm_protocol = 0;
-	rtm->rtm_scope = 0;
-	rtm->rtm_type = r->r_action;
-	rtm->rtm_flags = r->r_flags;
-
-	if (r->r_dst_len)
-		RTA_PUT(skb, RTA_DST, 4, &r->r_dst);
-	if (r->r_src_len)
-		RTA_PUT(skb, RTA_SRC, 4, &r->r_src);
-	if (r->r_ifname[0])
-		RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
-	if (r->r_preference)
-		RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
-	if (r->r_srcmap)
-		RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap);
+
+	if (rule4->dst_len)
+		NLA_PUT_U32(skb, FRA_DST, rule4->dst);
+
+	if (rule4->src_len)
+		NLA_PUT_U32(skb, FRA_SRC, rule4->src);
+
 #ifdef CONFIG_NET_CLS_ROUTE
-	if (r->r_tclassid)
-		RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid);
+	if (rule4->tclassid)
+		NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
 #endif
-	nlh->nlmsg_len = skb->tail - b;
-	return skb->len;
+	return 0;
 
-nlmsg_failure:
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
+nla_put_failure:
+	return -ENOBUFS;
 }
 
-/* callers should hold rtnl semaphore */
+int fib4_rules_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	return fib_rules_dump(skb, cb, AF_INET);
+}
 
-static void rtmsg_rule(int event, struct fib_rule *r)
+static u32 fib4_rule_default_pref(void)
 {
-	int size = NLMSG_SPACE(sizeof(struct rtmsg) + 128);
-	struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
-
-	if (!skb)
-		netlink_set_err(rtnl, 0, RTNLGRP_IPV4_RULE, ENOBUFS);
-	else if (inet_fill_rule(skb, r, 0, 0, event, 0) < 0) {
-		kfree_skb(skb);
-		netlink_set_err(rtnl, 0, RTNLGRP_IPV4_RULE, EINVAL);
-	} else {
-		netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_RULE, GFP_KERNEL);
+	struct list_head *pos;
+	struct fib_rule *rule;
+
+	if (!list_empty(&fib4_rules)) {
+		pos = fib4_rules.next;
+		if (pos->next != &fib4_rules) {
+			rule = list_entry(pos->next, struct fib_rule, list);
+			if (rule->pref)
+				return rule->pref - 1;
+		}
 	}
+
+	return 0;
 }
 
-int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
+static struct fib_rules_ops fib4_rules_ops = {
+	.family		= AF_INET,
+	.rule_size	= sizeof(struct fib4_rule),
+	.action		= fib4_rule_action,
+	.match		= fib4_rule_match,
+	.configure	= fib4_rule_configure,
+	.compare	= fib4_rule_compare,
+	.fill		= fib4_rule_fill,
+	.default_pref	= fib4_rule_default_pref,
+	.nlgroup	= RTNLGRP_IPV4_RULE,
+	.policy		= fib4_rule_policy,
+	.rules_list	= &fib4_rules,
+	.owner		= THIS_MODULE,
+};
+
+void __init fib4_rules_init(void)
 {
-	int idx = 0;
-	int s_idx = cb->args[0];
-	struct fib_rule *r;
-	struct hlist_node *node;
-
-	rcu_read_lock();
-	hlist_for_each_entry(r, node, &fib_rules, hlist) {
-		if (idx < s_idx)
-			goto next;
-		if (inet_fill_rule(skb, r, NETLINK_CB(cb->skb).pid,
-				   cb->nlh->nlmsg_seq,
-				   RTM_NEWRULE, NLM_F_MULTI) < 0)
-			break;
-next:
-		idx++;
-	}
-	rcu_read_unlock();
-	cb->args[0] = idx;
+	list_add_tail(&local_rule.common.list, &fib4_rules);
+	list_add_tail(&main_rule.common.list, &fib4_rules);
+	list_add_tail(&default_rule.common.list, &fib4_rules);
 
-	return skb->len;
+	fib_rules_register(&fib4_rules_ops);
 }
 
-void __init fib_rules_init(void)
+void __exit fib4_rules_cleanup(void)
 {
-	INIT_HLIST_HEAD(&fib_rules);
-	hlist_add_head(&local_rule.hlist, &fib_rules);
-	hlist_add_after(&local_rule.hlist, &main_rule.hlist);
-	hlist_add_after(&main_rule.hlist, &default_rule.hlist);
-	register_netdevice_notifier(&fib_rules_notifier);
+	fib_rules_unregister(&fib4_rules_ops);
 }
-- 
cgit v1.2.3-70-g09d2


From fe4944e59c357f945f81bc67edb7ed1392e875ad Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 4 Aug 2006 23:03:05 -0700
Subject: [NETLINK]: Extend netlink messaging interface

Adds:
 nlmsg_get_pos()                 return current position in message
 nlmsg_trim()                    trim part of message
 nla_reserve_nohdr(skb, len)     reserve room for an attribute w/o hdr
 nla_put_nohdr(skb, len, data)   add attribute w/o hdr
 nla_find_nested()               find attribute in nested attributes

Fixes nlmsg_new() to take allocation flags and consider size.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h   | 74 +++++++++++++++++++++++++++++++++++++++++-------
 kernel/taskstats.c      |  2 +-
 net/netlink/attr.c      | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/netlink/genetlink.c |  2 +-
 4 files changed, 141 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 640c26a90cf..3a5e40b1e04 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -35,6 +35,8 @@
  *   nlmsg_put()			add a netlink message to an skb
  *   nlmsg_put_answer()			callback based nlmsg_put()
  *   nlmsg_end()			finanlize netlink message
+ *   nlmsg_get_pos()			return current position in message
+ *   nlmsg_trim()			trim part of message
  *   nlmsg_cancel()			cancel message construction
  *   nlmsg_free()			free a netlink message
  *
@@ -80,8 +82,10 @@
  *   struct nlattr			netlink attribtue header
  *
  * Attribute Construction:
- *   nla_reserve(skb, type, len)	reserve skb tailroom for an attribute
+ *   nla_reserve(skb, type, len)	reserve room for an attribute
+ *   nla_reserve_nohdr(skb, len)	reserve room for an attribute w/o hdr
  *   nla_put(skb, type, len, data)	add attribute to skb
+ *   nla_put_nohdr(skb, len, data)	add attribute w/o hdr
  *
  * Attribute Construction for Basic Types:
  *   nla_put_u8(skb, type, value)	add u8 attribute to skb
@@ -139,6 +143,7 @@
  *   nla_next(nla, remaining)		get next netlink attribute
  *   nla_validate()			validate a stream of attributes
  *   nla_find()				find attribute in stream of attributes
+ *   nla_find_nested()			find attribute in nested attributes
  *   nla_parse()			parse and validate stream of attrs
  *   nla_parse_nested()			parse nested attribuets
  *   nla_for_each_attr()		loop over all attributes
@@ -203,12 +208,18 @@ extern int		nla_memcmp(const struct nlattr *nla, const void *data,
 extern int		nla_strcmp(const struct nlattr *nla, const char *str);
 extern struct nlattr *	__nla_reserve(struct sk_buff *skb, int attrtype,
 				      int attrlen);
+extern void *		__nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
 extern struct nlattr *	nla_reserve(struct sk_buff *skb, int attrtype,
 				    int attrlen);
+extern void *		nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
 extern void		__nla_put(struct sk_buff *skb, int attrtype,
 				  int attrlen, const void *data);
+extern void		__nla_put_nohdr(struct sk_buff *skb, int attrlen,
+					const void *data);
 extern int		nla_put(struct sk_buff *skb, int attrtype,
 				int attrlen, const void *data);
+extern int		nla_put_nohdr(struct sk_buff *skb, int attrlen,
+				      const void *data);
 
 /**************************************************************************
  * Netlink Messages
@@ -453,12 +464,13 @@ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb,
 /**
  * nlmsg_new - Allocate a new netlink message
  * @size: maximum size of message
+ * @flags: the type of memory to allocate.
  *
  * Use NLMSG_GOODSIZE if size isn't know and you need a good default size.
  */
-static inline struct sk_buff *nlmsg_new(int size)
+static inline struct sk_buff *nlmsg_new(int size, gfp_t flags)
 {
-	return alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	return alloc_skb(size, flags);
 }
 
 /**
@@ -479,6 +491,32 @@ static inline int nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
 	return skb->len;
 }
 
+/**
+ * nlmsg_get_pos - return current position in netlink message
+ * @skb: socket buffer the message is stored in
+ *
+ * Returns a pointer to the current tail of the message.
+ */
+static inline void *nlmsg_get_pos(struct sk_buff *skb)
+{
+	return skb->tail;
+}
+
+/**
+ * nlmsg_trim - Trim message to a mark
+ * @skb: socket buffer the message is stored in
+ * @mark: mark to trim to
+ *
+ * Trims the message to the provided mark. Returns -1.
+ */
+static inline int nlmsg_trim(struct sk_buff *skb, void *mark)
+{
+	if (mark)
+		skb_trim(skb, (unsigned char *) mark - skb->data);
+
+	return -1;
+}
+
 /**
  * nlmsg_cancel - Cancel construction of a netlink message
  * @skb: socket buffer the message is stored in
@@ -489,9 +527,7 @@ static inline int nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
  */
 static inline int nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-	skb_trim(skb, (unsigned char *) nlh - skb->data);
-
-	return -1;
+	return nlmsg_trim(skb, nlh);
 }
 
 /**
@@ -630,6 +666,18 @@ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
 	return (struct nlattr *) ((char *) nla + totlen);
 }
 
+/**
+ * nla_find_nested - find attribute in a set of nested attributes
+ * @nla: attribute containing the nested attributes
+ * @attrtype: type of attribute to look for
+ *
+ * Returns the first attribute which matches the specified type.
+ */
+static inline struct nlattr *nla_find_nested(struct nlattr *nla, int attrtype)
+{
+	return nla_find(nla_data(nla), nla_len(nla), attrtype);
+}
+
 /**
  * nla_parse_nested - parse nested attributes
  * @tb: destination array with maxtype+1 elements
@@ -862,10 +910,7 @@ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start)
  */
 static inline int nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
 {
-	if (start)
-		skb_trim(skb, (unsigned char *) start - skb->data);
-
-	return -1;
+	return nlmsg_trim(skb, start);
 }
 
 /**
@@ -880,4 +925,13 @@ static inline int nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
 	     nla_ok(pos, rem); \
 	     pos = nla_next(pos, &(rem)))
 
+/**
+ * nla_for_each_nested - iterate over nested attributes
+ * @pos: loop counter, set to current attribute
+ * @nla: attribute containing the nested attributes
+ * @rem: initialized to len, holds bytes currently remaining in stream
+ */
+#define nla_for_each_nested(pos, nla, rem) \
+	nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem)
+
 #endif
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e7818765733..2ed4040d0dc 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -75,7 +75,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	/*
 	 * If new attributes are added, please revisit this allocation
 	 */
-	skb = nlmsg_new(size);
+	skb = nlmsg_new(size, GFP_KERNEL);
 	if (!skb)
 		return -ENOMEM;
 
diff --git a/net/netlink/attr.c b/net/netlink/attr.c
index dddbd15135a..136e529e578 100644
--- a/net/netlink/attr.c
+++ b/net/netlink/attr.c
@@ -254,6 +254,26 @@ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
 	return nla;
 }
 
+/**
+ * __nla_reserve_nohdr - reserve room for attribute without header
+ * @skb: socket buffer to reserve room on
+ * @attrlen: length of attribute payload
+ *
+ * Reserves room for attribute payload without a header.
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the payload.
+ */
+void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
+{
+	void *start;
+
+	start = skb_put(skb, NLA_ALIGN(attrlen));
+	memset(start, 0, NLA_ALIGN(attrlen));
+
+	return start;
+}
+
 /**
  * nla_reserve - reserve room for attribute on the skb
  * @skb: socket buffer to reserve room on
@@ -274,6 +294,24 @@ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
 	return __nla_reserve(skb, attrtype, attrlen);
 }
 
+/**
+ * nla_reserve - reserve room for attribute without header
+ * @skb: socket buffer to reserve room on
+ * @len: length of attribute payload
+ *
+ * Reserves room for attribute payload without a header.
+ *
+ * Returns NULL if the tailroom of the skb is insufficient to store
+ * the attribute payload.
+ */
+void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
+{
+	if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
+		return NULL;
+
+	return __nla_reserve_nohdr(skb, attrlen);
+}
+
 /**
  * __nla_put - Add a netlink attribute to a socket buffer
  * @skb: socket buffer to add attribute to
@@ -293,6 +331,22 @@ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
 	memcpy(nla_data(nla), data, attrlen);
 }
 
+/**
+ * __nla_put_nohdr - Add a netlink attribute without header
+ * @skb: socket buffer to add attribute to
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ *
+ * The caller is responsible to ensure that the skb provides enough
+ * tailroom for the attribute payload.
+ */
+void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
+{
+	void *start;
+
+	start = __nla_reserve_nohdr(skb, attrlen);
+	memcpy(start, data, attrlen);
+}
 
 /**
  * nla_put - Add a netlink attribute to a socket buffer
@@ -313,15 +367,36 @@ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
 	return 0;
 }
 
+/**
+ * nla_put_nohdr - Add a netlink attribute without header
+ * @skb: socket buffer to add attribute to
+ * @attrlen: length of attribute payload
+ * @data: head of attribute payload
+ *
+ * Returns -1 if the tailroom of the skb is insufficient to store
+ * the attribute payload.
+ */
+int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
+{
+	if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
+		return -1;
+
+	__nla_put_nohdr(skb, attrlen, data);
+	return 0;
+}
 
 EXPORT_SYMBOL(nla_validate);
 EXPORT_SYMBOL(nla_parse);
 EXPORT_SYMBOL(nla_find);
 EXPORT_SYMBOL(nla_strlcpy);
 EXPORT_SYMBOL(__nla_reserve);
+EXPORT_SYMBOL(__nla_reserve_nohdr);
 EXPORT_SYMBOL(nla_reserve);
+EXPORT_SYMBOL(nla_reserve_nohdr);
 EXPORT_SYMBOL(__nla_put);
+EXPORT_SYMBOL(__nla_put_nohdr);
 EXPORT_SYMBOL(nla_put);
+EXPORT_SYMBOL(nla_put_nohdr);
 EXPORT_SYMBOL(nla_memcpy);
 EXPORT_SYMBOL(nla_memcmp);
 EXPORT_SYMBOL(nla_strcmp);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index a298f77cc3e..75bb47a898d 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -440,7 +440,7 @@ static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid,
 	struct sk_buff *skb;
 	int err;
 
-	skb = nlmsg_new(NLMSG_GOODSIZE);
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (skb == NULL)
 		return ERR_PTR(-ENOBUFS);
 
-- 
cgit v1.2.3-70-g09d2


From 1823730fbc89fadde72a7bb3b7bdf03cc7b8835c Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 4 Aug 2006 23:04:54 -0700
Subject: [IPv4]: Move interface address bits to linux/if_addr.h

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_addr.h   | 53 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtnetlink.h | 56 -----------------------------------------------
 net/core/rtnetlink.c      |  1 +
 net/decnet/dn_dev.c       |  1 +
 net/ipv4/devinet.c        |  1 +
 net/ipv4/fib_frontend.c   |  1 +
 net/ipv6/addrconf.c       |  1 +
 net/ipv6/ndisc.c          |  1 +
 8 files changed, 59 insertions(+), 56 deletions(-)
 create mode 100644 include/linux/if_addr.h

(limited to 'include')

diff --git a/include/linux/if_addr.h b/include/linux/if_addr.h
new file mode 100644
index 00000000000..e1590454db5
--- /dev/null
+++ b/include/linux/if_addr.h
@@ -0,0 +1,53 @@
+#ifndef __LINUX_IF_ADDR_H
+#define __LINUX_IF_ADDR_H
+
+#include <linux/netlink.h>
+
+struct ifaddrmsg
+{
+	__u8		ifa_family;
+	__u8		ifa_prefixlen;	/* The prefix length		*/
+	__u8		ifa_flags;	/* Flags			*/
+	__u8		ifa_scope;	/* Address scope		*/
+	__u32		ifa_index;	/* Link index			*/
+};
+
+/*
+ * Important comment:
+ * IFA_ADDRESS is prefix address, rather than local interface address.
+ * It makes no difference for normally configured broadcast interfaces,
+ * but for point-to-point IFA_ADDRESS is DESTINATION address,
+ * local address is supplied in IFA_LOCAL attribute.
+ */
+enum
+{
+	IFA_UNSPEC,
+	IFA_ADDRESS,
+	IFA_LOCAL,
+	IFA_LABEL,
+	IFA_BROADCAST,
+	IFA_ANYCAST,
+	IFA_CACHEINFO,
+	IFA_MULTICAST,
+	__IFA_MAX,
+};
+
+#define IFA_MAX (__IFA_MAX - 1)
+
+/* ifa_flags */
+#define IFA_F_SECONDARY		0x01
+#define IFA_F_TEMPORARY		IFA_F_SECONDARY
+
+#define IFA_F_DEPRECATED	0x20
+#define IFA_F_TENTATIVE		0x40
+#define IFA_F_PERMANENT		0x80
+
+struct ifa_cacheinfo
+{
+	__u32	ifa_prefered;
+	__u32	ifa_valid;
+	__u32	cstamp; /* created timestamp, hundredths of seconds */
+	__u32	tstamp; /* updated timestamp, hundredths of seconds */
+};
+
+#endif
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index bf353538ae9..890c4d4038b 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -384,62 +384,6 @@ struct rta_session
 };
 
 
-/*********************************************************
- *		Interface address.
- ****/
-
-struct ifaddrmsg
-{
-	unsigned char	ifa_family;
-	unsigned char	ifa_prefixlen;	/* The prefix length		*/
-	unsigned char	ifa_flags;	/* Flags			*/
-	unsigned char	ifa_scope;	/* See above			*/
-	int		ifa_index;	/* Link index			*/
-};
-
-enum
-{
-	IFA_UNSPEC,
-	IFA_ADDRESS,
-	IFA_LOCAL,
-	IFA_LABEL,
-	IFA_BROADCAST,
-	IFA_ANYCAST,
-	IFA_CACHEINFO,
-	IFA_MULTICAST,
-	__IFA_MAX
-};
-
-#define IFA_MAX (__IFA_MAX - 1)
-
-/* ifa_flags */
-
-#define IFA_F_SECONDARY		0x01
-#define IFA_F_TEMPORARY		IFA_F_SECONDARY
-
-#define IFA_F_DEPRECATED	0x20
-#define IFA_F_TENTATIVE		0x40
-#define IFA_F_PERMANENT		0x80
-
-struct ifa_cacheinfo
-{
-	__u32	ifa_prefered;
-	__u32	ifa_valid;
-	__u32	cstamp; /* created timestamp, hundredths of seconds */
-	__u32	tstamp; /* updated timestamp, hundredths of seconds */
-};
-
-
-#define IFA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifaddrmsg))))
-#define IFA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifaddrmsg))
-
-/*
-   Important comment:
-   IFA_ADDRESS is prefix address, rather than local interface address.
-   It makes no difference for normally configured broadcast interfaces,
-   but for point-to-point IFA_ADDRESS is DESTINATION address,
-   local address is supplied in IFA_LOCAL attribute.
- */
 
 /**************************************************************
  *		Neighbour discovery.
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index aa7cff2257b..35712031e2c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/security.h>
 #include <linux/mutex.h>
+#include <linux/if_addr.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 476455fbdb0..632c5a90b58 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -34,6 +34,7 @@
 #include <linux/seq_file.h>
 #include <linux/timer.h>
 #include <linux/string.h>
+#include <linux/if_addr.h>
 #include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/skbuff.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 80bf5b2ea2e..398e7b9ca66 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -43,6 +43,7 @@
 #include <linux/in.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/if_addr.h>
 #include <linux/if_ether.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index fe4a53d4d10..a83f1aa8034 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -32,6 +32,7 @@
 #include <linux/inet.h>
 #include <linux/inetdevice.h>
 #include <linux/netdevice.h>
+#include <linux/if_addr.h>
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ed766eebc02..c2a4db843e5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -48,6 +48,7 @@
 #include <linux/net.h>
 #include <linux/in6.h>
 #include <linux/netdevice.h>
+#include <linux/if_addr.h>
 #include <linux/if_arp.h>
 #include <linux/if_arcnet.h>
 #include <linux/if_infiniband.h>
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 67cfc3813c3..5743e8bffef 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -62,6 +62,7 @@
 #include <linux/sysctl.h>
 #endif
 
+#include <linux/if_addr.h>
 #include <linux/if_arp.h>
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
-- 
cgit v1.2.3-70-g09d2


From 0844565fb8a9418f5a860aa480c1aef70319c9a2 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 4 Aug 2006 23:05:56 -0700
Subject: [NET]: Move netlink interface bits to linux/if.h

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if.h        | 129 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtnetlink.h | 133 +---------------------------------------------
 2 files changed, 130 insertions(+), 132 deletions(-)

(limited to 'include')

diff --git a/include/linux/if.h b/include/linux/if.h
index 374e20ad8b0..cd080d76532 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -212,5 +212,134 @@ struct ifconf
 #define	ifc_buf	ifc_ifcu.ifcu_buf		/* buffer address	*/
 #define	ifc_req	ifc_ifcu.ifcu_req		/* array of structures	*/
 
+/* The struct should be in sync with struct net_device_stats */
+struct rtnl_link_stats
+{
+	__u32	rx_packets;		/* total packets received	*/
+	__u32	tx_packets;		/* total packets transmitted	*/
+	__u32	rx_bytes;		/* total bytes received 	*/
+	__u32	tx_bytes;		/* total bytes transmitted	*/
+	__u32	rx_errors;		/* bad packets received		*/
+	__u32	tx_errors;		/* packet transmit problems	*/
+	__u32	rx_dropped;		/* no space in linux buffers	*/
+	__u32	tx_dropped;		/* no space available in linux	*/
+	__u32	multicast;		/* multicast packets received	*/
+	__u32	collisions;
+
+	/* detailed rx_errors: */
+	__u32	rx_length_errors;
+	__u32	rx_over_errors;		/* receiver ring buff overflow	*/
+	__u32	rx_crc_errors;		/* recved pkt with crc error	*/
+	__u32	rx_frame_errors;	/* recv'd frame alignment error */
+	__u32	rx_fifo_errors;		/* recv'r fifo overrun		*/
+	__u32	rx_missed_errors;	/* receiver missed packet	*/
+
+	/* detailed tx_errors */
+	__u32	tx_aborted_errors;
+	__u32	tx_carrier_errors;
+	__u32	tx_fifo_errors;
+	__u32	tx_heartbeat_errors;
+	__u32	tx_window_errors;
+
+	/* for cslip etc */
+	__u32	rx_compressed;
+	__u32	tx_compressed;
+};
+
+/* The struct should be in sync with struct ifmap */
+struct rtnl_link_ifmap
+{
+	__u64	mem_start;
+	__u64	mem_end;
+	__u64	base_addr;
+	__u16	irq;
+	__u8	dma;
+	__u8	port;
+};
+
+enum
+{
+	IFLA_UNSPEC,
+	IFLA_ADDRESS,
+	IFLA_BROADCAST,
+	IFLA_IFNAME,
+	IFLA_MTU,
+	IFLA_LINK,
+	IFLA_QDISC,
+	IFLA_STATS,
+	IFLA_COST,
+#define IFLA_COST IFLA_COST
+	IFLA_PRIORITY,
+#define IFLA_PRIORITY IFLA_PRIORITY
+	IFLA_MASTER,
+#define IFLA_MASTER IFLA_MASTER
+	IFLA_WIRELESS,		/* Wireless Extension event - see wireless.h */
+#define IFLA_WIRELESS IFLA_WIRELESS
+	IFLA_PROTINFO,		/* Protocol specific information for a link */
+#define IFLA_PROTINFO IFLA_PROTINFO
+	IFLA_TXQLEN,
+#define IFLA_TXQLEN IFLA_TXQLEN
+	IFLA_MAP,
+#define IFLA_MAP IFLA_MAP
+	IFLA_WEIGHT,
+#define IFLA_WEIGHT IFLA_WEIGHT
+	IFLA_OPERSTATE,
+	IFLA_LINKMODE,
+	__IFLA_MAX
+};
+
+
+#define IFLA_MAX (__IFLA_MAX - 1)
+
+/* ifi_flags.
+
+   IFF_* flags.
+
+   The only change is:
+   IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are
+   more not changeable by user. They describe link media
+   characteristics and set by device driver.
+
+   Comments:
+   - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid
+   - If neither of these three flags are set;
+     the interface is NBMA.
+
+   - IFF_MULTICAST does not mean anything special:
+   multicasts can be used on all not-NBMA links.
+   IFF_MULTICAST means that this media uses special encapsulation
+   for multicast frames. Apparently, all IFF_POINTOPOINT and
+   IFF_BROADCAST devices are able to use multicasts too.
+ */
+
+/* IFLA_LINK.
+   For usual devices it is equal ifi_index.
+   If it is a "virtual interface" (f.e. tunnel), ifi_link
+   can point to real physical interface (f.e. for bandwidth calculations),
+   or maybe 0, what means, that real media is unknown (usual
+   for IPIP tunnels, when route to endpoint is allowed to change)
+ */
+
+/* Subtype attributes for IFLA_PROTINFO */
+enum
+{
+	IFLA_INET6_UNSPEC,
+	IFLA_INET6_FLAGS,	/* link flags			*/
+	IFLA_INET6_CONF,	/* sysctl parameters		*/
+	IFLA_INET6_STATS,	/* statistics			*/
+	IFLA_INET6_MCAST,	/* MC things. What of them?	*/
+	IFLA_INET6_CACHEINFO,	/* time values and max reasm size */
+	__IFLA_INET6_MAX
+};
+
+#define IFLA_INET6_MAX	(__IFLA_INET6_MAX - 1)
+
+struct ifla_cacheinfo
+{
+	__u32	max_reasm_len;
+	__u32	tstamp;		/* ipv6InterfaceTable updated timestamp */
+	__u32	reachable_time;
+	__u32	retrans_time;
+};
 
 #endif /* _LINUX_IF_H */
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 890c4d4038b..84f3eb426da 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -2,6 +2,7 @@
 #define __LINUX_RTNETLINK_H
 
 #include <linux/netlink.h>
+#include <linux/if.h>
 
 /****
  *		Routing/neighbour discovery messages.
@@ -607,138 +608,6 @@ struct prefix_cacheinfo
 	__u32	valid_time;
 };
 
-/* The struct should be in sync with struct net_device_stats */
-struct rtnl_link_stats
-{
-	__u32	rx_packets;		/* total packets received	*/
-	__u32	tx_packets;		/* total packets transmitted	*/
-	__u32	rx_bytes;		/* total bytes received 	*/
-	__u32	tx_bytes;		/* total bytes transmitted	*/
-	__u32	rx_errors;		/* bad packets received		*/
-	__u32	tx_errors;		/* packet transmit problems	*/
-	__u32	rx_dropped;		/* no space in linux buffers	*/
-	__u32	tx_dropped;		/* no space available in linux	*/
-	__u32	multicast;		/* multicast packets received	*/
-	__u32	collisions;
-
-	/* detailed rx_errors: */
-	__u32	rx_length_errors;
-	__u32	rx_over_errors;		/* receiver ring buff overflow	*/
-	__u32	rx_crc_errors;		/* recved pkt with crc error	*/
-	__u32	rx_frame_errors;	/* recv'd frame alignment error */
-	__u32	rx_fifo_errors;		/* recv'r fifo overrun		*/
-	__u32	rx_missed_errors;	/* receiver missed packet	*/
-
-	/* detailed tx_errors */
-	__u32	tx_aborted_errors;
-	__u32	tx_carrier_errors;
-	__u32	tx_fifo_errors;
-	__u32	tx_heartbeat_errors;
-	__u32	tx_window_errors;
-	
-	/* for cslip etc */
-	__u32	rx_compressed;
-	__u32	tx_compressed;
-};
-
-/* The struct should be in sync with struct ifmap */
-struct rtnl_link_ifmap
-{
-	__u64	mem_start;
-	__u64	mem_end;
-	__u64	base_addr;
-	__u16	irq;
-	__u8	dma;
-	__u8	port;
-};
-
-enum
-{
-	IFLA_UNSPEC,
-	IFLA_ADDRESS,
-	IFLA_BROADCAST,
-	IFLA_IFNAME,
-	IFLA_MTU,
-	IFLA_LINK,
-	IFLA_QDISC,
-	IFLA_STATS,
-	IFLA_COST,
-#define IFLA_COST IFLA_COST
-	IFLA_PRIORITY,
-#define IFLA_PRIORITY IFLA_PRIORITY
-	IFLA_MASTER,
-#define IFLA_MASTER IFLA_MASTER
-	IFLA_WIRELESS,		/* Wireless Extension event - see wireless.h */
-#define IFLA_WIRELESS IFLA_WIRELESS
-	IFLA_PROTINFO,		/* Protocol specific information for a link */
-#define IFLA_PROTINFO IFLA_PROTINFO
-	IFLA_TXQLEN,
-#define IFLA_TXQLEN IFLA_TXQLEN
-	IFLA_MAP,
-#define IFLA_MAP IFLA_MAP
-	IFLA_WEIGHT,
-#define IFLA_WEIGHT IFLA_WEIGHT
-	IFLA_OPERSTATE,
-	IFLA_LINKMODE,
-	__IFLA_MAX
-};
-
-
-#define IFLA_MAX (__IFLA_MAX - 1)
-
-#define IFLA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg))))
-#define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg))
-
-/* ifi_flags.
-
-   IFF_* flags.
-
-   The only change is:
-   IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are
-   more not changeable by user. They describe link media
-   characteristics and set by device driver.
-
-   Comments:
-   - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid
-   - If neither of these three flags are set;
-     the interface is NBMA.
-
-   - IFF_MULTICAST does not mean anything special:
-   multicasts can be used on all not-NBMA links.
-   IFF_MULTICAST means that this media uses special encapsulation
-   for multicast frames. Apparently, all IFF_POINTOPOINT and
-   IFF_BROADCAST devices are able to use multicasts too.
- */
-
-/* IFLA_LINK.
-   For usual devices it is equal ifi_index.
-   If it is a "virtual interface" (f.e. tunnel), ifi_link
-   can point to real physical interface (f.e. for bandwidth calculations),
-   or maybe 0, what means, that real media is unknown (usual
-   for IPIP tunnels, when route to endpoint is allowed to change)
- */
-
-/* Subtype attributes for IFLA_PROTINFO */
-enum
-{
-	IFLA_INET6_UNSPEC,
-	IFLA_INET6_FLAGS,	/* link flags			*/
-	IFLA_INET6_CONF,	/* sysctl parameters		*/
-	IFLA_INET6_STATS,	/* statistics			*/
-	IFLA_INET6_MCAST,	/* MC things. What of them?	*/
-	IFLA_INET6_CACHEINFO,	/* time values and max reasm size */
-	__IFLA_INET6_MAX
-};
-
-#define IFLA_INET6_MAX	(__IFLA_INET6_MAX - 1)
-
-struct ifla_cacheinfo
-{
-	__u32	max_reasm_len;
-	__u32	tstamp;		/* ipv6InterfaceTable updated timestamp */
-	__u32	reachable_time;
-	__u32	retrans_time;
-};
 
 /*****************************************************************
  *		Traffic control messages.
-- 
cgit v1.2.3-70-g09d2


From 84fa7933a33f806bbbaae6775e87459b1ec584c0 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 29 Aug 2006 16:44:56 -0700
Subject: [NET]: Replace CHECKSUM_HW by CHECKSUM_PARTIAL/CHECKSUM_COMPLETE

Replace CHECKSUM_HW by CHECKSUM_PARTIAL (for outgoing packets, whose
checksum still needs to be completed) and CHECKSUM_COMPLETE (for
incoming packets, device supplied full checksum).

Patch originally from Herbert Xu, updated by myself for 2.6.18-rc3.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/he.c                            |  2 +-
 drivers/net/3c59x.c                         |  2 +-
 drivers/net/8139cp.c                        |  6 +++---
 drivers/net/acenic.c                        |  8 ++++----
 drivers/net/bnx2.c                          |  2 +-
 drivers/net/cassini.c                       |  4 ++--
 drivers/net/chelsio/sge.c                   | 10 +++++-----
 drivers/net/dl2k.c                          |  2 +-
 drivers/net/e1000/e1000_main.c              |  8 ++++----
 drivers/net/forcedeth.c                     |  3 ++-
 drivers/net/gianfar.c                       |  2 +-
 drivers/net/hamachi.c                       |  2 +-
 drivers/net/ibm_emac/ibm_emac_core.c        |  2 +-
 drivers/net/ioc3-eth.c                      |  2 +-
 drivers/net/ixgb/ixgb_main.c                |  2 +-
 drivers/net/mv643xx_eth.c                   |  2 +-
 drivers/net/myri10ge/myri10ge.c             |  8 ++++----
 drivers/net/ns83820.c                       |  2 +-
 drivers/net/r8169.c                         |  2 +-
 drivers/net/s2io.c                          |  2 +-
 drivers/net/sk98lin/skge.c                  |  6 +++---
 drivers/net/skge.c                          |  4 ++--
 drivers/net/sky2.c                          |  6 +++---
 drivers/net/starfire.c                      |  6 +++---
 drivers/net/sungem.c                        |  4 ++--
 drivers/net/sunhme.c                        |  6 +++---
 drivers/net/tg3.c                           |  6 +++---
 drivers/net/typhoon.c                       |  2 +-
 drivers/net/via-rhine.c                     |  2 +-
 drivers/net/via-velocity.c                  |  2 +-
 include/linux/netdevice.h                   |  4 ++--
 include/linux/skbuff.h                      | 17 +++++++++--------
 net/core/datagram.c                         |  4 ++--
 net/core/dev.c                              | 12 ++++++------
 net/core/netpoll.c                          |  2 +-
 net/core/skbuff.c                           | 14 +++++++-------
 net/ipv4/icmp.c                             |  2 +-
 net/ipv4/igmp.c                             |  2 +-
 net/ipv4/ip_fragment.c                      |  2 +-
 net/ipv4/ip_gre.c                           |  4 ++--
 net/ipv4/ip_output.c                        |  6 +++---
 net/ipv4/ipvs/ip_vs_proto_tcp.c             |  8 ++++----
 net/ipv4/ipvs/ip_vs_proto_udp.c             |  8 ++++----
 net/ipv4/netfilter.c                        |  2 +-
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c |  3 +--
 net/ipv4/netfilter/ip_conntrack_proto_udp.c |  3 +--
 net/ipv4/netfilter/ip_nat_standalone.c      |  5 +++--
 net/ipv4/netfilter/ip_queue.c               |  6 +++---
 net/ipv4/netfilter/ipt_ECN.c                |  9 +++++----
 net/ipv4/netfilter/ipt_TCPMSS.c             |  5 +++--
 net/ipv4/tcp.c                              |  8 ++++----
 net/ipv4/tcp_ipv4.c                         |  6 +++---
 net/ipv4/tcp_output.c                       | 18 ++++++++----------
 net/ipv4/udp.c                              |  6 +++---
 net/ipv4/xfrm4_output.c                     |  4 ++--
 net/ipv6/exthdrs.c                          |  2 +-
 net/ipv6/icmp.c                             |  2 +-
 net/ipv6/ip6_output.c                       |  2 +-
 net/ipv6/netfilter.c                        |  2 +-
 net/ipv6/netfilter/ip6_queue.c              |  6 +++---
 net/ipv6/netfilter/nf_conntrack_reasm.c     |  6 +++---
 net/ipv6/raw.c                              |  2 +-
 net/ipv6/reassembly.c                       |  6 +++---
 net/ipv6/tcp_ipv6.c                         |  6 +++---
 net/ipv6/udp.c                              |  2 +-
 net/ipv6/xfrm6_output.c                     |  4 ++--
 net/netfilter/nf_conntrack_proto_tcp.c      |  3 +--
 net/netfilter/nf_conntrack_proto_udp.c      |  3 +--
 net/netfilter/nfnetlink_queue.c             |  6 +++---
 net/packet/af_packet.c                      |  2 +-
 net/sched/sch_netem.c                       |  4 ++--
 net/sunrpc/socklib.c                        |  2 +-
 72 files changed, 168 insertions(+), 169 deletions(-)

(limited to 'include')

diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index ffcb9fd31c3..41e052fecd7 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -1912,7 +1912,7 @@ he_service_rbrq(struct he_dev *he_dev, int group)
 				skb->tail = skb->data + skb->len;
 #ifdef USE_CHECKSUM_HW
 				if (vcc->vpi == 0 && vcc->vci >= ATM_NOT_RSV_VCI) {
-					skb->ip_summed = CHECKSUM_HW;
+					skb->ip_summed = CHECKSUM_COMPLETE;
 					skb->csum = TCP_CKSUM(skb->data,
 							he_vcc->pdu_len);
 				}
diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index 80e8ca013e4..29dede2eaa8 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -2077,7 +2077,7 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	vp->tx_ring[entry].next = 0;
 #if DO_ZEROCOPY
-	if (skb->ip_summed != CHECKSUM_HW)
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
 			vp->tx_ring[entry].status = cpu_to_le32(skb->len | TxIntrUploaded);
 	else
 			vp->tx_ring[entry].status = cpu_to_le32(skb->len | TxIntrUploaded | AddTCPChksum | AddUDPChksum);
diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index 1428bb7715a..a48b211c489 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -813,7 +813,7 @@ static int cp_start_xmit (struct sk_buff *skb, struct net_device *dev)
 
 		if (mss)
 			flags |= LargeSend | ((mss & MSSMask) << MSSShift);
-		else if (skb->ip_summed == CHECKSUM_HW) {
+		else if (skb->ip_summed == CHECKSUM_PARTIAL) {
 			const struct iphdr *ip = skb->nh.iph;
 			if (ip->protocol == IPPROTO_TCP)
 				flags |= IPCS | TCPCS;
@@ -867,7 +867,7 @@ static int cp_start_xmit (struct sk_buff *skb, struct net_device *dev)
 			if (mss)
 				ctrl |= LargeSend |
 					((mss & MSSMask) << MSSShift);
-			else if (skb->ip_summed == CHECKSUM_HW) {
+			else if (skb->ip_summed == CHECKSUM_PARTIAL) {
 				if (ip->protocol == IPPROTO_TCP)
 					ctrl |= IPCS | TCPCS;
 				else if (ip->protocol == IPPROTO_UDP)
@@ -898,7 +898,7 @@ static int cp_start_xmit (struct sk_buff *skb, struct net_device *dev)
 		txd->addr = cpu_to_le64(first_mapping);
 		wmb();
 
-		if (skb->ip_summed == CHECKSUM_HW) {
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 			if (ip->protocol == IPPROTO_TCP)
 				txd->opts1 = cpu_to_le32(first_eor | first_len |
 							 FirstFrag | DescOwn |
diff --git a/drivers/net/acenic.c b/drivers/net/acenic.c
index 1c01e9b3d07..826548644d7 100644
--- a/drivers/net/acenic.c
+++ b/drivers/net/acenic.c
@@ -2040,7 +2040,7 @@ static void ace_rx_int(struct net_device *dev, u32 rxretprd, u32 rxretcsm)
 		 */
 		if (bd_flags & BD_FLG_TCP_UDP_SUM) {
 			skb->csum = htons(csum);
-			skb->ip_summed = CHECKSUM_HW;
+			skb->ip_summed = CHECKSUM_COMPLETE;
 		} else {
 			skb->ip_summed = CHECKSUM_NONE;
 		}
@@ -2511,7 +2511,7 @@ restart:
 
 		mapping = ace_map_tx_skb(ap, skb, skb, idx);
 		flagsize = (skb->len << 16) | (BD_FLG_END);
-		if (skb->ip_summed == CHECKSUM_HW)
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			flagsize |= BD_FLG_TCP_UDP_SUM;
 #if ACENIC_DO_VLAN
 		if (vlan_tx_tag_present(skb)) {
@@ -2534,7 +2534,7 @@ restart:
 
 		mapping = ace_map_tx_skb(ap, skb, NULL, idx);
 		flagsize = (skb_headlen(skb) << 16);
-		if (skb->ip_summed == CHECKSUM_HW)
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			flagsize |= BD_FLG_TCP_UDP_SUM;
 #if ACENIC_DO_VLAN
 		if (vlan_tx_tag_present(skb)) {
@@ -2560,7 +2560,7 @@ restart:
 					       PCI_DMA_TODEVICE);
 
 			flagsize = (frag->size << 16);
-			if (skb->ip_summed == CHECKSUM_HW)
+			if (skb->ip_summed == CHECKSUM_PARTIAL)
 				flagsize |= BD_FLG_TCP_UDP_SUM;
 			idx = (idx + 1) % ACE_TX_RING_ENTRIES(ap);
 
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 652eb05a6c2..7857b463012 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -4423,7 +4423,7 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	ring_prod = TX_RING_IDX(prod);
 
 	vlan_tag_flags = 0;
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		vlan_tag_flags |= TX_BD_FLAGS_TCP_UDP_CKSUM;
 	}
 
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index a31544ccb3c..558fdb8ad2d 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -2167,7 +2167,7 @@ end_copy_pkt:
 			cas_page_unmap(addr);
 	}
 	skb->csum = ntohs(i ^ 0xffff);
-	skb->ip_summed = CHECKSUM_HW;
+	skb->ip_summed = CHECKSUM_COMPLETE;
 	skb->protocol = eth_type_trans(skb, cp->dev);
 	return len;
 }
@@ -2821,7 +2821,7 @@ static inline int cas_xmit_tx_ringN(struct cas *cp, int ring,
 	}
 
 	ctrl = 0;
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		u64 csum_start_off, csum_stuff_off;
 
 		csum_start_off = (u64) (skb->h.raw - skb->data);
diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
index 61b3754f50f..ddd0bdb498f 100644
--- a/drivers/net/chelsio/sge.c
+++ b/drivers/net/chelsio/sge.c
@@ -1470,9 +1470,9 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 
 		if (!(adapter->flags & UDP_CSUM_CAPABLE) &&
-		    skb->ip_summed == CHECKSUM_HW &&
+		    skb->ip_summed == CHECKSUM_PARTIAL &&
 		    skb->nh.iph->protocol == IPPROTO_UDP)
-			if (unlikely(skb_checksum_help(skb, 0))) {
+			if (unlikely(skb_checksum_help(skb))) {
 				dev_kfree_skb_any(skb);
 				return NETDEV_TX_OK;
 			}
@@ -1495,11 +1495,11 @@ int t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		cpl = (struct cpl_tx_pkt *)__skb_push(skb, sizeof(*cpl));
 		cpl->opcode = CPL_TX_PKT;
 		cpl->ip_csum_dis = 1;    /* SW calculates IP csum */
-		cpl->l4_csum_dis = skb->ip_summed == CHECKSUM_HW ? 0 : 1;
+		cpl->l4_csum_dis = skb->ip_summed == CHECKSUM_PARTIAL ? 0 : 1;
 		/* the length field isn't used so don't bother setting it */
 
-		st->tx_cso += (skb->ip_summed == CHECKSUM_HW);
-		sge->stats.tx_do_cksum += (skb->ip_summed == CHECKSUM_HW);
+		st->tx_cso += (skb->ip_summed == CHECKSUM_PARTIAL);
+		sge->stats.tx_do_cksum += (skb->ip_summed == CHECKSUM_PARTIAL);
 		sge->stats.tx_reg_pkts++;
 	}
 	cpl->iff = dev->if_port;
diff --git a/drivers/net/dl2k.c b/drivers/net/dl2k.c
index 402961e68c8..b74e6765476 100644
--- a/drivers/net/dl2k.c
+++ b/drivers/net/dl2k.c
@@ -611,7 +611,7 @@ start_xmit (struct sk_buff *skb, struct net_device *dev)
 	txdesc = &np->tx_ring[entry];
 
 #if 0
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		txdesc->status |=
 		    cpu_to_le64 (TCPChecksumEnable | UDPChecksumEnable |
 				 IPChecksumEnable);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 98ef9f85482..2ab9f96f5da 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -2600,7 +2600,7 @@ e1000_tx_csum(struct e1000_adapter *adapter, struct e1000_tx_ring *tx_ring,
 	unsigned int i;
 	uint8_t css;
 
-	if (likely(skb->ip_summed == CHECKSUM_HW)) {
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
 		css = skb->h.raw - skb->data;
 
 		i = tx_ring->next_to_use;
@@ -2927,11 +2927,11 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	}
 
 	/* reserve a descriptor for the offload context */
-	if ((mss) || (skb->ip_summed == CHECKSUM_HW))
+	if ((mss) || (skb->ip_summed == CHECKSUM_PARTIAL))
 		count++;
 	count++;
 #else
-	if (skb->ip_summed == CHECKSUM_HW)
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		count++;
 #endif
 
@@ -3608,7 +3608,7 @@ e1000_rx_checksum(struct e1000_adapter *adapter,
 		 */
 		csum = ntohl(csum ^ 0xFFFF);
 		skb->csum = csum;
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_COMPLETE;
 	}
 	adapter->hw_csum_good++;
 }
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 11b8f1b43dd..32cacf115f7 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -1503,7 +1503,8 @@ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT);
 	else
 #endif
-	tx_flags_extra = (skb->ip_summed == CHECKSUM_HW ? (NV_TX2_CHECKSUM_L3|NV_TX2_CHECKSUM_L4) : 0);
+	tx_flags_extra = skb->ip_summed == CHECKSUM_PARTIAL ?
+			 NV_TX2_CHECKSUM_L3 | NV_TX2_CHECKSUM_L4 : 0;
 
 	/* vlan tag */
 	if (np->vlangrp && vlan_tx_tag_present(skb)) {
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index ebbbd6ca620..ba960913c03 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -947,7 +947,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	/* Set up checksumming */
 	if (likely((dev->features & NETIF_F_IP_CSUM)
-			&& (CHECKSUM_HW == skb->ip_summed))) {
+			&& (CHECKSUM_PARTIAL == skb->ip_summed))) {
 		fcb = gfar_add_fcb(skb, txbdp);
 		status |= TXBD_TOE;
 		gfar_tx_checksum(skb, fcb);
diff --git a/drivers/net/hamachi.c b/drivers/net/hamachi.c
index 409c6aab041..763373ae966 100644
--- a/drivers/net/hamachi.c
+++ b/drivers/net/hamachi.c
@@ -1648,7 +1648,7 @@ static int hamachi_rx(struct net_device *dev)
 						* could do the pseudo myself and return
 						* CHECKSUM_UNNECESSARY
 						*/
-						skb->ip_summed = CHECKSUM_HW;
+						skb->ip_summed = CHECKSUM_COMPLETE;
 					}
 				}	
 			}
diff --git a/drivers/net/ibm_emac/ibm_emac_core.c b/drivers/net/ibm_emac/ibm_emac_core.c
index 82468e2dc79..57e214d85e9 100644
--- a/drivers/net/ibm_emac/ibm_emac_core.c
+++ b/drivers/net/ibm_emac/ibm_emac_core.c
@@ -1036,7 +1036,7 @@ static inline u16 emac_tx_csum(struct ocp_enet_private *dev,
 			       struct sk_buff *skb)
 {
 #if defined(CONFIG_IBM_EMAC_TAH)
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		++dev->stats.tx_packets_csum;
 		return EMAC_TX_CTRL_TAH_CSUM;
 	}
diff --git a/drivers/net/ioc3-eth.c b/drivers/net/ioc3-eth.c
index 68d8af7df08..65f897ddb92 100644
--- a/drivers/net/ioc3-eth.c
+++ b/drivers/net/ioc3-eth.c
@@ -1387,7 +1387,7 @@ static int ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * MAC header which should not be summed and the TCP/UDP pseudo headers
 	 * manually.
 	 */
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		int proto = ntohs(skb->nh.iph->protocol);
 		unsigned int csoff;
 		struct iphdr *ih = skb->nh.iph;
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
index 7bbd447289b..9405b44f321 100644
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1232,7 +1232,7 @@ ixgb_tx_csum(struct ixgb_adapter *adapter, struct sk_buff *skb)
 	unsigned int i;
 	uint8_t css, cso;
 
-	if(likely(skb->ip_summed == CHECKSUM_HW)) {
+	if(likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
 		css = skb->h.raw - skb->data;
 		cso = (skb->h.raw + skb->csum) - skb->data;
 
diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c
index eeab1df5bef..38df58fdb35 100644
--- a/drivers/net/mv643xx_eth.c
+++ b/drivers/net/mv643xx_eth.c
@@ -1147,7 +1147,7 @@ static void eth_tx_submit_descs_for_skb(struct mv643xx_private *mp,
 	desc->byte_cnt = length;
 	desc->buf_ptr = dma_map_single(NULL, skb->data, length, DMA_TO_DEVICE);
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		BUG_ON(skb->protocol != ETH_P_IP);
 
 		cmd_sts |= ETH_GEN_TCP_UDP_CHECKSUM |
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 9bdd43ab357..9f16681d0e7 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -930,7 +930,7 @@ static inline void myri10ge_vlan_ip_csum(struct sk_buff *skb, u16 hw_csum)
 	    (vh->h_vlan_encapsulated_proto == htons(ETH_P_IP) ||
 	     vh->h_vlan_encapsulated_proto == htons(ETH_P_IPV6))) {
 		skb->csum = hw_csum;
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_COMPLETE;
 	}
 }
 
@@ -973,7 +973,7 @@ myri10ge_rx_done(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx,
 		if ((skb->protocol == ntohs(ETH_P_IP)) ||
 		    (skb->protocol == ntohs(ETH_P_IPV6))) {
 			skb->csum = ntohs((u16) csum);
-			skb->ip_summed = CHECKSUM_HW;
+			skb->ip_summed = CHECKSUM_COMPLETE;
 		} else
 			myri10ge_vlan_ip_csum(skb, ntohs((u16) csum));
 	}
@@ -1897,13 +1897,13 @@ again:
 	pseudo_hdr_offset = 0;
 	odd_flag = 0;
 	flags = (MXGEFW_FLAGS_NO_TSO | MXGEFW_FLAGS_FIRST);
-	if (likely(skb->ip_summed == CHECKSUM_HW)) {
+	if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
 		cksum_offset = (skb->h.raw - skb->data);
 		pseudo_hdr_offset = (skb->h.raw + skb->csum) - skb->data;
 		/* If the headers are excessively large, then we must
 		 * fall back to a software checksum */
 		if (unlikely(cksum_offset > 255 || pseudo_hdr_offset > 127)) {
-			if (skb_checksum_help(skb, 0))
+			if (skb_checksum_help(skb))
 				goto drop;
 			cksum_offset = 0;
 			pseudo_hdr_offset = 0;
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index 0e76859c90a..5143f5dbb2e 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -1153,7 +1153,7 @@ again:
 	if (!nr_frags)
 		frag = NULL;
 	extsts = 0;
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		extsts |= EXTSTS_IPPKT;
 		if (IPPROTO_TCP == skb->nh.iph->protocol)
 			extsts |= EXTSTS_TCPPKT;
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 4c2f575faad..d9b960aa9b0 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2169,7 +2169,7 @@ static inline u32 rtl8169_tso_csum(struct sk_buff *skb, struct net_device *dev)
 		if (mss)
 			return LargeSend | ((mss & MSSMask) << MSSShift);
 	}
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		const struct iphdr *ip = skb->nh.iph;
 
 		if (ip->protocol == IPPROTO_TCP)
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
index e72e0e09906..5b3713f622d 100644
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -3893,7 +3893,7 @@ static int s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 		txdp->Control_1 |= TXD_TCP_LSO_MSS(s2io_tcp_mss(skb));
 	}
 #endif
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		txdp->Control_2 |=
 		    (TXD_TX_CKO_IPV4_EN | TXD_TX_CKO_TCP_EN |
 		     TXD_TX_CKO_UDP_EN);
diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c
index ee62845d3ac..eb3b35180c2 100644
--- a/drivers/net/sk98lin/skge.c
+++ b/drivers/net/sk98lin/skge.c
@@ -1559,7 +1559,7 @@ struct sk_buff	*pMessage)	/* pointer to send-message              */
 	pTxd->VDataHigh = (SK_U32) (PhysAddr >> 32);
 	pTxd->pMBuf     = pMessage;
 
-	if (pMessage->ip_summed == CHECKSUM_HW) {
+	if (pMessage->ip_summed == CHECKSUM_PARTIAL) {
 		u16 hdrlen = pMessage->h.raw - pMessage->data;
 		u16 offset = hdrlen + pMessage->csum;
 
@@ -1678,7 +1678,7 @@ struct sk_buff	*pMessage)	/* pointer to send-message              */
 	/* 
 	** Does the HW need to evaluate checksum for TCP or UDP packets? 
 	*/
-	if (pMessage->ip_summed == CHECKSUM_HW) {
+	if (pMessage->ip_summed == CHECKSUM_PARTIAL) {
 		u16 hdrlen = pMessage->h.raw - pMessage->data;
 		u16 offset = hdrlen + pMessage->csum;
 
@@ -2158,7 +2158,7 @@ rx_start:
 
 #ifdef USE_SK_RX_CHECKSUM
 		pMsg->csum = pRxd->TcpSums & 0xffff;
-		pMsg->ip_summed = CHECKSUM_HW;
+		pMsg->ip_summed = CHECKSUM_COMPLETE;
 #else
 		pMsg->ip_summed = CHECKSUM_NONE;
 #endif
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index ad878dfddef..b3d6fa3d6df 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -2338,7 +2338,7 @@ static int skge_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 	td->dma_lo = map;
 	td->dma_hi = map >> 32;
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		int offset = skb->h.raw - skb->data;
 
 		/* This seems backwards, but it is what the sk98lin
@@ -2642,7 +2642,7 @@ static inline struct sk_buff *skge_rx_get(struct skge_port *skge,
 	skb->dev = skge->netdev;
 	if (skge->rx_csum) {
 		skb->csum = csum;
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_COMPLETE;
 	}
 
 	skb->protocol = eth_type_trans(skb, skge->netdev);
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 933e87f1cc6..8e92566b587 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1163,7 +1163,7 @@ static unsigned tx_le_req(const struct sk_buff *skb)
 	if (skb_is_gso(skb))
 		++count;
 
-	if (skb->ip_summed == CHECKSUM_HW)
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		++count;
 
 	return count;
@@ -1272,7 +1272,7 @@ static int sky2_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 #endif
 
 	/* Handle TCP checksum offload */
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		u16 hdr = skb->h.raw - skb->data;
 		u16 offset = hdr + skb->csum;
 
@@ -2000,7 +2000,7 @@ static int sky2_status_intr(struct sky2_hw *hw, int to_do)
 #endif
 		case OP_RXCHKS:
 			skb = sky2->rx_ring[sky2->rx_next].skb;
-			skb->ip_summed = CHECKSUM_HW;
+			skb->ip_summed = CHECKSUM_COMPLETE;
 			skb->csum = le16_to_cpu(status);
 			break;
 
diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c
index c0a62b00ffc..2607aa51d8e 100644
--- a/drivers/net/starfire.c
+++ b/drivers/net/starfire.c
@@ -1230,7 +1230,7 @@ static int start_tx(struct sk_buff *skb, struct net_device *dev)
 	}
 
 #if defined(ZEROCOPY) && defined(HAS_BROKEN_FIRMWARE)
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		if (skb_padto(skb, (skb->len + PADDING_MASK) & ~PADDING_MASK))
 			return NETDEV_TX_OK;
 	}
@@ -1252,7 +1252,7 @@ static int start_tx(struct sk_buff *skb, struct net_device *dev)
 				status |= TxDescIntr;
 				np->reap_tx = 0;
 			}
-			if (skb->ip_summed == CHECKSUM_HW) {
+			if (skb->ip_summed == CHECKSUM_PARTIAL) {
 				status |= TxCalTCP;
 				np->stats.tx_compressed++;
 			}
@@ -1499,7 +1499,7 @@ static int __netdev_rx(struct net_device *dev, int *quota)
 		 * Until then, the printk stays. :-) -Ion
 		 */
 		else if (le16_to_cpu(desc->status2) & 0x0040) {
-			skb->ip_summed = CHECKSUM_HW;
+			skb->ip_summed = CHECKSUM_COMPLETE;
 			skb->csum = le16_to_cpu(desc->csum);
 			printk(KERN_DEBUG "%s: checksum_hw, status2 = %#x\n", dev->name, le16_to_cpu(desc->status2));
 		}
diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c
index d7b1d1882ca..b388651b783 100644
--- a/drivers/net/sungem.c
+++ b/drivers/net/sungem.c
@@ -855,7 +855,7 @@ static int gem_rx(struct gem *gp, int work_to_do)
 		}
 
 		skb->csum = ntohs((status & RXDCTRL_TCPCSUM) ^ 0xffff);
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_COMPLETE;
 		skb->protocol = eth_type_trans(skb, gp->dev);
 
 		netif_receive_skb(skb);
@@ -1026,7 +1026,7 @@ static int gem_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	unsigned long flags;
 
 	ctrl = 0;
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		u64 csum_start_off, csum_stuff_off;
 
 		csum_start_off = (u64) (skb->h.raw - skb->data);
diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c
index c6f5bc3c042..17981da2273 100644
--- a/drivers/net/sunhme.c
+++ b/drivers/net/sunhme.c
@@ -1207,7 +1207,7 @@ static void happy_meal_transceiver_check(struct happy_meal *hp, void __iomem *tr
  * flags, thus:
  *
  * 	skb->csum = rxd->rx_flags & 0xffff;
- * 	skb->ip_summed = CHECKSUM_HW;
+ * 	skb->ip_summed = CHECKSUM_COMPLETE;
  *
  * before sending off the skb to the protocols, and we are good as gold.
  */
@@ -2074,7 +2074,7 @@ static void happy_meal_rx(struct happy_meal *hp, struct net_device *dev)
 
 		/* This card is _fucking_ hot... */
 		skb->csum = ntohs(csum ^ 0xffff);
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_COMPLETE;
 
 		RXD(("len=%d csum=%4x]", len, csum));
 		skb->protocol = eth_type_trans(skb, dev);
@@ -2268,7 +2268,7 @@ static int happy_meal_start_xmit(struct sk_buff *skb, struct net_device *dev)
  	u32 tx_flags;
 
 	tx_flags = TXFLAG_OWN;
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		u32 csum_start_off, csum_stuff_off;
 
 		csum_start_off = (u32) (skb->h.raw - skb->data);
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index eafabb253f0..6f5d3a38c58 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -3851,11 +3851,11 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		skb->h.th->check = 0;
 
 	}
-	else if (skb->ip_summed == CHECKSUM_HW)
+	else if (skb->ip_summed == CHECKSUM_PARTIAL)
 		base_flags |= TXD_FLAG_TCPUDP_CSUM;
 #else
 	mss = 0;
-	if (skb->ip_summed == CHECKSUM_HW)
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		base_flags |= TXD_FLAG_TCPUDP_CSUM;
 #endif
 #if TG3_VLAN_TAG_USED
@@ -3981,7 +3981,7 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev)
 
 	entry = tp->tx_prod;
 	base_flags = 0;
-	if (skb->ip_summed == CHECKSUM_HW)
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		base_flags |= TXD_FLAG_TCPUDP_CSUM;
 #if TG3_TSO_SUPPORT != 0
 	mss = 0;
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index 4103c37172f..c6e601dc6bb 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -830,7 +830,7 @@ typhoon_start_tx(struct sk_buff *skb, struct net_device *dev)
 	first_txd->addrHi = (u64)((unsigned long) skb) >> 32;
 	first_txd->processFlags = 0;
 
-	if(skb->ip_summed == CHECKSUM_HW) {
+	if(skb->ip_summed == CHECKSUM_PARTIAL) {
 		/* The 3XP will figure out if this is UDP/TCP */
 		first_txd->processFlags |= TYPHOON_TX_PF_TCP_CHKSUM;
 		first_txd->processFlags |= TYPHOON_TX_PF_UDP_CHKSUM;
diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c
index ae971080e2e..66547159bfd 100644
--- a/drivers/net/via-rhine.c
+++ b/drivers/net/via-rhine.c
@@ -1230,7 +1230,7 @@ static int rhine_start_tx(struct sk_buff *skb, struct net_device *dev)
 	rp->tx_skbuff[entry] = skb;
 
 	if ((rp->quirks & rqRhineI) &&
-	    (((unsigned long)skb->data & 3) || skb_shinfo(skb)->nr_frags != 0 || skb->ip_summed == CHECKSUM_HW)) {
+	    (((unsigned long)skb->data & 3) || skb_shinfo(skb)->nr_frags != 0 || skb->ip_summed == CHECKSUM_PARTIAL)) {
 		/* Must use alignment buffer. */
 		if (skb->len > PKT_BUF_SZ) {
 			/* packet too long, drop it */
diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c
index aa9cd92f46b..f1e0c746a38 100644
--- a/drivers/net/via-velocity.c
+++ b/drivers/net/via-velocity.c
@@ -2002,7 +2002,7 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev)
 	 *	Handle hardware checksum
 	 */
 	if ((vptr->flags & VELOCITY_FLAGS_TX_CSUM)
-				 && (skb->ip_summed == CHECKSUM_HW)) {
+				 && (skb->ip_summed == CHECKSUM_PARTIAL)) {
 		struct iphdr *ip = skb->nh.iph;
 		if (ip->protocol == IPPROTO_TCP)
 			td_ptr->tdesc1.TCR |= TCR0_TCPCK;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 50a4719512e..4f2c2b6beb5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -976,7 +976,7 @@ extern void		dev_mcast_init(void);
 extern int		netdev_max_backlog;
 extern int		weight_p;
 extern int		netdev_set_master(struct net_device *dev, struct net_device *master);
-extern int skb_checksum_help(struct sk_buff *skb, int inward);
+extern int skb_checksum_help(struct sk_buff *skb);
 extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features);
 #ifdef CONFIG_BUG
 extern void netdev_rx_csum_fault(struct net_device *dev);
@@ -1012,7 +1012,7 @@ static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
 {
 	return skb_is_gso(skb) &&
 	       (!skb_gso_ok(skb, dev->features) ||
-		unlikely(skb->ip_summed != CHECKSUM_HW));
+		unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
 }
 
 /* On bonding slaves other than the currently active slave, suppress
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 755e9cddac4..85577a4ffa6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -34,8 +34,9 @@
 #define HAVE_ALIGNABLE_SKB	/* Ditto 8)		   */
 
 #define CHECKSUM_NONE 0
-#define CHECKSUM_HW 1
+#define CHECKSUM_PARTIAL 1
 #define CHECKSUM_UNNECESSARY 2
+#define CHECKSUM_COMPLETE 3
 
 #define SKB_DATA_ALIGN(X)	(((X) + (SMP_CACHE_BYTES - 1)) & \
 				 ~(SMP_CACHE_BYTES - 1))
@@ -56,17 +57,17 @@
  *	      Apparently with secret goal to sell you new device, when you
  *	      will add new protocol to your host. F.e. IPv6. 8)
  *
- *	HW: the most generic way. Device supplied checksum of _all_
+ *	COMPLETE: the most generic way. Device supplied checksum of _all_
  *	    the packet as seen by netif_rx in skb->csum.
  *	    NOTE: Even if device supports only some protocols, but
- *	    is able to produce some skb->csum, it MUST use HW,
+ *	    is able to produce some skb->csum, it MUST use COMPLETE,
  *	    not UNNECESSARY.
  *
  * B. Checksumming on output.
  *
  *	NONE: skb is checksummed by protocol or csum is not required.
  *
- *	HW: device is required to csum packet as seen by hard_start_xmit
+ *	PARTIAL: device is required to csum packet as seen by hard_start_xmit
  *	from skb->h.raw to the end and to record the checksum
  *	at skb->h.raw+skb->csum.
  *
@@ -1261,14 +1262,14 @@ static inline int skb_linearize_cow(struct sk_buff *skb)
  *	@len: length of data pulled
  *
  *	After doing a pull on a received packet, you need to call this to
- *	update the CHECKSUM_HW checksum, or set ip_summed to CHECKSUM_NONE
- *	so that it can be recomputed from scratch.
+ *	update the CHECKSUM_COMPLETE checksum, or set ip_summed to
+ *	CHECKSUM_NONE so that it can be recomputed from scratch.
  */
 
 static inline void skb_postpull_rcsum(struct sk_buff *skb,
 				      const void *start, unsigned int len)
 {
-	if (skb->ip_summed == CHECKSUM_HW)
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
 		skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0));
 }
 
@@ -1287,7 +1288,7 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 {
 	if (likely(len >= skb->len))
 		return 0;
-	if (skb->ip_summed == CHECKSUM_HW)
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
 		skb->ip_summed = CHECKSUM_NONE;
 	return __pskb_trim(skb, len);
 }
diff --git a/net/core/datagram.c b/net/core/datagram.c
index aecddcc3040..f558c61aecc 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -417,7 +417,7 @@ unsigned int __skb_checksum_complete(struct sk_buff *skb)
 
 	sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
 	if (likely(!sum)) {
-		if (unlikely(skb->ip_summed == CHECKSUM_HW))
+		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
 			netdev_rx_csum_fault(skb->dev);
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
@@ -462,7 +462,7 @@ int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
 			goto fault;
 		if ((unsigned short)csum_fold(csum))
 			goto csum_error;
-		if (unlikely(skb->ip_summed == CHECKSUM_HW))
+		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
 			netdev_rx_csum_fault(skb->dev);
 		iov->iov_len -= chunk;
 		iov->iov_base += chunk;
diff --git a/net/core/dev.c b/net/core/dev.c
index d4a1ec3bded..fc82f6f6e1c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1166,12 +1166,12 @@ EXPORT_SYMBOL(netif_device_attach);
  * Invalidate hardware checksum when packet is to be mangled, and
  * complete checksum manually on outgoing path.
  */
-int skb_checksum_help(struct sk_buff *skb, int inward)
+int skb_checksum_help(struct sk_buff *skb)
 {
 	unsigned int csum;
 	int ret = 0, offset = skb->h.raw - skb->data;
 
-	if (inward)
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
 		goto out_set_summed;
 
 	if (unlikely(skb_shinfo(skb)->gso_size)) {
@@ -1223,7 +1223,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
 	skb->mac_len = skb->nh.raw - skb->data;
 	__skb_pull(skb, skb->mac_len);
 
-	if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
+	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
 		if (skb_header_cloned(skb) &&
 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
 			return ERR_PTR(err);
@@ -1232,7 +1232,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
-			if (unlikely(skb->ip_summed != CHECKSUM_HW)) {
+			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
 				err = ptype->gso_send_check(skb);
 				segs = ERR_PTR(err);
 				if (err || skb_gso_ok(skb, features))
@@ -1444,11 +1444,11 @@ int dev_queue_xmit(struct sk_buff *skb)
 	/* If packet is not checksummed and device does not support
 	 * checksumming for this protocol, complete checksumming here.
 	 */
-	if (skb->ip_summed == CHECKSUM_HW &&
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
 	    (!(dev->features & NETIF_F_GEN_CSUM) &&
 	     (!(dev->features & NETIF_F_IP_CSUM) ||
 	      skb->protocol != htons(ETH_P_IP))))
-	      	if (skb_checksum_help(skb, 0))
+	      	if (skb_checksum_help(skb))
 	      		goto out_kfree_skb;
 
 gso:
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 471da451cd4..ead5920c26d 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -110,7 +110,7 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
 
 	psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
 
-	if (skb->ip_summed == CHECKSUM_HW &&
+	if (skb->ip_summed == CHECKSUM_COMPLETE &&
 	    !(u16)csum_fold(csum_add(psum, skb->csum)))
 		return 0;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c54f3664bce..8a476f1956e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1397,7 +1397,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
 	unsigned int csum;
 	long csstart;
 
-	if (skb->ip_summed == CHECKSUM_HW)
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		csstart = skb->h.raw - skb->data;
 	else
 		csstart = skb_headlen(skb);
@@ -1411,7 +1411,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
 		csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
 					      skb->len - csstart, 0);
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		long csstuff = csstart + skb->csum;
 
 		*((unsigned short *)(to + csstuff)) = csum_fold(csum);
@@ -1898,10 +1898,10 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
  *	@len: length of data pulled
  *
  *	This function performs an skb_pull on the packet and updates
- *	update the CHECKSUM_HW checksum.  It should be used on receive
- *	path processing instead of skb_pull unless you know that the
- *	checksum difference is zero (e.g., a valid IP header) or you
- *	are setting ip_summed to CHECKSUM_NONE.
+ *	update the CHECKSUM_COMPLETE checksum.  It should be used on
+ *	receive path processing instead of skb_pull unless you know
+ *	that the checksum difference is zero (e.g., a valid IP header)
+ *	or you are setting ip_summed to CHECKSUM_NONE.
  */
 unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
 {
@@ -1994,7 +1994,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
 		frag = skb_shinfo(nskb)->frags;
 		k = 0;
 
-		nskb->ip_summed = CHECKSUM_HW;
+		nskb->ip_summed = CHECKSUM_PARTIAL;
 		nskb->csum = skb->csum;
 		memcpy(skb_put(nskb, hsize), skb->data + offset, hsize);
 
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 6ad797c1416..6d223e5c674 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -930,7 +930,7 @@ int icmp_rcv(struct sk_buff *skb)
 	ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
 
 	switch (skb->ip_summed) {
-	case CHECKSUM_HW:
+	case CHECKSUM_COMPLETE:
 		if (!(u16)csum_fold(skb->csum))
 			break;
 		/* fall through */
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 8e8117c19e4..7003e763d97 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -931,7 +931,7 @@ int igmp_rcv(struct sk_buff *skb)
 		goto drop;
 
 	switch (skb->ip_summed) {
-	case CHECKSUM_HW:
+	case CHECKSUM_COMPLETE:
 		if (!(u16)csum_fold(skb->csum))
 			break;
 		/* fall through */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b84b53a4752..8d7f107c2ee 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -665,7 +665,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
 		head->len += fp->len;
 		if (head->ip_summed != fp->ip_summed)
 			head->ip_summed = CHECKSUM_NONE;
-		else if (head->ip_summed == CHECKSUM_HW)
+		else if (head->ip_summed == CHECKSUM_COMPLETE)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
 		atomic_sub(fp->truesize, &ip_frag_mem);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0f9b3a31997..e66f6ff2e19 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -576,7 +576,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 
 		if (flags&GRE_CSUM) {
 			switch (skb->ip_summed) {
-			case CHECKSUM_HW:
+			case CHECKSUM_COMPLETE:
 				csum = (u16)csum_fold(skb->csum);
 				if (!csum)
 					break;
@@ -584,7 +584,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 			case CHECKSUM_NONE:
 				skb->csum = 0;
 				csum = __skb_checksum_complete(skb);
-				skb->ip_summed = CHECKSUM_HW;
+				skb->ip_summed = CHECKSUM_COMPLETE;
 			}
 			offset += 4;
 		}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 308bdeac345..1b9b6742ef7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -680,7 +680,7 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
 {
 	struct iovec *iov = from;
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 			return -EFAULT;
 	} else {
@@ -736,7 +736,7 @@ static inline int ip_ufo_append_data(struct sock *sk,
 		/* initialize protocol header pointer */
 		skb->h.raw = skb->data + fragheaderlen;
 
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum = 0;
 		sk->sk_sndmsg_off = 0;
 	}
@@ -844,7 +844,7 @@ int ip_append_data(struct sock *sk,
 	    length + fragheaderlen <= mtu &&
 	    rt->u.dst.dev->features & NETIF_F_ALL_CSUM &&
 	    !exthdrlen)
-		csummode = CHECKSUM_HW;
+		csummode = CHECKSUM_PARTIAL;
 
 	inet->cork.length += length;
 	if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
index bc28b1160a3..820e8318d10 100644
--- a/net/ipv4/ipvs/ip_vs_proto_tcp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -151,7 +151,7 @@ tcp_snat_handler(struct sk_buff **pskb,
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
 				     cp->dport, cp->vport);
-		if ((*pskb)->ip_summed == CHECKSUM_HW)
+		if ((*pskb)->ip_summed == CHECKSUM_COMPLETE)
 			(*pskb)->ip_summed = CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
@@ -204,7 +204,7 @@ tcp_dnat_handler(struct sk_buff **pskb,
 		/* Only port and addr are changed, do fast csum update */
 		tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
 				     cp->vport, cp->dport);
-		if ((*pskb)->ip_summed == CHECKSUM_HW)
+		if ((*pskb)->ip_summed == CHECKSUM_COMPLETE)
 			(*pskb)->ip_summed = CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
@@ -229,7 +229,7 @@ tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
 	switch (skb->ip_summed) {
 	case CHECKSUM_NONE:
 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
-	case CHECKSUM_HW:
+	case CHECKSUM_COMPLETE:
 		if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
 				      skb->len - tcphoff,
 				      skb->nh.iph->protocol, skb->csum)) {
@@ -239,7 +239,7 @@ tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
 		}
 		break;
 	default:
-		/* CHECKSUM_UNNECESSARY */
+		/* No need to checksum. */
 		break;
 	}
 
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
index 89d9175d8f2..90c8166c0ec 100644
--- a/net/ipv4/ipvs/ip_vs_proto_udp.c
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -161,7 +161,7 @@ udp_snat_handler(struct sk_buff **pskb,
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
 				     cp->dport, cp->vport);
-		if ((*pskb)->ip_summed == CHECKSUM_HW)
+		if ((*pskb)->ip_summed == CHECKSUM_COMPLETE)
 			(*pskb)->ip_summed = CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
@@ -216,7 +216,7 @@ udp_dnat_handler(struct sk_buff **pskb,
 		/* Only port and addr are changed, do fast csum update */
 		udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
 				     cp->vport, cp->dport);
-		if ((*pskb)->ip_summed == CHECKSUM_HW)
+		if ((*pskb)->ip_summed == CHECKSUM_COMPLETE)
 			(*pskb)->ip_summed = CHECKSUM_NONE;
 	} else {
 		/* full checksum calculation */
@@ -250,7 +250,7 @@ udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
 		case CHECKSUM_NONE:
 			skb->csum = skb_checksum(skb, udphoff,
 						 skb->len - udphoff, 0);
-		case CHECKSUM_HW:
+		case CHECKSUM_COMPLETE:
 			if (csum_tcpudp_magic(skb->nh.iph->saddr,
 					      skb->nh.iph->daddr,
 					      skb->len - udphoff,
@@ -262,7 +262,7 @@ udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
 			}
 			break;
 		default:
-			/* CHECKSUM_UNNECESSARY */
+			/* No need to checksum. */
 			break;
 		}
 	}
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 6a9e34b794b..f88347de21a 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -168,7 +168,7 @@ unsigned int nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 	unsigned int csum = 0;
 
 	switch (skb->ip_summed) {
-	case CHECKSUM_HW:
+	case CHECKSUM_COMPLETE:
 		if (hook != NF_IP_PRE_ROUTING && hook != NF_IP_LOCAL_IN)
 			break;
 		if ((protocol == 0 && !(u16)csum_fold(skb->csum)) ||
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index fb920e76ec1..9de81ff645d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -865,8 +865,7 @@ static int tcp_error(struct sk_buff *skb,
   
 	/* Checksum invalid? Ignore.
 	 * We skip checking packets on the outgoing path
-	 * because the semantic of CHECKSUM_HW is different there 
-	 * and moreover root might send raw packets.
+	 * because it is assumed to be correct.
 	 */
 	/* FIXME: Source route IP option packets --RR */
 	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 9b2c16b4d2f..e58e52f1455 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -117,8 +117,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 
 	/* Checksum invalid? Ignore.
 	 * We skip checking packets on the outgoing path
-	 * because the semantic of CHECKSUM_HW is different there 
-	 * and moreover root might send raw packets.
+	 * because the checksum is assumed to be correct.
 	 * FIXME: Source route IP option packets --RR */
 	if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING &&
 	    nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) {
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 17de077a790..f4f00c816d8 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -111,8 +111,9 @@ ip_nat_fn(unsigned int hooknum,
 		       & htons(IP_MF|IP_OFFSET)));
 
 	/* If we had a hardware checksum before, it's now invalid */
-	if ((*pskb)->ip_summed == CHECKSUM_HW)
-		if (skb_checksum_help(*pskb, (out == NULL)))
+	if ((*pskb)->ip_summed == CHECKSUM_PARTIAL ||
+	    (*pskb)->ip_summed == CHECKSUM_COMPLETE)
+		if (skb_checksum_help(*pskb))
 			return NF_DROP;
 
 	ct = ip_conntrack_get(*pskb, &ctinfo);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 198ac36db86..276a964ee6c 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -208,9 +208,9 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
 		break;
 	
 	case IPQ_COPY_PACKET:
-		if (entry->skb->ip_summed == CHECKSUM_HW &&
-		    (*errp = skb_checksum_help(entry->skb,
-		                               entry->info->outdev == NULL))) {
+		if ((entry->skb->ip_summed == CHECKSUM_PARTIAL ||
+		     entry->skb->ip_summed == CHECKSUM_COMPLETE) &&
+		    (*errp = skb_checksum_help(entry->skb))) {
 			read_unlock_bh(&queue_lock);
 			return NULL;
 		}
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 4adf5c9d34f..4ec43f98fe4 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -49,7 +49,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 
 /* Return 0 if there was an error. */
 static inline int
-set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
+set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 {
 	struct tcphdr _tcph, *tcph;
 	u_int16_t diffs[2];
@@ -70,8 +70,9 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
 		return 0;
 	tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
 
-	if ((*pskb)->ip_summed == CHECKSUM_HW &&
-	    skb_checksum_help(*pskb, inward))
+	if (((*pskb)->ip_summed == CHECKSUM_PARTIAL ||
+	     (*pskb)->ip_summed == CHECKSUM_COMPLETE) &&
+	    skb_checksum_help(*pskb))
 		return 0;
 
 	diffs[0] = ((u_int16_t *)tcph)[6];
@@ -106,7 +107,7 @@ target(struct sk_buff **pskb,
 
 	if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR)
 	    && (*pskb)->nh.iph->protocol == IPPROTO_TCP)
-		if (!set_ect_tcp(pskb, einfo, (out == NULL)))
+		if (!set_ect_tcp(pskb, einfo))
 			return NF_DROP;
 
 	return IPT_CONTINUE;
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index ef2fe5b3f0d..c998dc0fcd1 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -62,8 +62,9 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 	if (!skb_make_writable(pskb, (*pskb)->len))
 		return NF_DROP;
 
-	if ((*pskb)->ip_summed == CHECKSUM_HW &&
-	    skb_checksum_help(*pskb, out == NULL))
+	if (((*pskb)->ip_summed == CHECKSUM_PARTIAL ||
+	     (*pskb)->ip_summed == CHECKSUM_COMPLETE) &&
+	    skb_checksum_help(*pskb))
 		return NF_DROP;
 
 	iph = (*pskb)->nh.iph;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 934396bb137..b0124e69ab3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -568,7 +568,7 @@ new_segment:
 		skb->truesize += copy;
 		sk->sk_wmem_queued += copy;
 		sk->sk_forward_alloc -= copy;
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_PARTIAL;
 		tp->write_seq += copy;
 		TCP_SKB_CB(skb)->end_seq += copy;
 		skb_shinfo(skb)->gso_segs = 0;
@@ -723,7 +723,7 @@ new_segment:
 				 * Check whether we can use HW checksum.
 				 */
 				if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
-					skb->ip_summed = CHECKSUM_HW;
+					skb->ip_summed = CHECKSUM_PARTIAL;
 
 				skb_entail(sk, tp, skb);
 				copy = size_goal;
@@ -2205,7 +2205,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
 		th->fin = th->psh = 0;
 
 		th->check = ~csum_fold(th->check + delta);
-		if (skb->ip_summed != CHECKSUM_HW)
+		if (skb->ip_summed != CHECKSUM_PARTIAL)
 			th->check = csum_fold(csum_partial(skb->h.raw, thlen,
 							   skb->csum));
 
@@ -2219,7 +2219,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
 
 	delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len);
 	th->check = ~csum_fold(th->check + delta);
-	if (skb->ip_summed != CHECKSUM_HW)
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
 		th->check = csum_fold(csum_partial(skb->h.raw, thlen,
 						   skb->csum));
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 43f6740244f..b2aa512a30e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -484,7 +484,7 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 	struct inet_sock *inet = inet_sk(sk);
 	struct tcphdr *th = skb->h.th;
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
 		skb->csum = offsetof(struct tcphdr, check);
 	} else {
@@ -509,7 +509,7 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
 	th->check = 0;
 	th->check = ~tcp_v4_check(th, skb->len, iph->saddr, iph->daddr, 0);
 	skb->csum = offsetof(struct tcphdr, check);
-	skb->ip_summed = CHECKSUM_HW;
+	skb->ip_summed = CHECKSUM_PARTIAL;
 	return 0;
 }
 
@@ -973,7 +973,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 
 static int tcp_v4_checksum_init(struct sk_buff *skb)
 {
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
 		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
 				  skb->nh.iph->daddr, skb->csum)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b4f3ffe1b3b..9252a50c4b4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -577,7 +577,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
 	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
 	TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
 
-	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
+	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
 		/* Copy and checksum data tail into the new buffer. */
 		buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
 						       nsize, 0);
@@ -586,7 +586,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
 
 		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
 	} else {
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb_split(skb, buff, len);
 	}
 
@@ -689,7 +689,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 		__pskb_trim_head(skb, len - skb_headlen(skb));
 
 	TCP_SKB_CB(skb)->seq += len;
-	skb->ip_summed = CHECKSUM_HW;
+	skb->ip_summed = CHECKSUM_PARTIAL;
 
 	skb->truesize	     -= len;
 	sk->sk_wmem_queued   -= len;
@@ -1062,7 +1062,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	/* This packet was never sent out yet, so no SACK bits. */
 	TCP_SKB_CB(buff)->sacked = 0;
 
-	buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
 	skb_split(skb, buff, len);
 
 	/* Fix up tso_factor for both original and new SKB.  */
@@ -1206,8 +1206,7 @@ static int tcp_mtu_probe(struct sock *sk)
 	TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
 	TCP_SKB_CB(nskb)->sacked = 0;
 	nskb->csum = 0;
-	if (skb->ip_summed == CHECKSUM_HW)
-		nskb->ip_summed = CHECKSUM_HW;
+	nskb->ip_summed = skb->ip_summed;
 
 	len = 0;
 	while (len < probe_size) {
@@ -1231,7 +1230,7 @@ static int tcp_mtu_probe(struct sock *sk)
 			                           ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
 			if (!skb_shinfo(skb)->nr_frags) {
 				skb_pull(skb, copy);
-				if (skb->ip_summed != CHECKSUM_HW)
+				if (skb->ip_summed != CHECKSUM_PARTIAL)
 					skb->csum = csum_partial(skb->data, skb->len, 0);
 			} else {
 				__pskb_trim_head(skb, copy);
@@ -1572,10 +1571,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
 
 		memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 
-		if (next_skb->ip_summed == CHECKSUM_HW)
-			skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = next_skb->ip_summed;
 
-		if (skb->ip_summed != CHECKSUM_HW)
+		if (skb->ip_summed != CHECKSUM_PARTIAL)
 			skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
 
 		/* Update sequence range on original skb. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a4d005eccc7..87152510980 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -429,7 +429,7 @@ static int udp_push_pending_frames(struct sock *sk, struct udp_sock *up)
 		/*
 		 * Only one fragment on the socket.
 		 */
-		if (skb->ip_summed == CHECKSUM_HW) {
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 			skb->csum = offsetof(struct udphdr, check);
 			uh->check = ~csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
 					up->len, IPPROTO_UDP, 0);
@@ -448,7 +448,7 @@ static int udp_push_pending_frames(struct sock *sk, struct udp_sock *up)
 		 * fragments on the socket so that all csums of sk_buffs
 		 * should be together.
 		 */
-		if (skb->ip_summed == CHECKSUM_HW) {
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
 			int offset = (unsigned char *)uh - skb->data;
 			skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
 
@@ -1088,7 +1088,7 @@ static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
 {
 	if (uh->check == 0) {
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	} else if (skb->ip_summed == CHECKSUM_HW) {
+	} else if (skb->ip_summed == CHECKSUM_COMPLETE) {
 		if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index d16f863cf68..4a96a9e3ef3 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -48,8 +48,8 @@ static int xfrm4_output_one(struct sk_buff *skb)
 	struct xfrm_state *x = dst->xfrm;
 	int err;
 	
-	if (skb->ip_summed == CHECKSUM_HW) {
-		err = skb_checksum_help(skb, 0);
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		err = skb_checksum_help(skb);
 		if (err)
 			goto error_nolock;
 	}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 86dac106873..05afa6b1912 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -294,7 +294,7 @@ looped_back:
 		hdr = (struct ipv6_rt_hdr *) skb2->h.raw;
 	}
 
-	if (skb->ip_summed == CHECKSUM_HW)
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
 		skb->ip_summed = CHECKSUM_NONE;
 
 	i = n - --hdr->segments_left;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index dbfce089e91..10305510767 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -606,7 +606,7 @@ static int icmpv6_rcv(struct sk_buff **pskb)
 
 	/* Perform checksum. */
 	switch (skb->ip_summed) {
-	case CHECKSUM_HW:
+	case CHECKSUM_COMPLETE:
 		if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
 				     skb->csum))
 			break;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4fb47a25291..65514f21c18 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -866,7 +866,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 		/* initialize protocol header pointer */
 		skb->h.raw = skb->data + fragheaderlen;
 
-		skb->ip_summed = CHECKSUM_HW;
+		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum = 0;
 		sk->sk_sndmsg_off = 0;
 	}
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 395a417ba95..580b1aba672 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -87,7 +87,7 @@ unsigned int nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
 	unsigned int csum = 0;
 
 	switch (skb->ip_summed) {
-	case CHECKSUM_HW:
+	case CHECKSUM_COMPLETE:
 		if (hook != NF_IP6_PRE_ROUTING && hook != NF_IP6_LOCAL_IN)
 			break;
 		if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 968a14be0d0..c01c126224e 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -206,9 +206,9 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
 		break;
 	
 	case IPQ_COPY_PACKET:
-		if (entry->skb->ip_summed == CHECKSUM_HW &&
-		    (*errp = skb_checksum_help(entry->skb,
-		                               entry->info->outdev == NULL))) {
+		if ((entry->skb->ip_summed == CHECKSUM_PARTIAL ||
+		     entry->skb->ip_summed == CHECKSUM_COMPLETE) &&
+		    (*errp = skb_checksum_help(entry->skb))) {
 			read_unlock_bh(&queue_lock);
 			return NULL;
 		}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 00d5583807f..7a4e4c2e319 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -408,7 +408,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
  		return -1;
 	}
 
- 	if (skb->ip_summed == CHECKSUM_HW)
+ 	if (skb->ip_summed == CHECKSUM_COMPLETE)
  		skb->csum = csum_sub(skb->csum,
  				     csum_partial(skb->nh.raw,
 						  (u8*)(fhdr + 1) - skb->nh.raw,
@@ -640,7 +640,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 		head->len += fp->len;
 		if (head->ip_summed != fp->ip_summed)
 			head->ip_summed = CHECKSUM_NONE;
-		else if (head->ip_summed == CHECKSUM_HW)
+		else if (head->ip_summed == CHECKSUM_COMPLETE)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
 		atomic_sub(fp->truesize, &nf_ct_frag6_mem);
@@ -652,7 +652,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 	head->nh.ipv6h->payload_len = htons(payload_len);
 
 	/* Yes, and fold redundant checksum back. 8) */
-	if (head->ip_summed == CHECKSUM_HW)
+	if (head->ip_summed == CHECKSUM_COMPLETE)
 		head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum);
 
 	fq->fragments = NULL;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index d5040e17229..d4af1cb5e19 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -334,7 +334,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
 	if (!rp->checksum)
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
 		skb_postpull_rcsum(skb, skb->nh.raw,
 		                   skb->h.raw - skb->nh.raw);
 		if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr,
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 4e299c69e1c..a8623d2b087 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -433,7 +433,7 @@ static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
  		return;
 	}
 
- 	if (skb->ip_summed == CHECKSUM_HW)
+ 	if (skb->ip_summed == CHECKSUM_COMPLETE)
  		skb->csum = csum_sub(skb->csum,
  				     csum_partial(skb->nh.raw, (u8*)(fhdr+1)-skb->nh.raw, 0));
 
@@ -647,7 +647,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
 		head->len += fp->len;
 		if (head->ip_summed != fp->ip_summed)
 			head->ip_summed = CHECKSUM_NONE;
-		else if (head->ip_summed == CHECKSUM_HW)
+		else if (head->ip_summed == CHECKSUM_COMPLETE)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
 		atomic_sub(fp->truesize, &ip6_frag_mem);
@@ -662,7 +662,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
 	*skb_in = head;
 
 	/* Yes, and fold redundant checksum back. 8) */
-	if (head->ip_summed == CHECKSUM_HW)
+	if (head->ip_summed == CHECKSUM_COMPLETE)
 		head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum);
 
 	IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 302786a11cd..7f1b660493b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -545,7 +545,7 @@ static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcphdr *th = skb->h.th;
 
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP,  0);
 		skb->csum = offsetof(struct tcphdr, check);
 	} else {
@@ -570,7 +570,7 @@ static int tcp_v6_gso_send_check(struct sk_buff *skb)
 	th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len,
 				     IPPROTO_TCP, 0);
 	skb->csum = offsetof(struct tcphdr, check);
-	skb->ip_summed = CHECKSUM_HW;
+	skb->ip_summed = CHECKSUM_PARTIAL;
 	return 0;
 }
 
@@ -1033,7 +1033,7 @@ out:
 
 static int tcp_v6_checksum_init(struct sk_buff *skb)
 {
-	if (skb->ip_summed == CHECKSUM_HW) {
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
 		if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
 				  &skb->nh.ipv6h->daddr,skb->csum)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 82c7c9cde2a..780b89f6dfc 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -475,7 +475,7 @@ static int udpv6_rcv(struct sk_buff **pskb)
 		uh = skb->h.uh;
 	}
 
-	if (skb->ip_summed == CHECKSUM_HW &&
+	if (skb->ip_summed == CHECKSUM_COMPLETE &&
 	    !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum))
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index c8c8b44a0f5..6d111743e50 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -41,8 +41,8 @@ static int xfrm6_output_one(struct sk_buff *skb)
 	struct xfrm_state *x = dst->xfrm;
 	int err;
 	
-	if (skb->ip_summed == CHECKSUM_HW) {
-		err = skb_checksum_help(skb, 0);
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		err = skb_checksum_help(skb);
 		if (err)
 			goto error_nolock;
 	}
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index af8adcba23a..308d2abd7ee 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -823,8 +823,7 @@ static int tcp_error(struct sk_buff *skb,
   
 	/* Checksum invalid? Ignore.
 	 * We skip checking packets on the outgoing path
-	 * because the semantic of CHECKSUM_HW is different there 
-	 * and moreover root might send raw packets.
+	 * because the checksum is assumed to be correct.
 	 */
 	/* FIXME: Source route IP option packets --RR */
 	if (nf_conntrack_checksum &&
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index ae07ebe3ab3..d36e03139e8 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -131,8 +131,7 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff,
 
 	/* Checksum invalid? Ignore.
 	 * We skip checking packets on the outgoing path
-	 * because the semantic of CHECKSUM_HW is different there
-	 * and moreover root might send raw packets.
+	 * because the checksum is assumed to be correct.
 	 * FIXME: Source route IP option packets --RR */
 	if (nf_conntrack_checksum &&
 	    ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) ||
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 49ef41e34c4..eddfbe4441a 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -377,9 +377,9 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 		break;
 	
 	case NFQNL_COPY_PACKET:
-		if (entskb->ip_summed == CHECKSUM_HW &&
-		    (*errp = skb_checksum_help(entskb,
-		                               outdev == NULL))) {
+		if ((entskb->ip_summed == CHECKSUM_PARTIAL ||
+		     entskb->ip_summed == CHECKSUM_COMPLETE) &&
+		    (*errp = skb_checksum_help(entskb))) {
 			spin_unlock_bh(&queue->lock);
 			return NULL;
 		}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4172a523591..300215bdbf4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -586,7 +586,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
 		else if (skb->pkt_type == PACKET_OUTGOING) {
 			/* Special case: outgoing packets have ll header at head */
 			skb_pull(skb, skb->nh.raw - skb->data);
-			if (skb->ip_summed == CHECKSUM_HW)
+			if (skb->ip_summed == CHECKSUM_PARTIAL)
 				status |= TP_STATUS_CSUMNOTREADY;
 		}
 	}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index a08ec4c7c55..45939bafbdf 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -192,8 +192,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	 */
 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 		if (!(skb = skb_unshare(skb, GFP_ATOMIC))
-		    || (skb->ip_summed == CHECKSUM_HW
-			&& skb_checksum_help(skb, 0))) {
+		    || (skb->ip_summed == CHECKSUM_PARTIAL
+			&& skb_checksum_help(skb))) {
 			sch->qstats.drops++;
 			return NET_XMIT_DROP;
 		}
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index eb330d4f66d..6f17527b9e6 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -168,7 +168,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 		return -1;
 	if ((unsigned short)csum_fold(desc.csum))
 		return -1;
-	if (unlikely(skb->ip_summed == CHECKSUM_HW))
+	if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
 		netdev_rx_csum_fault(skb->dev);
 	return 0;
 no_checksum:
-- 
cgit v1.2.3-70-g09d2


From 4cf411de49c65140b3c259748629b561c0d3340f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 5 Aug 2006 00:58:33 -0700
Subject: [NETFILTER]: Get rid of HW checksum invalidation

Update hardware checksums incrementally to avoid breaking GSO.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h                  |  6 +++
 include/linux/netfilter_ipv4/ip_nat.h      |  4 --
 include/linux/netfilter_ipv4/ip_nat_core.h |  8 ++--
 net/ipv4/netfilter/ip_nat_core.c           | 52 ++++++++++----------------
 net/ipv4/netfilter/ip_nat_helper.c         | 59 +++++++++++++++++++++---------
 net/ipv4/netfilter/ip_nat_proto_gre.c      |  5 ++-
 net/ipv4/netfilter/ip_nat_proto_icmp.c     |  8 ++--
 net/ipv4/netfilter/ip_nat_proto_tcp.c      |  7 ++--
 net/ipv4/netfilter/ip_nat_proto_udp.c      | 15 +++++---
 net/ipv4/netfilter/ip_nat_standalone.c     | 10 +----
 net/ipv4/netfilter/ipt_ECN.c               | 19 +++-------
 net/ipv4/netfilter/ipt_REJECT.c            |  1 +
 net/ipv4/netfilter/ipt_TCPMSS.c            | 39 ++++++++------------
 net/netfilter/core.c                       | 22 +++++++++++
 14 files changed, 138 insertions(+), 117 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 10168e26a84..b7e67d1d438 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -282,6 +282,12 @@ extern void nf_invalidate_cache(int pf);
    Returns true or false. */
 extern int skb_make_writable(struct sk_buff **pskb, unsigned int writable_len);
 
+extern u_int16_t nf_csum_update(u_int32_t oldval, u_int32_t newval,
+				u_int32_t csum);
+extern u_int16_t nf_proto_csum_update(struct sk_buff *skb,
+				      u_int32_t oldval, u_int32_t newval,
+				      u_int16_t csum, int pseudohdr);
+
 struct nf_afinfo {
 	unsigned short	family;
 	unsigned int	(*checksum)(struct sk_buff *skb, unsigned int hook,
diff --git a/include/linux/netfilter_ipv4/ip_nat.h b/include/linux/netfilter_ipv4/ip_nat.h
index e9f5ed1d9f6..98f8407e4cb 100644
--- a/include/linux/netfilter_ipv4/ip_nat.h
+++ b/include/linux/netfilter_ipv4/ip_nat.h
@@ -72,10 +72,6 @@ extern unsigned int ip_nat_setup_info(struct ip_conntrack *conntrack,
 extern int ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
 			     const struct ip_conntrack *ignored_conntrack);
 
-/* Calculate relative checksum. */
-extern u_int16_t ip_nat_cheat_check(u_int32_t oldvalinv,
-				    u_int32_t newval,
-				    u_int16_t oldcheck);
 #else  /* !__KERNEL__: iptables wants this to compile. */
 #define ip_nat_multi_range ip_nat_multi_range_compat
 #endif /*__KERNEL__*/
diff --git a/include/linux/netfilter_ipv4/ip_nat_core.h b/include/linux/netfilter_ipv4/ip_nat_core.h
index 30db23f06b0..60566f9fd7b 100644
--- a/include/linux/netfilter_ipv4/ip_nat_core.h
+++ b/include/linux/netfilter_ipv4/ip_nat_core.h
@@ -11,8 +11,8 @@ extern unsigned int ip_nat_packet(struct ip_conntrack *ct,
 			       unsigned int hooknum,
 			       struct sk_buff **pskb);
 
-extern int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
-					 struct ip_conntrack *ct,
-					 enum ip_nat_manip_type manip,
-					 enum ip_conntrack_dir dir);
+extern int ip_nat_icmp_reply_translation(struct ip_conntrack *ct,
+					 enum ip_conntrack_info ctinfo,
+					 unsigned int hooknum,
+					 struct sk_buff **pskb);
 #endif /* _IP_NAT_CORE_H */
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 1741d555ad0..4c540d03d48 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -101,18 +101,6 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
 	write_unlock_bh(&ip_nat_lock);
 }
 
-/* We do checksum mangling, so if they were wrong before they're still
- * wrong.  Also works for incomplete packets (eg. ICMP dest
- * unreachables.) */
-u_int16_t
-ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
-{
-	u_int32_t diffs[] = { oldvalinv, newval };
-	return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
-				      oldcheck^0xFFFF));
-}
-EXPORT_SYMBOL(ip_nat_cheat_check);
-
 /* Is this tuple already taken? (not by us) */
 int
 ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
@@ -378,12 +366,12 @@ manip_pkt(u_int16_t proto,
 	iph = (void *)(*pskb)->data + iphdroff;
 
 	if (maniptype == IP_NAT_MANIP_SRC) {
-		iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
-						iph->check);
+		iph->check = nf_csum_update(~iph->saddr, target->src.ip,
+					    iph->check);
 		iph->saddr = target->src.ip;
 	} else {
-		iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
-						iph->check);
+		iph->check = nf_csum_update(~iph->daddr, target->dst.ip,
+					    iph->check);
 		iph->daddr = target->dst.ip;
 	}
 	return 1;
@@ -423,10 +411,10 @@ unsigned int ip_nat_packet(struct ip_conntrack *ct,
 EXPORT_SYMBOL_GPL(ip_nat_packet);
 
 /* Dir is direction ICMP is coming from (opposite to packet it contains) */
-int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
-				  struct ip_conntrack *ct,
-				  enum ip_nat_manip_type manip,
-				  enum ip_conntrack_dir dir)
+int ip_nat_icmp_reply_translation(struct ip_conntrack *ct,
+				  enum ip_conntrack_info ctinfo,
+				  unsigned int hooknum,
+				  struct sk_buff **pskb)
 {
 	struct {
 		struct icmphdr icmp;
@@ -434,7 +422,9 @@ int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
 	} *inside;
 	struct ip_conntrack_tuple inner, target;
 	int hdrlen = (*pskb)->nh.iph->ihl * 4;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	unsigned long statusbit;
+	enum ip_nat_manip_type manip = HOOK2MANIP(hooknum);
 
 	if (!skb_make_writable(pskb, hdrlen + sizeof(*inside)))
 		return 0;
@@ -443,12 +433,8 @@ int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
 
 	/* We're actually going to mangle it beyond trivial checksum
 	   adjustment, so make sure the current checksum is correct. */
-	if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
-		hdrlen = (*pskb)->nh.iph->ihl * 4;
-		if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
-						(*pskb)->len - hdrlen, 0)))
-			return 0;
-	}
+	if (nf_ip_checksum(*pskb, hooknum, hdrlen, 0))
+		return 0;
 
 	/* Must be RELATED */
 	IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
@@ -487,12 +473,14 @@ int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
 		       !manip))
 		return 0;
 
-	/* Reloading "inside" here since manip_pkt inner. */
-	inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
-	inside->icmp.checksum = 0;
-	inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
-						       (*pskb)->len - hdrlen,
-						       0));
+	if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
+		/* Reloading "inside" here since manip_pkt inner. */
+		inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+		inside->icmp.checksum = 0;
+		inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
+							       (*pskb)->len - hdrlen,
+							       0));
+	}
 
 	/* Change outer to look the reply to an incoming packet
 	 * (proto 0 means don't invert per-proto part). */
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index cbcaa45370a..021c3daae3e 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -165,7 +165,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
 {
 	struct iphdr *iph;
 	struct tcphdr *tcph;
-	int datalen;
+	int oldlen, datalen;
 
 	if (!skb_make_writable(pskb, (*pskb)->len))
 		return 0;
@@ -180,13 +180,22 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
 	iph = (*pskb)->nh.iph;
 	tcph = (void *)iph + iph->ihl*4;
 
+	oldlen = (*pskb)->len - iph->ihl*4;
 	mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4,
 			match_offset, match_len, rep_buffer, rep_len);
 
 	datalen = (*pskb)->len - iph->ihl*4;
-	tcph->check = 0;
-	tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr,
-				   csum_partial((char *)tcph, datalen, 0));
+	if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
+		tcph->check = 0;
+		tcph->check = tcp_v4_check(tcph, datalen,
+					   iph->saddr, iph->daddr,
+					   csum_partial((char *)tcph,
+					   		datalen, 0));
+	} else
+		tcph->check = nf_proto_csum_update(*pskb,
+						   htons(oldlen) ^ 0xFFFF,
+						   htons(datalen),
+						   tcph->check, 1);
 
 	if (rep_len != match_len) {
 		set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
@@ -221,6 +230,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
 {
 	struct iphdr *iph;
 	struct udphdr *udph;
+	int datalen, oldlen;
 
 	/* UDP helpers might accidentally mangle the wrong packet */
 	iph = (*pskb)->nh.iph;
@@ -238,22 +248,32 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
 
 	iph = (*pskb)->nh.iph;
 	udph = (void *)iph + iph->ihl*4;
+
+	oldlen = (*pskb)->len - iph->ihl*4;
 	mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph),
 			match_offset, match_len, rep_buffer, rep_len);
 
 	/* update the length of the UDP packet */
-	udph->len = htons((*pskb)->len - iph->ihl*4);
+	datalen = (*pskb)->len - iph->ihl*4;
+	udph->len = htons(datalen);
 
 	/* fix udp checksum if udp checksum was previously calculated */
-	if (udph->check) {
-		int datalen = (*pskb)->len - iph->ihl * 4;
+	if (!udph->check && (*pskb)->ip_summed != CHECKSUM_PARTIAL)
+		return 1;
+
+	if ((*pskb)->ip_summed != CHECKSUM_PARTIAL) {
 		udph->check = 0;
 		udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
 		                                datalen, IPPROTO_UDP,
 		                                csum_partial((char *)udph,
 		                                             datalen, 0));
-	}
-
+		if (!udph->check)
+			udph->check = -1;
+	} else
+		udph->check = nf_proto_csum_update(*pskb,
+						   htons(oldlen) ^ 0xFFFF,
+						   htons(datalen),
+						   udph->check, 1);
 	return 1;
 }
 EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
@@ -293,11 +313,14 @@ sack_adjust(struct sk_buff *skb,
 			ntohl(sack->start_seq), new_start_seq,
 			ntohl(sack->end_seq), new_end_seq);
 
-		tcph->check = 
-			ip_nat_cheat_check(~sack->start_seq, new_start_seq,
-					   ip_nat_cheat_check(~sack->end_seq, 
-						   	      new_end_seq,
-							      tcph->check));
+		tcph->check = nf_proto_csum_update(skb,
+						   ~sack->start_seq,
+						   new_start_seq,
+						   tcph->check, 0);
+		tcph->check = nf_proto_csum_update(skb,
+						   ~sack->end_seq,
+						   new_end_seq,
+						   tcph->check, 0);
 		sack->start_seq = new_start_seq;
 		sack->end_seq = new_end_seq;
 		sackoff += sizeof(*sack);
@@ -381,10 +404,10 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
 		newack = ntohl(tcph->ack_seq) - other_way->offset_before;
 	newack = htonl(newack);
 
-	tcph->check = ip_nat_cheat_check(~tcph->seq, newseq,
-					 ip_nat_cheat_check(~tcph->ack_seq, 
-					 		    newack, 
-							    tcph->check));
+	tcph->check = nf_proto_csum_update(*pskb, ~tcph->seq, newseq,
+					   tcph->check, 0);
+	tcph->check = nf_proto_csum_update(*pskb, ~tcph->ack_seq, newack,
+					   tcph->check, 0);
 
 	DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
 		ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
index 38acfdf540e..70a65372225 100644
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -130,9 +130,10 @@ gre_manip_pkt(struct sk_buff **pskb,
 			if (greh->csum) {
 				/* FIXME: Never tested this code... */
 				*(gre_csum(greh)) = 
-					ip_nat_cheat_check(~*(gre_key(greh)),
+					nf_proto_csum_update(*pskb,
+							~*(gre_key(greh)),
 							tuple->dst.u.gre.key,
-							*(gre_csum(greh)));
+							*(gre_csum(greh)), 0);
 			}
 			*(gre_key(greh)) = tuple->dst.u.gre.key;
 			break;
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
index 31a3f4ccb99..ec50cc29531 100644
--- a/net/ipv4/netfilter/ip_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -66,10 +66,10 @@ icmp_manip_pkt(struct sk_buff **pskb,
 		return 0;
 
 	hdr = (struct icmphdr *)((*pskb)->data + hdroff);
-
-	hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF,
-					    tuple->src.u.icmp.id,
-					    hdr->checksum);
+	hdr->checksum = nf_proto_csum_update(*pskb,
+					     hdr->un.echo.id ^ 0xFFFF,
+					     tuple->src.u.icmp.id,
+					     hdr->checksum, 0);
 	hdr->un.echo.id = tuple->src.u.icmp.id;
 	return 1;
 }
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
index a3d14079eba..72a6307bd2d 100644
--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -129,10 +129,9 @@ tcp_manip_pkt(struct sk_buff **pskb,
 	if (hdrsize < sizeof(*hdr))
 		return 1;
 
-	hdr->check = ip_nat_cheat_check(~oldip, newip,
-					ip_nat_cheat_check(oldport ^ 0xFFFF,
-							   newport,
-							   hdr->check));
+	hdr->check = nf_proto_csum_update(*pskb, ~oldip, newip, hdr->check, 1);
+	hdr->check = nf_proto_csum_update(*pskb, oldport ^ 0xFFFF, newport,
+					  hdr->check, 0);
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
index ec6053fdc86..5da196ae758 100644
--- a/net/ipv4/netfilter/ip_nat_proto_udp.c
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb,
 		newport = tuple->dst.u.udp.port;
 		portptr = &hdr->dest;
 	}
-	if (hdr->check) /* 0 is a special case meaning no checksum */
-		hdr->check = ip_nat_cheat_check(~oldip, newip,
-					ip_nat_cheat_check(*portptr ^ 0xFFFF,
-							   newport,
-							   hdr->check));
+
+	if (hdr->check || (*pskb)->ip_summed == CHECKSUM_PARTIAL) {
+		hdr->check = nf_proto_csum_update(*pskb, ~oldip, newip,
+						  hdr->check, 1);
+		hdr->check = nf_proto_csum_update(*pskb,
+						  *portptr ^ 0xFFFF, newport,
+						  hdr->check, 0);
+		if (!hdr->check)
+			hdr->check = -1;
+	}
 	*portptr = newport;
 	return 1;
 }
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index f4f00c816d8..f3b77835543 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -110,12 +110,6 @@ ip_nat_fn(unsigned int hooknum,
 	IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
 		       & htons(IP_MF|IP_OFFSET)));
 
-	/* If we had a hardware checksum before, it's now invalid */
-	if ((*pskb)->ip_summed == CHECKSUM_PARTIAL ||
-	    (*pskb)->ip_summed == CHECKSUM_COMPLETE)
-		if (skb_checksum_help(*pskb))
-			return NF_DROP;
-
 	ct = ip_conntrack_get(*pskb, &ctinfo);
 	/* Can't track?  It's not due to stress, or conntrack would
 	   have dropped it.  Hence it's the user's responsibilty to
@@ -146,8 +140,8 @@ ip_nat_fn(unsigned int hooknum,
 	case IP_CT_RELATED:
 	case IP_CT_RELATED+IP_CT_IS_REPLY:
 		if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
-			if (!ip_nat_icmp_reply_translation(pskb, ct, maniptype,
-							   CTINFO2DIR(ctinfo)))
+			if (!ip_nat_icmp_reply_translation(ct, ctinfo,
+							   hooknum, pskb))
 				return NF_DROP;
 			else
 				return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 4ec43f98fe4..35916c74fe4 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -52,7 +52,7 @@ static inline int
 set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 {
 	struct tcphdr _tcph, *tcph;
-	u_int16_t diffs[2];
+	u_int16_t oldval;
 
 	/* Not enought header? */
 	tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
@@ -70,23 +70,16 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
 		return 0;
 	tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
 
-	if (((*pskb)->ip_summed == CHECKSUM_PARTIAL ||
-	     (*pskb)->ip_summed == CHECKSUM_COMPLETE) &&
-	    skb_checksum_help(*pskb))
-		return 0;
-
-	diffs[0] = ((u_int16_t *)tcph)[6];
+	oldval = ((u_int16_t *)tcph)[6];
 	if (einfo->operation & IPT_ECN_OP_SET_ECE)
 		tcph->ece = einfo->proto.tcp.ece;
 	if (einfo->operation & IPT_ECN_OP_SET_CWR)
 		tcph->cwr = einfo->proto.tcp.cwr;
-	diffs[1] = ((u_int16_t *)tcph)[6];
-	diffs[0] = diffs[0] ^ 0xFFFF;
 
-	if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY)
-		tcph->check = csum_fold(csum_partial((char *)diffs,
-						     sizeof(diffs),
-						     tcph->check^0xFFFF));
+	tcph->check = nf_proto_csum_update((*pskb),
+					   oldval ^ 0xFFFF,
+					   ((u_int16_t *)tcph)[6],
+					   tcph->check, 0);
 	return 1;
 }
 
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 7f905bf2bde..95c6662b663 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -185,6 +185,7 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	tcph->urg_ptr = 0;
 
 	/* Adjust TCP checksum */
+	nskb->ip_summed = CHECKSUM_NONE;
 	tcph->check = 0;
 	tcph->check = tcp_v4_check(tcph, sizeof(struct tcphdr),
 				   nskb->nh.iph->saddr,
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index c998dc0fcd1..0fce85e0550 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -27,14 +27,6 @@ MODULE_DESCRIPTION("iptables TCP MSS modification module");
 #define DEBUGP(format, args...)
 #endif
 
-static u_int16_t
-cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
-{
-	u_int32_t diffs[] = { oldvalinv, newval };
-	return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
-                                      oldcheck^0xFFFF));
-}
-
 static inline unsigned int
 optlen(const u_int8_t *opt, unsigned int offset)
 {
@@ -62,11 +54,6 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 	if (!skb_make_writable(pskb, (*pskb)->len))
 		return NF_DROP;
 
-	if (((*pskb)->ip_summed == CHECKSUM_PARTIAL ||
-	     (*pskb)->ip_summed == CHECKSUM_COMPLETE) &&
-	    skb_checksum_help(*pskb))
-		return NF_DROP;
-
 	iph = (*pskb)->nh.iph;
 	tcplen = (*pskb)->len - iph->ihl*4;
 
@@ -120,9 +107,10 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 			opt[i+2] = (newmss & 0xff00) >> 8;
 			opt[i+3] = (newmss & 0x00ff);
 
-			tcph->check = cheat_check(htons(oldmss)^0xFFFF,
-						  htons(newmss),
-						  tcph->check);
+			tcph->check = nf_proto_csum_update(*pskb,
+							   htons(oldmss)^0xFFFF,
+							   htons(newmss),
+							   tcph->check, 0);
 
 			DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
 			       "->%u.%u.%u.%u:%hu changed TCP MSS option"
@@ -162,8 +150,10 @@ ipt_tcpmss_target(struct sk_buff **pskb,
  	opt = (u_int8_t *)tcph + sizeof(struct tcphdr);
 	memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr));
 
-	tcph->check = cheat_check(htons(tcplen) ^ 0xFFFF,
-				  htons(tcplen + TCPOLEN_MSS), tcph->check);
+	tcph->check = nf_proto_csum_update(*pskb,
+					   htons(tcplen) ^ 0xFFFF,
+				           htons(tcplen + TCPOLEN_MSS),
+					   tcph->check, 1);
 	tcplen += TCPOLEN_MSS;
 
 	opt[0] = TCPOPT_MSS;
@@ -171,16 +161,19 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 	opt[2] = (newmss & 0xff00) >> 8;
 	opt[3] = (newmss & 0x00ff);
 
-	tcph->check = cheat_check(~0, *((u_int32_t *)opt), tcph->check);
+	tcph->check = nf_proto_csum_update(*pskb, ~0, *((u_int32_t *)opt),
+					   tcph->check, 0);
 
 	oldval = ((u_int16_t *)tcph)[6];
 	tcph->doff += TCPOLEN_MSS/4;
-	tcph->check = cheat_check(oldval ^ 0xFFFF,
-				  ((u_int16_t *)tcph)[6], tcph->check);
+	tcph->check = nf_proto_csum_update(*pskb,
+					   oldval ^ 0xFFFF,
+					   ((u_int16_t *)tcph)[6],
+					   tcph->check, 0);
 
 	newtotlen = htons(ntohs(iph->tot_len) + TCPOLEN_MSS);
-	iph->check = cheat_check(iph->tot_len ^ 0xFFFF,
-				 newtotlen, iph->check);
+	iph->check = nf_csum_update(iph->tot_len ^ 0xFFFF,
+				    newtotlen, iph->check);
 	iph->tot_len = newtotlen;
 
 	DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 5d29d5e2362..27f639f3ac2 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -222,6 +222,28 @@ copy_skb:
 }
 EXPORT_SYMBOL(skb_make_writable);
 
+u_int16_t nf_csum_update(u_int32_t oldval, u_int32_t newval, u_int32_t csum)
+{
+	u_int32_t diff[] = { oldval, newval };
+
+	return csum_fold(csum_partial((char *)diff, sizeof(diff), ~csum));
+}
+EXPORT_SYMBOL(nf_csum_update);
+
+u_int16_t nf_proto_csum_update(struct sk_buff *skb,
+			       u_int32_t oldval, u_int32_t newval,
+			       u_int16_t csum, int pseudohdr)
+{
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		csum = nf_csum_update(oldval, newval, csum);
+		if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+			skb->csum = nf_csum_update(oldval, newval, skb->csum);
+	} else if (pseudohdr)
+		csum = ~nf_csum_update(oldval, newval, ~csum);
+
+	return csum;
+}
+EXPORT_SYMBOL(nf_proto_csum_update);
 
 /* This does not belong here, but locally generated errors need it if connection
    tracking in use: without this, connection may not be in hash table, and hence
-- 
cgit v1.2.3-70-g09d2


From 9067c722cf6930adf1df2d169de9094dd90b0c33 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Mon, 7 Aug 2006 17:57:44 -0700
Subject: [NEIGH]: Move netlink neighbour bits to linux/neighbour.h

Moves netlink neighbour bits to linux/neighbour.h. Also
moves bits to be exported to userspace from net/neighbour.h
to linux/neighbour.h and removes __KERNEL__ guards, userspace
is not supposed to be using it.

rtnetlink_rcv_msg() is not longer required to parse attributes
for the neighbour layer, remove dependency on obsolete and
buggy rta_buf.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/neighbour.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtnetlink.h | 63 ---------------------------------------------
 include/net/neighbour.h   | 39 ++--------------------------
 net/core/rtnetlink.c      |  2 --
 4 files changed, 67 insertions(+), 102 deletions(-)
 create mode 100644 include/linux/neighbour.h

(limited to 'include')

diff --git a/include/linux/neighbour.h b/include/linux/neighbour.h
new file mode 100644
index 00000000000..8e8293d86fb
--- /dev/null
+++ b/include/linux/neighbour.h
@@ -0,0 +1,65 @@
+#ifndef __LINUX_NEIGHBOUR_H
+#define __LINUX_NEIGHBOUR_H
+
+#include <linux/netlink.h>
+
+struct ndmsg
+{
+	__u8		ndm_family;
+	__u8		ndm_pad1;
+	__u16		ndm_pad2;
+	__s32		ndm_ifindex;
+	__u16		ndm_state;
+	__u8		ndm_flags;
+	__u8		ndm_type;
+};
+
+enum
+{
+	NDA_UNSPEC,
+	NDA_DST,
+	NDA_LLADDR,
+	NDA_CACHEINFO,
+	NDA_PROBES,
+	__NDA_MAX
+};
+
+#define NDA_MAX (__NDA_MAX - 1)
+
+/*
+ *	Neighbor Cache Entry Flags
+ */
+
+#define NTF_PROXY	0x08	/* == ATF_PUBL */
+#define NTF_ROUTER	0x80
+
+/*
+ *	Neighbor Cache Entry States.
+ */
+
+#define NUD_INCOMPLETE	0x01
+#define NUD_REACHABLE	0x02
+#define NUD_STALE	0x04
+#define NUD_DELAY	0x08
+#define NUD_PROBE	0x10
+#define NUD_FAILED	0x20
+
+/* Dummy states */
+#define NUD_NOARP	0x40
+#define NUD_PERMANENT	0x80
+#define NUD_NONE	0x00
+
+/* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change
+   and make no address resolution or NUD.
+   NUD_PERMANENT is also cannot be deleted by garbage collectors.
+ */
+
+struct nda_cacheinfo
+{
+	__u32		ndm_confirmed;
+	__u32		ndm_used;
+	__u32		ndm_updated;
+	__u32		ndm_refcnt;
+};
+
+#endif
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 84f3eb426da..9750f0214c2 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -386,69 +386,6 @@ struct rta_session
 
 
-/**************************************************************
- *		Neighbour discovery.
- ****/
-
-struct ndmsg
-{
-	unsigned char	ndm_family;
-	unsigned char	ndm_pad1;
-	unsigned short	ndm_pad2;
-	int		ndm_ifindex;	/* Link index			*/
-	__u16		ndm_state;
-	__u8		ndm_flags;
-	__u8		ndm_type;
-};
-
-enum
-{
-	NDA_UNSPEC,
-	NDA_DST,
-	NDA_LLADDR,
-	NDA_CACHEINFO,
-	NDA_PROBES,
-	__NDA_MAX
-};
-
-#define NDA_MAX (__NDA_MAX - 1)
-
-#define NDA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
-#define NDA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ndmsg))
-
-/*
- *	Neighbor Cache Entry Flags
- */
-
-#define NTF_PROXY	0x08	/* == ATF_PUBL */
-#define NTF_ROUTER	0x80
-
-/*
- *	Neighbor Cache Entry States.
- */
-
-#define NUD_INCOMPLETE	0x01
-#define NUD_REACHABLE	0x02
-#define NUD_STALE	0x04
-#define NUD_DELAY	0x08
-#define NUD_PROBE	0x10
-#define NUD_FAILED	0x20
-
-/* Dummy states */
-#define NUD_NOARP	0x40
-#define NUD_PERMANENT	0x80
-#define NUD_NONE	0x00
-
-
-struct nda_cacheinfo
-{
-	__u32		ndm_confirmed;
-	__u32		ndm_used;
-	__u32		ndm_updated;
-	__u32		ndm_refcnt;
-};
-
-
 /*****************************************************************
  *		Neighbour tables specific messages.
  *
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 4901ee44687..74c4b6ff8a5 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -1,6 +1,8 @@
 #ifndef _NET_NEIGHBOUR_H
 #define _NET_NEIGHBOUR_H
 
+#include <linux/neighbour.h>
+
 /*
  *	Generic neighbour manipulation
  *
@@ -14,40 +16,6 @@
  *		- Add neighbour cache statistics like rtstat
  */
 
-/* The following flags & states are exported to user space,
-   so that they should be moved to include/linux/ directory.
- */
-
-/*
- *	Neighbor Cache Entry Flags
- */
-
-#define NTF_PROXY	0x08	/* == ATF_PUBL */
-#define NTF_ROUTER	0x80
-
-/*
- *	Neighbor Cache Entry States.
- */
-
-#define NUD_INCOMPLETE	0x01
-#define NUD_REACHABLE	0x02
-#define NUD_STALE	0x04
-#define NUD_DELAY	0x08
-#define NUD_PROBE	0x10
-#define NUD_FAILED	0x20
-
-/* Dummy states */
-#define NUD_NOARP	0x40
-#define NUD_PERMANENT	0x80
-#define NUD_NONE	0x00
-
-/* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change
-   and make no address resolution or NUD.
-   NUD_PERMANENT is also cannot be deleted by garbage collectors.
- */
-
-#ifdef __KERNEL__
-
 #include <asm/atomic.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
@@ -374,6 +342,3 @@ struct neighbour_cb {
 #define NEIGH_CB(skb)	((struct neighbour_cb *)(skb)->cb)
 
 #endif
-#endif
-
-
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 93ba04fb844..78ccbd4c4e3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -104,7 +104,6 @@ static const int rtm_min[RTM_NR_FAMILIES] =
 	[RTM_FAM(RTM_NEWLINK)]      = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
 	[RTM_FAM(RTM_NEWADDR)]      = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
 	[RTM_FAM(RTM_NEWROUTE)]     = NLMSG_LENGTH(sizeof(struct rtmsg)),
-	[RTM_FAM(RTM_NEWNEIGH)]     = NLMSG_LENGTH(sizeof(struct ndmsg)),
 	[RTM_FAM(RTM_NEWRULE)]      = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)),
 	[RTM_FAM(RTM_NEWQDISC)]     = NLMSG_LENGTH(sizeof(struct tcmsg)),
 	[RTM_FAM(RTM_NEWTCLASS)]    = NLMSG_LENGTH(sizeof(struct tcmsg)),
@@ -121,7 +120,6 @@ static const int rta_max[RTM_NR_FAMILIES] =
 	[RTM_FAM(RTM_NEWLINK)]      = IFLA_MAX,
 	[RTM_FAM(RTM_NEWADDR)]      = IFA_MAX,
 	[RTM_FAM(RTM_NEWROUTE)]     = RTA_MAX,
-	[RTM_FAM(RTM_NEWNEIGH)]     = NDA_MAX,
 	[RTM_FAM(RTM_NEWRULE)]      = FRA_MAX,
 	[RTM_FAM(RTM_NEWQDISC)]     = TCA_MAX,
 	[RTM_FAM(RTM_NEWTCLASS)]    = TCA_MAX,
-- 
cgit v1.2.3-70-g09d2


From b63bbc5006a0a62fabc81c4f77e95f16ff16f340 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Mon, 7 Aug 2006 18:00:57 -0700
Subject: [NEIGH]: Move netlink neighbour table bits to linux/neighbour.h

rtnetlink_rcv_msg() is not longer required to parse attributes
for the neighbour tables layer, remove dependency on obsolete and
buggy rta_buf.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/neighbour.h |  94 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtnetlink.h | 101 ----------------------------------------------
 net/core/rtnetlink.c      |   2 -
 3 files changed, 94 insertions(+), 103 deletions(-)

(limited to 'include')

diff --git a/include/linux/neighbour.h b/include/linux/neighbour.h
index 8e8293d86fb..bd3bbf668cd 100644
--- a/include/linux/neighbour.h
+++ b/include/linux/neighbour.h
@@ -62,4 +62,98 @@ struct nda_cacheinfo
 	__u32		ndm_refcnt;
 };
 
+/*****************************************************************
+ *		Neighbour tables specific messages.
+ *
+ * To retrieve the neighbour tables send RTM_GETNEIGHTBL with the
+ * NLM_F_DUMP flag set. Every neighbour table configuration is
+ * spread over multiple messages to avoid running into message
+ * size limits on systems with many interfaces. The first message
+ * in the sequence transports all not device specific data such as
+ * statistics, configuration, and the default parameter set.
+ * This message is followed by 0..n messages carrying device
+ * specific parameter sets.
+ * Although the ordering should be sufficient, NDTA_NAME can be
+ * used to identify sequences. The initial message can be identified
+ * by checking for NDTA_CONFIG. The device specific messages do
+ * not contain this TLV but have NDTPA_IFINDEX set to the
+ * corresponding interface index.
+ *
+ * To change neighbour table attributes, send RTM_SETNEIGHTBL
+ * with NDTA_NAME set. Changeable attribute include NDTA_THRESH[1-3],
+ * NDTA_GC_INTERVAL, and all TLVs in NDTA_PARMS unless marked
+ * otherwise. Device specific parameter sets can be changed by
+ * setting NDTPA_IFINDEX to the interface index of the corresponding
+ * device.
+ ****/
+
+struct ndt_stats
+{
+	__u64		ndts_allocs;
+	__u64		ndts_destroys;
+	__u64		ndts_hash_grows;
+	__u64		ndts_res_failed;
+	__u64		ndts_lookups;
+	__u64		ndts_hits;
+	__u64		ndts_rcv_probes_mcast;
+	__u64		ndts_rcv_probes_ucast;
+	__u64		ndts_periodic_gc_runs;
+	__u64		ndts_forced_gc_runs;
+};
+
+enum {
+	NDTPA_UNSPEC,
+	NDTPA_IFINDEX,			/* u32, unchangeable */
+	NDTPA_REFCNT,			/* u32, read-only */
+	NDTPA_REACHABLE_TIME,		/* u64, read-only, msecs */
+	NDTPA_BASE_REACHABLE_TIME,	/* u64, msecs */
+	NDTPA_RETRANS_TIME,		/* u64, msecs */
+	NDTPA_GC_STALETIME,		/* u64, msecs */
+	NDTPA_DELAY_PROBE_TIME,		/* u64, msecs */
+	NDTPA_QUEUE_LEN,		/* u32 */
+	NDTPA_APP_PROBES,		/* u32 */
+	NDTPA_UCAST_PROBES,		/* u32 */
+	NDTPA_MCAST_PROBES,		/* u32 */
+	NDTPA_ANYCAST_DELAY,		/* u64, msecs */
+	NDTPA_PROXY_DELAY,		/* u64, msecs */
+	NDTPA_PROXY_QLEN,		/* u32 */
+	NDTPA_LOCKTIME,			/* u64, msecs */
+	__NDTPA_MAX
+};
+#define NDTPA_MAX (__NDTPA_MAX - 1)
+
+struct ndtmsg
+{
+	__u8		ndtm_family;
+	__u8		ndtm_pad1;
+	__u16		ndtm_pad2;
+};
+
+struct ndt_config
+{
+	__u16		ndtc_key_len;
+	__u16		ndtc_entry_size;
+	__u32		ndtc_entries;
+	__u32		ndtc_last_flush;	/* delta to now in msecs */
+	__u32		ndtc_last_rand;		/* delta to now in msecs */
+	__u32		ndtc_hash_rnd;
+	__u32		ndtc_hash_mask;
+	__u32		ndtc_hash_chain_gc;
+	__u32		ndtc_proxy_qlen;
+};
+
+enum {
+	NDTA_UNSPEC,
+	NDTA_NAME,			/* char *, unchangeable */
+	NDTA_THRESH1,			/* u32 */
+	NDTA_THRESH2,			/* u32 */
+	NDTA_THRESH3,			/* u32 */
+	NDTA_CONFIG,			/* struct ndt_config, read-only */
+	NDTA_PARMS,			/* nested TLV NDTPA_* */
+	NDTA_STATS,			/* struct ndt_stats, read-only */
+	NDTA_GC_INTERVAL,		/* u64, msecs */
+	__NDTA_MAX
+};
+#define NDTA_MAX (__NDTA_MAX - 1)
+
 #endif
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9750f0214c2..784a1a29490 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -384,107 +384,6 @@ struct rta_session
 	} u;
 };
 
-
-
-/*****************************************************************
- *		Neighbour tables specific messages.
- *
- * To retrieve the neighbour tables send RTM_GETNEIGHTBL with the
- * NLM_F_DUMP flag set. Every neighbour table configuration is
- * spread over multiple messages to avoid running into message
- * size limits on systems with many interfaces. The first message
- * in the sequence transports all not device specific data such as
- * statistics, configuration, and the default parameter set.
- * This message is followed by 0..n messages carrying device
- * specific parameter sets.
- * Although the ordering should be sufficient, NDTA_NAME can be
- * used to identify sequences. The initial message can be identified
- * by checking for NDTA_CONFIG. The device specific messages do
- * not contain this TLV but have NDTPA_IFINDEX set to the
- * corresponding interface index.
- *
- * To change neighbour table attributes, send RTM_SETNEIGHTBL
- * with NDTA_NAME set. Changeable attribute include NDTA_THRESH[1-3],
- * NDTA_GC_INTERVAL, and all TLVs in NDTA_PARMS unless marked
- * otherwise. Device specific parameter sets can be changed by
- * setting NDTPA_IFINDEX to the interface index of the corresponding
- * device.
- ****/
-
-struct ndt_stats
-{
-	__u64		ndts_allocs;
-	__u64		ndts_destroys;
-	__u64		ndts_hash_grows;
-	__u64		ndts_res_failed;
-	__u64		ndts_lookups;
-	__u64		ndts_hits;
-	__u64		ndts_rcv_probes_mcast;
-	__u64		ndts_rcv_probes_ucast;
-	__u64		ndts_periodic_gc_runs;
-	__u64		ndts_forced_gc_runs;
-};
-
-enum {
-	NDTPA_UNSPEC,
-	NDTPA_IFINDEX,			/* u32, unchangeable */
-	NDTPA_REFCNT,			/* u32, read-only */
-	NDTPA_REACHABLE_TIME,		/* u64, read-only, msecs */
-	NDTPA_BASE_REACHABLE_TIME,	/* u64, msecs */
-	NDTPA_RETRANS_TIME,		/* u64, msecs */
-	NDTPA_GC_STALETIME,		/* u64, msecs */
-	NDTPA_DELAY_PROBE_TIME,		/* u64, msecs */
-	NDTPA_QUEUE_LEN,		/* u32 */
-	NDTPA_APP_PROBES,		/* u32 */
-	NDTPA_UCAST_PROBES,		/* u32 */
-	NDTPA_MCAST_PROBES,		/* u32 */
-	NDTPA_ANYCAST_DELAY,		/* u64, msecs */
-	NDTPA_PROXY_DELAY,		/* u64, msecs */
-	NDTPA_PROXY_QLEN,		/* u32 */
-	NDTPA_LOCKTIME,			/* u64, msecs */
-	__NDTPA_MAX
-};
-#define NDTPA_MAX (__NDTPA_MAX - 1)
-
-struct ndtmsg
-{
-	__u8		ndtm_family;
-	__u8		ndtm_pad1;
-	__u16		ndtm_pad2;
-};
-
-struct ndt_config
-{
-	__u16		ndtc_key_len;
-	__u16		ndtc_entry_size;
-	__u32		ndtc_entries;
-	__u32		ndtc_last_flush;	/* delta to now in msecs */
-	__u32		ndtc_last_rand;		/* delta to now in msecs */
-	__u32		ndtc_hash_rnd;
-	__u32		ndtc_hash_mask;
-	__u32		ndtc_hash_chain_gc;
-	__u32		ndtc_proxy_qlen;
-};
-
-enum {
-	NDTA_UNSPEC,
-	NDTA_NAME,			/* char *, unchangeable */
-	NDTA_THRESH1,			/* u32 */
-	NDTA_THRESH2,			/* u32 */
-	NDTA_THRESH3,			/* u32 */
-	NDTA_CONFIG,			/* struct ndt_config, read-only */
-	NDTA_PARMS,			/* nested TLV NDTPA_* */
-	NDTA_STATS,			/* struct ndt_stats, read-only */
-	NDTA_GC_INTERVAL,		/* u64, msecs */
-	__NDTA_MAX
-};
-#define NDTA_MAX (__NDTA_MAX - 1)
-
-#define NDTA_RTA(r) ((struct rtattr*)(((char*)(r)) + \
-		     NLMSG_ALIGN(sizeof(struct ndtmsg))))
-#define NDTA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ndtmsg))
-
-
 /****
  *		General form of address family dependent message.
  ****/
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 78ccbd4c4e3..a1b783a6afc 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -112,7 +112,6 @@ static const int rtm_min[RTM_NR_FAMILIES] =
 	[RTM_FAM(RTM_NEWPREFIX)]    = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
 	[RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
 	[RTM_FAM(RTM_GETANYCAST)]   = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
-	[RTM_FAM(RTM_NEWNEIGHTBL)]  = NLMSG_LENGTH(sizeof(struct ndtmsg)),
 };
 
 static const int rta_max[RTM_NR_FAMILIES] =
@@ -125,7 +124,6 @@ static const int rta_max[RTM_NR_FAMILIES] =
 	[RTM_FAM(RTM_NEWTCLASS)]    = TCA_MAX,
 	[RTM_FAM(RTM_NEWTFILTER)]   = TCA_MAX,
 	[RTM_FAM(RTM_NEWACTION)]    = TCAA_MAX,
-	[RTM_FAM(RTM_NEWNEIGHTBL)]  = NDTA_MAX,
 };
 
 void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
-- 
cgit v1.2.3-70-g09d2


From ac5a488ef252ed673cb067843e411f8cc43f7ab9 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sri@us.ibm.com>
Date: Mon, 7 Aug 2006 20:57:31 -0700
Subject: [NET]: Round out in-kernel sockets API

This patch implements wrapper functions that provide a convenient way
to access the sockets API for in-kernel users like sunrpc, cifs &
ocfs2 etc and any future users.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  19 +++++++++
 net/socket.c        | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index b20c53c7441..19da2c08d7b 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -208,6 +208,25 @@ extern int   	     kernel_recvmsg(struct socket *sock, struct msghdr *msg,
 				    struct kvec *vec, size_t num,
 				    size_t len, int flags);
 
+extern int kernel_bind(struct socket *sock, struct sockaddr *addr,
+		       int addrlen);
+extern int kernel_listen(struct socket *sock, int backlog);
+extern int kernel_accept(struct socket *sock, struct socket **newsock,
+			 int flags);
+extern int kernel_connect(struct socket *sock, struct sockaddr *addr,
+			  int addrlen, int flags);
+extern int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
+			      int *addrlen);
+extern int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
+			      int *addrlen);
+extern int kernel_getsockopt(struct socket *sock, int level, int optname,
+			     char *optval, int *optlen);
+extern int kernel_setsockopt(struct socket *sock, int level, int optname,
+			     char *optval, int optlen);
+extern int kernel_sendpage(struct socket *sock, struct page *page, int offset,
+			   size_t size, int flags);
+extern int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg);
+
 #ifndef CONFIG_SMP
 #define SOCKOPS_WRAPPED(name) name
 #define SOCKOPS_WRAP(name, fam)
diff --git a/net/socket.c b/net/socket.c
index 6756e57e1ff..2eaebf934a1 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2170,6 +2170,109 @@ static long compat_sock_ioctl(struct file *file, unsigned cmd,
 }
 #endif
 
+int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
+{
+	return sock->ops->bind(sock, addr, addrlen);
+}
+
+int kernel_listen(struct socket *sock, int backlog)
+{
+	return sock->ops->listen(sock, backlog);
+}
+
+int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err;
+
+	err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
+			       newsock);
+	if (err < 0)
+		goto done;
+
+	err = sock->ops->accept(sock, *newsock, flags);
+	if (err < 0) {
+		sock_release(*newsock);
+		goto done;
+	}
+
+	(*newsock)->ops = sock->ops;
+
+done:
+	return err;
+}
+
+int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
+                   int flags)
+{
+	return sock->ops->connect(sock, addr, addrlen, flags);
+}
+
+int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
+			 int *addrlen)
+{
+	return sock->ops->getname(sock, addr, addrlen, 0);
+}
+
+int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
+			 int *addrlen)
+{
+	return sock->ops->getname(sock, addr, addrlen, 1);
+}
+
+int kernel_getsockopt(struct socket *sock, int level, int optname,
+			char *optval, int *optlen)
+{
+	mm_segment_t oldfs = get_fs();
+	int err;
+
+	set_fs(KERNEL_DS);
+	if (level == SOL_SOCKET)
+		err = sock_getsockopt(sock, level, optname, optval, optlen);
+	else
+		err = sock->ops->getsockopt(sock, level, optname, optval,
+					    optlen);
+	set_fs(oldfs);
+	return err;
+}
+
+int kernel_setsockopt(struct socket *sock, int level, int optname,
+			char *optval, int optlen)
+{
+	mm_segment_t oldfs = get_fs();
+	int err;
+
+	set_fs(KERNEL_DS);
+	if (level == SOL_SOCKET)
+		err = sock_setsockopt(sock, level, optname, optval, optlen);
+	else
+		err = sock->ops->setsockopt(sock, level, optname, optval,
+					    optlen);
+	set_fs(oldfs);
+	return err;
+}
+
+int kernel_sendpage(struct socket *sock, struct page *page, int offset,
+		    size_t size, int flags)
+{
+	if (sock->ops->sendpage)
+		return sock->ops->sendpage(sock, page, offset, size, flags);
+
+	return sock_no_sendpage(sock, page, offset, size, flags);
+}
+
+int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
+{
+	mm_segment_t oldfs = get_fs();
+	int err;
+
+	set_fs(KERNEL_DS);
+	err = sock->ops->ioctl(sock, cmd, arg);
+	set_fs(oldfs);
+
+	return err;
+}
+
 /* ABI emulation layers need these two */
 EXPORT_SYMBOL(move_addr_to_kernel);
 EXPORT_SYMBOL(move_addr_to_user);
@@ -2186,3 +2289,13 @@ EXPORT_SYMBOL(sock_wake_async);
 EXPORT_SYMBOL(sockfd_lookup);
 EXPORT_SYMBOL(kernel_sendmsg);
 EXPORT_SYMBOL(kernel_recvmsg);
+EXPORT_SYMBOL(kernel_bind);
+EXPORT_SYMBOL(kernel_listen);
+EXPORT_SYMBOL(kernel_accept);
+EXPORT_SYMBOL(kernel_connect);
+EXPORT_SYMBOL(kernel_getsockname);
+EXPORT_SYMBOL(kernel_getpeername);
+EXPORT_SYMBOL(kernel_getsockopt);
+EXPORT_SYMBOL(kernel_setsockopt);
+EXPORT_SYMBOL(kernel_sendpage);
+EXPORT_SYMBOL(kernel_sock_ioctl);
-- 
cgit v1.2.3-70-g09d2


From 8ce11e6a9faf1f1c849b77104adc1642c46aee95 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 7 Aug 2006 21:50:48 -0700
Subject: [NET]: Make code static.

This patch makes needlessly global code static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h              | 4 ----
 net/ipv4/cipso_ipv4.c              | 2 +-
 net/ipv4/fib_rules.c               | 4 ++--
 net/ipv6/fib6_rules.c              | 4 ++--
 net/ipv6/ip6_fib.c                 | 6 +++---
 net/ipv6/route.c                   | 6 +++---
 net/netlabel/netlabel_domainhash.c | 4 ++--
 7 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 7b47e8d5a76..c0660cea9a2 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -192,10 +192,6 @@ struct fib6_node		*fib6_locate(struct fib6_node *root,
 					     struct in6_addr *daddr, int dst_len,
 					     struct in6_addr *saddr, int src_len);
 
-extern void			fib6_clean_tree(struct fib6_node *root,
-						int (*func)(struct rt6_info *, void *arg),
-						int prune, void *arg);
-
 extern void			fib6_clean_all(int (*func)(struct rt6_info *, void *arg),
 					       int prune, void *arg);
 
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index b82a101c95c..80a2a0911b4 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -60,7 +60,7 @@ struct cipso_v4_domhsh_entry {
  * if in practice there are a lot of different DOIs this list should
  * probably be turned into a hash table or something similar so we
  * can do quick lookups. */
-DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
+static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
 static struct list_head cipso_v4_doi_list = LIST_HEAD_INIT(cipso_v4_doi_list);
 
 /* Label mapping cache */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 23ec6ae1a0f..03d1e8a43a4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -101,8 +101,8 @@ int fib_lookup(struct flowi *flp, struct fib_result *res)
 	return err;
 }
 
-int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags,
-		     struct fib_lookup_arg *arg)
+static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
 {
 	int err = -EAGAIN;
 	struct fib_table *tbl;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 94a46ec967a..bf9bba83b85 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -66,8 +66,8 @@ struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
 	return (struct dst_entry *) arg.result;
 }
 
-int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
-		     int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+			    int flags, struct fib_lookup_arg *arg)
 {
 	struct rt6_info *rt = NULL;
 	struct fib6_table *table;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ce226c14bef..1f2316187ca 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1169,9 +1169,9 @@ static int fib6_clean_node(struct fib6_walker_t *w)
  *	ignoring pure split nodes) will be scanned.
  */
 
-void fib6_clean_tree(struct fib6_node *root,
-		     int (*func)(struct rt6_info *, void *arg),
-		     int prune, void *arg)
+static void fib6_clean_tree(struct fib6_node *root,
+			    int (*func)(struct rt6_info *, void *arg),
+			    int prune, void *arg)
 {
 	struct fib6_cleaner_t c;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 41c5905d319..e08d84063c1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -613,8 +613,8 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *d
 	return rt;
 }
 
-struct rt6_info *ip6_pol_route_input(struct fib6_table *table, struct flowi *fl,
-				     int flags)
+static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
+					    struct flowi *fl, int flags)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt, *nrt;
@@ -872,7 +872,7 @@ static inline unsigned int ipv6_advmss(unsigned int mtu)
 }
 
 static struct dst_entry *ndisc_dst_gc_list;
-DEFINE_SPINLOCK(ndisc_lock);
+static DEFINE_SPINLOCK(ndisc_lock);
 
 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
 				  struct neighbour *neigh,
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index 5bb3fad4a11..0489a137810 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -50,11 +50,11 @@ struct netlbl_domhsh_tbl {
 /* Domain hash table */
 /* XXX - updates should be so rare that having one spinlock for the entire
  * hash table should be okay */
-DEFINE_SPINLOCK(netlbl_domhsh_lock);
+static DEFINE_SPINLOCK(netlbl_domhsh_lock);
 static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
 
 /* Default domain mapping */
-DEFINE_SPINLOCK(netlbl_domhsh_def_lock);
+static DEFINE_SPINLOCK(netlbl_domhsh_def_lock);
 static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 0298f36a579b5bd7f10f6f6d57e5929977a865a1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Mon, 7 Aug 2006 21:56:52 -0700
Subject: [IPV4]: Kill fib4_rules_clean().

As noted by Adrian Bunk this function is totally unused.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 1 -
 net/ipv4/fib_rules.c | 5 -----
 2 files changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 14c82e611c9..adf73586bc0 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -254,7 +254,6 @@ extern struct fib_table *fib_hash_init(int id);
 extern int fib4_rules_dump(struct sk_buff *skb, struct netlink_callback *cb);
 
 extern void __init fib4_rules_init(void);
-extern void __exit fib4_rules_cleanup(void);
 
 #ifdef CONFIG_NET_CLS_ROUTE
 extern u32 fib_rules_tclass(struct fib_result *res);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 03d1e8a43a4..d242e5291fc 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -347,8 +347,3 @@ void __init fib4_rules_init(void)
 
 	fib_rules_register(&fib4_rules_ops);
 }
-
-void __exit fib4_rules_cleanup(void)
-{
-	fib_rules_unregister(&fib4_rules_ops);
-}
-- 
cgit v1.2.3-70-g09d2


From 1a01912ae0a5666c4c24eaae2b4821711e2ad79a Mon Sep 17 00:00:00 2001
From: Louis Nyffenegger <louis.nyffenegger@gmail.com>
Date: Tue, 8 Aug 2006 00:56:11 -0700
Subject: [INET]: Remove is_setbyuser patch

The value is_setbyuser from struct ip_options is never used and set
only one time (http://linux-net.osdl.org/index.php/TODO#IPV4).
This little patch removes it from the kernel source.

Signed-off-by: Louis Nyffenegger <louis.nyffenegger@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h | 4 +---
 net/ipv4/ip_options.c   | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index f4caad56cd0..f6242710f2f 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -27,7 +27,6 @@
 /** struct ip_options - IP Options
  *
  * @faddr - Saved first hop address
- * @is_setbyuser - Set by setsockopt?
  * @is_data - Options in __data, rather than skb
  * @is_strictroute - Strict source route
  * @srr_is_hit - Packet destination addr was our one
@@ -42,8 +41,7 @@ struct ip_options {
 	unsigned char	srr;
 	unsigned char	rr;
 	unsigned char	ts;
-	unsigned char	is_setbyuser:1,
-			is_data:1,
+	unsigned char	is_data:1,
 			is_strictroute:1,
 			srr_is_hit:1,
 			is_changed:1,
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index e0a93b4fa8c..e7437c09132 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -525,7 +525,6 @@ static int ip_options_get_finish(struct ip_options **optp,
 		opt->__data[optlen++] = IPOPT_END;
 	opt->optlen = optlen;
 	opt->is_data = 1;
-	opt->is_setbyuser = 1;
 	if (optlen && ip_options_compile(opt, NULL)) {
 		kfree(opt);
 		return -EINVAL;
-- 
cgit v1.2.3-70-g09d2


From 99a92ff50424146ba01a222248fd47a1cd55b78f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 8 Aug 2006 02:18:10 -0700
Subject: [IPV4]: Uninline inet_lookup_listener

By modern standards this function is way too big to be inlined.  It's
even bigger than __inet_lookup_listener :)

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h | 33 ++-------------------------------
 net/ipv4/inet_hashtables.c    | 35 ++++++++++++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 98e0bb3014f..bd513f3b9c7 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -271,39 +271,10 @@ static inline int inet_iif(const struct sk_buff *skb)
 	return ((struct rtable *)skb->dst)->rt_iif;
 }
 
-extern struct sock *__inet_lookup_listener(const struct hlist_head *head,
-					   const u32 daddr,
-					   const unsigned short hnum,
-					   const int dif);
-
-/* Optimize the common listener case. */
-static inline struct sock *
+extern struct sock *
 		inet_lookup_listener(struct inet_hashinfo *hashinfo,
 				     const u32 daddr,
-				     const unsigned short hnum, const int dif)
-{
-	struct sock *sk = NULL;
-	const struct hlist_head *head;
-
-	read_lock(&hashinfo->lhash_lock);
-	head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
-	if (!hlist_empty(head)) {
-		const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
-
-		if (inet->num == hnum && !sk->sk_node.next &&
-		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
-		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
-		    !sk->sk_bound_dev_if)
-			goto sherry_cache;
-		sk = __inet_lookup_listener(head, daddr, hnum, dif);
-	}
-	if (sk) {
-sherry_cache:
-		sock_hold(sk);
-	}
-	read_unlock(&hashinfo->lhash_lock);
-	return sk;
-}
+				     const unsigned short hnum, const int dif);
 
 /* Socket demux engine toys. */
 #ifdef __BIG_ENDIAN
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 95fac553299..bfc39066e73 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -124,8 +124,10 @@ EXPORT_SYMBOL(inet_listen_wlock);
  * remote address for the connection. So always assume those are both
  * wildcarded during the search since they can never be otherwise.
  */
-struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
-				    const unsigned short hnum, const int dif)
+static struct sock *__inet_lookup_listener(const struct hlist_head *head,
+					   const u32 daddr,
+					   const unsigned short hnum,
+					   const int dif)
 {
 	struct sock *result = NULL, *sk;
 	const struct hlist_node *node;
@@ -159,7 +161,34 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad
 	return result;
 }
 
-EXPORT_SYMBOL_GPL(__inet_lookup_listener);
+/* Optimize the common listener case. */
+struct sock *inet_lookup_listener(struct inet_hashinfo *hashinfo,
+				  const u32 daddr, const unsigned short hnum,
+				  const int dif)
+{
+	struct sock *sk = NULL;
+	const struct hlist_head *head;
+
+	read_lock(&hashinfo->lhash_lock);
+	head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
+	if (!hlist_empty(head)) {
+		const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+
+		if (inet->num == hnum && !sk->sk_node.next &&
+		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
+		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
+		    !sk->sk_bound_dev_if)
+			goto sherry_cache;
+		sk = __inet_lookup_listener(head, daddr, hnum, dif);
+	}
+	if (sk) {
+sherry_cache:
+		sock_hold(sk);
+	}
+	read_unlock(&hashinfo->lhash_lock);
+	return sk;
+}
+EXPORT_SYMBOL_GPL(inet_lookup_listener);
 
 /* called with local bh disabled */
 static int __inet_check_established(struct inet_timewait_death_row *death_row,
-- 
cgit v1.2.3-70-g09d2


From 8f491069b40be5d627007a343f99759e9da6a178 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 9 Aug 2006 15:47:12 -0700
Subject: [IPV4]: Use network-order dport for all visible inet_lookup_*

Right now most inet_lookup_* functions take a host-order hnum instead
of a network-order dport because that's how it is represented
internally.

This means that users of these functions have to be careful about
using the right byte-order.  To add more confusion, inet_lookup takes
a network-order dport unlike all other functions.

So this patch changes all visible inet_lookup functions to take a
dport and move all dport->hnum conversion inside them.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h | 31 ++++++++++++++++++++++++-------
 net/dccp/ipv4.c               | 10 +++++-----
 net/ipv4/inet_hashtables.c    | 18 +++++++++---------
 net/ipv4/tcp_ipv4.c           | 10 +++++-----
 4 files changed, 43 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index bd513f3b9c7..b4491c9e2a5 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -271,10 +271,16 @@ static inline int inet_iif(const struct sk_buff *skb)
 	return ((struct rtable *)skb->dst)->rt_iif;
 }
 
-extern struct sock *
-		inet_lookup_listener(struct inet_hashinfo *hashinfo,
-				     const u32 daddr,
-				     const unsigned short hnum, const int dif);
+extern struct sock *__inet_lookup_listener(struct inet_hashinfo *hashinfo,
+					   const u32 daddr,
+					   const unsigned short hnum,
+					   const int dif);
+
+static inline struct sock *inet_lookup_listener(struct inet_hashinfo *hashinfo,
+						u32 daddr, u16 dport, int dif)
+{
+	return __inet_lookup_listener(hashinfo, daddr, ntohs(dport), dif);
+}
 
 /* Socket demux engine toys. */
 #ifdef __BIG_ENDIAN
@@ -362,14 +368,25 @@ hit:
 	goto out;
 }
 
+static inline struct sock *
+	inet_lookup_established(struct inet_hashinfo *hashinfo,
+				const u32 saddr, const u16 sport,
+				const u32 daddr, const u16 dport,
+				const int dif)
+{
+	return __inet_lookup_established(hashinfo, saddr, sport, daddr,
+					 ntohs(dport), dif);
+}
+
 static inline struct sock *__inet_lookup(struct inet_hashinfo *hashinfo,
 					 const u32 saddr, const u16 sport,
-					 const u32 daddr, const u16 hnum,
+					 const u32 daddr, const u16 dport,
 					 const int dif)
 {
+	u16 hnum = ntohs(dport);
 	struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, daddr,
 						    hnum, dif);
-	return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif);
+	return sk ? : __inet_lookup_listener(hashinfo, daddr, hnum, dif);
 }
 
 static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
@@ -380,7 +397,7 @@ static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
 	struct sock *sk;
 
 	local_bh_disable();
-	sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+	sk = __inet_lookup(hashinfo, saddr, sport, daddr, dport, dif);
 	local_bh_enable();
 
 	return sk;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 171d363876e..9a1a76a7dc4 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -608,10 +608,10 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 	if (req != NULL)
 		return dccp_check_req(sk, skb, req, prev);
 
-	nsk = __inet_lookup_established(&dccp_hashinfo,
-					iph->saddr, dh->dccph_sport,
-					iph->daddr, ntohs(dh->dccph_dport),
-					inet_iif(skb));
+	nsk = inet_lookup_established(&dccp_hashinfo,
+				      iph->saddr, dh->dccph_sport,
+				      iph->daddr, dh->dccph_dport,
+				      inet_iif(skb));
 	if (nsk != NULL) {
 		if (nsk->sk_state != DCCP_TIME_WAIT) {
 			bh_lock_sock(nsk);
@@ -925,7 +925,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
 	 * 	Look up flow ID in table and get corresponding socket */
 	sk = __inet_lookup(&dccp_hashinfo,
 			   skb->nh.iph->saddr, dh->dccph_sport,
-			   skb->nh.iph->daddr, ntohs(dh->dccph_dport),
+			   skb->nh.iph->daddr, dh->dccph_dport,
 			   inet_iif(skb));
 
 	/* 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index bfc39066e73..fb296c9a7f3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -124,10 +124,10 @@ EXPORT_SYMBOL(inet_listen_wlock);
  * remote address for the connection. So always assume those are both
  * wildcarded during the search since they can never be otherwise.
  */
-static struct sock *__inet_lookup_listener(const struct hlist_head *head,
-					   const u32 daddr,
-					   const unsigned short hnum,
-					   const int dif)
+static struct sock *inet_lookup_listener_slow(const struct hlist_head *head,
+					      const u32 daddr,
+					      const unsigned short hnum,
+					      const int dif)
 {
 	struct sock *result = NULL, *sk;
 	const struct hlist_node *node;
@@ -162,9 +162,9 @@ static struct sock *__inet_lookup_listener(const struct hlist_head *head,
 }
 
 /* Optimize the common listener case. */
-struct sock *inet_lookup_listener(struct inet_hashinfo *hashinfo,
-				  const u32 daddr, const unsigned short hnum,
-				  const int dif)
+struct sock *__inet_lookup_listener(struct inet_hashinfo *hashinfo,
+				    const u32 daddr, const unsigned short hnum,
+				    const int dif)
 {
 	struct sock *sk = NULL;
 	const struct hlist_head *head;
@@ -179,7 +179,7 @@ struct sock *inet_lookup_listener(struct inet_hashinfo *hashinfo,
 		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
 		    !sk->sk_bound_dev_if)
 			goto sherry_cache;
-		sk = __inet_lookup_listener(head, daddr, hnum, dif);
+		sk = inet_lookup_listener_slow(head, daddr, hnum, dif);
 	}
 	if (sk) {
 sherry_cache:
@@ -188,7 +188,7 @@ sherry_cache:
 	read_unlock(&hashinfo->lhash_lock);
 	return sk;
 }
-EXPORT_SYMBOL_GPL(inet_lookup_listener);
+EXPORT_SYMBOL_GPL(__inet_lookup_listener);
 
 /* called with local bh disabled */
 static int __inet_check_established(struct inet_timewait_death_row *death_row,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b2aa512a30e..2973dee0a48 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -951,9 +951,9 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 	if (req)
 		return tcp_check_req(sk, skb, req, prev);
 
-	nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
-					th->source, skb->nh.iph->daddr,
-					ntohs(th->dest), inet_iif(skb));
+	nsk = inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
+				      th->source, skb->nh.iph->daddr,
+				      th->dest, inet_iif(skb));
 
 	if (nsk) {
 		if (nsk->sk_state != TCP_TIME_WAIT) {
@@ -1090,7 +1090,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->sacked	 = 0;
 
 	sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
-			   skb->nh.iph->daddr, ntohs(th->dest),
+			   skb->nh.iph->daddr, th->dest,
 			   inet_iif(skb));
 
 	if (!sk)
@@ -1168,7 +1168,7 @@ do_time_wait:
 	case TCP_TW_SYN: {
 		struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
 							skb->nh.iph->daddr,
-							ntohs(th->dest),
+							th->dest,
 							inet_iif(skb));
 		if (sk2) {
 			inet_twsk_deschedule((struct inet_timewait_sock *)sk,
-- 
cgit v1.2.3-70-g09d2


From a8731cbf61c8768ea129780b70dc7dfc6795aad4 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <steve@chygwyn.com>
Date: Wed, 9 Aug 2006 15:56:46 -0700
Subject: [DECNET]: Covert rules to use generic code

This patch converts the DECnet rules code to use the generic
rules system created by Thomas Graf <tgraf@suug.ch>.

Signed-off-by: Steven Whitehouse <steve@chygwyn.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |   3 +-
 include/net/dn_fib.h      |   8 +-
 net/decnet/Kconfig        |   1 +
 net/decnet/af_decnet.c    |   1 +
 net/decnet/dn_dev.c       |   3 +-
 net/decnet/dn_fib.c       |   1 +
 net/decnet/dn_route.c     |   3 +-
 net/decnet/dn_rules.c     | 494 +++++++++++++++++-----------------------------
 net/decnet/dn_table.c     |   1 +
 9 files changed, 196 insertions(+), 319 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 784a1a29490..0aaffa2ae66 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -534,7 +534,8 @@ enum rtnetlink_groups {
 	RTNLGRP_NOP2,
 	RTNLGRP_DECnet_ROUTE,
 #define RTNLGRP_DECnet_ROUTE	RTNLGRP_DECnet_ROUTE
-	RTNLGRP_NOP3,
+	RTNLGRP_DECnet_RULE,
+#define RTNLGRP_DECnet_RULE	RTNLGRP_DECnet_RULE
 	RTNLGRP_NOP4,
 	RTNLGRP_IPV6_PREFIX,
 #define RTNLGRP_IPV6_PREFIX	RTNLGRP_IPV6_PREFIX
diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h
index a15dcf0d5c1..32bc8ce5c5c 100644
--- a/include/net/dn_fib.h
+++ b/include/net/dn_fib.h
@@ -22,7 +22,7 @@ struct dn_kern_rta
 };
 
 struct dn_fib_res {
-	struct dn_fib_rule *r;
+	struct fib_rule *r;
 	struct dn_fib_info *fi;
 	unsigned char prefixlen;
 	unsigned char nh_sel;
@@ -147,10 +147,8 @@ extern void dn_fib_table_cleanup(void);
  */
 extern void dn_fib_rules_init(void);
 extern void dn_fib_rules_cleanup(void);
-extern void dn_fib_rule_put(struct dn_fib_rule *);
-extern __le16 dn_fib_rules_policy(__le16 saddr, struct dn_fib_res *res, unsigned *flags);
 extern unsigned dnet_addr_type(__le16 addr);
-extern int dn_fib_lookup(const struct flowi *fl, struct dn_fib_res *res);
+extern int dn_fib_lookup(struct flowi *fl, struct dn_fib_res *res);
 
 /*
  * rtnetlink interface
@@ -176,7 +174,7 @@ static inline void dn_fib_res_put(struct dn_fib_res *res)
 	if (res->fi)
 		dn_fib_info_put(res->fi);
 	if (res->r)
-		dn_fib_rule_put(res->r);
+		fib_rule_put(res->r);
 }
 
 extern struct dn_fib_table *dn_fib_tables[];
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index 92f2ec46fd2..36e72cb145b 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -27,6 +27,7 @@ config DECNET
 config DECNET_ROUTER
 	bool "DECnet: router support (EXPERIMENTAL)"
 	depends on DECNET && EXPERIMENTAL
+	select FIB_RULES
 	---help---
 	  Add support for turning your DECnet Endnode into a level 1 or 2
 	  router.  This is an experimental, but functional option.  If you
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 5486247735f..70e02737568 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -130,6 +130,7 @@ Version 0.0.6    2.1.110   07-aug-98   Eduardo Marcelo Serrat
 #include <linux/poll.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
+#include <net/fib_rules.h>
 #include <net/dn.h>
 #include <net/dn_nsp.h>
 #include <net/dn_dev.h>
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 632c5a90b58..88ea7a13bb2 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -46,6 +46,7 @@
 #include <net/neighbour.h>
 #include <net/dst.h>
 #include <net/flow.h>
+#include <net/fib_rules.h>
 #include <net/dn.h>
 #include <net/dn_dev.h>
 #include <net/dn_route.h>
@@ -1418,8 +1419,6 @@ static struct rtnetlink_link dnet_rtnetlink_table[RTM_NR_MSGTYPES] =
 	[RTM_DELROUTE - RTM_BASE] = { .doit	= dn_fib_rtm_delroute,	},
 	[RTM_GETROUTE - RTM_BASE] = { .doit	= dn_cache_getroute,
 				      .dumpit	= dn_fib_dump,		},
-	[RTM_NEWRULE  - RTM_BASE] = { .doit	= dn_fib_rtm_newrule,	},
-	[RTM_DELRULE  - RTM_BASE] = { .doit	= dn_fib_rtm_delrule,	},
 	[RTM_GETRULE  - RTM_BASE] = { .dumpit	= dn_fib_dump_rules,	},
 #else
 	[RTM_GETROUTE - RTM_BASE] = { .doit	= dn_cache_getroute,
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index fa20e2efcfc..846df3954a6 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -34,6 +34,7 @@
 #include <net/neighbour.h>
 #include <net/dst.h>
 #include <net/flow.h>
+#include <net/fib_rules.h>
 #include <net/dn.h>
 #include <net/dn_route.h>
 #include <net/dn_fib.h>
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 743e9fcf7c5..5e6f4616ca1 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -80,6 +80,7 @@
 #include <net/neighbour.h>
 #include <net/dst.h>
 #include <net/flow.h>
+#include <net/fib_rules.h>
 #include <net/dn.h>
 #include <net/dn_dev.h>
 #include <net/dn_nsp.h>
@@ -1284,7 +1285,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
 		dev_hold(out_dev);
 
 		if (res.r)
-			src_map = dn_fib_rules_policy(fl.fld_src, &res, &flags);
+			src_map = fl.fld_src; /* no NAT support for now */
 
 		gateway = DN_FIB_RES_GW(res);
 		if (res.type == RTN_NAT) {
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 6986be754ef..096f1273e71 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -11,259 +11,198 @@
  *
  *
  * Changes:
+ *              Steve Whitehouse <steve@chygwyn.com>
+ *              Updated for Thomas Graf's generic rules
  *
  */
-#include <linux/string.h>
 #include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
 #include <linux/init.h>
-#include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
 #include <linux/netdevice.h>
-#include <linux/timer.h>
 #include <linux/spinlock.h>
-#include <linux/in_route.h>
 #include <linux/list.h>
 #include <linux/rcupdate.h>
-#include <asm/atomic.h>
-#include <asm/uaccess.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
 #include <net/flow.h>
+#include <net/fib_rules.h>
 #include <net/dn.h>
 #include <net/dn_fib.h>
 #include <net/dn_neigh.h>
 #include <net/dn_dev.h>
 
+static struct fib_rules_ops dn_fib_rules_ops;
+
 struct dn_fib_rule
 {
-	struct hlist_node	r_hlist;
-	atomic_t		r_clntref;
-	u32			r_preference;
-	unsigned char		r_table;
-	unsigned char		r_action;
-	unsigned char		r_dst_len;
-	unsigned char		r_src_len;
-	__le16			r_src;
-	__le16			r_srcmask;
-	__le16			r_dst;
-	__le16			r_dstmask;
-	__le16			r_srcmap;
-	u8			r_flags;
+	struct fib_rule		common;
+	unsigned char		dst_len;
+	unsigned char		src_len;
+	__le16			src;
+	__le16			srcmask;
+	__le16			dst;
+	__le16			dstmask;
+	__le16			srcmap;
+	u8			flags;
 #ifdef CONFIG_DECNET_ROUTE_FWMARK
-	u32			r_fwmark;
+	u32			fwmark;
 #endif
-	int			r_ifindex;
-	char			r_ifname[IFNAMSIZ];
-	int			r_dead;
-	struct rcu_head		rcu;
 };
 
 static struct dn_fib_rule default_rule = {
-	.r_clntref =		ATOMIC_INIT(2),
-	.r_preference =		0x7fff,
-	.r_table =		RT_TABLE_MAIN,
-	.r_action =		RTN_UNICAST
+	.common = {
+		.refcnt =		ATOMIC_INIT(2),
+		.pref =			0x7fff,
+		.table =		RT_TABLE_MAIN,
+		.action =		FR_ACT_TO_TBL,
+	},
 };
 
-static struct hlist_head dn_fib_rules;
+static LIST_HEAD(dn_fib_rules);
+
 
-int dn_fib_rtm_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+int dn_fib_lookup(struct flowi *flp, struct dn_fib_res *res)
 {
-	struct rtattr **rta = arg;
-	struct rtmsg *rtm = NLMSG_DATA(nlh);
-	struct dn_fib_rule *r;
-	struct hlist_node *node;
-	int err = -ESRCH;
-
-	hlist_for_each_entry(r, node, &dn_fib_rules, r_hlist) {
-		if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 2) == 0) &&
-			rtm->rtm_src_len == r->r_src_len &&
-			rtm->rtm_dst_len == r->r_dst_len &&
-			(!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 2) == 0) &&
-#ifdef CONFIG_DECNET_ROUTE_FWMARK
-			(!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
-#endif
-			(!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
-			(!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
-			(!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) &&
-			(!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
-
-			err = -EPERM;
-			if (r == &default_rule)
-				break;
-
-			hlist_del_rcu(&r->r_hlist);
-			r->r_dead = 1;
-			dn_fib_rule_put(r);
-			err = 0;
-			break;
-		}
-	}
+	struct fib_lookup_arg arg = {
+		.result = res,
+	};
+	int err;
+
+	err = fib_rules_lookup(&dn_fib_rules_ops, flp, 0, &arg);
+	res->r = arg.rule;
 
 	return err;
 }
 
-static inline void dn_fib_rule_put_rcu(struct rcu_head *head)
+int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp, int flags,
+		       struct fib_lookup_arg *arg)
 {
-	struct dn_fib_rule *r = container_of(head, struct dn_fib_rule, rcu);
-	kfree(r);
-}
+	int err = -EAGAIN;
+	struct dn_fib_table *tbl;
 
-void dn_fib_rule_put(struct dn_fib_rule *r)
-{
-	if (atomic_dec_and_test(&r->r_clntref)) {
-		if (r->r_dead)
-			call_rcu(&r->rcu, dn_fib_rule_put_rcu);
-		else
-			printk(KERN_DEBUG "Attempt to free alive dn_fib_rule\n");
+	switch(rule->action) {
+	case FR_ACT_TO_TBL:
+		break;
+
+	case FR_ACT_UNREACHABLE:
+		err = -ENETUNREACH;
+		goto errout;
+
+	case FR_ACT_PROHIBIT:
+		err = -EACCES;
+		goto errout;
+
+	case FR_ACT_BLACKHOLE:
+	default:
+		err = -EINVAL;
+		goto errout;
 	}
+
+	tbl = dn_fib_get_table(rule->table, 0);
+	if (tbl == NULL)
+		goto errout;
+
+	err = tbl->lookup(tbl, flp, (struct dn_fib_res *)arg->result);
+	if (err > 0)
+		err = -EAGAIN;
+errout:
+	return err;
 }
 
+static struct nla_policy dn_fib_rule_policy[FRA_MAX+1] __read_mostly = {
+	[FRA_IFNAME]	= { .type = NLA_STRING },
+	[FRA_PRIORITY]	= { .type = NLA_U32 },
+	[FRA_SRC]	= { .type = NLA_U16 },
+	[FRA_DST]	= { .type = NLA_U16 },
+	[FRA_FWMARK]	= { .type = NLA_U32 },
+};
 
-int dn_fib_rtm_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 {
-	struct rtattr **rta = arg;
-	struct rtmsg *rtm = NLMSG_DATA(nlh);
-	struct dn_fib_rule *r, *new_r, *last = NULL;
-	struct hlist_node *node = NULL;
-	unsigned char table_id;
-
-	if (rtm->rtm_src_len > 16 || rtm->rtm_dst_len > 16)
-		return -EINVAL;
-
-	if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
-		return -EINVAL;
-
-	if (rtm->rtm_type == RTN_NAT)
-		return -EINVAL;
-
-	table_id = rtm->rtm_table;
-	if (table_id == RT_TABLE_UNSPEC) {
-		struct dn_fib_table *tb;
-		if (rtm->rtm_type == RTN_UNICAST) {
-			if ((tb = dn_fib_empty_table()) == NULL)
-				return -ENOBUFS;
-			table_id = tb->n;
-		}
-	}
+	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+	u16 daddr = fl->fld_dst;
+	u16 saddr = fl->fld_src;
+
+	if (((saddr ^ r->src) & r->srcmask) ||
+	    ((daddr ^ r->dst) & r->dstmask))
+		return 0;
 
-	new_r = kzalloc(sizeof(*new_r), GFP_KERNEL);
-	if (!new_r)
-		return -ENOMEM;
-
-	if (rta[RTA_SRC-1])
-		memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 2);
-	if (rta[RTA_DST-1])
-		memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 2);
-	if (rta[RTA_GATEWAY-1])
-		memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 2);
-	new_r->r_src_len = rtm->rtm_src_len;
-	new_r->r_dst_len = rtm->rtm_dst_len;
-	new_r->r_srcmask = dnet_make_mask(rtm->rtm_src_len);
-	new_r->r_dstmask = dnet_make_mask(rtm->rtm_dst_len);
 #ifdef CONFIG_DECNET_ROUTE_FWMARK
-	if (rta[RTA_PROTOINFO-1])
-		memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
+	if (r->fwmark && (r->fwmark != fl->fld_fwmark))
+		return 0;
 #endif
-	new_r->r_action = rtm->rtm_type;
-	new_r->r_flags = rtm->rtm_flags;
-	if (rta[RTA_PRIORITY-1])
-		memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
-	new_r->r_table = table_id;
-	if (rta[RTA_IIF-1]) {
-		struct net_device *dev;
-		rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ);
-		new_r->r_ifindex = -1;
-		dev = dev_get_by_name(new_r->r_ifname);
-		if (dev) {
-			new_r->r_ifindex = dev->ifindex;
-			dev_put(dev);
-		}
-	}
 
-	r = container_of(dn_fib_rules.first, struct dn_fib_rule, r_hlist);
-	if (!new_r->r_preference) {
-		if (r && r->r_hlist.next != NULL) {
-			r = container_of(r->r_hlist.next, struct dn_fib_rule, r_hlist);
-			if (r->r_preference)
-				new_r->r_preference = r->r_preference - 1;
+	return 1;
+}
+
+static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+				 struct nlmsghdr *nlh, struct fib_rule_hdr *frh,
+				 struct nlattr **tb)
+{
+	int err = -EINVAL;
+	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+
+	if (frh->src_len > 16 || frh->dst_len > 16 || frh->tos)
+		goto  errout;
+
+	if (rule->table == RT_TABLE_UNSPEC) {
+		if (rule->action == FR_ACT_TO_TBL) {
+			struct dn_fib_table *table;
+
+			table = dn_fib_empty_table();
+			if (table == NULL) {
+				err = -ENOBUFS;
+				goto errout;
+			}
+
+			rule->table = table->n;
 		}
 	}
 
-	hlist_for_each_entry(r, node, &dn_fib_rules, r_hlist) {
-		if (r->r_preference > new_r->r_preference)
-			break;
-		last = r;
-	}
-	atomic_inc(&new_r->r_clntref);
+	if (tb[FRA_SRC])
+		r->src = nla_get_u16(tb[FRA_SRC]);
 
-	if (last)
-		hlist_add_after_rcu(&last->r_hlist, &new_r->r_hlist);
-	else
-		hlist_add_before_rcu(&new_r->r_hlist, &r->r_hlist);
-	return 0;
-}
+	if (tb[FRA_DST])
+		r->dst = nla_get_u16(tb[FRA_DST]);
 
+#ifdef CONFIG_DECNET_ROUTE_FWMARK
+	if (tb[FRA_FWMARK])
+		r->fwmark = nla_get_u32(tb[FRA_FWMARK]);
+#endif
+
+	r->src_len = frh->src_len;
+	r->srcmask = dnet_make_mask(r->src_len);
+	r->dst_len = frh->dst_len;
+	r->dstmask = dnet_make_mask(r->dst_len);
+	err = 0;
+errout:
+	return err;
+}
 
-int dn_fib_lookup(const struct flowi *flp, struct dn_fib_res *res)
+static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+			       struct nlattr **tb)
 {
-	struct dn_fib_rule *r, *policy;
-	struct dn_fib_table *tb;
-	__le16 saddr = flp->fld_src;
-	__le16 daddr = flp->fld_dst;
-	struct hlist_node *node;
-	int err;
+	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
+
+	if (frh->src_len && (r->src_len != frh->src_len))
+		return 0;
 
-	rcu_read_lock();
+	if (frh->dst_len && (r->dst_len != frh->dst_len))
+		return 0;
 
-	hlist_for_each_entry_rcu(r, node, &dn_fib_rules, r_hlist) {
-		if (((saddr^r->r_src) & r->r_srcmask) ||
-		    ((daddr^r->r_dst) & r->r_dstmask) ||
 #ifdef CONFIG_DECNET_ROUTE_FWMARK
-		    (r->r_fwmark && r->r_fwmark != flp->fld_fwmark) ||
+	if (tb[FRA_FWMARK] && (r->fwmark != nla_get_u32(tb[FRA_FWMARK])))
+		return 0;
 #endif
-		    (r->r_ifindex && r->r_ifindex != flp->iif))
-			continue;
-
-		switch(r->r_action) {
-			case RTN_UNICAST:
-			case RTN_NAT:
-				policy = r;
-				break;
-			case RTN_UNREACHABLE:
-				rcu_read_unlock();
-				return -ENETUNREACH;
-			default:
-			case RTN_BLACKHOLE:
-				rcu_read_unlock();
-				return -EINVAL;
-			case RTN_PROHIBIT:
-				rcu_read_unlock();
-				return -EACCES;
-		}
 
-		if ((tb = dn_fib_get_table(r->r_table, 0)) == NULL)
-			continue;
-		err = tb->lookup(tb, flp, res);
-		if (err == 0) {
-			res->r = policy;
-			if (policy)
-				atomic_inc(&policy->r_clntref);
-			rcu_read_unlock();
-			return 0;
-		}
-		if (err < 0 && err != -EAGAIN) {
-			rcu_read_unlock();
-			return err;
-		}
-	}
+	if (tb[FRA_SRC] && (r->src != nla_get_u32(tb[FRA_SRC])))
+		return 0;
+
+	if (tb[FRA_DST] && (r->dst != nla_get_u32(tb[FRA_DST])))
+		return 0;
 
-	rcu_read_unlock();
-	return -ESRCH;
+	return 1;
 }
 
 unsigned dnet_addr_type(__le16 addr)
@@ -284,142 +223,77 @@ unsigned dnet_addr_type(__le16 addr)
 	return ret;
 }
 
-__le16 dn_fib_rules_policy(__le16 saddr, struct dn_fib_res *res, unsigned *flags)
+static int dn_fib_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+			    struct nlmsghdr *nlh, struct fib_rule_hdr *frh)
 {
-	struct dn_fib_rule *r = res->r;
-
-	if (r->r_action == RTN_NAT) {
-		int addrtype = dnet_addr_type(r->r_srcmap);
-
-		if (addrtype == RTN_NAT) {
-			saddr = (saddr&~r->r_srcmask)|r->r_srcmap;
-			*flags |= RTCF_SNAT;
-		} else if (addrtype == RTN_LOCAL || r->r_srcmap == 0) {
-			saddr = r->r_srcmap;
-			*flags |= RTCF_MASQ;
-		}
-	}
-	return saddr;
-}
+	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
 
-static void dn_fib_rules_detach(struct net_device *dev)
-{
-	struct hlist_node *node;
-	struct dn_fib_rule *r;
+	frh->family = AF_DECnet;
+	frh->dst_len = r->dst_len;
+	frh->src_len = r->src_len;
+	frh->tos = 0;
 
-	hlist_for_each_entry(r, node, &dn_fib_rules, r_hlist) {
-		if (r->r_ifindex == dev->ifindex)
-			r->r_ifindex = -1;
-	}
-}
+#ifdef CONFIG_DECNET_ROUTE_FWMARK
+	if (r->fwmark)
+		NLA_PUT_U32(skb, FRA_FWMARK, r->fwmark);
+#endif
+	if (r->dst_len)
+		NLA_PUT_U16(skb, FRA_DST, r->dst);
+	if (r->src_len)
+		NLA_PUT_U16(skb, FRA_SRC, r->src);
 
-static void dn_fib_rules_attach(struct net_device *dev)
-{
-	struct hlist_node *node;
-	struct dn_fib_rule *r;
+	return 0;
 
-	hlist_for_each_entry(r, node, &dn_fib_rules, r_hlist) {
-		if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0)
-			r->r_ifindex = dev->ifindex;
-	}
+nla_put_failure:
+	return -ENOBUFS;
 }
 
-static int dn_fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
+static u32 dn_fib_rule_default_pref(void)
 {
-	struct net_device *dev = ptr;
-
-	switch(event) {
-		case NETDEV_UNREGISTER:
-			dn_fib_rules_detach(dev);
-			dn_fib_sync_down(0, dev, 1);
-		case NETDEV_REGISTER:
-			dn_fib_rules_attach(dev);
-			dn_fib_sync_up(dev);
+	struct list_head *pos;
+	struct fib_rule *rule;
+
+	if (!list_empty(&dn_fib_rules)) {
+		pos = dn_fib_rules.next;
+		if (pos->next != &dn_fib_rules) {
+			rule = list_entry(pos->next, struct fib_rule, list);
+			if (rule->pref)
+				return rule->pref - 1;
+		}
 	}
 
-	return NOTIFY_DONE;
-}
-
-
-static struct notifier_block dn_fib_rules_notifier = {
-	.notifier_call =	dn_fib_rules_event,
-};
-
-static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r,
-			    struct netlink_callback *cb, unsigned int flags)
-{
-	struct rtmsg *rtm;
-	struct nlmsghdr *nlh;
-	unsigned char *b = skb->tail;
-
-
-	nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
-	rtm = NLMSG_DATA(nlh);
-	rtm->rtm_family = AF_DECnet;
-	rtm->rtm_dst_len = r->r_dst_len;
-	rtm->rtm_src_len = r->r_src_len;
-	rtm->rtm_tos = 0;
-#ifdef CONFIG_DECNET_ROUTE_FWMARK
-	if (r->r_fwmark)
-		RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
-#endif
-	rtm->rtm_table = r->r_table;
-	rtm->rtm_protocol = 0;
-	rtm->rtm_scope = 0;
-	rtm->rtm_type = r->r_action;
-	rtm->rtm_flags = r->r_flags;
-
-	if (r->r_dst_len)
-		RTA_PUT(skb, RTA_DST, 2, &r->r_dst);
-	if (r->r_src_len)
-		RTA_PUT(skb, RTA_SRC, 2, &r->r_src);
-	if (r->r_ifname[0])
-		RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
-	if (r->r_preference)
-		RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
-	if (r->r_srcmap)
-		RTA_PUT(skb, RTA_GATEWAY, 2, &r->r_srcmap);
-	nlh->nlmsg_len = skb->tail - b;
-	return skb->len;
-
-nlmsg_failure:
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
+	return 0;
 }
 
 int dn_fib_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int idx = 0;
-	int s_idx = cb->args[0];
-	struct dn_fib_rule *r;
-	struct hlist_node *node;
-
-	rcu_read_lock();
-	hlist_for_each_entry(r, node, &dn_fib_rules, r_hlist) {
-		if (idx < s_idx)
-			goto next;
-		if (dn_fib_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
-			break;
-next:
-		idx++;
-	}
-	rcu_read_unlock();
-	cb->args[0] = idx;
-
-	return skb->len;
+	return fib_rules_dump(skb, cb, AF_DECnet);
 }
 
+static struct fib_rules_ops dn_fib_rules_ops = {
+	.family		= AF_DECnet,
+	.rule_size	= sizeof(struct dn_fib_rule),
+	.action		= dn_fib_rule_action,
+	.match		= dn_fib_rule_match,
+	.configure	= dn_fib_rule_configure,
+	.compare	= dn_fib_rule_compare,
+	.fill		= dn_fib_rule_fill,
+	.default_pref	= dn_fib_rule_default_pref,
+	.nlgroup	= RTNLGRP_DECnet_RULE,
+	.policy		= dn_fib_rule_policy,
+	.rules_list	= &dn_fib_rules,
+	.owner		= THIS_MODULE,
+};
+
 void __init dn_fib_rules_init(void)
 {
-	INIT_HLIST_HEAD(&dn_fib_rules);
-	hlist_add_head(&default_rule.r_hlist, &dn_fib_rules);
-	register_netdevice_notifier(&dn_fib_rules_notifier);
+	list_add_tail(&default_rule.common.list, &dn_fib_rules);
+	fib_rules_register(&dn_fib_rules_ops);
 }
 
 void __exit dn_fib_rules_cleanup(void)
 {
-	unregister_netdevice_notifier(&dn_fib_rules_notifier);
+	fib_rules_unregister(&dn_fib_rules_ops);
 }
 
 
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index e926c952e36..2e01b67398c 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -30,6 +30,7 @@
 #include <net/neighbour.h>
 #include <net/dst.h>
 #include <net/flow.h>
+#include <net/fib_rules.h>
 #include <net/dn.h>
 #include <net/dn_route.h>
 #include <net/dn_fib.h>
-- 
cgit v1.2.3-70-g09d2


From 757dbb494be3309fe41ce4c62f8057d8b41d8897 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 9 Aug 2006 20:50:00 -0700
Subject: [NET]: drop unused elements from net_proto_family

Three values in net_proto_family are defined but never used.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 19da2c08d7b..1bd76327ee2 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -169,11 +169,6 @@ struct proto_ops {
 struct net_proto_family {
 	int		family;
 	int		(*create)(struct socket *sock, int protocol);
-	/* These are counters for the number of different methods of
-	   each we support */
-	short		authentication;
-	short		encryption;
-	short		encrypt_net;
 	struct module	*owner;
 };
 
-- 
cgit v1.2.3-70-g09d2


From f0fd27d42e39b91f85e1840ec49b072fd6c545b8 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 9 Aug 2006 21:03:17 -0700
Subject: [NET]: sock_register interface changes

The sock_register() doesn't change the family, so the protocols can
define it read-only.  No caller ever checks return value from
sock_unregister()

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  4 ++--
 net/socket.c        | 10 ++++------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index 1bd76327ee2..c257f716e00 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -176,8 +176,8 @@ struct iovec;
 struct kvec;
 
 extern int	     sock_wake_async(struct socket *sk, int how, int band);
-extern int	     sock_register(struct net_proto_family *fam);
-extern int	     sock_unregister(int family);
+extern int	     sock_register(const struct net_proto_family *fam);
+extern void	     sock_unregister(int family);
 extern int	     sock_create(int family, int type, int proto,
 				 struct socket **res);
 extern int	     sock_create_kern(int family, int type, int proto,
diff --git a/net/socket.c b/net/socket.c
index b5a3fcb9ed6..4147fe4bf41 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -147,7 +147,7 @@ static struct file_operations socket_file_ops = {
  */
 
 static DEFINE_SPINLOCK(net_family_lock);
-static const struct net_proto_family *net_families[NPROTO];
+static const struct net_proto_family *net_families[NPROTO] __read_mostly;
 
 /*
  *	Statistics counters of the socket lists
@@ -2080,7 +2080,7 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
  *	socket interface. The value ops->family coresponds to the
  *	socket system call protocol family.
  */
-int sock_register(struct net_proto_family *ops)
+int sock_register(const struct net_proto_family *ops)
 {
 	int err;
 
@@ -2116,10 +2116,9 @@ int sock_register(struct net_proto_family *ops)
  *	a module then it needs to provide its own protection in
  *	the ops->create routine.
  */
-int sock_unregister(int family)
+void sock_unregister(int family)
 {
-	if (family < 0 || family >= NPROTO)
-		return -EINVAL;
+	BUG_ON(family < 0 || family >= NPROTO);
 
 	spin_lock(&net_family_lock);
 	net_families[family] = NULL;
@@ -2128,7 +2127,6 @@ int sock_unregister(int family)
 	synchronize_rcu();
 
 	printk(KERN_INFO "NET: Unregistered protocol family %d\n", family);
-	return 0;
 }
 
 static int __init sock_init(void)
-- 
cgit v1.2.3-70-g09d2


From d924424aaed116b362c6d0e667d912b77e655085 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 10 Aug 2006 23:03:23 -0700
Subject: [NEIGHBOUR]: Use ALIGN() macro.

Rather than opencoding the mask, it looks better to use ALIGN()
macro from kernel.h.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 74c4b6ff8a5..bd187daffdb 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -101,7 +101,7 @@ struct neighbour
 	__u8			dead;
 	atomic_t		probes;
 	rwlock_t		lock;
-	unsigned char		ha[(MAX_ADDR_LEN+sizeof(unsigned long)-1)&~(sizeof(unsigned long)-1)];
+	unsigned char		ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
 	struct hh_cache		*hh;
 	atomic_t		refcnt;
 	int			(*output)(struct sk_buff *skb);
-- 
cgit v1.2.3-70-g09d2


From 2dfe55b47e3d66ded5a84caf71e0da5710edf48b Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 10 Aug 2006 23:08:33 -0700
Subject: [NET]: Use u32 for routing table IDs

Use u32 for routing table IDs in net/ipv4 and net/decnet in preparation of
support for a larger number of routing tables. net/ipv6 already uses u32
everywhere and needs no further changes. No functional changes are made by
this patch.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dn_fib.h     |  4 ++--
 include/net/ip_fib.h     | 14 +++++++-------
 net/decnet/dn_fib.c      |  6 +++---
 net/decnet/dn_table.c    | 10 +++++-----
 net/ipv4/fib_frontend.c  |  8 ++++----
 net/ipv4/fib_hash.c      |  4 ++--
 net/ipv4/fib_lookup.h    |  4 ++--
 net/ipv4/fib_rules.c     |  2 +-
 net/ipv4/fib_semantics.c |  4 ++--
 net/ipv4/fib_trie.c      |  6 +++---
 10 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h
index 32bc8ce5c5c..cd9c3782f83 100644
--- a/include/net/dn_fib.h
+++ b/include/net/dn_fib.h
@@ -94,7 +94,7 @@ struct dn_fib_node {
 
 
 struct dn_fib_table {
-	int n;
+	u32 n;
 
 	int (*insert)(struct dn_fib_table *t, struct rtmsg *r, 
 			struct dn_kern_rta *rta, struct nlmsghdr *n, 
@@ -137,7 +137,7 @@ extern int dn_fib_sync_up(struct net_device *dev);
 /*
  * dn_tables.c
  */
-extern struct dn_fib_table *dn_fib_get_table(int n, int creat);
+extern struct dn_fib_table *dn_fib_get_table(u32 n, int creat);
 extern struct dn_fib_table *dn_fib_empty_table(void);
 extern void dn_fib_table_init(void);
 extern void dn_fib_table_cleanup(void);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index adf73586bc0..0dcbf166eb9 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -150,7 +150,7 @@ struct fib_result_nl {
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_WRANDOM */
 
 struct fib_table {
-	unsigned char	tb_id;
+	u32		tb_id;
 	unsigned	tb_stamp;
 	int		(*tb_lookup)(struct fib_table *tb, const struct flowi *flp, struct fib_result *res);
 	int		(*tb_insert)(struct fib_table *table, struct rtmsg *r,
@@ -173,14 +173,14 @@ struct fib_table {
 extern struct fib_table *ip_fib_local_table;
 extern struct fib_table *ip_fib_main_table;
 
-static inline struct fib_table *fib_get_table(int id)
+static inline struct fib_table *fib_get_table(u32 id)
 {
 	if (id != RT_TABLE_LOCAL)
 		return ip_fib_main_table;
 	return ip_fib_local_table;
 }
 
-static inline struct fib_table *fib_new_table(int id)
+static inline struct fib_table *fib_new_table(u32 id)
 {
 	return fib_get_table(id);
 }
@@ -205,9 +205,9 @@ static inline void fib_select_default(const struct flowi *flp, struct fib_result
 
 extern struct fib_table * fib_tables[RT_TABLE_MAX+1];
 extern int fib_lookup(struct flowi *flp, struct fib_result *res);
-extern struct fib_table *__fib_new_table(int id);
+extern struct fib_table *__fib_new_table(u32 id);
 
-static inline struct fib_table *fib_get_table(int id)
+static inline struct fib_table *fib_get_table(u32 id)
 {
 	if (id == 0)
 		id = RT_TABLE_MAIN;
@@ -215,7 +215,7 @@ static inline struct fib_table *fib_get_table(int id)
 	return fib_tables[id];
 }
 
-static inline struct fib_table *fib_new_table(int id)
+static inline struct fib_table *fib_new_table(u32 id)
 {
 	if (id == 0)
 		id = RT_TABLE_MAIN;
@@ -248,7 +248,7 @@ extern int fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
 extern u32  __fib_res_prefsrc(struct fib_result *res);
 
 /* Exported by fib_hash.c */
-extern struct fib_table *fib_hash_init(int id);
+extern struct fib_table *fib_hash_init(u32 id);
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 extern int fib4_rules_dump(struct sk_buff *skb, struct netlink_callback *cb);
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index ed5fb5c3eab..7b3bf5c3d72 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -534,8 +534,8 @@ int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 
 int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int t;
-	int s_t;
+	u32 t;
+	u32 s_t;
 	struct dn_fib_table *tb;
 
 	if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
@@ -765,7 +765,7 @@ void dn_fib_flush(void)
 {
         int flushed = 0;
         struct dn_fib_table *tb;
-        int id;
+        u32 id;
 
         for(id = RT_TABLE_MAX; id > 0; id--) {
                 if ((tb = dn_fib_get_table(id, 0)) == NULL)
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 2e01b67398c..1601ee5406a 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -264,7 +264,7 @@ static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern
 }
 
 static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
-                        u8 tb_id, u8 type, u8 scope, void *dst, int dst_len,
+                        u32 tb_id, u8 type, u8 scope, void *dst, int dst_len,
                         struct dn_fib_info *fi, unsigned int flags)
 {
         struct rtmsg *rtm;
@@ -327,7 +327,7 @@ rtattr_failure:
 }
 
 
-static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id,
+static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
                         struct nlmsghdr *nlh, struct netlink_skb_parms *req)
 {
         struct sk_buff *skb;
@@ -740,7 +740,7 @@ out:
 }
 
 
-struct dn_fib_table *dn_fib_get_table(int n, int create)
+struct dn_fib_table *dn_fib_get_table(u32 n, int create)
 {
         struct dn_fib_table *t;
 
@@ -777,7 +777,7 @@ struct dn_fib_table *dn_fib_get_table(int n, int create)
         return t;
 }
 
-static void dn_fib_del_tree(int n)
+static void dn_fib_del_tree(u32 n)
 {
 	struct dn_fib_table *t;
 
@@ -791,7 +791,7 @@ static void dn_fib_del_tree(int n)
 
 struct dn_fib_table *dn_fib_empty_table(void)
 {
-        int id;
+        u32 id;
 
         for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++)
                 if (dn_fib_tables[id] == NULL)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index a83f1aa8034..06f4b23f6f5 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -62,7 +62,7 @@ struct fib_table *ip_fib_main_table;
 
 struct fib_table *fib_tables[RT_TABLE_MAX+1];
 
-struct fib_table *__fib_new_table(int id)
+struct fib_table *__fib_new_table(u32 id)
 {
 	struct fib_table *tb;
 
@@ -82,7 +82,7 @@ static void fib_flush(void)
 	int flushed = 0;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_table *tb;
-	int id;
+	u32 id;
 
 	for (id = RT_TABLE_MAX; id>0; id--) {
 		if ((tb = fib_get_table(id))==NULL)
@@ -333,8 +333,8 @@ int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int t;
-	int s_t;
+	u32 t;
+	u32 s_t;
 	struct fib_table *tb;
 
 	if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 72c633b357c..f8d5c8024cc 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -765,9 +765,9 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
 }
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
-struct fib_table * fib_hash_init(int id)
+struct fib_table * fib_hash_init(u32 id)
 #else
-struct fib_table * __init fib_hash_init(int id)
+struct fib_table * __init fib_hash_init(u32 id)
 #endif
 {
 	struct fib_table *tb;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index ef6609ea0eb..ddd52496b45 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -30,11 +30,11 @@ extern struct fib_info *fib_create_info(const struct rtmsg *r,
 extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
 			struct kern_rta *rta, struct fib_info *fi);
 extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
-			 u8 tb_id, u8 type, u8 scope, void *dst,
+			 u32 tb_id, u8 type, u8 scope, void *dst,
 			 int dst_len, u8 tos, struct fib_info *fi,
 			 unsigned int);
 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
-		      int z, int tb_id,
+		      int z, u32 tb_id,
 		      struct nlmsghdr *n, struct netlink_skb_parms *req);
 extern struct fib_alias *fib_find_alias(struct list_head *fah,
 					u8 tos, u32 prio);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index d242e5291fc..58fb91b00fd 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -169,7 +169,7 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 
 static struct fib_table *fib_empty_table(void)
 {
-	int id;
+	u32 id;
 
 	for (id = 1; id <= RT_TABLE_MAX; id++)
 		if (fib_tables[id] == NULL)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 38bca473c7e..c7a112b5a18 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -273,7 +273,7 @@ int ip_fib_check_default(u32 gw, struct net_device *dev)
 }
 
 void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
-	       int z, int tb_id,
+	       int z, u32 tb_id,
 	       struct nlmsghdr *n, struct netlink_skb_parms *req)
 {
 	struct sk_buff *skb;
@@ -939,7 +939,7 @@ u32 __fib_res_prefsrc(struct fib_result *res)
 
 int
 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
-	      u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
+	      u32 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
 	      struct fib_info *fi, unsigned int flags)
 {
 	struct rtmsg *rtm;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 01801c0f885..4a27b2d573a 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1148,7 +1148,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 
 	key = ntohl(key);
 
-	pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
+	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
 
 	mask = ntohl(inet_make_mask(plen));
 
@@ -1943,9 +1943,9 @@ out:
 /* Fix more generic FIB names for init later */
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
-struct fib_table * fib_hash_init(int id)
+struct fib_table * fib_hash_init(u32 id)
 #else
-struct fib_table * __init fib_hash_init(int id)
+struct fib_table * __init fib_hash_init(u32 id)
 #endif
 {
 	struct fib_table *tb;
-- 
cgit v1.2.3-70-g09d2


From 9e762a4a89b302cb3b26a1f9bb33eff459eaeca9 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 10 Aug 2006 23:09:48 -0700
Subject: [NET]: Introduce RTA_TABLE/FRA_TABLE attributes

Introduce RTA_TABLE route attribute and FRA_TABLE routing rule attribute
to hold 32 bit routing table IDs. Usespace compatibility is provided by
continuing to accept and send the rtm_table field, but because of its
limited size it can only carry the low 8 bits of the table ID. This
implies that if larger IDs are used, _all_ userspace programs using them
need to use RTA_TABLE.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/fib_rules.h |  4 ++++
 include/linux/rtnetlink.h |  8 ++++++++
 include/net/fib_rules.h   |  7 +++++++
 net/core/fib_rules.c      |  5 +++--
 net/decnet/dn_fib.c       |  7 ++++---
 net/decnet/dn_route.c     |  1 +
 net/decnet/dn_table.c     |  1 +
 net/ipv4/fib_frontend.c   |  7 ++++---
 net/ipv4/fib_rules.c      |  1 +
 net/ipv4/fib_semantics.c  |  1 +
 net/ipv4/route.c          |  1 +
 net/ipv6/fib6_rules.c     |  1 +
 net/ipv6/route.c          | 13 +++++++++----
 13 files changed, 45 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h
index 5e503f0ca6e..19a82b6c1c1 100644
--- a/include/linux/fib_rules.h
+++ b/include/linux/fib_rules.h
@@ -36,6 +36,10 @@ enum
 	FRA_UNUSED5,
 	FRA_FWMARK,	/* netfilter mark (IPv4) */
 	FRA_FLOW,	/* flow/class id */
+	FRA_UNUSED6,
+	FRA_UNUSED7,
+	FRA_UNUSED8,
+	FRA_TABLE,	/* Extended table id */
 	__FRA_MAX
 };
 
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 0aaffa2ae66..ea422a539a0 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -264,6 +264,7 @@ enum rtattr_type_t
 	RTA_CACHEINFO,
 	RTA_SESSION,
 	RTA_MP_ALGO,
+	RTA_TABLE,
 	__RTA_MAX
 };
 
@@ -717,6 +718,13 @@ extern void __rtnl_unlock(void);
 	} \
 } while(0)
 
+static inline u32 rtm_get_table(struct rtattr **rta, u8 table)
+{
+	return RTA_GET_U32(rta[RTA_TABLE-1]);
+rtattr_failure:
+	return table;
+}
+
 #endif /* __KERNEL__ */
 
 
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 61375d9e53f..8e2f473d3e8 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -74,6 +74,13 @@ static inline void fib_rule_put(struct fib_rule *rule)
 		call_rcu(&rule->rcu, fib_rule_put_rcu);
 }
 
+static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla)
+{
+	if (nla[FRA_TABLE])
+		return nla_get_u32(nla[FRA_TABLE]);
+	return frh->table;
+}
+
 extern int			fib_rules_register(struct fib_rules_ops *);
 extern int			fib_rules_unregister(struct fib_rules_ops *);
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 6cdad24038e..873b04d5df8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -187,7 +187,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 	rule->action = frh->action;
 	rule->flags = frh->flags;
-	rule->table = frh->table;
+	rule->table = frh_get_table(frh, tb);
 
 	if (!rule->pref && ops->default_pref)
 		rule->pref = ops->default_pref();
@@ -245,7 +245,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		if (frh->action && (frh->action != rule->action))
 			continue;
 
-		if (frh->table && (frh->table != rule->table))
+		if (frh->table && (frh_get_table(frh, tb) != rule->table))
 			continue;
 
 		if (tb[FRA_PRIORITY] &&
@@ -291,6 +291,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 
 	frh = nlmsg_data(nlh);
 	frh->table = rule->table;
+	NLA_PUT_U32(skb, FRA_TABLE, rule->table);
 	frh->res1 = 0;
 	frh->res2 = 0;
 	frh->action = rule->action;
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 7b3bf5c3d72..fb596373daa 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -491,7 +491,8 @@ static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta)
 		if (attr) {
 			if (RTA_PAYLOAD(attr) < 4 && RTA_PAYLOAD(attr) != 2)
 				return -EINVAL;
-			if (i != RTA_MULTIPATH && i != RTA_METRICS)
+			if (i != RTA_MULTIPATH && i != RTA_METRICS &&
+			    i != RTA_TABLE)
 				rta[i-1] = (struct rtattr *)RTA_DATA(attr);
 		}
 	}
@@ -508,7 +509,7 @@ int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	if (dn_fib_check_attr(r, rta))
 		return -EINVAL;
 
-	tb = dn_fib_get_table(r->rtm_table, 0);
+	tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 0);
 	if (tb)
 		return tb->delete(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb));
 
@@ -524,7 +525,7 @@ int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	if (dn_fib_check_attr(r, rta))
 		return -EINVAL;
 
-	tb = dn_fib_get_table(r->rtm_table, 1);
+	tb = dn_fib_get_table(rtm_get_table(rta, r->rtm_table), 1);
 	if (tb) 
 		return tb->insert(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb));
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5e6f4616ca1..4c963213fba 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1486,6 +1486,7 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 	r->rtm_src_len = 0;
 	r->rtm_tos = 0;
 	r->rtm_table = RT_TABLE_MAIN;
+	RTA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
 	r->rtm_type = rt->rt_type;
 	r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
 	r->rtm_scope = RT_SCOPE_UNIVERSE;
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 1601ee5406a..eca7c1e10c8 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -278,6 +278,7 @@ static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
         rtm->rtm_src_len = 0;
         rtm->rtm_tos = 0;
         rtm->rtm_table = tb_id;
+	RTA_PUT_U32(skb, RTA_TABLE, tb_id);
         rtm->rtm_flags = fi->fib_flags;
         rtm->rtm_scope = scope;
 	rtm->rtm_type  = type;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 06f4b23f6f5..2696ede52de 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -294,7 +294,8 @@ static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
 		if (attr) {
 			if (RTA_PAYLOAD(attr) < 4)
 				return -EINVAL;
-			if (i != RTA_MULTIPATH && i != RTA_METRICS)
+			if (i != RTA_MULTIPATH && i != RTA_METRICS &&
+			    i != RTA_TABLE)
 				*rta = (struct rtattr*)RTA_DATA(attr);
 		}
 	}
@@ -310,7 +311,7 @@ int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	if (inet_check_attr(r, rta))
 		return -EINVAL;
 
-	tb = fib_get_table(r->rtm_table);
+	tb = fib_get_table(rtm_get_table(rta, r->rtm_table));
 	if (tb)
 		return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
 	return -ESRCH;
@@ -325,7 +326,7 @@ int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	if (inet_check_attr(r, rta))
 		return -EINVAL;
 
-	tb = fib_new_table(r->rtm_table);
+	tb = fib_new_table(rtm_get_table(rta, r->rtm_table));
 	if (tb)
 		return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
 	return -ENOBUFS;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 58fb91b00fd..0330b9cc4b5 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -184,6 +184,7 @@ static struct nla_policy fib4_rule_policy[FRA_MAX+1] __read_mostly = {
 	[FRA_DST]	= { .type = NLA_U32 },
 	[FRA_FWMARK]	= { .type = NLA_U32 },
 	[FRA_FLOW]	= { .type = NLA_U32 },
+	[FRA_TABLE]	= { .type = NLA_U32 },
 };
 
 static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c7a112b5a18..ab753df20a3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -953,6 +953,7 @@ fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 	rtm->rtm_src_len = 0;
 	rtm->rtm_tos = tos;
 	rtm->rtm_table = tb_id;
+	RTA_PUT_U32(skb, RTA_TABLE, tb_id);
 	rtm->rtm_type = type;
 	rtm->rtm_flags = fi->fib_flags;
 	rtm->rtm_scope = scope;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b873cbcdd0b..12128b82c9d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2652,6 +2652,7 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 	r->rtm_src_len	= 0;
 	r->rtm_tos	= rt->fl.fl4_tos;
 	r->rtm_table	= RT_TABLE_MAIN;
+	RTA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
 	r->rtm_type	= rt->rt_type;
 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
 	r->rtm_protocol = RTPROT_UNSPEC;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 22a2fdb0983..2c4fbc855e6 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -129,6 +129,7 @@ static struct nla_policy fib6_rule_policy[RTA_MAX+1] __read_mostly = {
 	[FRA_PRIORITY]	= { .type = NLA_U32 },
 	[FRA_SRC]	= { .minlen = sizeof(struct in6_addr) },
 	[FRA_DST]	= { .minlen = sizeof(struct in6_addr) },
+	[FRA_TABLE]	= { .type = NLA_U32 },
 };
 
 static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e08d84063c1..843c5509fce 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1859,7 +1859,8 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
 		return -EINVAL;
-	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb), r->rtm_table);
+	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb),
+			     rtm_get_table(arg, r->rtm_table));
 }
 
 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -1869,7 +1870,8 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
 		return -EINVAL;
-	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb), r->rtm_table);
+	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb),
+			     rtm_get_table(arg, r->rtm_table));
 }
 
 struct rt6_rtnl_dump_arg
@@ -1887,6 +1889,7 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
 	struct rta_cacheinfo ci;
+	u32 table;
 
 	if (prefix) {	/* user wants prefix routes only */
 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
@@ -1902,9 +1905,11 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 	rtm->rtm_src_len = rt->rt6i_src.plen;
 	rtm->rtm_tos = 0;
 	if (rt->rt6i_table)
-		rtm->rtm_table = rt->rt6i_table->tb6_id;
+		table = rt->rt6i_table->tb6_id;
 	else
-		rtm->rtm_table = RT6_TABLE_UNSPEC;
+		table = RT6_TABLE_UNSPEC;
+	rtm->rtm_table = table;
+	RTA_PUT_U32(skb, RTA_TABLE, table);
 	if (rt->rt6i_flags&RTF_REJECT)
 		rtm->rtm_type = RTN_UNREACHABLE;
 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
-- 
cgit v1.2.3-70-g09d2


From 1af5a8c4a11cfed0c9a7f30fcfb689981750599c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 10 Aug 2006 23:10:46 -0700
Subject: [IPV4]: Increase number of possible routing tables to 2^32

Increase the number of possible routing tables to 2^32 by replacing the
fixed sized array of pointers by a hash table and replacing iterations
over all possible table IDs by hash table walking.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  25 +++---------
 net/ipv4/fib_frontend.c | 102 ++++++++++++++++++++++++++++++++----------------
 net/ipv4/fib_hash.c     |  26 ++++++------
 net/ipv4/fib_rules.c    |   4 +-
 net/ipv4/fib_trie.c     |  26 ++++++------
 5 files changed, 101 insertions(+), 82 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 0dcbf166eb9..8e9ba563d34 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -150,6 +150,7 @@ struct fib_result_nl {
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_WRANDOM */
 
 struct fib_table {
+	struct hlist_node tb_hlist;
 	u32		tb_id;
 	unsigned	tb_stamp;
 	int		(*tb_lookup)(struct fib_table *tb, const struct flowi *flp, struct fib_result *res);
@@ -200,29 +201,13 @@ static inline void fib_select_default(const struct flowi *flp, struct fib_result
 }
 
 #else /* CONFIG_IP_MULTIPLE_TABLES */
-#define ip_fib_local_table (fib_tables[RT_TABLE_LOCAL])
-#define ip_fib_main_table (fib_tables[RT_TABLE_MAIN])
+#define ip_fib_local_table fib_get_table(RT_TABLE_LOCAL)
+#define ip_fib_main_table fib_get_table(RT_TABLE_MAIN)
 
-extern struct fib_table * fib_tables[RT_TABLE_MAX+1];
 extern int fib_lookup(struct flowi *flp, struct fib_result *res);
-extern struct fib_table *__fib_new_table(u32 id);
-
-static inline struct fib_table *fib_get_table(u32 id)
-{
-	if (id == 0)
-		id = RT_TABLE_MAIN;
-
-	return fib_tables[id];
-}
-
-static inline struct fib_table *fib_new_table(u32 id)
-{
-	if (id == 0)
-		id = RT_TABLE_MAIN;
-
-	return fib_tables[id] ? : __fib_new_table(id);
-}
 
+extern struct fib_table *fib_new_table(u32 id);
+extern struct fib_table *fib_get_table(u32 id);
 extern void fib_select_default(const struct flowi *flp, struct fib_result *res);
 
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2696ede52de..ad4c14f968a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -37,6 +37,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/init.h>
+#include <linux/list.h>
 
 #include <net/ip.h>
 #include <net/protocol.h>
@@ -51,48 +52,67 @@
 
 #ifndef CONFIG_IP_MULTIPLE_TABLES
 
-#define RT_TABLE_MIN RT_TABLE_MAIN
-
 struct fib_table *ip_fib_local_table;
 struct fib_table *ip_fib_main_table;
 
-#else
+#define FIB_TABLE_HASHSZ 1
+static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
 
-#define RT_TABLE_MIN 1
+#else
 
-struct fib_table *fib_tables[RT_TABLE_MAX+1];
+#define FIB_TABLE_HASHSZ 256
+static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
 
-struct fib_table *__fib_new_table(u32 id)
+struct fib_table *fib_new_table(u32 id)
 {
 	struct fib_table *tb;
+	unsigned int h;
 
+	if (id == 0)
+		id = RT_TABLE_MAIN;
+	tb = fib_get_table(id);
+	if (tb)
+		return tb;
 	tb = fib_hash_init(id);
 	if (!tb)
 		return NULL;
-	fib_tables[id] = tb;
+	h = id & (FIB_TABLE_HASHSZ - 1);
+	hlist_add_head_rcu(&tb->tb_hlist, &fib_table_hash[h]);
 	return tb;
 }
 
+struct fib_table *fib_get_table(u32 id)
+{
+	struct fib_table *tb;
+	struct hlist_node *node;
+	unsigned int h;
 
+	if (id == 0)
+		id = RT_TABLE_MAIN;
+	h = id & (FIB_TABLE_HASHSZ - 1);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb_hlist) {
+		if (tb->tb_id == id) {
+			rcu_read_unlock();
+			return tb;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
-
 static void fib_flush(void)
 {
 	int flushed = 0;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
 	struct fib_table *tb;
-	u32 id;
+	struct hlist_node *node;
+	unsigned int h;
 
-	for (id = RT_TABLE_MAX; id>0; id--) {
-		if ((tb = fib_get_table(id))==NULL)
-			continue;
-		flushed += tb->tb_flush(tb);
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist)
+			flushed += tb->tb_flush(tb);
 	}
-#else /* CONFIG_IP_MULTIPLE_TABLES */
-	flushed += ip_fib_main_table->tb_flush(ip_fib_main_table);
-	flushed += ip_fib_local_table->tb_flush(ip_fib_local_table);
-#endif /* CONFIG_IP_MULTIPLE_TABLES */
 
 	if (flushed)
 		rt_cache_flush(-1);
@@ -334,29 +354,37 @@ int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	u32 t;
-	u32 s_t;
+	unsigned int h, s_h;
+	unsigned int e = 0, s_e;
 	struct fib_table *tb;
+	struct hlist_node *node;
+	int dumped = 0;
 
 	if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
 	    ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
 		return ip_rt_dump(skb, cb);
 
-	s_t = cb->args[0];
-	if (s_t == 0)
-		s_t = cb->args[0] = RT_TABLE_MIN;
-
-	for (t=s_t; t<=RT_TABLE_MAX; t++) {
-		if (t < s_t) continue;
-		if (t > s_t)
-			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
-		if ((tb = fib_get_table(t))==NULL)
-			continue;
-		if (tb->tb_dump(tb, skb, cb) < 0) 
-			break;
+	s_h = cb->args[0];
+	s_e = cb->args[1];
+
+	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
+		e = 0;
+		hlist_for_each_entry(tb, node, &fib_table_hash[h], tb_hlist) {
+			if (e < s_e)
+				goto next;
+			if (dumped)
+				memset(&cb->args[2], 0, sizeof(cb->args) -
+				                 2 * sizeof(cb->args[0]));
+			if (tb->tb_dump(tb, skb, cb) < 0)
+				goto out;
+			dumped = 1;
+next:
+			e++;
+		}
 	}
-
-	cb->args[0] = t;
+out:
+	cb->args[1] = e;
+	cb->args[0] = h;
 
 	return skb->len;
 }
@@ -654,9 +682,15 @@ static struct notifier_block fib_netdev_notifier = {
 
 void __init ip_fib_init(void)
 {
+	unsigned int i;
+
+	for (i = 0; i < FIB_TABLE_HASHSZ; i++)
+		INIT_HLIST_HEAD(&fib_table_hash[i]);
 #ifndef CONFIG_IP_MULTIPLE_TABLES
 	ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
+	hlist_add_head_rcu(&ip_fib_local_table->tb_hlist, &fib_table_hash[0]);
 	ip_fib_main_table  = fib_hash_init(RT_TABLE_MAIN);
+	hlist_add_head_rcu(&ip_fib_main_table->tb_hlist, &fib_table_hash[0]);
 #else
 	fib4_rules_init();
 #endif
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index f8d5c8024cc..b5bee1a71e5 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -684,7 +684,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 	struct fib_node *f;
 	int i, s_i;
 
-	s_i = cb->args[3];
+	s_i = cb->args[4];
 	i = 0;
 	hlist_for_each_entry(f, node, head, fn_hash) {
 		struct fib_alias *fa;
@@ -704,14 +704,14 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 					  fa->fa_tos,
 					  fa->fa_info,
 					  NLM_F_MULTI) < 0) {
-				cb->args[3] = i;
+				cb->args[4] = i;
 				return -1;
 			}
 		next:
 			i++;
 		}
 	}
-	cb->args[3] = i;
+	cb->args[4] = i;
 	return skb->len;
 }
 
@@ -722,21 +722,21 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
 {
 	int h, s_h;
 
-	s_h = cb->args[2];
+	s_h = cb->args[3];
 	for (h=0; h < fz->fz_divisor; h++) {
 		if (h < s_h) continue;
 		if (h > s_h)
-			memset(&cb->args[3], 0,
-			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
+			memset(&cb->args[4], 0,
+			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
 		if (fz->fz_hash == NULL ||
 		    hlist_empty(&fz->fz_hash[h]))
 			continue;
 		if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) {
-			cb->args[2] = h;
+			cb->args[3] = h;
 			return -1;
 		}
 	}
-	cb->args[2] = h;
+	cb->args[3] = h;
 	return skb->len;
 }
 
@@ -746,21 +746,21 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
 	struct fn_zone *fz;
 	struct fn_hash *table = (struct fn_hash*)tb->tb_data;
 
-	s_m = cb->args[1];
+	s_m = cb->args[2];
 	read_lock(&fib_hash_lock);
 	for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
 		if (m < s_m) continue;
 		if (m > s_m)
-			memset(&cb->args[2], 0,
-			       sizeof(cb->args) - 2*sizeof(cb->args[0]));
+			memset(&cb->args[3], 0,
+			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
-			cb->args[1] = m;
+			cb->args[2] = m;
 			read_unlock(&fib_hash_lock);
 			return -1;
 		}
 	}
 	read_unlock(&fib_hash_lock);
-	cb->args[1] = m;
+	cb->args[2] = m;
 	return skb->len;
 }
 
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 0330b9cc4b5..ce185ac6f26 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -172,8 +172,8 @@ static struct fib_table *fib_empty_table(void)
 	u32 id;
 
 	for (id = 1; id <= RT_TABLE_MAX; id++)
-		if (fib_tables[id] == NULL)
-			return __fib_new_table(id);
+		if (fib_get_table(id) == NULL)
+			return fib_new_table(id);
 	return NULL;
 }
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4a27b2d573a..2a580eb2579 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1848,7 +1848,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
 
 	u32 xkey = htonl(key);
 
-	s_i = cb->args[3];
+	s_i = cb->args[4];
 	i = 0;
 
 	/* rcu_read_lock is hold by caller */
@@ -1870,12 +1870,12 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
 				  plen,
 				  fa->fa_tos,
 				  fa->fa_info, 0) < 0) {
-			cb->args[3] = i;
+			cb->args[4] = i;
 			return -1;
 		}
 		i++;
 	}
-	cb->args[3] = i;
+	cb->args[4] = i;
 	return skb->len;
 }
 
@@ -1886,14 +1886,14 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
 	struct list_head *fa_head;
 	struct leaf *l = NULL;
 
-	s_h = cb->args[2];
+	s_h = cb->args[3];
 
 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
 		if (h < s_h)
 			continue;
 		if (h > s_h)
-			memset(&cb->args[3], 0,
-			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
+			memset(&cb->args[4], 0,
+			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
 
 		fa_head = get_fa_head(l, plen);
 
@@ -1904,11 +1904,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str
 			continue;
 
 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
-			cb->args[2] = h;
+			cb->args[3] = h;
 			return -1;
 		}
 	}
-	cb->args[2] = h;
+	cb->args[3] = h;
 	return skb->len;
 }
 
@@ -1917,23 +1917,23 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
 	int m, s_m;
 	struct trie *t = (struct trie *) tb->tb_data;
 
-	s_m = cb->args[1];
+	s_m = cb->args[2];
 
 	rcu_read_lock();
 	for (m = 0; m <= 32; m++) {
 		if (m < s_m)
 			continue;
 		if (m > s_m)
-			memset(&cb->args[2], 0,
-				sizeof(cb->args) - 2*sizeof(cb->args[0]));
+			memset(&cb->args[3], 0,
+				sizeof(cb->args) - 3*sizeof(cb->args[0]));
 
 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
-			cb->args[1] = m;
+			cb->args[2] = m;
 			goto out;
 		}
 	}
 	rcu_read_unlock();
-	cb->args[1] = m;
+	cb->args[2] = m;
 	return skb->len;
 out:
 	rcu_read_unlock();
-- 
cgit v1.2.3-70-g09d2


From 1b43af5480c351dbcb2eef478bafe179cbeb6e83 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 10 Aug 2006 23:11:17 -0700
Subject: [IPV6]: Increase number of possible routing tables to 2^32

Increase number of possible routing tables to 2^32 by replacing iterations
over all possible table IDs by hash table walking.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |   7 ++
 net/ipv6/ip6_fib.c      | 171 ++++++++++++++++++++++++++++++++++++++++++------
 net/ipv6/route.c        | 128 +-----------------------------------
 3 files changed, 159 insertions(+), 147 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 9bfa3cc6ced..01bfe404784 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -137,6 +137,13 @@ extern int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
 extern int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
 extern int inet6_rtm_getroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
 
+struct rt6_rtnl_dump_arg
+{
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+};
+
+extern int rt6_dump_route(struct rt6_info *rt, void *p_arg);
 extern void rt6_ifdown(struct net_device *dev);
 extern void rt6_mtu_change(struct net_device *dev, unsigned mtu);
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1f2316187ca..bececbe9dd2 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -158,7 +158,26 @@ static struct fib6_table fib6_main_tbl = {
 };
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+#define FIB_TABLE_HASHSZ 256
+#else
+#define FIB_TABLE_HASHSZ 1
+#endif
+static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
+
+static void fib6_link_table(struct fib6_table *tb)
+{
+	unsigned int h;
+
+	h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1);
 
+	/*
+	 * No protection necessary, this is the only list mutatation
+	 * operation, tables never disappear once they exist.
+	 */
+	hlist_add_head_rcu(&tb->tb6_hlist, &fib_table_hash[h]);
+}
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 static struct fib6_table fib6_local_tbl = {
 	.tb6_id		= RT6_TABLE_LOCAL,
 	.tb6_lock	= RW_LOCK_UNLOCKED,
@@ -168,9 +187,6 @@ static struct fib6_table fib6_local_tbl = {
 	},
 };
 
-#define FIB_TABLE_HASHSZ 256
-static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
-
 static struct fib6_table *fib6_alloc_table(u32 id)
 {
 	struct fib6_table *table;
@@ -186,19 +202,6 @@ static struct fib6_table *fib6_alloc_table(u32 id)
 	return table;
 }
 
-static void fib6_link_table(struct fib6_table *tb)
-{
-	unsigned int h;
-
-	h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1);
-
-	/*
-	 * No protection necessary, this is the only list mutatation
-	 * operation, tables never disappear once they exist.
-	 */
-	hlist_add_head_rcu(&tb->tb6_hlist, &fib_table_hash[h]);
-}
-
 struct fib6_table *fib6_new_table(u32 id)
 {
 	struct fib6_table *tb;
@@ -263,10 +266,135 @@ struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
 
 static void __init fib6_tables_init(void)
 {
+	fib6_link_table(&fib6_main_tbl);
 }
 
 #endif
 
+static int fib6_dump_node(struct fib6_walker_t *w)
+{
+	int res;
+	struct rt6_info *rt;
+
+	for (rt = w->leaf; rt; rt = rt->u.next) {
+		res = rt6_dump_route(rt, w->args);
+		if (res < 0) {
+			/* Frame is full, suspend walking */
+			w->leaf = rt;
+			return 1;
+		}
+		BUG_TRAP(res!=0);
+	}
+	w->leaf = NULL;
+	return 0;
+}
+
+static void fib6_dump_end(struct netlink_callback *cb)
+{
+	struct fib6_walker_t *w = (void*)cb->args[2];
+
+	if (w) {
+		cb->args[2] = 0;
+		kfree(w);
+	}
+	cb->done = (void*)cb->args[3];
+	cb->args[1] = 3;
+}
+
+static int fib6_dump_done(struct netlink_callback *cb)
+{
+	fib6_dump_end(cb);
+	return cb->done ? cb->done(cb) : 0;
+}
+
+static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
+			   struct netlink_callback *cb)
+{
+	struct fib6_walker_t *w;
+	int res;
+
+	w = (void *)cb->args[2];
+	w->root = &table->tb6_root;
+
+	if (cb->args[4] == 0) {
+		read_lock_bh(&table->tb6_lock);
+		res = fib6_walk(w);
+		read_unlock_bh(&table->tb6_lock);
+		if (res > 0)
+			cb->args[4] = 1;
+	} else {
+		read_lock_bh(&table->tb6_lock);
+		res = fib6_walk_continue(w);
+		read_unlock_bh(&table->tb6_lock);
+		if (res != 0) {
+			if (res < 0)
+				fib6_walker_unlink(w);
+			goto end;
+		}
+		fib6_walker_unlink(w);
+		cb->args[4] = 0;
+	}
+end:
+	return res;
+}
+
+int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	unsigned int h, s_h;
+	unsigned int e = 0, s_e;
+	struct rt6_rtnl_dump_arg arg;
+	struct fib6_walker_t *w;
+	struct fib6_table *tb;
+	struct hlist_node *node;
+	int res = 0;
+
+	s_h = cb->args[0];
+	s_e = cb->args[1];
+
+	w = (void *)cb->args[2];
+	if (w == NULL) {
+		/* New dump:
+		 *
+		 * 1. hook callback destructor.
+		 */
+		cb->args[3] = (long)cb->done;
+		cb->done = fib6_dump_done;
+
+		/*
+		 * 2. allocate and initialize walker.
+		 */
+		w = kzalloc(sizeof(*w), GFP_ATOMIC);
+		if (w == NULL)
+			return -ENOMEM;
+		w->func = fib6_dump_node;
+		cb->args[2] = (long)w;
+	}
+
+	arg.skb = skb;
+	arg.cb = cb;
+	w->args = &arg;
+
+	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
+		e = 0;
+		hlist_for_each_entry(tb, node, &fib_table_hash[h], tb6_hlist) {
+			if (e < s_e)
+				goto next;
+			res = fib6_dump_table(tb, skb, cb);
+			if (res != 0)
+				goto out;
+next:
+			e++;
+		}
+	}
+out:
+	cb->args[1] = e;
+	cb->args[0] = h;
+
+	res = res < 0 ? res : skb->len;
+	if (res <= 0)
+		fib6_dump_end(cb);
+	return res;
+}
 
 /*
  *	Routing Table
@@ -1187,17 +1315,20 @@ static void fib6_clean_tree(struct fib6_node *root,
 void fib6_clean_all(int (*func)(struct rt6_info *, void *arg),
 		    int prune, void *arg)
 {
-	int i;
 	struct fib6_table *table;
+	struct hlist_node *node;
+	unsigned int h;
 
-	for (i = FIB6_TABLE_MIN; i <= FIB6_TABLE_MAX; i++) {
-		table = fib6_get_table(i);
-		if (table != NULL) {
+	rcu_read_lock();
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		hlist_for_each_entry_rcu(table, node, &fib_table_hash[h],
+					 tb6_hlist) {
 			write_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(&table->tb6_root, func, prune, arg);
 			write_unlock_bh(&table->tb6_lock);
 		}
 	}
+	rcu_read_unlock();
 }
 
 static int fib6_prune_clone(struct rt6_info *rt, void *arg)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 843c5509fce..9ce28277f47 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1874,12 +1874,6 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 			     rtm_get_table(arg, r->rtm_table));
 }
 
-struct rt6_rtnl_dump_arg
-{
-	struct sk_buff *skb;
-	struct netlink_callback *cb;
-};
-
 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 			 struct in6_addr *dst, struct in6_addr *src,
 			 int iif, int type, u32 pid, u32 seq,
@@ -1976,7 +1970,7 @@ rtattr_failure:
 	return -1;
 }
 
-static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
+int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
 	int prefix;
@@ -1992,126 +1986,6 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 		     prefix, NLM_F_MULTI);
 }
 
-static int fib6_dump_node(struct fib6_walker_t *w)
-{
-	int res;
-	struct rt6_info *rt;
-
-	for (rt = w->leaf; rt; rt = rt->u.next) {
-		res = rt6_dump_route(rt, w->args);
-		if (res < 0) {
-			/* Frame is full, suspend walking */
-			w->leaf = rt;
-			return 1;
-		}
-		BUG_TRAP(res!=0);
-	}
-	w->leaf = NULL;
-	return 0;
-}
-
-static void fib6_dump_end(struct netlink_callback *cb)
-{
-	struct fib6_walker_t *w = (void*)cb->args[0];
-
-	if (w) {
-		cb->args[0] = 0;
-		kfree(w);
-	}
-	cb->done = (void*)cb->args[1];
-	cb->args[1] = 0;
-}
-
-static int fib6_dump_done(struct netlink_callback *cb)
-{
-	fib6_dump_end(cb);
-	return cb->done ? cb->done(cb) : 0;
-}
-
-int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	struct fib6_table *table;
-	struct rt6_rtnl_dump_arg arg;
-	struct fib6_walker_t *w;
-	int i, res = 0;
-
-	arg.skb = skb;
-	arg.cb = cb;
-
-	/*
-	 * cb->args[0] = pointer to walker structure
-	 * cb->args[1] = saved cb->done() pointer
-	 * cb->args[2] = current table being dumped
-	 */
-
-	w = (void*)cb->args[0];
-	if (w == NULL) {
-		/* New dump:
-		 * 
-		 * 1. hook callback destructor.
-		 */
-		cb->args[1] = (long)cb->done;
-		cb->done = fib6_dump_done;
-
-		/*
-		 * 2. allocate and initialize walker.
-		 */
-		w = kzalloc(sizeof(*w), GFP_ATOMIC);
-		if (w == NULL)
-			return -ENOMEM;
-		w->func = fib6_dump_node;
-		w->args = &arg;
-		cb->args[0] = (long)w;
-		cb->args[2] = FIB6_TABLE_MIN;
-	} else {
-		w->args = &arg;
-		i = cb->args[2];
-		if (i > FIB6_TABLE_MAX)
-			goto end;
-
-		table = fib6_get_table(i);
-		if (table != NULL) {
-			read_lock_bh(&table->tb6_lock);
-			w->root = &table->tb6_root;
-			res = fib6_walk_continue(w);
-			read_unlock_bh(&table->tb6_lock);
-			if (res != 0) {
-				if (res < 0)
-					fib6_walker_unlink(w);
-				goto end;
-			}
-		}
-
-		fib6_walker_unlink(w);
-		cb->args[2] = ++i;
-	}
-
-	for (i = cb->args[2]; i <= FIB6_TABLE_MAX; i++) {
-		table = fib6_get_table(i);
-		if (table == NULL)
-			continue;
-
-		read_lock_bh(&table->tb6_lock);
-		w->root = &table->tb6_root;
-		res = fib6_walk(w);
-		read_unlock_bh(&table->tb6_lock);
-		if (res)
-			break;
-	}
-end:
-	cb->args[2] = i;
-
-	res = res < 0 ? res : skb->len;
-	/* res < 0 is an error. (really, impossible)
-	   res == 0 means that dump is complete, but skb still can contain data.
-	   res > 0 dump is not complete, but frame is full.
-	 */
-	/* Destroy walker, if dump of this table is complete. */
-	if (res <= 0)
-		fib6_dump_end(cb);
-	return res;
-}
-
 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 {
 	struct rtattr **rta = arg;
-- 
cgit v1.2.3-70-g09d2


From abcab268303c22d24fc89fedd35d82271d20f5da Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 10 Aug 2006 23:11:47 -0700
Subject: [DECNET]: Increase number of possible routing tables to 2^32

Increase the number of possible routing tables to 2^32 by replacing the
fixed sized array of pointers by a hash table and replacing iterations
over all possible table IDs by hash table walking.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dn_fib.h  |   3 +-
 net/decnet/dn_fib.c   |  49 --------------------
 net/decnet/dn_rules.c |   2 +-
 net/decnet/dn_table.c | 125 ++++++++++++++++++++++++++++++++++++--------------
 4 files changed, 93 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h
index cd9c3782f83..d97aa10c463 100644
--- a/include/net/dn_fib.h
+++ b/include/net/dn_fib.h
@@ -94,6 +94,7 @@ struct dn_fib_node {
 
 
 struct dn_fib_table {
+	struct hlist_node hlist;
 	u32 n;
 
 	int (*insert)(struct dn_fib_table *t, struct rtmsg *r, 
@@ -177,8 +178,6 @@ static inline void dn_fib_res_put(struct dn_fib_res *res)
 		fib_rule_put(res->r);
 }
 
-extern struct dn_fib_table *dn_fib_tables[];
-
 #else /* Endnode */
 
 #define dn_fib_init()  do { } while(0)
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index fb596373daa..5ccca3ed53b 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -532,39 +532,6 @@ int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	return -ENOBUFS;
 }
 
-
-int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	u32 t;
-	u32 s_t;
-	struct dn_fib_table *tb;
-
-	if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
-		((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
-			return dn_cache_dump(skb, cb);
-
-	s_t = cb->args[0];
-	if (s_t == 0)
-		s_t = cb->args[0] = RT_MIN_TABLE;
-
-	for(t = s_t; t <= RT_TABLE_MAX; t++) {
-		if (t < s_t)
-			continue;
-		if (t > s_t)
-			memset(&cb->args[1], 0,
-			       sizeof(cb->args) - sizeof(cb->args[0]));
-		tb = dn_fib_get_table(t, 0);
-		if (tb == NULL)
-			continue;
-		if (tb->dump(tb, skb, cb) < 0)
-			break;
-	}
-
-	cb->args[0] = t;
-
-	return skb->len;
-}
-
 static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa)
 {
 	struct dn_fib_table *tb;
@@ -762,22 +729,6 @@ int dn_fib_sync_up(struct net_device *dev)
         return ret;
 }
 
-void dn_fib_flush(void)
-{
-        int flushed = 0;
-        struct dn_fib_table *tb;
-        u32 id;
-
-        for(id = RT_TABLE_MAX; id > 0; id--) {
-                if ((tb = dn_fib_get_table(id, 0)) == NULL)
-                        continue;
-                flushed += tb->flush(tb);
-        }
-
-        if (flushed)
-                dn_rt_cache_flush(-1);
-}
-
 static struct notifier_block dn_fib_dnaddr_notifier = {
 	.notifier_call = dn_fib_dnaddr_event,
 };
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 096f1273e71..878312ff34e 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -210,7 +210,7 @@ unsigned dnet_addr_type(__le16 addr)
 	struct flowi fl = { .nl_u = { .dn_u = { .daddr = addr } } };
 	struct dn_fib_res res;
 	unsigned ret = RTN_UNICAST;
-	struct dn_fib_table *tb = dn_fib_tables[RT_TABLE_LOCAL];
+	struct dn_fib_table *tb = dn_fib_get_table(RT_TABLE_LOCAL, 0);
 
 	res.r = NULL;
 
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index eca7c1e10c8..10e87262b6f 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -75,9 +75,9 @@ for( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
 for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next)
 
 #define RT_TABLE_MIN 1
-
+#define DN_FIB_TABLE_HASHSZ 256
+static struct hlist_head dn_fib_table_hash[DN_FIB_TABLE_HASHSZ];
 static DEFINE_RWLOCK(dn_fib_tables_lock);
-struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1];
 
 static kmem_cache_t *dn_hash_kmem __read_mostly;
 static int dn_fib_hash_zombies;
@@ -361,7 +361,7 @@ static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
 {
 	int i, s_i;
 
-	s_i = cb->args[3];
+	s_i = cb->args[4];
 	for(i = 0; f; i++, f = f->fn_next) {
 		if (i < s_i)
 			continue;
@@ -374,11 +374,11 @@ static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
 				(f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
 				f->fn_scope, &f->fn_key, dz->dz_order, 
 				f->fn_info, NLM_F_MULTI) < 0) {
-			cb->args[3] = i;
+			cb->args[4] = i;
 			return -1;
 		}
 	}
-	cb->args[3] = i;
+	cb->args[4] = i;
 	return skb->len;
 }
 
@@ -389,20 +389,20 @@ static __inline__ int dn_hash_dump_zone(struct sk_buff *skb,
 {
 	int h, s_h;
 
-	s_h = cb->args[2];
+	s_h = cb->args[3];
 	for(h = 0; h < dz->dz_divisor; h++) {
 		if (h < s_h)
 			continue;
 		if (h > s_h)
-			memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
+			memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0]));
 		if (dz->dz_hash == NULL || dz->dz_hash[h] == NULL)
 			continue;
 		if (dn_hash_dump_bucket(skb, cb, tb, dz, dz->dz_hash[h]) < 0) {
-			cb->args[2] = h;
+			cb->args[3] = h;
 			return -1;
 		}
 	}
-	cb->args[2] = h;
+	cb->args[3] = h;
 	return skb->len;
 }
 
@@ -413,26 +413,63 @@ static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb,
 	struct dn_zone *dz;
 	struct dn_hash *table = (struct dn_hash *)tb->data;
 
-	s_m = cb->args[1];
+	s_m = cb->args[2];
 	read_lock(&dn_fib_tables_lock);
 	for(dz = table->dh_zone_list, m = 0; dz; dz = dz->dz_next, m++) {
 		if (m < s_m)
 			continue;
 		if (m > s_m)
-			memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0]));
+			memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
 
 		if (dn_hash_dump_zone(skb, cb, tb, dz) < 0) {
-			cb->args[1] = m;
+			cb->args[2] = m;
 			read_unlock(&dn_fib_tables_lock);
 			return -1;
 		}
 	}
 	read_unlock(&dn_fib_tables_lock);
-	cb->args[1] = m;
+	cb->args[2] = m;
 
         return skb->len;
 }
 
+int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	unsigned int h, s_h;
+	unsigned int e = 0, s_e;
+	struct dn_fib_table *tb;
+	struct hlist_node *node;
+	int dumped = 0;
+
+	if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+		((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+			return dn_cache_dump(skb, cb);
+
+	s_h = cb->args[0];
+	s_e = cb->args[1];
+
+	for (h = s_h; h < DN_FIB_TABLE_HASHSZ; h++, s_h = 0) {
+		e = 0;
+		hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist) {
+			if (e < s_e)
+				goto next;
+			if (dumped)
+				memset(&cb->args[2], 0, sizeof(cb->args) -
+				                 2 * sizeof(cb->args[0]));
+			if (tb->dump(tb, skb, cb) < 0)
+				goto out;
+			dumped = 1;
+next:
+			e++;
+		}
+	}
+out:
+	cb->args[1] = e;
+	cb->args[0] = h;
+
+	return skb->len;
+}
+
 static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req)
 {
 	struct dn_hash *table = (struct dn_hash *)tb->data;
@@ -744,6 +781,8 @@ out:
 struct dn_fib_table *dn_fib_get_table(u32 n, int create)
 {
         struct dn_fib_table *t;
+	struct hlist_node *node;
+	unsigned int h;
 
         if (n < RT_TABLE_MIN)
                 return NULL;
@@ -751,8 +790,15 @@ struct dn_fib_table *dn_fib_get_table(u32 n, int create)
         if (n > RT_TABLE_MAX)
                 return NULL;
 
-        if (dn_fib_tables[n]) 
-                return dn_fib_tables[n];
+	h = n & (DN_FIB_TABLE_HASHSZ - 1);
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(t, node, &dn_fib_table_hash[h], hlist) {
+		if (t->n == n) {
+			rcu_read_unlock();
+			return t;
+		}
+	}
+	rcu_read_unlock();
 
         if (!create)
                 return NULL;
@@ -773,33 +819,37 @@ struct dn_fib_table *dn_fib_get_table(u32 n, int create)
         t->flush  = dn_fib_table_flush;
         t->dump = dn_fib_table_dump;
 	memset(t->data, 0, sizeof(struct dn_hash));
-        dn_fib_tables[n] = t;
+	hlist_add_head_rcu(&t->hlist, &dn_fib_table_hash[h]);
 
         return t;
 }
 
-static void dn_fib_del_tree(u32 n)
-{
-	struct dn_fib_table *t;
-
-	write_lock(&dn_fib_tables_lock);
-	t = dn_fib_tables[n];
-	dn_fib_tables[n] = NULL;
-	write_unlock(&dn_fib_tables_lock);
-
-	kfree(t);
-}
-
 struct dn_fib_table *dn_fib_empty_table(void)
 {
         u32 id;
 
         for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++)
-                if (dn_fib_tables[id] == NULL)
+		if (dn_fib_get_table(id, 0) == NULL)
                         return dn_fib_get_table(id, 1);
         return NULL;
 }
 
+void dn_fib_flush(void)
+{
+        int flushed = 0;
+        struct dn_fib_table *tb;
+	struct hlist_node *node;
+	unsigned int h;
+
+	for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
+		hlist_for_each_entry(tb, node, &dn_fib_table_hash[h], hlist)
+	                flushed += tb->flush(tb);
+        }
+
+        if (flushed)
+                dn_rt_cache_flush(-1);
+}
+
 void __init dn_fib_table_init(void)
 {
 	dn_hash_kmem = kmem_cache_create("dn_fib_info_cache",
@@ -810,10 +860,17 @@ void __init dn_fib_table_init(void)
 
 void __exit dn_fib_table_cleanup(void)
 {
-	int i;
-
-	for (i = RT_TABLE_MIN; i <= RT_TABLE_MAX; ++i)
-		dn_fib_del_tree(i);
+	struct dn_fib_table *t;
+	struct hlist_node *node, *next;
+	unsigned int h;
 
-	return;
+	write_lock(&dn_fib_tables_lock);
+	for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
+		hlist_for_each_entry_safe(t, node, next, &dn_fib_table_hash[h],
+		                          hlist) {
+			hlist_del(&t->hlist);
+			kfree(t);
+		}
+	}
+	write_unlock(&dn_fib_tables_lock);
 }
-- 
cgit v1.2.3-70-g09d2


From b801f54917b7c6e8540f877ee562cd0725e62ebd Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Thu, 10 Aug 2006 23:12:34 -0700
Subject: [NET]: Increate RT_TABLE_MAX to 2^32

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index ea422a539a0..7e4aa48680a 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -239,10 +239,8 @@ enum rt_class_t
 	RT_TABLE_DEFAULT=253,
 	RT_TABLE_MAIN=254,
 	RT_TABLE_LOCAL=255,
-	__RT_TABLE_MAX
+	RT_TABLE_MAX=0xFFFFFFFF
 };
-#define RT_TABLE_MAX (__RT_TABLE_MAX - 1)
-
 
 
 /* Routing message attributes */
-- 
cgit v1.2.3-70-g09d2


From 90d41122f79c8c3687d965dde4c6d30a6e0cac4c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 14 Aug 2006 23:49:16 -0700
Subject: [IPV6] ip6_fib.c: make code static

Make the following needlessly global code static:
- fib6_walker_lock
- struct fib6_walker_list
- fib6_walk_continue()
- fib6_walk()

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 25 -------------------------
 net/ipv6/ip6_fib.c    | 29 ++++++++++++++++++++++++-----
 2 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c0660cea9a2..69c44420978 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -92,28 +92,6 @@ struct fib6_walker_t
 	void *args;
 };
 
-extern struct fib6_walker_t fib6_walker_list;
-extern rwlock_t fib6_walker_lock;
-
-static inline void fib6_walker_link(struct fib6_walker_t *w)
-{
-	write_lock_bh(&fib6_walker_lock);
-	w->next = fib6_walker_list.next;
-	w->prev = &fib6_walker_list;
-	w->next->prev = w;
-	w->prev->next = w;
-	write_unlock_bh(&fib6_walker_lock);
-}
-
-static inline void fib6_walker_unlink(struct fib6_walker_t *w)
-{
-	write_lock_bh(&fib6_walker_lock);
-	w->next->prev = w->prev;
-	w->prev->next = w->next;
-	w->prev = w->next = w;
-	write_unlock_bh(&fib6_walker_lock);
-}
-
 struct rt6_statistics {
 	__u32		fib_nodes;
 	__u32		fib_route_nodes;
@@ -195,9 +173,6 @@ struct fib6_node		*fib6_locate(struct fib6_node *root,
 extern void			fib6_clean_all(int (*func)(struct rt6_info *, void *arg),
 					       int prune, void *arg);
 
-extern int			fib6_walk(struct fib6_walker_t *w);
-extern int			fib6_walk_continue(struct fib6_walker_t *w);
-
 extern int			fib6_add(struct fib6_node *root,
 					 struct rt6_info *rt,
 					 struct nlmsghdr *nlh,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index bececbe9dd2..be36f4acda9 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -69,8 +69,7 @@ struct fib6_cleaner_t
 	void *arg;
 };
 
-DEFINE_RWLOCK(fib6_walker_lock);
-
+static DEFINE_RWLOCK(fib6_walker_lock);
 
 #ifdef CONFIG_IPV6_SUBTREES
 #define FWS_INIT FWS_S
@@ -82,6 +81,8 @@ DEFINE_RWLOCK(fib6_walker_lock);
 
 static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt);
 static struct fib6_node * fib6_repair_tree(struct fib6_node *fn);
+static int fib6_walk(struct fib6_walker_t *w);
+static int fib6_walk_continue(struct fib6_walker_t *w);
 
 /*
  *	A routing update causes an increase of the serial number on the
@@ -94,13 +95,31 @@ static __u32 rt_sernum;
 
 static DEFINE_TIMER(ip6_fib_timer, fib6_run_gc, 0, 0);
 
-struct fib6_walker_t fib6_walker_list = {
+static struct fib6_walker_t fib6_walker_list = {
 	.prev	= &fib6_walker_list,
 	.next	= &fib6_walker_list, 
 };
 
 #define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next)
 
+static inline void fib6_walker_link(struct fib6_walker_t *w)
+{
+	write_lock_bh(&fib6_walker_lock);
+	w->next = fib6_walker_list.next;
+	w->prev = &fib6_walker_list;
+	w->next->prev = w;
+	w->prev->next = w;
+	write_unlock_bh(&fib6_walker_lock);
+}
+
+static inline void fib6_walker_unlink(struct fib6_walker_t *w)
+{
+	write_lock_bh(&fib6_walker_lock);
+	w->next->prev = w->prev;
+	w->prev->next = w->next;
+	w->prev = w->next = w;
+	write_unlock_bh(&fib6_walker_lock);
+}
 static __inline__ u32 fib6_new_sernum(void)
 {
 	u32 n = ++rt_sernum;
@@ -1173,7 +1192,7 @@ int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct ne
  *	<0  -> walk is terminated by an error.
  */
 
-int fib6_walk_continue(struct fib6_walker_t *w)
+static int fib6_walk_continue(struct fib6_walker_t *w)
 {
 	struct fib6_node *fn, *pn;
 
@@ -1247,7 +1266,7 @@ int fib6_walk_continue(struct fib6_walker_t *w)
 	}
 }
 
-int fib6_walk(struct fib6_walker_t *w)
+static int fib6_walk(struct fib6_walker_t *w)
 {
 	int res;
 
-- 
cgit v1.2.3-70-g09d2


From 2aa7f36cdb332a32849afbf25fcbf35dce5b1940 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 14 Aug 2006 23:55:20 -0700
Subject: [DECNET]: cleanups

- make the following needlessly global functions static:
  - dn_fib.c: dn_fib_sync_down()
  - dn_fib.c: dn_fib_sync_up()
  - dn_rules.c: dn_fib_rule_action()
- remove the following unneeded prototype:
  - dn_fib.c: dn_cache_dump()

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dn_fib.h  | 3 ---
 net/decnet/dn_fib.c   | 9 +++++----
 net/decnet/dn_rules.c | 4 ++--
 3 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h
index d97aa10c463..f01626cbbed 100644
--- a/include/net/dn_fib.h
+++ b/include/net/dn_fib.h
@@ -131,9 +131,6 @@ extern __le16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type);
 extern void dn_fib_flush(void);
 extern void dn_fib_select_multipath(const struct flowi *fl,
 					struct dn_fib_res *res);
-extern int dn_fib_sync_down(__le16 local, struct net_device *dev,
-				int force);
-extern int dn_fib_sync_up(struct net_device *dev);
 
 /*
  * dn_tables.c
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 5ccca3ed53b..1cf010124ec 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -55,8 +55,6 @@
 
 #define endfor_nexthops(fi) }
 
-extern int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb);
-
 static DEFINE_SPINLOCK(dn_fib_multipath_lock);
 static struct dn_fib_info *dn_fib_info_list;
 static DEFINE_SPINLOCK(dn_fib_info_lock);
@@ -80,6 +78,9 @@ static struct
 	[RTN_XRESOLVE] =    { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
 };
 
+static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force);
+static int dn_fib_sync_up(struct net_device *dev);
+
 void dn_fib_free_info(struct dn_fib_info *fi)
 {
 	if (fi->fib_dead == 0) {
@@ -651,7 +652,7 @@ static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event,
 	return NOTIFY_DONE;
 }
 
-int dn_fib_sync_down(__le16 local, struct net_device *dev, int force)
+static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force)
 {
         int ret = 0;
         int scope = RT_SCOPE_NOWHERE;
@@ -695,7 +696,7 @@ int dn_fib_sync_down(__le16 local, struct net_device *dev, int force)
 }
 
 
-int dn_fib_sync_up(struct net_device *dev)
+static int dn_fib_sync_up(struct net_device *dev)
 {
         int ret = 0;
 
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 977bb56c3ce..50e819edf8c 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -75,8 +75,8 @@ int dn_fib_lookup(struct flowi *flp, struct dn_fib_res *res)
 	return err;
 }
 
-int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp, int flags,
-		       struct fib_lookup_arg *arg)
+static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp,
+			      int flags, struct fib_lookup_arg *arg)
 {
 	int err = -EAGAIN;
 	struct dn_fib_table *tbl;
-- 
cgit v1.2.3-70-g09d2


From 81aa646cc4df3779bcbf9d18cc2c0813ee9b3262 Mon Sep 17 00:00:00 2001
From: Martin Bligh <mbligh@google.com>
Date: Mon, 14 Aug 2006 23:57:10 -0700
Subject: [IPV4]: add the UdpSndbufErrors and UdpRcvbufErrors MIBs

Signed-off-by: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 include/linux/snmp.h |  2 ++
 net/ipv4/proc.c      |  2 ++
 net/ipv4/udp.c       | 16 +++++++++++++++-
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 4db25d5c7cd..30156556f78 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -155,6 +155,8 @@ enum
 	UDP_MIB_NOPORTS,			/* NoPorts */
 	UDP_MIB_INERRORS,			/* InErrors */
 	UDP_MIB_OUTDATAGRAMS,			/* OutDatagrams */
+	UDP_MIB_RCVBUFERRORS,			/* RcvbufErrors */
+	UDP_MIB_SNDBUFERRORS,			/* SndbufErrors */
 	__UDP_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index d61e2a9d394..9c6cbe3d9fb 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -173,6 +173,8 @@ static const struct snmp_mib snmp4_udp_list[] = {
 	SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
 	SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
 	SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+	SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
+	SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 87152510980..514c1e9ae81 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -662,6 +662,16 @@ out:
 		UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS);
 		return len;
 	}
+	/*
+	 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
+	 * ENOBUFS might not be good (it's not tunable per se), but otherwise
+	 * we don't have a good statistic (IpOutDiscards but it can be too many
+	 * things).  We could add another new stat but at least for now that
+	 * seems like overkill.
+	 */
+	if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		UDP_INC_STATS_USER(UDP_MIB_SNDBUFERRORS);
+	}
 	return err;
 
 do_confirm:
@@ -981,6 +991,7 @@ static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
 static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
+	int rc;
 
 	/*
 	 *	Charge it to the socket, dropping if the queue is full.
@@ -1027,7 +1038,10 @@ static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
-	if (sock_queue_rcv_skb(sk,skb)<0) {
+	if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {
+		/* Note that an ENOMEM error is charged twice */
+		if (rc == -ENOMEM)
+			UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS);
 		UDP_INC_STATS_BH(UDP_MIB_INERRORS);
 		kfree_skb(skb);
 		return -1;
-- 
cgit v1.2.3-70-g09d2


From 2942e90050569525628a9f34e0daaa9b661b49cc Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 15 Aug 2006 00:30:25 -0700
Subject: [RTNETLINK]: Use rtnl_unicast() for rtnetlink unicasts

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  1 +
 net/core/rtnetlink.c      | 10 +++++++---
 net/decnet/dn_route.c     |  4 +---
 net/ipv4/ipmr.c           |  7 ++++---
 net/ipv4/route.c          |  7 +++----
 net/ipv6/addrconf.c       |  4 +---
 net/ipv6/route.c          |  4 +---
 net/sched/act_api.c       |  7 ++-----
 8 files changed, 20 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 7e4aa48680a..0e4f478e2cb 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -584,6 +584,7 @@ struct rtnetlink_link
 
 extern struct rtnetlink_link * rtnetlink_links[NPROTO];
 extern int rtnetlink_send(struct sk_buff *skb, u32 pid, u32 group, int echo);
+extern int rtnl_unicast(struct sk_buff *skb, u32 pid);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
 
 extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a1b783a6afc..e02fa6a33f4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -166,6 +166,11 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
 	return err;
 }
 
+int rtnl_unicast(struct sk_buff *skb, u32 pid)
+{
+	return nlmsg_unicast(rtnl, skb, pid);
+}
+
 int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
 {
 	struct rtattr *mx = (struct rtattr*)skb->tail;
@@ -574,9 +579,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		goto errout;
 	}
 
-	err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).pid, MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
+	err = rtnl_unicast(skb, NETLINK_CB(skb).pid);
 errout:
 	kfree(iw_buf);
 	dev_put(dev);
@@ -825,3 +828,4 @@ EXPORT_SYMBOL(rtnl);
 EXPORT_SYMBOL(rtnl_lock);
 EXPORT_SYMBOL(rtnl_trylock);
 EXPORT_SYMBOL(rtnl_unlock);
+EXPORT_SYMBOL(rtnl_unicast);
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 4c963213fba..c5daf3557c1 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1611,9 +1611,7 @@ int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
 		goto out_free;
 	}
 
-	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
-
-	return err;
+	return rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
 
 out_free:
 	kfree_skb(skb);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 85893eef6b1..98f0aa0d421 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -312,7 +312,8 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
 			e = NLMSG_DATA(nlh);
 			e->error = -ETIMEDOUT;
 			memset(&e->msg, 0, sizeof(e->msg));
-			netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+
+			rtnl_unicast(skb, NETLINK_CB(skb).pid);
 		} else
 			kfree_skb(skb);
 	}
@@ -512,7 +513,6 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 
 	while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
 		if (skb->nh.iph->version == 0) {
-			int err;
 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
 
 			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
@@ -525,7 +525,8 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 				e->error = -EMSGSIZE;
 				memset(&e->msg, 0, sizeof(e->msg));
 			}
-			err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+
+			rtnl_unicast(skb, NETLINK_CB(skb).pid);
 		} else
 			ip_mr_forward(skb, c, 0);
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 12128b82c9d..b8f6cadc5b3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2809,10 +2809,9 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 		goto out_free;
 	}
 
-	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
-out:	return err;
+	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
+out:
+	return err;
 
 out_free:
 	kfree_skb(skb);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 9ba1e811ba5..4f991a2234d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3268,9 +3268,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb,
 		goto out_free;
 	}
 
-	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
+	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
 out:
 	in6_ifa_put(ifa);
 	return err;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9ce28277f47..024c8e26c2e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2044,9 +2044,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 		goto out_free;
 	}
 
-	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
+	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
 out:
 	return err;
 out_free:
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a2587b52e53..6990747d6d5 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -459,7 +459,6 @@ static int
 act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event)
 {
 	struct sk_buff *skb;
-	int err = 0;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb)
@@ -468,10 +467,8 @@ act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event)
 		kfree_skb(skb);
 		return -EINVAL;
 	}
-	err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
-	return err;
+
+	return rtnl_unicast(skb, pid);
 }
 
 static struct tc_action *
-- 
cgit v1.2.3-70-g09d2


From d387f6ad10764fc2174373b4a1cca443adee36e3 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 15 Aug 2006 00:31:06 -0700
Subject: [NETLINK]: Add notification message sending interface

Adds nlmsg_notify() implementing proper notification logic. The
message is multicasted to all listeners in the group. The
applications the requests orignates from can request a unicast
back report in which case said socket will be excluded from the
multicast to avoid duplicated notifications.

nlmsg_multicast() is extended to take allocation flags to
allow notification in atomic contexts.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h      |  5 +++--
 include/net/netlink.h        |  6 ++++--
 net/netlabel/netlabel_user.c |  2 +-
 net/netlink/af_netlink.c     | 34 +++++++++++++++++++++++++++++++++-
 net/netlink/genetlink.c      |  2 +-
 5 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 8c228726426..97d6d3aba9d 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -133,11 +133,12 @@ static inline int genlmsg_cancel(struct sk_buff *skb, void *hdr)
  * @skb: netlink message as socket buffer
  * @pid: own netlink pid to avoid sending to yourself
  * @group: multicast group id
+ * @flags: allocation flags
  */
 static inline int genlmsg_multicast(struct sk_buff *skb, u32 pid,
-				    unsigned int group)
+				    unsigned int group, gfp_t flags)
 {
-	return nlmsg_multicast(genl_sock, skb, pid, group);
+	return nlmsg_multicast(genl_sock, skb, pid, group, flags);
 }
 
 /**
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 3a5e40b1e04..b154b81d9a7 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -43,6 +43,7 @@
  * Message Sending:
  *   nlmsg_multicast()			multicast message to several groups
  *   nlmsg_unicast()			unicast a message to a single socket
+ *   nlmsg_notify()			send notification message
  *
  * Message Length Calculations:
  *   nlmsg_msg_size(payload)		length of message w/o padding
@@ -545,15 +546,16 @@ static inline void nlmsg_free(struct sk_buff *skb)
  * @skb: netlink message as socket buffer
  * @pid: own netlink pid to avoid sending to yourself
  * @group: multicast group id
+ * @flags: allocation flags
  */
 static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb,
-				  u32 pid, unsigned int group)
+				  u32 pid, unsigned int group, gfp_t flags)
 {
 	int err;
 
 	NETLINK_CB(skb).dst_group = group;
 
-	err = netlink_broadcast(sk, skb, pid, group, GFP_KERNEL);
+	err = netlink_broadcast(sk, skb, pid, group, flags);
 	if (err > 0)
 		err = 0;
 
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 80022221b0a..73cbe66e42f 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -154,5 +154,5 @@ int netlbl_netlink_snd(struct sk_buff *skb, u32 pid)
  */
 int netlbl_netlink_snd_multicast(struct sk_buff *skb, u32 pid, u32 group)
 {
-	return genlmsg_multicast(skb, pid, group);
+	return genlmsg_multicast(skb, pid, group, GFP_KERNEL);
 }
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 0f36ddc0b72..a80e4456e20 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1549,6 +1549,38 @@ void netlink_queue_skip(struct nlmsghdr *nlh, struct sk_buff *skb)
 	skb_pull(skb, msglen);
 }
 
+/**
+ * nlmsg_notify - send a notification netlink message
+ * @sk: netlink socket to use
+ * @skb: notification message
+ * @pid: destination netlink pid for reports or 0
+ * @group: destination multicast group or 0
+ * @report: 1 to report back, 0 to disable
+ * @flags: allocation flags
+ */
+int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid,
+		 unsigned int group, int report, gfp_t flags)
+{
+	int err = 0;
+
+	if (group) {
+		int exclude_pid = 0;
+
+		if (report) {
+			atomic_inc(&skb->users);
+			exclude_pid = pid;
+		}
+
+		/* errors reported via destination sk->sk_err */
+		nlmsg_multicast(sk, skb, exclude_pid, group, flags);
+	}
+
+	if (report)
+		err = nlmsg_unicast(sk, skb, pid);
+
+	return err;
+}
+
 #ifdef CONFIG_PROC_FS
 struct nl_seq_iter {
 	int link;
@@ -1802,4 +1834,4 @@ EXPORT_SYMBOL(netlink_set_err);
 EXPORT_SYMBOL(netlink_set_nonroot);
 EXPORT_SYMBOL(netlink_unicast);
 EXPORT_SYMBOL(netlink_unregister_notifier);
-
+EXPORT_SYMBOL(nlmsg_notify);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 75bb47a898d..d32599116c5 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -510,7 +510,7 @@ static int genl_ctrl_event(int event, void *data)
 		if (IS_ERR(msg))
 			return PTR_ERR(msg);
 
-		genlmsg_multicast(msg, 0, GENL_ID_CTRL);
+		genlmsg_multicast(msg, 0, GENL_ID_CTRL, GFP_KERNEL);
 		break;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 97676b6b5538b3e059d33b8338e7d5cc41c5f1f1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 15 Aug 2006 00:31:41 -0700
Subject: [RTNETLINK]: Add rtnetlink notification interface

Adds rtnl_notify() to send rtnetlink notification messages and
rtnl_set_sk_err() to report notification errors as socket
errors in order to indicate the need of a resync due to loss
of events.

nlmsg_report() is added to properly document the meaning of
NLM_F_ECHO.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  3 +++
 include/net/netlink.h     | 17 +++++++++++++++++
 net/core/rtnetlink.c      | 18 ++++++++++++++++++
 3 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 0e4f478e2cb..ecbe0349060 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -585,6 +585,9 @@ struct rtnetlink_link
 extern struct rtnetlink_link * rtnetlink_links[NPROTO];
 extern int rtnetlink_send(struct sk_buff *skb, u32 pid, u32 group, int echo);
 extern int rtnl_unicast(struct sk_buff *skb, u32 pid);
+extern int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group,
+		       struct nlmsghdr *nlh, gfp_t flags);
+extern void rtnl_set_sk_err(u32 group, int error);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
 
 extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
diff --git a/include/net/netlink.h b/include/net/netlink.h
index b154b81d9a7..bf593eb59e1 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -65,6 +65,9 @@
  *   nlmsg_validate()			validate netlink message incl. attrs
  *   nlmsg_for_each_attr()		loop over all attributes
  *
+ * Misc:
+ *   nlmsg_report()			report back to application?
+ *
  * ------------------------------------------------------------------------
  *                          Attributes Interface
  * ------------------------------------------------------------------------
@@ -194,6 +197,9 @@ extern void		netlink_run_queue(struct sock *sk, unsigned int *qlen,
 						    struct nlmsghdr *, int *));
 extern void		netlink_queue_skip(struct nlmsghdr *nlh,
 					   struct sk_buff *skb);
+extern int		nlmsg_notify(struct sock *sk, struct sk_buff *skb,
+				     u32 pid, unsigned int group, int report,
+				     gfp_t flags);
 
 extern int		nla_validate(struct nlattr *head, int len, int maxtype,
 				     struct nla_policy *policy);
@@ -375,6 +381,17 @@ static inline int nlmsg_validate(struct nlmsghdr *nlh, int hdrlen, int maxtype,
 			    nlmsg_attrlen(nlh, hdrlen), maxtype, policy);
 }
 
+/**
+ * nlmsg_report - need to report back to application?
+ * @nlh: netlink message header
+ *
+ * Returns 1 if a report back to the application is requested.
+ */
+static inline int nlmsg_report(struct nlmsghdr *nlh)
+{
+	return !!(nlh->nlmsg_flags & NLM_F_ECHO);
+}
+
 /**
  * nlmsg_for_each_attr - iterate over a stream of attributes
  * @pos: loop counter, set to current attribute
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e02fa6a33f4..2b1af17e638 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -171,6 +171,22 @@ int rtnl_unicast(struct sk_buff *skb, u32 pid)
 	return nlmsg_unicast(rtnl, skb, pid);
 }
 
+int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group,
+		struct nlmsghdr *nlh, gfp_t flags)
+{
+	int report = 0;
+
+	if (nlh)
+		report = nlmsg_report(nlh);
+
+	return nlmsg_notify(rtnl, skb, pid, group, report, flags);
+}
+
+void rtnl_set_sk_err(u32 group, int error)
+{
+	netlink_set_err(rtnl, 0, group, error);
+}
+
 int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
 {
 	struct rtattr *mx = (struct rtattr*)skb->tail;
@@ -829,3 +845,5 @@ EXPORT_SYMBOL(rtnl_lock);
 EXPORT_SYMBOL(rtnl_trylock);
 EXPORT_SYMBOL(rtnl_unlock);
 EXPORT_SYMBOL(rtnl_unicast);
+EXPORT_SYMBOL(rtnl_notify);
+EXPORT_SYMBOL(rtnl_set_sk_err);
-- 
cgit v1.2.3-70-g09d2


From 56fc85ac961e2c20dcb5ef07e2628b3f93de2e49 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 15 Aug 2006 00:37:29 -0700
Subject: [RTNETLINK]: Unexport rtnl socket

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 2 --
 net/core/rtnetlink.c      | 4 +---
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index ecbe0349060..9c92dc8b9a0 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -574,8 +574,6 @@ extern int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, in
 #define rtattr_parse_nested(tb, max, rta) \
 	rtattr_parse((tb), (max), RTA_DATA((rta)), RTA_PAYLOAD((rta)))
 
-extern struct sock *rtnl;
-
 struct rtnetlink_link
 {
 	int (*doit)(struct sk_buff *, struct nlmsghdr*, void *attr);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f5300b5dd0f..dfc58269240 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -58,6 +58,7 @@
 #endif	/* CONFIG_NET_WIRELESS_RTNETLINK */
 
 static DEFINE_MUTEX(rtnl_mutex);
+static struct sock *rtnl;
 
 void rtnl_lock(void)
 {
@@ -95,8 +96,6 @@ int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len)
 	return 0;
 }
 
-struct sock *rtnl;
-
 struct rtnetlink_link * rtnetlink_links[NPROTO];
 
 static const int rtm_min[RTM_NR_FAMILIES] =
@@ -842,7 +841,6 @@ EXPORT_SYMBOL(rtattr_strlcpy);
 EXPORT_SYMBOL(rtattr_parse);
 EXPORT_SYMBOL(rtnetlink_links);
 EXPORT_SYMBOL(rtnetlink_put_metrics);
-EXPORT_SYMBOL(rtnl);
 EXPORT_SYMBOL(rtnl_lock);
 EXPORT_SYMBOL(rtnl_trylock);
 EXPORT_SYMBOL(rtnl_unlock);
-- 
cgit v1.2.3-70-g09d2


From 4e902c57417c4c285b98ba2722468d1c3ed83d1b Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 17 Aug 2006 18:14:52 -0700
Subject: [IPv4]: FIB configuration using struct fib_config

Introduces struct fib_config replacing the ugly struct kern_rta
prone to ordering issues. Avoids creating faked netlink messages
for auto generated routes or requests via ioctl.

A new interface net/nexthop.h is added to help navigate through
nexthop configuration arrays.

A new struct nl_info will be used to carry the necessary netlink
information to be used for notifications later on.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  55 +++----
 include/net/netlink.h    |  10 ++
 include/net/nexthop.h    |  33 ++++
 net/ipv4/fib_frontend.c  | 364 +++++++++++++++++++++++++++++++++++---------
 net/ipv4/fib_hash.c      |  94 ++++++------
 net/ipv4/fib_lookup.h    |  11 +-
 net/ipv4/fib_semantics.c | 385 ++++++++++++++---------------------------------
 net/ipv4/fib_trie.c      |  76 ++++------
 8 files changed, 560 insertions(+), 468 deletions(-)
 create mode 100644 include/net/nexthop.h

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 8e9ba563d34..42ed96fab3f 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -20,25 +20,32 @@
 #include <linux/seq_file.h>
 #include <net/fib_rules.h>
 
-/* WARNING: The ordering of these elements must match ordering
- *          of RTA_* rtnetlink attribute numbers.
- */
-struct kern_rta {
-	void		*rta_dst;
-	void		*rta_src;
-	int		*rta_iif;
-	int		*rta_oif;
-	void		*rta_gw;
-	u32		*rta_priority;
-	void		*rta_prefsrc;
-	struct rtattr	*rta_mx;
-	struct rtattr	*rta_mp;
-	unsigned char	*rta_protoinfo;
-	u32		*rta_flow;
-	struct rta_cacheinfo *rta_ci;
-	struct rta_session *rta_sess;
-	u32		*rta_mp_alg;
-};
+struct fib_config {
+	u8			fc_family;
+	u8			fc_dst_len;
+	u8			fc_src_len;
+	u8			fc_tos;
+	u8			fc_protocol;
+	u8			fc_scope;
+	u8			fc_type;
+	/* 1 byte unused */
+	u32			fc_table;
+	u32			fc_dst;
+	u32			fc_src;
+	u32			fc_gw;
+	int			fc_oif;
+	u32			fc_flags;
+	u32			fc_priority;
+	u32			fc_prefsrc;
+	struct nlattr		*fc_mx;
+	struct rtnexthop	*fc_mp;
+	int			fc_mx_len;
+	int			fc_mp_len;
+	u32			fc_flow;
+	u32			fc_mp_alg;
+	u32			fc_nlflags;
+	struct nl_info		fc_nlinfo;
+ };
 
 struct fib_info;
 
@@ -154,12 +161,8 @@ struct fib_table {
 	u32		tb_id;
 	unsigned	tb_stamp;
 	int		(*tb_lookup)(struct fib_table *tb, const struct flowi *flp, struct fib_result *res);
-	int		(*tb_insert)(struct fib_table *table, struct rtmsg *r,
-				     struct kern_rta *rta, struct nlmsghdr *n,
-				     struct netlink_skb_parms *req);
-	int		(*tb_delete)(struct fib_table *table, struct rtmsg *r,
-				     struct kern_rta *rta, struct nlmsghdr *n,
-				     struct netlink_skb_parms *req);
+	int		(*tb_insert)(struct fib_table *, struct fib_config *);
+	int		(*tb_delete)(struct fib_table *, struct fib_config *);
 	int		(*tb_dump)(struct fib_table *table, struct sk_buff *skb,
 				     struct netlink_callback *cb);
 	int		(*tb_flush)(struct fib_table *table);
@@ -228,8 +231,6 @@ struct rtentry;
 extern int ip_fib_check_default(u32 gw, struct net_device *dev);
 extern int fib_sync_down(u32 local, struct net_device *dev, int force);
 extern int fib_sync_up(struct net_device *dev);
-extern int fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
-			       struct kern_rta *rta, struct rtentry *r);
 extern u32  __fib_res_prefsrc(struct fib_result *res);
 
 /* Exported by fib_hash.c */
diff --git a/include/net/netlink.h b/include/net/netlink.h
index bf593eb59e1..47044da167c 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -192,6 +192,16 @@ struct nla_policy {
 	u16		minlen;
 };
 
+/**
+ * struct nl_info - netlink source information
+ * @nlh: Netlink message header of original request
+ * @pid: Netlink PID of requesting application
+ */
+struct nl_info {
+	struct nlmsghdr		*nlh;
+	u32			pid;
+};
+
 extern void		netlink_run_queue(struct sock *sk, unsigned int *qlen,
 					  int (*cb)(struct sk_buff *,
 						    struct nlmsghdr *, int *));
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
new file mode 100644
index 00000000000..3334dbfa5aa
--- /dev/null
+++ b/include/net/nexthop.h
@@ -0,0 +1,33 @@
+#ifndef __NET_NEXTHOP_H
+#define __NET_NEXTHOP_H
+
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+
+static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining)
+{
+	return remaining >= sizeof(*rtnh) &&
+	       rtnh->rtnh_len >= sizeof(*rtnh) &&
+	       rtnh->rtnh_len <= remaining;
+}
+
+static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh,
+                                         int *remaining)
+{
+	int totlen = NLA_ALIGN(rtnh->rtnh_len);
+
+	*remaining -= totlen;
+	return (struct rtnexthop *) ((char *) rtnh + totlen);
+}
+
+static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh)
+{
+	return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh)));
+}
+
+static inline int rtnh_attrlen(const struct rtnexthop *rtnh)
+{
+	return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh));
+}
+
+#endif
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ad4c14f968a..acc18bdf2de 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -253,42 +253,190 @@ e_inval:
 
 #ifndef CONFIG_IP_NOSIOCRT
 
+static inline u32 sk_extract_addr(struct sockaddr *addr)
+{
+	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+}
+
+static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
+{
+	struct nlattr *nla;
+
+	nla = (struct nlattr *) ((char *) mx + len);
+	nla->nla_type = type;
+	nla->nla_len = nla_attr_size(4);
+	*(u32 *) nla_data(nla) = value;
+
+	return len + nla_total_size(4);
+}
+
+static int rtentry_to_fib_config(int cmd, struct rtentry *rt,
+				 struct fib_config *cfg)
+{
+	u32 addr;
+	int plen;
+
+	memset(cfg, 0, sizeof(*cfg));
+
+	if (rt->rt_dst.sa_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	/*
+	 * Check mask for validity:
+	 * a) it must be contiguous.
+	 * b) destination must have all host bits clear.
+	 * c) if application forgot to set correct family (AF_INET),
+	 *    reject request unless it is absolutely clear i.e.
+	 *    both family and mask are zero.
+	 */
+	plen = 32;
+	addr = sk_extract_addr(&rt->rt_dst);
+	if (!(rt->rt_flags & RTF_HOST)) {
+		u32 mask = sk_extract_addr(&rt->rt_genmask);
+
+		if (rt->rt_genmask.sa_family != AF_INET) {
+			if (mask || rt->rt_genmask.sa_family)
+				return -EAFNOSUPPORT;
+		}
+
+		if (bad_mask(mask, addr))
+			return -EINVAL;
+
+		plen = inet_mask_len(mask);
+	}
+
+	cfg->fc_dst_len = plen;
+	cfg->fc_dst = addr;
+
+	if (cmd != SIOCDELRT) {
+		cfg->fc_nlflags = NLM_F_CREATE;
+		cfg->fc_protocol = RTPROT_BOOT;
+	}
+
+	if (rt->rt_metric)
+		cfg->fc_priority = rt->rt_metric - 1;
+
+	if (rt->rt_flags & RTF_REJECT) {
+		cfg->fc_scope = RT_SCOPE_HOST;
+		cfg->fc_type = RTN_UNREACHABLE;
+		return 0;
+	}
+
+	cfg->fc_scope = RT_SCOPE_NOWHERE;
+	cfg->fc_type = RTN_UNICAST;
+
+	if (rt->rt_dev) {
+		char *colon;
+		struct net_device *dev;
+		char devname[IFNAMSIZ];
+
+		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
+			return -EFAULT;
+
+		devname[IFNAMSIZ-1] = 0;
+		colon = strchr(devname, ':');
+		if (colon)
+			*colon = 0;
+		dev = __dev_get_by_name(devname);
+		if (!dev)
+			return -ENODEV;
+		cfg->fc_oif = dev->ifindex;
+		if (colon) {
+			struct in_ifaddr *ifa;
+			struct in_device *in_dev = __in_dev_get_rtnl(dev);
+			if (!in_dev)
+				return -ENODEV;
+			*colon = ':';
+			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+				if (strcmp(ifa->ifa_label, devname) == 0)
+					break;
+			if (ifa == NULL)
+				return -ENODEV;
+			cfg->fc_prefsrc = ifa->ifa_local;
+		}
+	}
+
+	addr = sk_extract_addr(&rt->rt_gateway);
+	if (rt->rt_gateway.sa_family == AF_INET && addr) {
+		cfg->fc_gw = addr;
+		if (rt->rt_flags & RTF_GATEWAY &&
+		    inet_addr_type(addr) == RTN_UNICAST)
+			cfg->fc_scope = RT_SCOPE_UNIVERSE;
+	}
+
+	if (cmd == SIOCDELRT)
+		return 0;
+
+	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
+		return -EINVAL;
+
+	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
+		cfg->fc_scope = RT_SCOPE_LINK;
+
+	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
+		struct nlattr *mx;
+		int len = 0;
+
+		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
+ 		if (mx == NULL)
+			return -ENOMEM;
+
+		if (rt->rt_flags & RTF_MTU)
+			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
+
+		if (rt->rt_flags & RTF_WINDOW)
+			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
+
+		if (rt->rt_flags & RTF_IRTT)
+			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
+
+		cfg->fc_mx = mx;
+		cfg->fc_mx_len = len;
+	}
+
+	return 0;
+}
+
 /*
  *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
  */
  
 int ip_rt_ioctl(unsigned int cmd, void __user *arg)
 {
+	struct fib_config cfg;
+	struct rtentry rt;
 	int err;
-	struct kern_rta rta;
-	struct rtentry  r;
-	struct {
-		struct nlmsghdr nlh;
-		struct rtmsg	rtm;
-	} req;
 
 	switch (cmd) {
 	case SIOCADDRT:		/* Add a route */
 	case SIOCDELRT:		/* Delete a route */
 		if (!capable(CAP_NET_ADMIN))
 			return -EPERM;
-		if (copy_from_user(&r, arg, sizeof(struct rtentry)))
+
+		if (copy_from_user(&rt, arg, sizeof(rt)))
 			return -EFAULT;
+
 		rtnl_lock();
-		err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r);
+		err = rtentry_to_fib_config(cmd, &rt, &cfg);
 		if (err == 0) {
+			struct fib_table *tb;
+
 			if (cmd == SIOCDELRT) {
-				struct fib_table *tb = fib_get_table(req.rtm.rtm_table);
-				err = -ESRCH;
+				tb = fib_get_table(cfg.fc_table);
 				if (tb)
-					err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+					err = tb->tb_delete(tb, &cfg);
+				else
+					err = -ESRCH;
 			} else {
-				struct fib_table *tb = fib_new_table(req.rtm.rtm_table);
-				err = -ENOBUFS;
+				tb = fib_new_table(cfg.fc_table);
 				if (tb)
-					err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+					err = tb->tb_insert(tb, &cfg);
+				else
+					err = -ENOBUFS;
 			}
-			kfree(rta.rta_mx);
+
+			/* allocated by rtentry_to_fib_config() */
+			kfree(cfg.fc_mx);
 		}
 		rtnl_unlock();
 		return err;
@@ -305,51 +453,134 @@ int ip_rt_ioctl(unsigned int cmd, void *arg)
 
 #endif
 
-static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
+static struct nla_policy rtm_ipv4_policy[RTA_MAX+1] __read_mostly = {
+	[RTA_DST]		= { .type = NLA_U32 },
+	[RTA_SRC]		= { .type = NLA_U32 },
+	[RTA_IIF]		= { .type = NLA_U32 },
+	[RTA_OIF]		= { .type = NLA_U32 },
+	[RTA_GATEWAY]		= { .type = NLA_U32 },
+	[RTA_PRIORITY]		= { .type = NLA_U32 },
+	[RTA_PREFSRC]		= { .type = NLA_U32 },
+	[RTA_METRICS]		= { .type = NLA_NESTED },
+	[RTA_MULTIPATH]		= { .minlen = sizeof(struct rtnexthop) },
+	[RTA_PROTOINFO]		= { .type = NLA_U32 },
+	[RTA_FLOW]		= { .type = NLA_U32 },
+	[RTA_MP_ALGO]		= { .type = NLA_U32 },
+};
+
+static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct fib_config *cfg)
 {
-	int i;
-
-	for (i=1; i<=RTA_MAX; i++, rta++) {
-		struct rtattr *attr = *rta;
-		if (attr) {
-			if (RTA_PAYLOAD(attr) < 4)
-				return -EINVAL;
-			if (i != RTA_MULTIPATH && i != RTA_METRICS &&
-			    i != RTA_TABLE)
-				*rta = (struct rtattr*)RTA_DATA(attr);
+	struct nlattr *attr;
+	int err, remaining;
+	struct rtmsg *rtm;
+
+	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
+	if (err < 0)
+		goto errout;
+
+	memset(cfg, 0, sizeof(*cfg));
+
+	rtm = nlmsg_data(nlh);
+	cfg->fc_family = rtm->rtm_family;
+	cfg->fc_dst_len = rtm->rtm_dst_len;
+	cfg->fc_src_len = rtm->rtm_src_len;
+	cfg->fc_tos = rtm->rtm_tos;
+	cfg->fc_table = rtm->rtm_table;
+	cfg->fc_protocol = rtm->rtm_protocol;
+	cfg->fc_scope = rtm->rtm_scope;
+	cfg->fc_type = rtm->rtm_type;
+	cfg->fc_flags = rtm->rtm_flags;
+	cfg->fc_nlflags = nlh->nlmsg_flags;
+
+	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+	cfg->fc_nlinfo.nlh = nlh;
+
+	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
+		switch (attr->nla_type) {
+		case RTA_DST:
+			cfg->fc_dst = nla_get_u32(attr);
+			break;
+		case RTA_SRC:
+			cfg->fc_src = nla_get_u32(attr);
+			break;
+		case RTA_OIF:
+			cfg->fc_oif = nla_get_u32(attr);
+			break;
+		case RTA_GATEWAY:
+			cfg->fc_gw = nla_get_u32(attr);
+			break;
+		case RTA_PRIORITY:
+			cfg->fc_priority = nla_get_u32(attr);
+			break;
+		case RTA_PREFSRC:
+			cfg->fc_prefsrc = nla_get_u32(attr);
+			break;
+		case RTA_METRICS:
+			cfg->fc_mx = nla_data(attr);
+			cfg->fc_mx_len = nla_len(attr);
+			break;
+		case RTA_MULTIPATH:
+			cfg->fc_mp = nla_data(attr);
+			cfg->fc_mp_len = nla_len(attr);
+			break;
+		case RTA_FLOW:
+			cfg->fc_flow = nla_get_u32(attr);
+			break;
+		case RTA_MP_ALGO:
+			cfg->fc_mp_alg = nla_get_u32(attr);
+			break;
+		case RTA_TABLE:
+			cfg->fc_table = nla_get_u32(attr);
+			break;
 		}
 	}
+
 	return 0;
+errout:
+	return err;
 }
 
 int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
-	struct fib_table * tb;
-	struct rtattr **rta = arg;
-	struct rtmsg *r = NLMSG_DATA(nlh);
+	struct fib_config cfg;
+	struct fib_table *tb;
+	int err;
 
-	if (inet_check_attr(r, rta))
-		return -EINVAL;
+	err = rtm_to_fib_config(skb, nlh, &cfg);
+	if (err < 0)
+		goto errout;
 
-	tb = fib_get_table(rtm_get_table(rta, r->rtm_table));
-	if (tb)
-		return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
-	return -ESRCH;
+	tb = fib_get_table(cfg.fc_table);
+	if (tb == NULL) {
+		err = -ESRCH;
+		goto errout;
+	}
+
+	err = tb->tb_delete(tb, &cfg);
+errout:
+	return err;
 }
 
 int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
-	struct fib_table * tb;
-	struct rtattr **rta = arg;
-	struct rtmsg *r = NLMSG_DATA(nlh);
+	struct fib_config cfg;
+	struct fib_table *tb;
+	int err;
 
-	if (inet_check_attr(r, rta))
-		return -EINVAL;
+	err = rtm_to_fib_config(skb, nlh, &cfg);
+	if (err < 0)
+		goto errout;
 
-	tb = fib_new_table(rtm_get_table(rta, r->rtm_table));
-	if (tb)
-		return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
-	return -ENOBUFS;
+	tb = fib_new_table(cfg.fc_table);
+	if (tb == NULL) {
+		err = -ENOBUFS;
+		goto errout;
+	}
+
+	err = tb->tb_insert(tb, &cfg);
+errout:
+	return err;
 }
 
 int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
@@ -396,17 +627,19 @@ out:
    only when netlink is already locked.
  */
 
-static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa)
+static void fib_magic(int cmd, int type, u32 dst, int dst_len,
+		      struct in_ifaddr *ifa)
 {
-	struct fib_table * tb;
-	struct {
-		struct nlmsghdr	nlh;
-		struct rtmsg	rtm;
-	} req;
-	struct kern_rta rta;
-
-	memset(&req.rtm, 0, sizeof(req.rtm));
-	memset(&rta, 0, sizeof(rta));
+	struct fib_table *tb;
+	struct fib_config cfg = {
+		.fc_protocol = RTPROT_KERNEL,
+		.fc_type = type,
+		.fc_dst = dst,
+		.fc_dst_len = dst_len,
+		.fc_prefsrc = ifa->ifa_local,
+		.fc_oif = ifa->ifa_dev->dev->ifindex,
+		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
+	};
 
 	if (type == RTN_UNICAST)
 		tb = fib_new_table(RT_TABLE_MAIN);
@@ -416,26 +649,17 @@ static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr
 	if (tb == NULL)
 		return;
 
-	req.nlh.nlmsg_len = sizeof(req);
-	req.nlh.nlmsg_type = cmd;
-	req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
-	req.nlh.nlmsg_pid = 0;
-	req.nlh.nlmsg_seq = 0;
+	cfg.fc_table = tb->tb_id;
 
-	req.rtm.rtm_dst_len = dst_len;
-	req.rtm.rtm_table = tb->tb_id;
-	req.rtm.rtm_protocol = RTPROT_KERNEL;
-	req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
-	req.rtm.rtm_type = type;
-
-	rta.rta_dst = &dst;
-	rta.rta_prefsrc = &ifa->ifa_local;
-	rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
+	if (type != RTN_LOCAL)
+		cfg.fc_scope = RT_SCOPE_LINK;
+	else
+		cfg.fc_scope = RT_SCOPE_HOST;
 
 	if (cmd == RTM_NEWROUTE)
-		tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+		tb->tb_insert(tb, &cfg);
 	else
-		tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+		tb->tb_delete(tb, &cfg);
 }
 
 void fib_add_ifaddr(struct in_ifaddr *ifa)
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index b5bee1a71e5..357557549ce 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -379,42 +379,39 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, u32 key)
 	return NULL;
 }
 
-static int
-fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
-	       struct nlmsghdr *n, struct netlink_skb_parms *req)
+static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
 	struct fib_node *new_f, *f;
 	struct fib_alias *fa, *new_fa;
 	struct fn_zone *fz;
 	struct fib_info *fi;
-	int z = r->rtm_dst_len;
-	int type = r->rtm_type;
-	u8 tos = r->rtm_tos;
+	u8 tos = cfg->fc_tos;
 	u32 key;
 	int err;
 
-	if (z > 32)
+	if (cfg->fc_dst_len > 32)
 		return -EINVAL;
-	fz = table->fn_zones[z];
-	if (!fz && !(fz = fn_new_zone(table, z)))
+
+	fz = table->fn_zones[cfg->fc_dst_len];
+	if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
 		return -ENOBUFS;
 
 	key = 0;
-	if (rta->rta_dst) {
-		u32 dst;
-		memcpy(&dst, rta->rta_dst, 4);
-		if (dst & ~FZ_MASK(fz))
+	if (cfg->fc_dst) {
+		if (cfg->fc_dst & ~FZ_MASK(fz))
 			return -EINVAL;
-		key = fz_key(dst, fz);
+		key = fz_key(cfg->fc_dst, fz);
 	}
 
-	if  ((fi = fib_create_info(r, rta, n, &err)) == NULL)
-		return err;
+	fi = fib_create_info(cfg);
+	if (IS_ERR(fi))
+		return PTR_ERR(fi);
 
 	if (fz->fz_nent > (fz->fz_divisor<<1) &&
 	    fz->fz_divisor < FZ_MAX_DIVISOR &&
-	    (z==32 || (1<<z) > fz->fz_divisor))
+	    (cfg->fc_dst_len == 32 ||
+	     (1 << cfg->fc_dst_len) > fz->fz_divisor))
 		fn_rehash_zone(fz);
 
 	f = fib_find_node(fz, key);
@@ -440,18 +437,18 @@ fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		struct fib_alias *fa_orig;
 
 		err = -EEXIST;
-		if (n->nlmsg_flags & NLM_F_EXCL)
+		if (cfg->fc_nlflags & NLM_F_EXCL)
 			goto out;
 
-		if (n->nlmsg_flags & NLM_F_REPLACE) {
+		if (cfg->fc_nlflags & NLM_F_REPLACE) {
 			struct fib_info *fi_drop;
 			u8 state;
 
 			write_lock_bh(&fib_hash_lock);
 			fi_drop = fa->fa_info;
 			fa->fa_info = fi;
-			fa->fa_type = type;
-			fa->fa_scope = r->rtm_scope;
+			fa->fa_type = cfg->fc_type;
+			fa->fa_scope = cfg->fc_scope;
 			state = fa->fa_state;
 			fa->fa_state &= ~FA_S_ACCESSED;
 			fib_hash_genid++;
@@ -474,17 +471,17 @@ fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 				break;
 			if (fa->fa_info->fib_priority != fi->fib_priority)
 				break;
-			if (fa->fa_type == type &&
-			    fa->fa_scope == r->rtm_scope &&
+			if (fa->fa_type == cfg->fc_type &&
+			    fa->fa_scope == cfg->fc_scope &&
 			    fa->fa_info == fi)
 				goto out;
 		}
-		if (!(n->nlmsg_flags & NLM_F_APPEND))
+		if (!(cfg->fc_nlflags & NLM_F_APPEND))
 			fa = fa_orig;
 	}
 
 	err = -ENOENT;
-	if (!(n->nlmsg_flags&NLM_F_CREATE))
+	if (!(cfg->fc_nlflags & NLM_F_CREATE))
 		goto out;
 
 	err = -ENOBUFS;
@@ -506,8 +503,8 @@ fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 
 	new_fa->fa_info = fi;
 	new_fa->fa_tos = tos;
-	new_fa->fa_type = type;
-	new_fa->fa_scope = r->rtm_scope;
+	new_fa->fa_type = cfg->fc_type;
+	new_fa->fa_scope = cfg->fc_scope;
 	new_fa->fa_state = 0;
 
 	/*
@@ -526,7 +523,8 @@ fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		fz->fz_nent++;
 	rt_cache_flush(-1);
 
-	rtmsg_fib(RTM_NEWROUTE, key, new_fa, z, tb->tb_id, n, req);
+	rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
+		  &cfg->fc_nlinfo);
 	return 0;
 
 out_free_new_fa:
@@ -537,30 +535,25 @@ out:
 }
 
 
-static int
-fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
-	       struct nlmsghdr *n, struct netlink_skb_parms *req)
+static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct fn_hash *table = (struct fn_hash*)tb->tb_data;
 	struct fib_node *f;
 	struct fib_alias *fa, *fa_to_delete;
-	int z = r->rtm_dst_len;
 	struct fn_zone *fz;
 	u32 key;
-	u8 tos = r->rtm_tos;
 
-	if (z > 32)
+	if (cfg->fc_dst_len > 32)
 		return -EINVAL;
-	if ((fz  = table->fn_zones[z]) == NULL)
+
+	if ((fz  = table->fn_zones[cfg->fc_dst_len]) == NULL)
 		return -ESRCH;
 
 	key = 0;
-	if (rta->rta_dst) {
-		u32 dst;
-		memcpy(&dst, rta->rta_dst, 4);
-		if (dst & ~FZ_MASK(fz))
+	if (cfg->fc_dst) {
+		if (cfg->fc_dst & ~FZ_MASK(fz))
 			return -EINVAL;
-		key = fz_key(dst, fz);
+		key = fz_key(cfg->fc_dst, fz);
 	}
 
 	f = fib_find_node(fz, key);
@@ -568,7 +561,7 @@ fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	if (!f)
 		fa = NULL;
 	else
-		fa = fib_find_alias(&f->fn_alias, tos, 0);
+		fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
 	if (!fa)
 		return -ESRCH;
 
@@ -577,16 +570,16 @@ fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
 		struct fib_info *fi = fa->fa_info;
 
-		if (fa->fa_tos != tos)
+		if (fa->fa_tos != cfg->fc_tos)
 			break;
 
-		if ((!r->rtm_type ||
-		     fa->fa_type == r->rtm_type) &&
-		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
-		     fa->fa_scope == r->rtm_scope) &&
-		    (!r->rtm_protocol ||
-		     fi->fib_protocol == r->rtm_protocol) &&
-		    fib_nh_match(r, n, rta, fi) == 0) {
+		if ((!cfg->fc_type ||
+		     fa->fa_type == cfg->fc_type) &&
+		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
+		     fa->fa_scope == cfg->fc_scope) &&
+		    (!cfg->fc_protocol ||
+		     fi->fib_protocol == cfg->fc_protocol) &&
+		    fib_nh_match(cfg, fi) == 0) {
 			fa_to_delete = fa;
 			break;
 		}
@@ -596,7 +589,8 @@ fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		int kill_fn;
 
 		fa = fa_to_delete;
-		rtmsg_fib(RTM_DELROUTE, key, fa, z, tb->tb_id, n, req);
+		rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
+			  tb->tb_id, &cfg->fc_nlinfo);
 
 		kill_fn = 0;
 		write_lock_bh(&fib_hash_lock);
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index ddd52496b45..d6d1a89e400 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -23,19 +23,14 @@ extern int fib_semantic_match(struct list_head *head,
 			      struct fib_result *res, __u32 zone, __u32 mask,
 				int prefixlen);
 extern void fib_release_info(struct fib_info *);
-extern struct fib_info *fib_create_info(const struct rtmsg *r,
-					struct kern_rta *rta,
-					const struct nlmsghdr *,
-					int *err);
-extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
-			struct kern_rta *rta, struct fib_info *fi);
+extern struct fib_info *fib_create_info(struct fib_config *cfg);
+extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
 extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 			 u32 tb_id, u8 type, u8 scope, void *dst,
 			 int dst_len, u8 tos, struct fib_info *fi,
 			 unsigned int);
 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
-		      int z, u32 tb_id,
-		      struct nlmsghdr *n, struct netlink_skb_parms *req);
+		      int dst_len, u32 tb_id, struct nl_info *info);
 extern struct fib_alias *fib_find_alias(struct list_head *fah,
 					u8 tos, u32 prio);
 extern int fib_detect_death(struct fib_info *fi, int order,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 5dfdad5cbcd..340f9db389e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -44,6 +44,7 @@
 #include <net/ip_fib.h>
 #include <net/ip_mp_alg.h>
 #include <net/netlink.h>
+#include <net/nexthop.h>
 
 #include "fib_lookup.h"
 
@@ -273,27 +274,27 @@ int ip_fib_check_default(u32 gw, struct net_device *dev)
 }
 
 void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
-	       int z, u32 tb_id,
-	       struct nlmsghdr *n, struct netlink_skb_parms *req)
+	       int dst_len, u32 tb_id, struct nl_info *info)
 {
 	struct sk_buff *skb;
-	u32 pid = req ? req->pid : n->nlmsg_pid;
 	int payload = sizeof(struct rtmsg) + 256;
+	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
 	int err = -ENOBUFS;
 
 	skb = nlmsg_new(nlmsg_total_size(payload), GFP_KERNEL);
 	if (skb == NULL)
 		goto errout;
 
-	err = fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
-			    fa->fa_type, fa->fa_scope, &key, z, fa->fa_tos,
-			    fa->fa_info, 0);
+	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
+			    fa->fa_type, fa->fa_scope, &key, dst_len,
+			    fa->fa_tos, fa->fa_info, 0);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto errout;
 	}
 
-	err = rtnl_notify(skb, pid, RTNLGRP_IPV4_ROUTE, n, GFP_KERNEL);
+	err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
+			  info->nlh, GFP_KERNEL);
 errout:
 	if (err < 0)
 		rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
@@ -342,102 +343,100 @@ int fib_detect_death(struct fib_info *fi, int order,
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
-static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
-{
-	while (RTA_OK(attr,attrlen)) {
-		if (attr->rta_type == type)
-			return *(u32*)RTA_DATA(attr);
-		attr = RTA_NEXT(attr, attrlen);
-	}
-	return 0;
-}
-
-static int
-fib_count_nexthops(struct rtattr *rta)
+static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 {
 	int nhs = 0;
-	struct rtnexthop *nhp = RTA_DATA(rta);
-	int nhlen = RTA_PAYLOAD(rta);
 
-	while (nhlen >= (int)sizeof(struct rtnexthop)) {
-		if ((nhlen -= nhp->rtnh_len) < 0)
-			return 0;
+	while (rtnh_ok(rtnh, remaining)) {
 		nhs++;
-		nhp = RTNH_NEXT(nhp);
-	};
-	return nhs;
+		rtnh = rtnh_next(rtnh, &remaining);
+	}
+
+	/* leftover implies invalid nexthop configuration, discard it */
+	return remaining > 0 ? 0 : nhs;
 }
 
-static int
-fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+		       int remaining, struct fib_config *cfg)
 {
-	struct rtnexthop *nhp = RTA_DATA(rta);
-	int nhlen = RTA_PAYLOAD(rta);
-
 	change_nexthops(fi) {
-		int attrlen = nhlen - sizeof(struct rtnexthop);
-		if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+		int attrlen;
+
+		if (!rtnh_ok(rtnh, remaining))
 			return -EINVAL;
-		nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
-		nh->nh_oif = nhp->rtnh_ifindex;
-		nh->nh_weight = nhp->rtnh_hops + 1;
-		if (attrlen) {
-			nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+
+		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+		nh->nh_oif = rtnh->rtnh_ifindex;
+		nh->nh_weight = rtnh->rtnh_hops + 1;
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen > 0) {
+			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+			nh->nh_gw = nla ? nla_get_u32(nla) : 0;
 #ifdef CONFIG_NET_CLS_ROUTE
-			nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+			nla = nla_find(attrs, attrlen, RTA_FLOW);
+			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
 #endif
 		}
-		nhp = RTNH_NEXT(nhp);
+
+		rtnh = rtnh_next(rtnh, &remaining);
 	} endfor_nexthops(fi);
+
 	return 0;
 }
 
 #endif
 
-int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
-		 struct fib_info *fi)
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	struct rtnexthop *nhp;
-	int nhlen;
+	struct rtnexthop *rtnh;
+	int remaining;
 #endif
 
-	if (rta->rta_priority &&
-	    *rta->rta_priority != fi->fib_priority)
+	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
 		return 1;
 
-	if (rta->rta_oif || rta->rta_gw) {
-		if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
-		    (!rta->rta_gw  || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
+	if (cfg->fc_oif || cfg->fc_gw) {
+		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
+		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
 			return 0;
 		return 1;
 	}
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (rta->rta_mp == NULL)
+	if (cfg->fc_mp == NULL)
 		return 0;
-	nhp = RTA_DATA(rta->rta_mp);
-	nhlen = RTA_PAYLOAD(rta->rta_mp);
+
+	rtnh = cfg->fc_mp;
+	remaining = cfg->fc_mp_len;
 	
 	for_nexthops(fi) {
-		int attrlen = nhlen - sizeof(struct rtnexthop);
-		u32 gw;
+		int attrlen;
 
-		if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+		if (!rtnh_ok(rtnh, remaining))
 			return -EINVAL;
-		if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
+
+		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
 			return 1;
-		if (attrlen) {
-			gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
-			if (gw && gw != nh->nh_gw)
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen < 0) {
+			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+			if (nla && nla_get_u32(nla) != nh->nh_gw)
 				return 1;
 #ifdef CONFIG_NET_CLS_ROUTE
-			gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
-			if (gw && gw != nh->nh_tclassid)
+			nla = nla_find(attrs, attrlen, RTA_FLOW);
+			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
 				return 1;
 #endif
 		}
-		nhp = RTNH_NEXT(nhp);
+
+		rtnh = rtnh_next(rtnh, &remaining);
 	} endfor_nexthops(fi);
 #endif
 	return 0;
@@ -488,7 +487,8 @@ int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
 						|-> {local prefix} (terminal node)
  */
 
-static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
+static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
+			struct fib_nh *nh)
 {
 	int err;
 
@@ -502,7 +502,7 @@ static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_n
 		if (nh->nh_flags&RTNH_F_ONLINK) {
 			struct net_device *dev;
 
-			if (r->rtm_scope >= RT_SCOPE_LINK)
+			if (cfg->fc_scope >= RT_SCOPE_LINK)
 				return -EINVAL;
 			if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
 				return -EINVAL;
@@ -516,10 +516,15 @@ static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_n
 			return 0;
 		}
 		{
-			struct flowi fl = { .nl_u = { .ip4_u =
-						      { .daddr = nh->nh_gw,
-							.scope = r->rtm_scope + 1 } },
-					    .oif = nh->nh_oif };
+			struct flowi fl = {
+				.nl_u = {
+					.ip4_u = {
+						.daddr = nh->nh_gw,
+						.scope = cfg->fc_scope + 1,
+					},
+				},
+				.oif = nh->nh_oif,
+			};
 
 			/* It is not necessary, but requires a bit of thinking */
 			if (fl.fl4_scope < RT_SCOPE_LINK)
@@ -646,39 +651,28 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
 	fib_hash_free(old_laddrhash, bytes);
 }
 
-struct fib_info *
-fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
-		const struct nlmsghdr *nlh, int *errp)
+struct fib_info *fib_create_info(struct fib_config *cfg)
 {
 	int err;
 	struct fib_info *fi = NULL;
 	struct fib_info *ofi;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int nhs = 1;
-#else
-	const int nhs = 1;
-#endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	u32 mp_alg = IP_MP_ALG_NONE;
-#endif
 
 	/* Fast check to catch the most weird cases */
-	if (fib_props[r->rtm_type].scope > r->rtm_scope)
+	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
 		goto err_inval;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (rta->rta_mp) {
-		nhs = fib_count_nexthops(rta->rta_mp);
+	if (cfg->fc_mp) {
+		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
 		if (nhs == 0)
 			goto err_inval;
 	}
 #endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	if (rta->rta_mp_alg) {
-		mp_alg = *rta->rta_mp_alg;
-
-		if (mp_alg < IP_MP_ALG_NONE ||
-		    mp_alg > IP_MP_ALG_MAX)
+	if (cfg->fc_mp_alg) {
+		if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
+		    cfg->fc_mp_alg > IP_MP_ALG_MAX)
 			goto err_inval;
 	}
 #endif
@@ -714,43 +708,42 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
 		goto failure;
 	fib_info_cnt++;
 
-	fi->fib_protocol = r->rtm_protocol;
+	fi->fib_protocol = cfg->fc_protocol;
+	fi->fib_flags = cfg->fc_flags;
+	fi->fib_priority = cfg->fc_priority;
+	fi->fib_prefsrc = cfg->fc_prefsrc;
 
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
 		nh->nh_parent = fi;
 	} endfor_nexthops(fi)
 
-	fi->fib_flags = r->rtm_flags;
-	if (rta->rta_priority)
-		fi->fib_priority = *rta->rta_priority;
-	if (rta->rta_mx) {
-		int attrlen = RTA_PAYLOAD(rta->rta_mx);
-		struct rtattr *attr = RTA_DATA(rta->rta_mx);
-
-		while (RTA_OK(attr, attrlen)) {
-			unsigned flavor = attr->rta_type;
-			if (flavor) {
-				if (flavor > RTAX_MAX)
+	if (cfg->fc_mx) {
+		struct nlattr *nla;
+		int remaining;
+
+		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+			int type = nla->nla_type;
+
+			if (type) {
+				if (type > RTAX_MAX)
 					goto err_inval;
-				fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
+				fi->fib_metrics[type - 1] = nla_get_u32(nla);
 			}
-			attr = RTA_NEXT(attr, attrlen);
 		}
 	}
-	if (rta->rta_prefsrc)
-		memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
 
-	if (rta->rta_mp) {
+	if (cfg->fc_mp) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-		if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
+		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
+		if (err != 0)
 			goto failure;
-		if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
+		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
 			goto err_inval;
-		if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
+		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
 			goto err_inval;
 #ifdef CONFIG_NET_CLS_ROUTE
-		if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
+		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
 			goto err_inval;
 #endif
 #else
@@ -758,34 +751,32 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
 #endif
 	} else {
 		struct fib_nh *nh = fi->fib_nh;
-		if (rta->rta_oif)
-			nh->nh_oif = *rta->rta_oif;
-		if (rta->rta_gw)
-			memcpy(&nh->nh_gw, rta->rta_gw, 4);
+
+		nh->nh_oif = cfg->fc_oif;
+		nh->nh_gw = cfg->fc_gw;
+		nh->nh_flags = cfg->fc_flags;
 #ifdef CONFIG_NET_CLS_ROUTE
-		if (rta->rta_flow)
-			memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
+		nh->nh_tclassid = cfg->fc_flow;
 #endif
-		nh->nh_flags = r->rtm_flags;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 		nh->nh_weight = 1;
 #endif
 	}
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	fi->fib_mp_alg = mp_alg;
+	fi->fib_mp_alg = cfg->fc_mp_alg;
 #endif
 
-	if (fib_props[r->rtm_type].error) {
-		if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
+	if (fib_props[cfg->fc_type].error) {
+		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
 			goto err_inval;
 		goto link_it;
 	}
 
-	if (r->rtm_scope > RT_SCOPE_HOST)
+	if (cfg->fc_scope > RT_SCOPE_HOST)
 		goto err_inval;
 
-	if (r->rtm_scope == RT_SCOPE_HOST) {
+	if (cfg->fc_scope == RT_SCOPE_HOST) {
 		struct fib_nh *nh = fi->fib_nh;
 
 		/* Local address is added. */
@@ -798,14 +789,14 @@ fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
 			goto failure;
 	} else {
 		change_nexthops(fi) {
-			if ((err = fib_check_nh(r, fi, nh)) != 0)
+			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
 				goto failure;
 		} endfor_nexthops(fi)
 	}
 
 	if (fi->fib_prefsrc) {
-		if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
-		    memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
+		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
+		    fi->fib_prefsrc != cfg->fc_dst)
 			if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
 				goto err_inval;
 	}
@@ -846,12 +837,12 @@ err_inval:
 	err = -EINVAL;
 
 failure:
-        *errp = err;
         if (fi) {
 		fi->fib_dead = 1;
 		free_fib_info(fi);
 	}
-	return NULL;
+
+	return ERR_PTR(err);
 }
 
 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
@@ -1012,150 +1003,6 @@ rtattr_failure:
 	return -1;
 }
 
-#ifndef CONFIG_IP_NOSIOCRT
-
-int
-fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
-		    struct kern_rta *rta, struct rtentry *r)
-{
-	int    plen;
-	u32    *ptr;
-
-	memset(rtm, 0, sizeof(*rtm));
-	memset(rta, 0, sizeof(*rta));
-
-	if (r->rt_dst.sa_family != AF_INET)
-		return -EAFNOSUPPORT;
-
-	/* Check mask for validity:
-	   a) it must be contiguous.
-	   b) destination must have all host bits clear.
-	   c) if application forgot to set correct family (AF_INET),
-	      reject request unless it is absolutely clear i.e.
-	      both family and mask are zero.
-	 */
-	plen = 32;
-	ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
-	if (!(r->rt_flags&RTF_HOST)) {
-		u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
-		if (r->rt_genmask.sa_family != AF_INET) {
-			if (mask || r->rt_genmask.sa_family)
-				return -EAFNOSUPPORT;
-		}
-		if (bad_mask(mask, *ptr))
-			return -EINVAL;
-		plen = inet_mask_len(mask);
-	}
-
-	nl->nlmsg_flags = NLM_F_REQUEST;
-	nl->nlmsg_pid = 0;
-	nl->nlmsg_seq = 0;
-	nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
-	if (cmd == SIOCDELRT) {
-		nl->nlmsg_type = RTM_DELROUTE;
-		nl->nlmsg_flags = 0;
-	} else {
-		nl->nlmsg_type = RTM_NEWROUTE;
-		nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
-		rtm->rtm_protocol = RTPROT_BOOT;
-	}
-
-	rtm->rtm_dst_len = plen;
-	rta->rta_dst = ptr;
-
-	if (r->rt_metric) {
-		*(u32*)&r->rt_pad3 = r->rt_metric - 1;
-		rta->rta_priority = (u32*)&r->rt_pad3;
-	}
-	if (r->rt_flags&RTF_REJECT) {
-		rtm->rtm_scope = RT_SCOPE_HOST;
-		rtm->rtm_type = RTN_UNREACHABLE;
-		return 0;
-	}
-	rtm->rtm_scope = RT_SCOPE_NOWHERE;
-	rtm->rtm_type = RTN_UNICAST;
-
-	if (r->rt_dev) {
-		char *colon;
-		struct net_device *dev;
-		char   devname[IFNAMSIZ];
-
-		if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
-			return -EFAULT;
-		devname[IFNAMSIZ-1] = 0;
-		colon = strchr(devname, ':');
-		if (colon)
-			*colon = 0;
-		dev = __dev_get_by_name(devname);
-		if (!dev)
-			return -ENODEV;
-		rta->rta_oif = &dev->ifindex;
-		if (colon) {
-			struct in_ifaddr *ifa;
-			struct in_device *in_dev = __in_dev_get_rtnl(dev);
-			if (!in_dev)
-				return -ENODEV;
-			*colon = ':';
-			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
-				if (strcmp(ifa->ifa_label, devname) == 0)
-					break;
-			if (ifa == NULL)
-				return -ENODEV;
-			rta->rta_prefsrc = &ifa->ifa_local;
-		}
-	}
-
-	ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
-	if (r->rt_gateway.sa_family == AF_INET && *ptr) {
-		rta->rta_gw = ptr;
-		if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
-			rtm->rtm_scope = RT_SCOPE_UNIVERSE;
-	}
-
-	if (cmd == SIOCDELRT)
-		return 0;
-
-	if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
-		return -EINVAL;
-
-	if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
-		rtm->rtm_scope = RT_SCOPE_LINK;
-
-	if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
-		struct rtattr *rec;
-		struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
-		if (mx == NULL)
-			return -ENOMEM;
-		rta->rta_mx = mx;
-		mx->rta_type = RTA_METRICS;
-		mx->rta_len  = RTA_LENGTH(0);
-		if (r->rt_flags&RTF_MTU) {
-			rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
-			rec->rta_type = RTAX_ADVMSS;
-			rec->rta_len = RTA_LENGTH(4);
-			mx->rta_len += RTA_LENGTH(4);
-			*(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
-		}
-		if (r->rt_flags&RTF_WINDOW) {
-			rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
-			rec->rta_type = RTAX_WINDOW;
-			rec->rta_len = RTA_LENGTH(4);
-			mx->rta_len += RTA_LENGTH(4);
-			*(u32*)RTA_DATA(rec) = r->rt_window;
-		}
-		if (r->rt_flags&RTF_IRTT) {
-			rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
-			rec->rta_type = RTAX_RTT;
-			rec->rta_len = RTA_LENGTH(4);
-			mx->rta_len += RTA_LENGTH(4);
-			*(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
-		}
-	}
-	return 0;
-}
-
-#endif
-
 /*
    Update FIB if:
    - local address disappeared -> we must delete all the entries
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 2a580eb2579..41bef0a88ab 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1124,17 +1124,14 @@ err:
 	return fa_head;
 }
 
-static int
-fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
-	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	struct fib_alias *fa, *new_fa;
 	struct list_head *fa_head = NULL;
 	struct fib_info *fi;
-	int plen = r->rtm_dst_len;
-	int type = r->rtm_type;
-	u8 tos = r->rtm_tos;
+	int plen = cfg->fc_dst_len;
+	u8 tos = cfg->fc_tos;
 	u32 key, mask;
 	int err;
 	struct leaf *l;
@@ -1142,11 +1139,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 	if (plen > 32)
 		return -EINVAL;
 
-	key = 0;
-	if (rta->rta_dst)
-		memcpy(&key, rta->rta_dst, 4);
-
-	key = ntohl(key);
+	key = ntohl(cfg->fc_dst);
 
 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
 
@@ -1157,10 +1150,11 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 
 	key = key & mask;
 
-	fi = fib_create_info(r, rta, nlhdr, &err);
-
-	if (!fi)
+	fi = fib_create_info(cfg);
+	if (IS_ERR(fi)) {
+		err = PTR_ERR(fi);
 		goto err;
+	}
 
 	l = fib_find_node(t, key);
 	fa = NULL;
@@ -1185,10 +1179,10 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		struct fib_alias *fa_orig;
 
 		err = -EEXIST;
-		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
+		if (cfg->fc_nlflags & NLM_F_EXCL)
 			goto out;
 
-		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
+		if (cfg->fc_nlflags & NLM_F_REPLACE) {
 			struct fib_info *fi_drop;
 			u8 state;
 
@@ -1200,8 +1194,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 			fi_drop = fa->fa_info;
 			new_fa->fa_tos = fa->fa_tos;
 			new_fa->fa_info = fi;
-			new_fa->fa_type = type;
-			new_fa->fa_scope = r->rtm_scope;
+			new_fa->fa_type = cfg->fc_type;
+			new_fa->fa_scope = cfg->fc_scope;
 			state = fa->fa_state;
 			new_fa->fa_state &= ~FA_S_ACCESSED;
 
@@ -1224,17 +1218,17 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 				break;
 			if (fa->fa_info->fib_priority != fi->fib_priority)
 				break;
-			if (fa->fa_type == type &&
-			    fa->fa_scope == r->rtm_scope &&
+			if (fa->fa_type == cfg->fc_type &&
+			    fa->fa_scope == cfg->fc_scope &&
 			    fa->fa_info == fi) {
 				goto out;
 			}
 		}
-		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
+		if (!(cfg->fc_nlflags & NLM_F_APPEND))
 			fa = fa_orig;
 	}
 	err = -ENOENT;
-	if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
+	if (!(cfg->fc_nlflags & NLM_F_CREATE))
 		goto out;
 
 	err = -ENOBUFS;
@@ -1244,8 +1238,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 
 	new_fa->fa_info = fi;
 	new_fa->fa_tos = tos;
-	new_fa->fa_type = type;
-	new_fa->fa_scope = r->rtm_scope;
+	new_fa->fa_type = cfg->fc_type;
+	new_fa->fa_scope = cfg->fc_scope;
 	new_fa->fa_state = 0;
 	/*
 	 * Insert new entry to the list.
@@ -1262,7 +1256,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 			  (fa ? &fa->fa_list : fa_head));
 
 	rt_cache_flush(-1);
-	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
+	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
+		  &cfg->fc_nlinfo);
 succeeded:
 	return 0;
 
@@ -1548,28 +1543,21 @@ static int trie_leaf_remove(struct trie *t, t_key key)
 	return 1;
 }
 
-static int
-fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
-		struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	u32 key, mask;
-	int plen = r->rtm_dst_len;
-	u8 tos = r->rtm_tos;
+	int plen = cfg->fc_dst_len;
+	u8 tos = cfg->fc_tos;
 	struct fib_alias *fa, *fa_to_delete;
 	struct list_head *fa_head;
 	struct leaf *l;
 	struct leaf_info *li;
 
-
 	if (plen > 32)
 		return -EINVAL;
 
-	key = 0;
-	if (rta->rta_dst)
-		memcpy(&key, rta->rta_dst, 4);
-
-	key = ntohl(key);
+	key = ntohl(cfg->fc_dst);
 	mask = ntohl(inet_make_mask(plen));
 
 	if (key & ~mask)
@@ -1598,13 +1586,12 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		if (fa->fa_tos != tos)
 			break;
 
-		if ((!r->rtm_type ||
-		     fa->fa_type == r->rtm_type) &&
-		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
-		     fa->fa_scope == r->rtm_scope) &&
-		    (!r->rtm_protocol ||
-		     fi->fib_protocol == r->rtm_protocol) &&
-		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
+		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
+		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
+		     fa->fa_scope == cfg->fc_scope) &&
+		    (!cfg->fc_protocol ||
+		     fi->fib_protocol == cfg->fc_protocol) &&
+		    fib_nh_match(cfg, fi) == 0) {
 			fa_to_delete = fa;
 			break;
 		}
@@ -1614,7 +1601,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
 		return -ESRCH;
 
 	fa = fa_to_delete;
-	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
+	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
+		  &cfg->fc_nlinfo);
 
 	l = fib_find_node(t, key);
 	li = find_leaf_info(l, plen);
-- 
cgit v1.2.3-70-g09d2


From d889ce3b29e55b91257964b4c9aac70b91fedd91 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 17 Aug 2006 18:15:44 -0700
Subject: [IPv4]: Convert route get to new netlink api

Fixes various unvalidated netlink attributes causing memory
corruptions when left empty by userspace applications.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  1 +
 net/ipv4/fib_frontend.c |  2 +-
 net/ipv4/route.c        | 84 ++++++++++++++++++++++++++-----------------------
 3 files changed, 47 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 42ed96fab3f..fcc159a4ac1 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -216,6 +216,7 @@ extern void fib_select_default(const struct flowi *flp, struct fib_result *res);
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
 /* Exported by fib_frontend.c */
+extern struct nla_policy rtm_ipv4_policy[];
 extern void		ip_fib_init(void);
 extern int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
 extern int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d537c933abe..d0abeab16e6 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -453,7 +453,7 @@ int ip_rt_ioctl(unsigned int cmd, void *arg)
 
 #endif
 
-static struct nla_policy rtm_ipv4_policy[RTA_MAX+1] __read_mostly = {
+struct nla_policy rtm_ipv4_policy[RTA_MAX+1] __read_mostly = {
 	[RTA_DST]		= { .type = NLA_U32 },
 	[RTA_SRC]		= { .type = NLA_U32 },
 	[RTA_IIF]		= { .type = NLA_U32 },
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 31b67059ac2..a4d4cb85a16 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2737,18 +2737,24 @@ nla_put_failure:
 
 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 {
-	struct rtattr **rta = arg;
-	struct rtmsg *rtm = NLMSG_DATA(nlh);
+	struct rtmsg *rtm;
+	struct nlattr *tb[RTA_MAX+1];
 	struct rtable *rt = NULL;
-	u32 dst = 0;
-	u32 src = 0;
-	int iif = 0;
-	int err = -ENOBUFS;
+	u32 dst, src, iif;
+	int err;
 	struct sk_buff *skb;
 
+	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
+	if (err < 0)
+		goto errout;
+
+	rtm = nlmsg_data(nlh);
+
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-	if (!skb)
-		goto out;
+	if (skb == NULL) {
+		err = -ENOBUFS;
+		goto errout;
+	}
 
 	/* Reserve room for dummy headers, this skb can pass
 	   through good chunk of routing engine.
@@ -2759,61 +2765,61 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 	skb->nh.iph->protocol = IPPROTO_ICMP;
 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
 
-	if (rta[RTA_SRC - 1])
-		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
-	if (rta[RTA_DST - 1])
-		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
-	if (rta[RTA_IIF - 1])
-		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
+	src = tb[RTA_SRC] ? nla_get_u32(tb[RTA_SRC]) : 0;
+	dst = tb[RTA_DST] ? nla_get_u32(tb[RTA_DST]) : 0;
+	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
 
 	if (iif) {
-		struct net_device *dev = __dev_get_by_index(iif);
-		err = -ENODEV;
-		if (!dev)
-			goto out_free;
+		struct net_device *dev;
+
+		dev = __dev_get_by_index(iif);
+		if (dev == NULL) {
+			err = -ENODEV;
+			goto errout_free;
+		}
+
 		skb->protocol	= htons(ETH_P_IP);
 		skb->dev	= dev;
 		local_bh_disable();
 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
 		local_bh_enable();
-		rt = (struct rtable*)skb->dst;
-		if (!err && rt->u.dst.error)
+
+		rt = (struct rtable*) skb->dst;
+		if (err == 0 && rt->u.dst.error)
 			err = -rt->u.dst.error;
 	} else {
-		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
-							 .saddr = src,
-							 .tos = rtm->rtm_tos } } };
-		int oif = 0;
-		if (rta[RTA_OIF - 1])
-			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
-		fl.oif = oif;
+		struct flowi fl = {
+			.nl_u = {
+				.ip4_u = {
+					.daddr = dst,
+					.saddr = src,
+					.tos = rtm->rtm_tos,
+				},
+			},
+			.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
+		};
 		err = ip_route_output_key(&rt, &fl);
 	}
+
 	if (err)
-		goto out_free;
+		goto errout_free;
 
 	skb->dst = &rt->u.dst;
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
 
-	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
-
 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
 				RTM_NEWROUTE, 0, 0);
-	if (!err)
-		goto out_free;
-	if (err < 0) {
-		err = -EMSGSIZE;
-		goto out_free;
-	}
+	if (err <= 0)
+		goto errout_free;
 
 	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
-out:
+errout:
 	return err;
 
-out_free:
+errout_free:
 	kfree_skb(skb);
-	goto out;
+	goto errout;
 }
 
 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
-- 
cgit v1.2.3-70-g09d2


From e9ce1cd3cf6cf35b21d0ce990f2e738f35907386 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Mon, 21 Aug 2006 23:54:55 -0700
Subject: [PKT_SCHED]: Kill pkt_act.h inlining.

This was simply making templates of functions and mostly causing a lot
of code duplication in the classifier action modules.

We solve this more cleanly by having a common "struct tcf_common" that
hash worker functions contained once in act_api.c can work with.

Callers work with real action objects that have the common struct
plus their module specific struct members.  You go from a common
object to the higher level one using a "to_foo()" macro which makes
use of container_of() to do the dirty work.

This also kills off act_generic.h which was only used by act_simple.c
and keeping it around was more work than the it's value.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h          | 136 +++++++----
 include/net/act_generic.h      | 142 ------------
 include/net/pkt_act.h          | 273 ----------------------
 include/net/tc_act/tc_defact.h |  13 +-
 include/net/tc_act/tc_gact.h   |  18 +-
 include/net/tc_act/tc_ipt.h    |  15 +-
 include/net/tc_act/tc_mirred.h |  17 +-
 include/net/tc_act/tc_pedit.h  |  15 +-
 net/sched/act_api.c            | 246 ++++++++++++++++++--
 net/sched/act_gact.c           | 142 ++++++------
 net/sched/act_ipt.c            | 175 +++++++-------
 net/sched/act_mirred.c         | 159 ++++++-------
 net/sched/act_pedit.c          | 166 ++++++--------
 net/sched/act_police.c         | 508 +++++++++++++++++++++--------------------
 net/sched/act_simple.c         | 183 ++++++++++++---
 15 files changed, 1061 insertions(+), 1147 deletions(-)
 delete mode 100644 include/net/act_generic.h
 delete mode 100644 include/net/pkt_act.h

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 11e9eaf79f5..8b06c2f3657 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -8,70 +8,110 @@
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 
-#define tca_gen(name) \
-struct tcf_##name *next; \
-	u32 index; \
-	int refcnt; \
-	int bindcnt; \
-	u32 capab; \
-	int action; \
-	struct tcf_t tm; \
-	struct gnet_stats_basic bstats; \
-	struct gnet_stats_queue qstats; \
-	struct gnet_stats_rate_est rate_est; \
-	spinlock_t *stats_lock; \
-	spinlock_t lock
-
-struct tcf_police
-{
-	tca_gen(police);
-	int		result;
-	u32		ewma_rate;
-	u32		burst;
-	u32		mtu;
-	u32		toks;
-	u32		ptoks;
-	psched_time_t	t_c;
-	struct qdisc_rate_table *R_tab;
-	struct qdisc_rate_table *P_tab;
+struct tcf_common {
+	struct tcf_common		*tcfc_next;
+	u32				tcfc_index;
+	int				tcfc_refcnt;
+	int				tcfc_bindcnt;
+	u32				tcfc_capab;
+	int				tcfc_action;
+	struct tcf_t			tcfc_tm;
+	struct gnet_stats_basic		tcfc_bstats;
+	struct gnet_stats_queue		tcfc_qstats;
+	struct gnet_stats_rate_est	tcfc_rate_est;
+	spinlock_t			*tcfc_stats_lock;
+	spinlock_t			tcfc_lock;
+};
+#define tcf_next	common.tcfc_next
+#define tcf_index	common.tcfc_index
+#define tcf_refcnt	common.tcfc_refcnt
+#define tcf_bindcnt	common.tcfc_bindcnt
+#define tcf_capab	common.tcfc_capab
+#define tcf_action	common.tcfc_action
+#define tcf_tm		common.tcfc_tm
+#define tcf_bstats	common.tcfc_bstats
+#define tcf_qstats	common.tcfc_qstats
+#define tcf_rate_est	common.tcfc_rate_est
+#define tcf_stats_lock	common.tcfc_stats_lock
+#define tcf_lock	common.tcfc_lock
+
+struct tcf_police {
+	struct tcf_common	common;
+	int			tcfp_result;
+	u32			tcfp_ewma_rate;
+	u32			tcfp_burst;
+	u32			tcfp_mtu;
+	u32			tcfp_toks;
+	u32			tcfp_ptoks;
+	psched_time_t		tcfp_t_c;
+	struct qdisc_rate_table	*tcfp_R_tab;
+	struct qdisc_rate_table	*tcfp_P_tab;
 };
+#define to_police(pc)	\
+	container_of(pc, struct tcf_police, common)
+
+struct tcf_hashinfo {
+	struct tcf_common	**htab;
+	unsigned int		hmask;
+	rwlock_t		*lock;
+};
+
+static inline unsigned int tcf_hash(u32 index, unsigned int hmask)
+{
+	return index & hmask;
+}
 
 #ifdef CONFIG_NET_CLS_ACT
 
 #define ACT_P_CREATED 1
 #define ACT_P_DELETED 1
 
-struct tcf_act_hdr
-{
-	tca_gen(act_hdr);
+struct tcf_act_hdr {
+	struct tcf_common	common;
 };
 
-struct tc_action
-{
-	void *priv;
-	struct tc_action_ops *ops;
-	__u32   type;   /* for backward compat(TCA_OLD_COMPAT) */
-	__u32   order; 
-	struct tc_action *next;
+struct tc_action {
+	void			*priv;
+	struct tc_action_ops	*ops;
+	__u32			type; /* for backward compat(TCA_OLD_COMPAT) */
+	__u32			order;
+	struct tc_action	*next;
 };
 
 #define TCA_CAP_NONE 0
-struct tc_action_ops
-{
+struct tc_action_ops {
 	struct tc_action_ops *next;
+	struct tcf_hashinfo *hinfo;
 	char    kind[IFNAMSIZ];
 	__u32   type; /* TBD to match kind */
 	__u32 	capab;  /* capabilities includes 4 bit version */
 	struct module		*owner;
 	int     (*act)(struct sk_buff *, struct tc_action *, struct tcf_result *);
 	int     (*get_stats)(struct sk_buff *, struct tc_action *);
-	int     (*dump)(struct sk_buff *, struct tc_action *,int , int);
+	int     (*dump)(struct sk_buff *, struct tc_action *, int, int);
 	int     (*cleanup)(struct tc_action *, int bind);
-	int     (*lookup)(struct tc_action *, u32 );
-	int     (*init)(struct rtattr *,struct rtattr *,struct tc_action *, int , int );
-	int     (*walk)(struct sk_buff *, struct netlink_callback *, int , struct tc_action *);
+	int     (*lookup)(struct tc_action *, u32);
+	int     (*init)(struct rtattr *, struct rtattr *, struct tc_action *, int , int);
+	int     (*walk)(struct sk_buff *, struct netlink_callback *, int, struct tc_action *);
 };
 
+extern struct tcf_common *tcf_hash_lookup(u32 index,
+					  struct tcf_hashinfo *hinfo);
+extern void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo);
+extern int tcf_hash_release(struct tcf_common *p, int bind,
+			    struct tcf_hashinfo *hinfo);
+extern int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
+			      int type, struct tc_action *a);
+extern u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo);
+extern int tcf_hash_search(struct tc_action *a, u32 index);
+extern struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a,
+					 int bind, struct tcf_hashinfo *hinfo);
+extern struct tcf_common *tcf_hash_create(u32 index, struct rtattr *est,
+					  struct tc_action *a, int size,
+					  int bind, u32 *idx_gen,
+					  struct tcf_hashinfo *hinfo);
+extern void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo);
+
 extern int tcf_register_action(struct tc_action_ops *a);
 extern int tcf_unregister_action(struct tc_action_ops *a);
 extern void tcf_action_destroy(struct tc_action *a, int bind);
@@ -96,17 +136,17 @@ tcf_police_release(struct tcf_police *p, int bind)
 	int ret = 0;
 #ifdef CONFIG_NET_CLS_ACT
 	if (p) {
-		if (bind) {
-			 p->bindcnt--;
-		}
-		p->refcnt--;
-		if (p->refcnt <= 0 && !p->bindcnt) {
+		if (bind)
+			p->tcf_bindcnt--;
+
+		p->tcf_refcnt--;
+		if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) {
 			tcf_police_destroy(p);
 			ret = 1;
 		}
 	}
 #else
-	if (p && --p->refcnt == 0)
+	if (p && --p->tcf_refcnt == 0)
 		tcf_police_destroy(p);
 
 #endif /* CONFIG_NET_CLS_ACT */
diff --git a/include/net/act_generic.h b/include/net/act_generic.h
deleted file mode 100644
index c9daa7e5230..00000000000
--- a/include/net/act_generic.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * include/net/act_generic.h
- *
-*/
-#ifndef _NET_ACT_GENERIC_H
-#define _NET_ACT_GENERIC_H
-static inline int tcf_defact_release(struct tcf_defact *p, int bind)
-{
-	int ret = 0;
-	if (p) {
-		if (bind) {
-			p->bindcnt--;
-		}
-		p->refcnt--;
-		if (p->bindcnt <= 0 && p->refcnt <= 0) {
-			kfree(p->defdata);
-			tcf_hash_destroy(p);
-			ret = 1;
-		}
-	}
-	return ret;
-}
-
-static inline int
-alloc_defdata(struct tcf_defact *p, u32 datalen, void *defdata)
-{
-	p->defdata = kmalloc(datalen, GFP_KERNEL);
-	if (p->defdata == NULL)
-		return -ENOMEM;
-	p->datalen = datalen;
-	memcpy(p->defdata, defdata, datalen);
-	return 0;
-}
-
-static inline int
-realloc_defdata(struct tcf_defact *p, u32 datalen, void *defdata)
-{
-	/* safer to be just brute force for now */
-	kfree(p->defdata);
-	return alloc_defdata(p, datalen, defdata);
-}
-
-static inline int
-tcf_defact_init(struct rtattr *rta, struct rtattr *est,
-		struct tc_action *a, int ovr, int bind)
-{
-	struct rtattr *tb[TCA_DEF_MAX];
-	struct tc_defact *parm;
-	struct tcf_defact *p;
-	void *defdata;
-	u32 datalen = 0;
-	int ret = 0;
-
-	if (rta == NULL || rtattr_parse_nested(tb, TCA_DEF_MAX, rta) < 0)
-		return -EINVAL;
-
-	if (tb[TCA_DEF_PARMS - 1] == NULL || 
-	    RTA_PAYLOAD(tb[TCA_DEF_PARMS - 1]) < sizeof(*parm))
-		return -EINVAL;
-
-	parm = RTA_DATA(tb[TCA_DEF_PARMS - 1]);
-	defdata = RTA_DATA(tb[TCA_DEF_DATA - 1]);
-	if (defdata == NULL)
-		return -EINVAL;
-
-	datalen = RTA_PAYLOAD(tb[TCA_DEF_DATA - 1]);
-	if (datalen <= 0)
-		return -EINVAL;
-
-	p = tcf_hash_check(parm->index, a, ovr, bind);
-	if (p == NULL) {
-		p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
-		if (p == NULL)
-			return -ENOMEM;
-
-		ret = alloc_defdata(p, datalen, defdata);
-		if (ret < 0) {
-			kfree(p);
-			return ret;
-		}
-		ret = ACT_P_CREATED;
-	} else {
-		if (!ovr) {
-			tcf_defact_release(p, bind);
-			return -EEXIST;
-		}
-		realloc_defdata(p, datalen, defdata);
-	}
-
-	spin_lock_bh(&p->lock);
-	p->action = parm->action;
-	spin_unlock_bh(&p->lock);
-	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(p);
-	return ret;
-}
-
-static inline int tcf_defact_cleanup(struct tc_action *a, int bind)
-{
-	struct tcf_defact *p = PRIV(a, defact);
-
-	if (p != NULL)
-		return tcf_defact_release(p, bind);
-	return 0;
-}
-
-static inline int
-tcf_defact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
-{
-	unsigned char *b = skb->tail;
-	struct tc_defact opt;
-	struct tcf_defact *p = PRIV(a, defact);
-	struct tcf_t t;
-
-	opt.index = p->index;
-	opt.refcnt = p->refcnt - ref;
-	opt.bindcnt = p->bindcnt - bind;
-	opt.action = p->action;
-	RTA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
-	RTA_PUT(skb, TCA_DEF_DATA, p->datalen, p->defdata);
-	t.install = jiffies_to_clock_t(jiffies - p->tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
-	t.expires = jiffies_to_clock_t(p->tm.expires);
-	RTA_PUT(skb, TCA_DEF_TM, sizeof(t), &t);
-	return skb->len;
-
-rtattr_failure:
-	skb_trim(skb, b - skb->data);
-	return -1;
-}
-
-#define tca_use_default_ops \
-	.dump           =       tcf_defact_dump, \
-	.cleanup        =       tcf_defact_cleanup, \
-	.init           =       tcf_defact_init, \
-	.walk           =       tcf_generic_walker, \
-
-#define tca_use_default_defines(name) \
-	static u32 idx_gen; \
-	static struct tcf_defact *tcf_##name_ht[MY_TAB_SIZE]; \
-	static DEFINE_RWLOCK(##name_lock);
-#endif /* _NET_ACT_GENERIC_H */
diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h
deleted file mode 100644
index cf5e4d2e4c2..00000000000
--- a/include/net/pkt_act.h
+++ /dev/null
@@ -1,273 +0,0 @@
-#ifndef __NET_PKT_ACT_H
-#define __NET_PKT_ACT_H
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <net/sock.h>
-#include <net/pkt_sched.h>
-
-#define tca_st(val) (struct tcf_##val *)
-#define PRIV(a,name) ( tca_st(name) (a)->priv)
-
-#if 0 /* control */
-#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define DPRINTK(format,args...)
-#endif
-
-#if 0 /* data */
-#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args)
-#else
-#define D2PRINTK(format,args...)
-#endif
-
-static __inline__ unsigned
-tcf_hash(u32 index)
-{
-	return index & MY_TAB_MASK;
-}
-
-/* probably move this from being inline
- * and put into act_generic
-*/
-static inline void
-tcf_hash_destroy(struct tcf_st *p)
-{
-	unsigned h = tcf_hash(p->index);
-	struct tcf_st **p1p;
-
-	for (p1p = &tcf_ht[h]; *p1p; p1p = &(*p1p)->next) {
-		if (*p1p == p) {
-			write_lock_bh(&tcf_t_lock);
-			*p1p = p->next;
-			write_unlock_bh(&tcf_t_lock);
-#ifdef CONFIG_NET_ESTIMATOR
-			gen_kill_estimator(&p->bstats, &p->rate_est);
-#endif
-			kfree(p);
-			return;
-		}
-	}
-	BUG_TRAP(0);
-}
-
-static inline int
-tcf_hash_release(struct tcf_st *p, int bind )
-{
-	int ret = 0;
-	if (p) {
-		if (bind) {
-			p->bindcnt--;
-		}
-		p->refcnt--;
-	       	if(p->bindcnt <=0 && p->refcnt <= 0) {
-			tcf_hash_destroy(p);
-			ret = 1;
-		}
-	}
-	return ret;
-}
-
-static __inline__ int
-tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
-		struct tc_action *a)
-{
-	struct tcf_st *p;
-	int err =0, index =  -1,i= 0, s_i = 0, n_i = 0;
-	struct rtattr *r ;
-
-	read_lock(&tcf_t_lock);
-
-	s_i = cb->args[0];
-
-	for (i = 0; i < MY_TAB_SIZE; i++) {
-		p = tcf_ht[tcf_hash(i)];
-
-		for (; p; p = p->next) {
-			index++;
-			if (index < s_i)
-				continue;
-			a->priv = p;
-			a->order = n_i;
-			r = (struct rtattr*) skb->tail;
-			RTA_PUT(skb, a->order, 0, NULL);
-			err = tcf_action_dump_1(skb, a, 0, 0);
-			if (0 > err) {
-				index--;
-				skb_trim(skb, (u8*)r - skb->data);
-				goto done;
-			}
-			r->rta_len = skb->tail - (u8*)r;
-			n_i++;
-			if (n_i >= TCA_ACT_MAX_PRIO) {
-				goto done;
-			}
-		}
-	}
-done:
-	read_unlock(&tcf_t_lock);
-	if (n_i)
-		cb->args[0] += n_i;
-	return n_i;
-
-rtattr_failure:
-	skb_trim(skb, (u8*)r - skb->data);
-	goto done;
-}
-
-static __inline__ int
-tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
-{
-	struct tcf_st *p, *s_p;
-	struct rtattr *r ;
-	int i= 0, n_i = 0;
-
-	r = (struct rtattr*) skb->tail;
-	RTA_PUT(skb, a->order, 0, NULL);
-	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind);
-	for (i = 0; i < MY_TAB_SIZE; i++) {
-		p = tcf_ht[tcf_hash(i)];
-
-		while (p != NULL) {
-			s_p = p->next;
-			if (ACT_P_DELETED == tcf_hash_release(p, 0)) {
-				 module_put(a->ops->owner);
-			}
-			n_i++;
-			p = s_p;
-		}
-	}
-	RTA_PUT(skb, TCA_FCNT, 4, &n_i);
-	r->rta_len = skb->tail - (u8*)r;
-
-	return n_i;
-rtattr_failure:
-	skb_trim(skb, (u8*)r - skb->data);
-	return -EINVAL;
-}
-
-static __inline__ int
-tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, int type,
-		struct tc_action *a)
-{
-		if (type == RTM_DELACTION) {
-			return tcf_del_walker(skb,a);
-		} else if (type == RTM_GETACTION) {
-			return tcf_dump_walker(skb,cb,a);
-		} else {
-			printk("tcf_generic_walker: unknown action %d\n",type);
-			return -EINVAL;
-		}
-}
-
-static __inline__ struct tcf_st *
-tcf_hash_lookup(u32 index)
-{
-	struct tcf_st *p;
-
-	read_lock(&tcf_t_lock);
-	for (p = tcf_ht[tcf_hash(index)]; p; p = p->next) {
-		if (p->index == index)
-			break;
-	}
-	read_unlock(&tcf_t_lock);
-	return p;
-}
-
-static __inline__ u32
-tcf_hash_new_index(void)
-{
-	do {
-		if (++idx_gen == 0)
-			idx_gen = 1;
-	} while (tcf_hash_lookup(idx_gen));
-
-	return idx_gen;
-}
-
-
-static inline int
-tcf_hash_search(struct tc_action *a, u32 index)
-{
-	struct tcf_st *p = tcf_hash_lookup(index);
-
-	if (p != NULL) {
-		a->priv = p;
-		return 1;
-	}
-	return 0;
-}
-
-#ifdef CONFIG_NET_ACT_INIT
-static inline struct tcf_st *
-tcf_hash_check(u32 index, struct tc_action *a, int ovr, int bind)
-{
-	struct tcf_st *p = NULL;
-	if (index && (p = tcf_hash_lookup(index)) != NULL) {
-		if (bind) {
-			p->bindcnt++;
-			p->refcnt++;
-		}
-		a->priv = p;
-	}
-	return p;
-}
-
-static inline struct tcf_st *
-tcf_hash_create(u32 index, struct rtattr *est, struct tc_action *a, int size, int ovr, int bind)
-{
-	struct tcf_st *p = NULL;
-
-	p = kmalloc(size, GFP_KERNEL);
-	if (p == NULL)
-		return p;
-
-	memset(p, 0, size);
-	p->refcnt = 1;
-
-	if (bind) {
-		p->bindcnt = 1;
-	}
-
-	spin_lock_init(&p->lock);
-	p->stats_lock = &p->lock;
-	p->index = index ? : tcf_hash_new_index();
-	p->tm.install = jiffies;
-	p->tm.lastuse = jiffies;
-#ifdef CONFIG_NET_ESTIMATOR
-	if (est)
-		gen_new_estimator(&p->bstats, &p->rate_est, p->stats_lock, est);
-#endif
-	a->priv = (void *) p;
-	return p;
-}
-
-static inline void tcf_hash_insert(struct tcf_st *p)
-{
-	unsigned h = tcf_hash(p->index);
-
-	write_lock_bh(&tcf_t_lock);
-	p->next = tcf_ht[h];
-	tcf_ht[h] = p;
-	write_unlock_bh(&tcf_t_lock);
-}
-
-#endif
-
-#endif
diff --git a/include/net/tc_act/tc_defact.h b/include/net/tc_act/tc_defact.h
index 463aa671f95..65f024b8095 100644
--- a/include/net/tc_act/tc_defact.h
+++ b/include/net/tc_act/tc_defact.h
@@ -3,11 +3,12 @@
 
 #include <net/act_api.h>
 
-struct tcf_defact
-{
-	tca_gen(defact);
-	u32     datalen;
-	void    *defdata;
+struct tcf_defact {
+	struct tcf_common	common;
+	u32     		tcfd_datalen;
+	void    		*tcfd_defdata;
 };
+#define to_defact(pc) \
+	container_of(pc, struct tcf_defact, common)
 
-#endif
+#endif /* __NET_TC_DEF_H */
diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index 59f0d9628ad..9e3f6767b80 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -3,15 +3,15 @@
 
 #include <net/act_api.h>
 
-struct tcf_gact
-{
-        tca_gen(gact);
+struct tcf_gact {
+	struct tcf_common	common;
 #ifdef CONFIG_GACT_PROB
-        u16                 ptype;
-        u16                 pval;
-        int                 paction;
+        u16			tcfg_ptype;
+        u16			tcfg_pval;
+        int			tcfg_paction;
 #endif
-                                                                                
 };
-                                                                                
-#endif
+#define to_gact(pc) \
+	container_of(pc, struct tcf_gact, common)
+
+#endif /* __NET_TC_GACT_H */
diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h
index cb37ad08427..f7d25dfcc4b 100644
--- a/include/net/tc_act/tc_ipt.h
+++ b/include/net/tc_act/tc_ipt.h
@@ -5,12 +5,13 @@
 
 struct xt_entry_target;
 
-struct tcf_ipt
-{
-	tca_gen(ipt);
-	u32 hook;
-	char *tname;
-	struct xt_entry_target *t;
+struct tcf_ipt {
+	struct tcf_common	common;
+	u32			tcfi_hook;
+	char			*tcfi_tname;
+	struct xt_entry_target	*tcfi_t;
 };
+#define to_ipt(pc) \
+	container_of(pc, struct tcf_ipt, common)
 
-#endif
+#endif /* __NET_TC_IPT_H */
diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index b5c32f65c12..ceac661cdfd 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -3,13 +3,14 @@
 
 #include <net/act_api.h>
 
-struct tcf_mirred
-{
-	tca_gen(mirred);
-	int eaction;
-	int ifindex;
-	int ok_push;
-	struct net_device *dev;
+struct tcf_mirred {
+	struct tcf_common	common;
+	int			tcfm_eaction;
+	int			tcfm_ifindex;
+	int			tcfm_ok_push;
+	struct net_device	*tcfm_dev;
 };
+#define to_mirred(pc) \
+	container_of(pc, struct tcf_mirred, common)
 
-#endif
+#endif /* __NET_TC_MIR_H */
diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index eb21689d759..e6f6e15956f 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -3,12 +3,13 @@
 
 #include <net/act_api.h>
 
-struct tcf_pedit
-{
-	tca_gen(pedit);
-	unsigned char           nkeys;
-	unsigned char           flags;
-	struct tc_pedit_key     *keys;
+struct tcf_pedit {
+	struct tcf_common	common;
+	unsigned char		tcfp_nkeys;
+	unsigned char		tcfp_flags;
+	struct tc_pedit_key	*tcfp_keys;
 };
+#define to_pedit(pc) \
+	container_of(pc, struct tcf_pedit, common)
 
-#endif
+#endif /* __NET_TC_PED_H */
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 6990747d6d5..835070e9169 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -33,16 +33,230 @@
 #include <net/sch_generic.h>
 #include <net/act_api.h>
 
-#if 0 /* control */
-#define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args)
-#else
-#define DPRINTK(format, args...)
+void tcf_hash_destroy(struct tcf_common *p, struct tcf_hashinfo *hinfo)
+{
+	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
+	struct tcf_common **p1p;
+
+	for (p1p = &hinfo->htab[h]; *p1p; p1p = &(*p1p)->tcfc_next) {
+		if (*p1p == p) {
+			write_lock_bh(hinfo->lock);
+			*p1p = p->tcfc_next;
+			write_unlock_bh(hinfo->lock);
+#ifdef CONFIG_NET_ESTIMATOR
+			gen_kill_estimator(&p->tcfc_bstats,
+					   &p->tcfc_rate_est);
 #endif
-#if 0 /* data */
-#define D2PRINTK(format, args...) printk(KERN_DEBUG format, ##args)
-#else
-#define D2PRINTK(format, args...)
+			kfree(p);
+			return;
+		}
+	}
+	BUG_TRAP(0);
+}
+EXPORT_SYMBOL(tcf_hash_destroy);
+
+int tcf_hash_release(struct tcf_common *p, int bind,
+		     struct tcf_hashinfo *hinfo)
+{
+	int ret = 0;
+
+	if (p) {
+		if (bind)
+			p->tcfc_bindcnt--;
+
+		p->tcfc_refcnt--;
+	       	if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
+			tcf_hash_destroy(p, hinfo);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(tcf_hash_release);
+
+static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
+			   struct tc_action *a, struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p;
+	int err = 0, index = -1,i = 0, s_i = 0, n_i = 0;
+	struct rtattr *r ;
+
+	read_lock(hinfo->lock);
+
+	s_i = cb->args[0];
+
+	for (i = 0; i < (hinfo->hmask + 1); i++) {
+		p = hinfo->htab[tcf_hash(i, hinfo->hmask)];
+
+		for (; p; p = p->tcfc_next) {
+			index++;
+			if (index < s_i)
+				continue;
+			a->priv = p;
+			a->order = n_i;
+			r = (struct rtattr*) skb->tail;
+			RTA_PUT(skb, a->order, 0, NULL);
+			err = tcf_action_dump_1(skb, a, 0, 0);
+			if (err < 0) {
+				index--;
+				skb_trim(skb, (u8*)r - skb->data);
+				goto done;
+			}
+			r->rta_len = skb->tail - (u8*)r;
+			n_i++;
+			if (n_i >= TCA_ACT_MAX_PRIO)
+				goto done;
+		}
+	}
+done:
+	read_unlock(hinfo->lock);
+	if (n_i)
+		cb->args[0] += n_i;
+	return n_i;
+
+rtattr_failure:
+	skb_trim(skb, (u8*)r - skb->data);
+	goto done;
+}
+
+static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
+			  struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p, *s_p;
+	struct rtattr *r ;
+	int i= 0, n_i = 0;
+
+	r = (struct rtattr*) skb->tail;
+	RTA_PUT(skb, a->order, 0, NULL);
+	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind);
+	for (i = 0; i < (hinfo->hmask + 1); i++) {
+		p = hinfo->htab[tcf_hash(i, hinfo->hmask)];
+
+		while (p != NULL) {
+			s_p = p->tcfc_next;
+			if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo))
+				 module_put(a->ops->owner);
+			n_i++;
+			p = s_p;
+		}
+	}
+	RTA_PUT(skb, TCA_FCNT, 4, &n_i);
+	r->rta_len = skb->tail - (u8*)r;
+
+	return n_i;
+rtattr_failure:
+	skb_trim(skb, (u8*)r - skb->data);
+	return -EINVAL;
+}
+
+int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
+		       int type, struct tc_action *a)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+
+	if (type == RTM_DELACTION) {
+		return tcf_del_walker(skb, a, hinfo);
+	} else if (type == RTM_GETACTION) {
+		return tcf_dump_walker(skb, cb, a, hinfo);
+	} else {
+		printk("tcf_generic_walker: unknown action %d\n", type);
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(tcf_generic_walker);
+
+struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p;
+
+	read_lock(hinfo->lock);
+	for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p;
+	     p = p->tcfc_next) {
+		if (p->tcfc_index == index)
+			break;
+	}
+	read_unlock(hinfo->lock);
+
+	return p;
+}
+EXPORT_SYMBOL(tcf_hash_lookup);
+
+u32 tcf_hash_new_index(u32 *idx_gen, struct tcf_hashinfo *hinfo)
+{
+	u32 val = *idx_gen;
+
+	do {
+		if (++val == 0)
+			val = 1;
+	} while (tcf_hash_lookup(val, hinfo));
+
+	return (*idx_gen = val);
+}
+EXPORT_SYMBOL(tcf_hash_new_index);
+
+int tcf_hash_search(struct tc_action *a, u32 index)
+{
+	struct tcf_hashinfo *hinfo = a->ops->hinfo;
+	struct tcf_common *p = tcf_hash_lookup(index, hinfo);
+
+	if (p) {
+		a->priv = p;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(tcf_hash_search);
+
+struct tcf_common *tcf_hash_check(u32 index, struct tc_action *a, int bind,
+				  struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p = NULL;
+	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
+		if (bind) {
+			p->tcfc_bindcnt++;
+			p->tcfc_refcnt++;
+		}
+		a->priv = p;
+	}
+	return p;
+}
+EXPORT_SYMBOL(tcf_hash_check);
+
+struct tcf_common *tcf_hash_create(u32 index, struct rtattr *est, struct tc_action *a, int size, int bind, u32 *idx_gen, struct tcf_hashinfo *hinfo)
+{
+	struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+
+	if (unlikely(!p))
+		return p;
+	p->tcfc_refcnt = 1;
+	if (bind)
+		p->tcfc_bindcnt = 1;
+
+	spin_lock_init(&p->tcfc_lock);
+	p->tcfc_stats_lock = &p->tcfc_lock;
+	p->tcfc_index = index ? index : tcf_hash_new_index(idx_gen, hinfo);
+	p->tcfc_tm.install = jiffies;
+	p->tcfc_tm.lastuse = jiffies;
+#ifdef CONFIG_NET_ESTIMATOR
+	if (est)
+		gen_new_estimator(&p->tcfc_bstats, &p->tcfc_rate_est,
+				  p->tcfc_stats_lock, est);
 #endif
+	a->priv = (void *) p;
+	return p;
+}
+EXPORT_SYMBOL(tcf_hash_create);
+
+void tcf_hash_insert(struct tcf_common *p, struct tcf_hashinfo *hinfo)
+{
+	unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
+
+	write_lock_bh(hinfo->lock);
+	p->tcfc_next = hinfo->htab[h];
+	hinfo->htab[h] = p;
+	write_unlock_bh(hinfo->lock);
+}
+EXPORT_SYMBOL(tcf_hash_insert);
 
 static struct tc_action_ops *act_base = NULL;
 static DEFINE_RWLOCK(act_mod_lock);
@@ -155,9 +369,6 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action *act,
 
 	if (skb->tc_verd & TC_NCLS) {
 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
-		D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n",
-		         skb, skb->input_dev ? skb->input_dev->name : "xxx",
-		         skb->dev->name);
 		ret = TC_ACT_OK;
 		goto exec_done;
 	}
@@ -187,8 +398,6 @@ void tcf_action_destroy(struct tc_action *act, int bind)
 
 	for (a = act; a; a = act) {
 		if (a->ops && a->ops->cleanup) {
-			DPRINTK("tcf_action_destroy destroying %p next %p\n",
-			        a, a->next);
 			if (a->ops->cleanup(a, bind) == ACT_P_DELETED)
 				module_put(a->ops->owner);
 			act = act->next;
@@ -331,7 +540,6 @@ struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est,
 	if (*err != ACT_P_CREATED)
 		module_put(a_o->owner);
 	a->ops = a_o;
-	DPRINTK("tcf_action_init_1: successfull %s\n", act_name);
 
 	*err = 0;
 	return a;
@@ -392,12 +600,12 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 	if (compat_mode) {
 		if (a->type == TCA_OLD_COMPAT)
 			err = gnet_stats_start_copy_compat(skb, 0,
-				TCA_STATS, TCA_XSTATS, h->stats_lock, &d);
+				TCA_STATS, TCA_XSTATS, h->tcf_stats_lock, &d);
 		else
 			return 0;
 	} else
 		err = gnet_stats_start_copy(skb, TCA_ACT_STATS,
-			h->stats_lock, &d);
+			h->tcf_stats_lock, &d);
 
 	if (err < 0)
 		goto errout;
@@ -406,11 +614,11 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a,
 		if (a->ops->get_stats(skb, a) < 0)
 			goto errout;
 
-	if (gnet_stats_copy_basic(&d, &h->bstats) < 0 ||
+	if (gnet_stats_copy_basic(&d, &h->tcf_bstats) < 0 ||
 #ifdef CONFIG_NET_ESTIMATOR
-	    gnet_stats_copy_rate_est(&d, &h->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &h->tcf_rate_est) < 0 ||
 #endif
-	    gnet_stats_copy_queue(&d, &h->qstats) < 0)
+	    gnet_stats_copy_queue(&d, &h->tcf_qstats) < 0)
 		goto errout;
 
 	if (gnet_stats_finish_copy(&d) < 0)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e75a147ad60..6cff56696a8 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -34,48 +34,43 @@
 #include <linux/tc_act/tc_gact.h>
 #include <net/tc_act/tc_gact.h>
 
-/* use generic hash table */
-#define MY_TAB_SIZE	16
-#define MY_TAB_MASK	15
-
-static u32 idx_gen;
-static struct tcf_gact *tcf_gact_ht[MY_TAB_SIZE];
+#define GACT_TAB_MASK	15
+static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1];
+static u32 gact_idx_gen;
 static DEFINE_RWLOCK(gact_lock);
 
-/* ovewrride the defaults */
-#define tcf_st		tcf_gact
-#define tc_st		tc_gact
-#define tcf_t_lock	gact_lock
-#define tcf_ht		tcf_gact_ht
-
-#define CONFIG_NET_ACT_INIT 1
-#include <net/pkt_act.h>
+static struct tcf_hashinfo gact_hash_info = {
+	.htab	=	tcf_gact_ht,
+	.hmask	=	GACT_TAB_MASK,
+	.lock	=	&gact_lock,
+};
 
 #ifdef CONFIG_GACT_PROB
-static int gact_net_rand(struct tcf_gact *p)
+static int gact_net_rand(struct tcf_gact *gact)
 {
-	if (net_random()%p->pval)
-		return p->action;
-	return p->paction;
+	if (net_random() % gact->tcfg_pval)
+		return gact->tcf_action;
+	return gact->tcfg_paction;
 }
 
-static int gact_determ(struct tcf_gact *p)
+static int gact_determ(struct tcf_gact *gact)
 {
-	if (p->bstats.packets%p->pval)
-		return p->action;
-	return p->paction;
+	if (gact->tcf_bstats.packets % gact->tcfg_pval)
+		return gact->tcf_action;
+	return gact->tcfg_paction;
 }
 
-typedef int (*g_rand)(struct tcf_gact *p);
+typedef int (*g_rand)(struct tcf_gact *gact);
 static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ };
-#endif
+#endif /* CONFIG_GACT_PROB */
 
 static int tcf_gact_init(struct rtattr *rta, struct rtattr *est,
                          struct tc_action *a, int ovr, int bind)
 {
 	struct rtattr *tb[TCA_GACT_MAX];
 	struct tc_gact *parm;
-	struct tcf_gact *p;
+	struct tcf_gact *gact;
+	struct tcf_common *pc;
 	int ret = 0;
 
 	if (rta == NULL || rtattr_parse_nested(tb, TCA_GACT_MAX, rta) < 0)
@@ -94,105 +89,106 @@ static int tcf_gact_init(struct rtattr *rta, struct rtattr *est,
 		return -EOPNOTSUPP;
 #endif
 
-	p = tcf_hash_check(parm->index, a, ovr, bind);
-	if (p == NULL) {
-		p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
-		if (p == NULL)
+	pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*gact),
+				     bind, &gact_idx_gen, &gact_hash_info);
+		if (unlikely(!pc))
 			return -ENOMEM;
 		ret = ACT_P_CREATED;
 	} else {
 		if (!ovr) {
-			tcf_hash_release(p, bind);
+			tcf_hash_release(pc, bind, &gact_hash_info);
 			return -EEXIST;
 		}
 	}
 
-	spin_lock_bh(&p->lock);
-	p->action = parm->action;
+	gact = to_gact(pc);
+
+	spin_lock_bh(&gact->tcf_lock);
+	gact->tcf_action = parm->action;
 #ifdef CONFIG_GACT_PROB
 	if (tb[TCA_GACT_PROB-1] != NULL) {
 		struct tc_gact_p *p_parm = RTA_DATA(tb[TCA_GACT_PROB-1]);
-		p->paction = p_parm->paction;
-		p->pval    = p_parm->pval;
-		p->ptype   = p_parm->ptype;
+		gact->tcfg_paction = p_parm->paction;
+		gact->tcfg_pval    = p_parm->pval;
+		gact->tcfg_ptype   = p_parm->ptype;
 	}
 #endif
-	spin_unlock_bh(&p->lock);
+	spin_unlock_bh(&gact->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(p);
+		tcf_hash_insert(pc, &gact_hash_info);
 	return ret;
 }
 
-static int
-tcf_gact_cleanup(struct tc_action *a, int bind)
+static int tcf_gact_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_gact *p = PRIV(a, gact);
+	struct tcf_gact *gact = a->priv;
 
-	if (p != NULL)
-		return tcf_hash_release(p, bind);
+	if (gact)
+		return tcf_hash_release(&gact->common, bind, &gact_hash_info);
 	return 0;
 }
 
-static int
-tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
 {
-	struct tcf_gact *p = PRIV(a, gact);
+	struct tcf_gact *gact = a->priv;
 	int action = TC_ACT_SHOT;
 
-	spin_lock(&p->lock);
+	spin_lock(&gact->tcf_lock);
 #ifdef CONFIG_GACT_PROB
-	if (p->ptype && gact_rand[p->ptype] != NULL)
-		action = gact_rand[p->ptype](p);
+	if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL)
+		action = gact_rand[gact->tcfg_ptype](gact);
 	else
-		action = p->action;
+		action = gact->tcf_action;
 #else
-	action = p->action;
+	action = gact->tcf_action;
 #endif
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
+	gact->tcf_bstats.bytes += skb->len;
+	gact->tcf_bstats.packets++;
 	if (action == TC_ACT_SHOT)
-		p->qstats.drops++;
-	p->tm.lastuse = jiffies;
-	spin_unlock(&p->lock);
+		gact->tcf_qstats.drops++;
+	gact->tcf_tm.lastuse = jiffies;
+	spin_unlock(&gact->tcf_lock);
 
 	return action;
 }
 
-static int
-tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
 	unsigned char *b = skb->tail;
 	struct tc_gact opt;
-	struct tcf_gact *p = PRIV(a, gact);
+	struct tcf_gact *gact = a->priv;
 	struct tcf_t t;
 
-	opt.index = p->index;
-	opt.refcnt = p->refcnt - ref;
-	opt.bindcnt = p->bindcnt - bind;
-	opt.action = p->action;
+	opt.index = gact->tcf_index;
+	opt.refcnt = gact->tcf_refcnt - ref;
+	opt.bindcnt = gact->tcf_bindcnt - bind;
+	opt.action = gact->tcf_action;
 	RTA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
 #ifdef CONFIG_GACT_PROB
-	if (p->ptype) {
+	if (gact->tcfg_ptype) {
 		struct tc_gact_p p_opt;
-		p_opt.paction = p->paction;
-		p_opt.pval = p->pval;
-		p_opt.ptype = p->ptype;
+		p_opt.paction = gact->tcfg_paction;
+		p_opt.pval = gact->tcfg_pval;
+		p_opt.ptype = gact->tcfg_ptype;
 		RTA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
 	}
 #endif
-	t.install = jiffies_to_clock_t(jiffies - p->tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
-	t.expires = jiffies_to_clock_t(p->tm.expires);
+	t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
 	RTA_PUT(skb, TCA_GACT_TM, sizeof(t), &t);
 	return skb->len;
 
-      rtattr_failure:
+rtattr_failure:
 	skb_trim(skb, b - skb->data);
 	return -1;
 }
 
 static struct tc_action_ops act_gact_ops = {
 	.kind		=	"gact",
+	.hinfo		=	&gact_hash_info,
 	.type		=	TCA_ACT_GACT,
 	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
@@ -208,8 +204,7 @@ MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
 MODULE_DESCRIPTION("Generic Classifier actions");
 MODULE_LICENSE("GPL");
 
-static int __init
-gact_init_module(void)
+static int __init gact_init_module(void)
 {
 #ifdef CONFIG_GACT_PROB
 	printk("GACT probability on\n");
@@ -219,8 +214,7 @@ gact_init_module(void)
 	return tcf_register_action(&act_gact_ops);
 }
 
-static void __exit
-gact_cleanup_module(void)
+static void __exit gact_cleanup_module(void)
 {
 	tcf_unregister_action(&act_gact_ops);
 }
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d799e01248c..224c078a398 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -38,25 +38,19 @@
 
 #include <linux/netfilter_ipv4/ip_tables.h>
 
-/* use generic hash table */
-#define MY_TAB_SIZE     16
-#define MY_TAB_MASK     15
 
-static u32 idx_gen;
-static struct tcf_ipt *tcf_ipt_ht[MY_TAB_SIZE];
-/* ipt hash table lock */
+#define IPT_TAB_MASK     15
+static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1];
+static u32 ipt_idx_gen;
 static DEFINE_RWLOCK(ipt_lock);
 
-/* ovewrride the defaults */
-#define tcf_st		tcf_ipt
-#define tcf_t_lock	ipt_lock
-#define tcf_ht		tcf_ipt_ht
-
-#define CONFIG_NET_ACT_INIT
-#include <net/pkt_act.h>
+static struct tcf_hashinfo ipt_hash_info = {
+	.htab	=	tcf_ipt_ht,
+	.hmask	=	IPT_TAB_MASK,
+	.lock	=	&ipt_lock,
+};
 
-static int
-ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
+static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
 {
 	struct ipt_target *target;
 	int ret = 0;
@@ -65,7 +59,6 @@ ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
 	if (!target)
 		return -ENOENT;
 
-	DPRINTK("ipt_init_target: found %s\n", target->name);
 	t->u.kernel.target = target;
 
 	ret = xt_check_target(target, AF_INET, t->u.target_size - sizeof(*t),
@@ -78,8 +71,6 @@ ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
 		    			       t->u.kernel.target, t->data,
 					       t->u.target_size - sizeof(*t),
 					       hook)) {
-		DPRINTK("ipt_init_target: check failed for `%s'.\n",
-			t->u.kernel.target->name);
 		module_put(t->u.kernel.target->me);
 		ret = -EINVAL;
 	}
@@ -87,8 +78,7 @@ ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
 	return ret;
 }
 
-static void
-ipt_destroy_target(struct ipt_entry_target *t)
+static void ipt_destroy_target(struct ipt_entry_target *t)
 {
 	if (t->u.kernel.target->destroy)
 		t->u.kernel.target->destroy(t->u.kernel.target, t->data,
@@ -96,31 +86,30 @@ ipt_destroy_target(struct ipt_entry_target *t)
         module_put(t->u.kernel.target->me);
 }
 
-static int
-tcf_ipt_release(struct tcf_ipt *p, int bind)
+static int tcf_ipt_release(struct tcf_ipt *ipt, int bind)
 {
 	int ret = 0;
-	if (p) {
+	if (ipt) {
 		if (bind)
-			p->bindcnt--;
-		p->refcnt--;
-		if (p->bindcnt <= 0 && p->refcnt <= 0) {
-			ipt_destroy_target(p->t);
-			kfree(p->tname);
-			kfree(p->t);
-			tcf_hash_destroy(p);
+			ipt->tcf_bindcnt--;
+		ipt->tcf_refcnt--;
+		if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) {
+			ipt_destroy_target(ipt->tcfi_t);
+			kfree(ipt->tcfi_tname);
+			kfree(ipt->tcfi_t);
+			tcf_hash_destroy(&ipt->common, &ipt_hash_info);
 			ret = ACT_P_DELETED;
 		}
 	}
 	return ret;
 }
 
-static int
-tcf_ipt_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
-             int ovr, int bind)
+static int tcf_ipt_init(struct rtattr *rta, struct rtattr *est,
+			struct tc_action *a, int ovr, int bind)
 {
 	struct rtattr *tb[TCA_IPT_MAX];
-	struct tcf_ipt *p;
+	struct tcf_ipt *ipt;
+	struct tcf_common *pc;
 	struct ipt_entry_target *td, *t;
 	char *tname;
 	int ret = 0, err;
@@ -144,49 +133,51 @@ tcf_ipt_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
 	    RTA_PAYLOAD(tb[TCA_IPT_INDEX-1]) >= sizeof(u32))
 		index = *(u32 *)RTA_DATA(tb[TCA_IPT_INDEX-1]);
 
-	p = tcf_hash_check(index, a, ovr, bind);
-	if (p == NULL) {
-		p = tcf_hash_create(index, est, a, sizeof(*p), ovr, bind);
-		if (p == NULL)
+	pc = tcf_hash_check(index, a, bind, &ipt_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind,
+				     &ipt_idx_gen, &ipt_hash_info);
+		if (unlikely(!pc))
 			return -ENOMEM;
 		ret = ACT_P_CREATED;
 	} else {
 		if (!ovr) {
-			tcf_ipt_release(p, bind);
+			tcf_ipt_release(to_ipt(pc), bind);
 			return -EEXIST;
 		}
 	}
+	ipt = to_ipt(pc);
 
 	hook = *(u32 *)RTA_DATA(tb[TCA_IPT_HOOK-1]);
 
 	err = -ENOMEM;
 	tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
-	if (tname == NULL)
+	if (unlikely(!tname))
 		goto err1;
 	if (tb[TCA_IPT_TABLE - 1] == NULL ||
 	    rtattr_strlcpy(tname, tb[TCA_IPT_TABLE-1], IFNAMSIZ) >= IFNAMSIZ)
 		strcpy(tname, "mangle");
 
 	t = kmalloc(td->u.target_size, GFP_KERNEL);
-	if (t == NULL)
+	if (unlikely(!t))
 		goto err2;
 	memcpy(t, td, td->u.target_size);
 
 	if ((err = ipt_init_target(t, tname, hook)) < 0)
 		goto err3;
 
-	spin_lock_bh(&p->lock);
+	spin_lock_bh(&ipt->tcf_lock);
 	if (ret != ACT_P_CREATED) {
-		ipt_destroy_target(p->t);
-		kfree(p->tname);
-		kfree(p->t);
+		ipt_destroy_target(ipt->tcfi_t);
+		kfree(ipt->tcfi_tname);
+		kfree(ipt->tcfi_t);
 	}
-	p->tname = tname;
-	p->t     = t;
-	p->hook  = hook;
-	spin_unlock_bh(&p->lock);
+	ipt->tcfi_tname = tname;
+	ipt->tcfi_t     = t;
+	ipt->tcfi_hook  = hook;
+	spin_unlock_bh(&ipt->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(p);
+		tcf_hash_insert(pc, &ipt_hash_info);
 	return ret;
 
 err3:
@@ -194,33 +185,32 @@ err3:
 err2:
 	kfree(tname);
 err1:
-	kfree(p);
+	kfree(pc);
 	return err;
 }
 
-static int
-tcf_ipt_cleanup(struct tc_action *a, int bind)
+static int tcf_ipt_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_ipt *p = PRIV(a, ipt);
-	return tcf_ipt_release(p, bind);
+	struct tcf_ipt *ipt = a->priv;
+	return tcf_ipt_release(ipt, bind);
 }
 
-static int
-tcf_ipt(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
+		   struct tcf_result *res)
 {
 	int ret = 0, result = 0;
-	struct tcf_ipt *p = PRIV(a, ipt);
+	struct tcf_ipt *ipt = a->priv;
 
 	if (skb_cloned(skb)) {
 		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
 			return TC_ACT_UNSPEC;
 	}
 
-	spin_lock(&p->lock);
+	spin_lock(&ipt->tcf_lock);
 
-	p->tm.lastuse = jiffies;
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
+	ipt->tcf_tm.lastuse = jiffies;
+	ipt->tcf_bstats.bytes += skb->len;
+	ipt->tcf_bstats.packets++;
 
 	/* yes, we have to worry about both in and out dev
 	 worry later - danger - this API seems to have changed
@@ -229,16 +219,17 @@ tcf_ipt(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
 	/* iptables targets take a double skb pointer in case the skb
 	 * needs to be replaced. We don't own the skb, so this must not
 	 * happen. The pskb_expand_head above should make sure of this */
-	ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL, p->hook,
-					    p->t->u.kernel.target, p->t->data,
-					    NULL);
+	ret = ipt->tcfi_t->u.kernel.target->target(&skb, skb->dev, NULL,
+						   ipt->tcfi_hook,
+						   ipt->tcfi_t->u.kernel.target,
+						   ipt->tcfi_t->data, NULL);
 	switch (ret) {
 	case NF_ACCEPT:
 		result = TC_ACT_OK;
 		break;
 	case NF_DROP:
 		result = TC_ACT_SHOT;
-		p->qstats.drops++;
+		ipt->tcf_qstats.drops++;
 		break;
 	case IPT_CONTINUE:
 		result = TC_ACT_PIPE;
@@ -249,53 +240,46 @@ tcf_ipt(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
 		result = TC_POLICE_OK;
 		break;
 	}
-	spin_unlock(&p->lock);
+	spin_unlock(&ipt->tcf_lock);
 	return result;
 
 }
 
-static int
-tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
+	unsigned char *b = skb->tail;
+	struct tcf_ipt *ipt = a->priv;
 	struct ipt_entry_target *t;
 	struct tcf_t tm;
 	struct tc_cnt c;
-	unsigned char *b = skb->tail;
-	struct tcf_ipt *p = PRIV(a, ipt);
 
 	/* for simple targets kernel size == user size
 	** user name = target name
 	** for foolproof you need to not assume this
 	*/
 
-	t = kmalloc(p->t->u.user.target_size, GFP_ATOMIC);
-	if (t == NULL)
+	t = kmalloc(ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
+	if (unlikely(!t))
 		goto rtattr_failure;
 
-	c.bindcnt = p->bindcnt - bind;
-	c.refcnt = p->refcnt - ref;
-	memcpy(t, p->t, p->t->u.user.target_size);
-	strcpy(t->u.user.name, p->t->u.kernel.target->name);
-
-	DPRINTK("\ttcf_ipt_dump tablename %s length %d\n", p->tname,
-		strlen(p->tname));
-	DPRINTK("\tdump target name %s size %d size user %d "
-	        "data[0] %x data[1] %x\n", p->t->u.kernel.target->name,
-	        p->t->u.target_size, p->t->u.user.target_size,
-	        p->t->data[0], p->t->data[1]);
-	RTA_PUT(skb, TCA_IPT_TARG, p->t->u.user.target_size, t);
-	RTA_PUT(skb, TCA_IPT_INDEX, 4, &p->index);
-	RTA_PUT(skb, TCA_IPT_HOOK, 4, &p->hook);
+	c.bindcnt = ipt->tcf_bindcnt - bind;
+	c.refcnt = ipt->tcf_refcnt - ref;
+	memcpy(t, ipt->tcfi_t, ipt->tcfi_t->u.user.target_size);
+	strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
+
+	RTA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t);
+	RTA_PUT(skb, TCA_IPT_INDEX, 4, &ipt->tcf_index);
+	RTA_PUT(skb, TCA_IPT_HOOK, 4, &ipt->tcfi_hook);
 	RTA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c);
-	RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, p->tname);
-	tm.install = jiffies_to_clock_t(jiffies - p->tm.install);
-	tm.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
-	tm.expires = jiffies_to_clock_t(p->tm.expires);
+	RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, ipt->tcfi_tname);
+	tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
+	tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
+	tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
 	RTA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm);
 	kfree(t);
 	return skb->len;
 
-      rtattr_failure:
+rtattr_failure:
 	skb_trim(skb, b - skb->data);
 	kfree(t);
 	return -1;
@@ -303,6 +287,7 @@ tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 
 static struct tc_action_ops act_ipt_ops = {
 	.kind		=	"ipt",
+	.hinfo		=	&ipt_hash_info,
 	.type		=	TCA_ACT_IPT,
 	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
@@ -318,14 +303,12 @@ MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
 MODULE_DESCRIPTION("Iptables target actions");
 MODULE_LICENSE("GPL");
 
-static int __init
-ipt_init_module(void)
+static int __init ipt_init_module(void)
 {
 	return tcf_register_action(&act_ipt_ops);
 }
 
-static void __exit
-ipt_cleanup_module(void)
+static void __exit ipt_cleanup_module(void)
 {
 	tcf_unregister_action(&act_ipt_ops);
 }
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index fc562047ecc..483897271f1 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -39,46 +39,39 @@
 #include <linux/etherdevice.h>
 #include <linux/if_arp.h>
 
-
-/* use generic hash table */
-#define MY_TAB_SIZE     8
-#define MY_TAB_MASK     (MY_TAB_SIZE - 1)
-static u32 idx_gen;
-static struct tcf_mirred *tcf_mirred_ht[MY_TAB_SIZE];
+#define MIRRED_TAB_MASK     7
+static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1];
+static u32 mirred_idx_gen;
 static DEFINE_RWLOCK(mirred_lock);
 
-/* ovewrride the defaults */
-#define tcf_st		tcf_mirred
-#define tc_st		tc_mirred
-#define tcf_t_lock	mirred_lock
-#define tcf_ht		tcf_mirred_ht
-
-#define CONFIG_NET_ACT_INIT 1
-#include <net/pkt_act.h>
+static struct tcf_hashinfo mirred_hash_info = {
+	.htab	=	tcf_mirred_ht,
+	.hmask	=	MIRRED_TAB_MASK,
+	.lock	=	&mirred_lock,
+};
 
-static inline int
-tcf_mirred_release(struct tcf_mirred *p, int bind)
+static inline int tcf_mirred_release(struct tcf_mirred *m, int bind)
 {
-	if (p) {
+	if (m) {
 		if (bind)
-			p->bindcnt--;
-		p->refcnt--;
-		if(!p->bindcnt && p->refcnt <= 0) {
-			dev_put(p->dev);
-			tcf_hash_destroy(p);
+			m->tcf_bindcnt--;
+		m->tcf_refcnt--;
+		if(!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
+			dev_put(m->tcfm_dev);
+			tcf_hash_destroy(&m->common, &mirred_hash_info);
 			return 1;
 		}
 	}
 	return 0;
 }
 
-static int
-tcf_mirred_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
-                int ovr, int bind)
+static int tcf_mirred_init(struct rtattr *rta, struct rtattr *est,
+			   struct tc_action *a, int ovr, int bind)
 {
 	struct rtattr *tb[TCA_MIRRED_MAX];
 	struct tc_mirred *parm;
-	struct tcf_mirred *p;
+	struct tcf_mirred *m;
+	struct tcf_common *pc;
 	struct net_device *dev = NULL;
 	int ret = 0;
 	int ok_push = 0;
@@ -110,64 +103,62 @@ tcf_mirred_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
 		}
 	}
 
-	p = tcf_hash_check(parm->index, a, ovr, bind);
-	if (p == NULL) {
+	pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info);
+	if (!pc) {
 		if (!parm->ifindex)
 			return -EINVAL;
-		p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
-		if (p == NULL)
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind,
+				     &mirred_idx_gen, &mirred_hash_info);
+		if (unlikely(!pc))
 			return -ENOMEM;
 		ret = ACT_P_CREATED;
 	} else {
 		if (!ovr) {
-			tcf_mirred_release(p, bind);
+			tcf_mirred_release(to_mirred(pc), bind);
 			return -EEXIST;
 		}
 	}
+	m = to_mirred(pc);
 
-	spin_lock_bh(&p->lock);
-	p->action = parm->action;
-	p->eaction = parm->eaction;
+	spin_lock_bh(&m->tcf_lock);
+	m->tcf_action = parm->action;
+	m->tcfm_eaction = parm->eaction;
 	if (parm->ifindex) {
-		p->ifindex = parm->ifindex;
+		m->tcfm_ifindex = parm->ifindex;
 		if (ret != ACT_P_CREATED)
-			dev_put(p->dev);
-		p->dev = dev;
+			dev_put(m->tcfm_dev);
+		m->tcfm_dev = dev;
 		dev_hold(dev);
-		p->ok_push = ok_push;
+		m->tcfm_ok_push = ok_push;
 	}
-	spin_unlock_bh(&p->lock);
+	spin_unlock_bh(&m->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(p);
+		tcf_hash_insert(pc, &mirred_hash_info);
 
-	DPRINTK("tcf_mirred_init index %d action %d eaction %d device %s "
-	        "ifindex %d\n", parm->index, parm->action, parm->eaction,
-	        dev->name, parm->ifindex);
 	return ret;
 }
 
-static int
-tcf_mirred_cleanup(struct tc_action *a, int bind)
+static int tcf_mirred_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_mirred *p = PRIV(a, mirred);
+	struct tcf_mirred *m = a->priv;
 
-	if (p != NULL)
-		return tcf_mirred_release(p, bind);
+	if (m)
+		return tcf_mirred_release(m, bind);
 	return 0;
 }
 
-static int
-tcf_mirred(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
+		      struct tcf_result *res)
 {
-	struct tcf_mirred *p = PRIV(a, mirred);
+	struct tcf_mirred *m = a->priv;
 	struct net_device *dev;
 	struct sk_buff *skb2 = NULL;
 	u32 at = G_TC_AT(skb->tc_verd);
 
-	spin_lock(&p->lock);
+	spin_lock(&m->tcf_lock);
 
-	dev = p->dev;
-	p->tm.lastuse = jiffies;
+	dev = m->tcfm_dev;
+	m->tcf_tm.lastuse = jiffies;
 
 	if (!(dev->flags&IFF_UP) ) {
 		if (net_ratelimit())
@@ -176,10 +167,10 @@ tcf_mirred(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
 bad_mirred:
 		if (skb2 != NULL)
 			kfree_skb(skb2);
-		p->qstats.overlimits++;
-		p->bstats.bytes += skb->len;
-		p->bstats.packets++;
-		spin_unlock(&p->lock);
+		m->tcf_qstats.overlimits++;
+		m->tcf_bstats.bytes += skb->len;
+		m->tcf_bstats.packets++;
+		spin_unlock(&m->tcf_lock);
 		/* should we be asking for packet to be dropped?
 		 * may make sense for redirect case only
 		*/
@@ -189,59 +180,59 @@ bad_mirred:
 	skb2 = skb_clone(skb, GFP_ATOMIC);
 	if (skb2 == NULL)
 		goto bad_mirred;
-	if (p->eaction != TCA_EGRESS_MIRROR && p->eaction != TCA_EGRESS_REDIR) {
+	if (m->tcfm_eaction != TCA_EGRESS_MIRROR &&
+	    m->tcfm_eaction != TCA_EGRESS_REDIR) {
 		if (net_ratelimit())
-			printk("tcf_mirred unknown action %d\n", p->eaction);
+			printk("tcf_mirred unknown action %d\n",
+			       m->tcfm_eaction);
 		goto bad_mirred;
 	}
 
-	p->bstats.bytes += skb2->len;
-	p->bstats.packets++;
+	m->tcf_bstats.bytes += skb2->len;
+	m->tcf_bstats.packets++;
 	if (!(at & AT_EGRESS))
-		if (p->ok_push)
+		if (m->tcfm_ok_push)
 			skb_push(skb2, skb2->dev->hard_header_len);
 
 	/* mirror is always swallowed */
-	if (p->eaction != TCA_EGRESS_MIRROR)
+	if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
 		skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
 
 	skb2->dev = dev;
 	skb2->input_dev = skb->dev;
 	dev_queue_xmit(skb2);
-	spin_unlock(&p->lock);
-	return p->action;
+	spin_unlock(&m->tcf_lock);
+	return m->tcf_action;
 }
 
-static int
-tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
+static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
 	unsigned char *b = skb->tail;
+	struct tcf_mirred *m = a->priv;
 	struct tc_mirred opt;
-	struct tcf_mirred *p = PRIV(a, mirred);
 	struct tcf_t t;
 
-	opt.index = p->index;
-	opt.action = p->action;
-	opt.refcnt = p->refcnt - ref;
-	opt.bindcnt = p->bindcnt - bind;
-	opt.eaction = p->eaction;
-	opt.ifindex = p->ifindex;
-	DPRINTK("tcf_mirred_dump index %d action %d eaction %d ifindex %d\n",
-	         p->index, p->action, p->eaction, p->ifindex);
+	opt.index = m->tcf_index;
+	opt.action = m->tcf_action;
+	opt.refcnt = m->tcf_refcnt - ref;
+	opt.bindcnt = m->tcf_bindcnt - bind;
+	opt.eaction = m->tcfm_eaction;
+	opt.ifindex = m->tcfm_ifindex;
 	RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
-	t.install = jiffies_to_clock_t(jiffies - p->tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
-	t.expires = jiffies_to_clock_t(p->tm.expires);
+	t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
 	RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t);
 	return skb->len;
 
-      rtattr_failure:
+rtattr_failure:
 	skb_trim(skb, b - skb->data);
 	return -1;
 }
 
 static struct tc_action_ops act_mirred_ops = {
 	.kind		=	"mirred",
+	.hinfo		=	&mirred_hash_info,
 	.type		=	TCA_ACT_MIRRED,
 	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
@@ -257,15 +248,13 @@ MODULE_AUTHOR("Jamal Hadi Salim(2002)");
 MODULE_DESCRIPTION("Device Mirror/redirect actions");
 MODULE_LICENSE("GPL");
 
-static int __init
-mirred_init_module(void)
+static int __init mirred_init_module(void)
 {
 	printk("Mirror/redirect action on\n");
 	return tcf_register_action(&act_mirred_ops);
 }
 
-static void __exit
-mirred_cleanup_module(void)
+static void __exit mirred_cleanup_module(void)
 {
 	tcf_unregister_action(&act_mirred_ops);
 }
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index f257475e0e0..8ac65c219b9 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -33,32 +33,25 @@
 #include <linux/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_pedit.h>
 
-
-#define PEDIT_DEB 1
-
-/* use generic hash table */
-#define MY_TAB_SIZE     16
-#define MY_TAB_MASK     15
-static u32 idx_gen;
-static struct tcf_pedit *tcf_pedit_ht[MY_TAB_SIZE];
+#define PEDIT_TAB_MASK	15
+static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1];
+static u32 pedit_idx_gen;
 static DEFINE_RWLOCK(pedit_lock);
 
-#define tcf_st		tcf_pedit
-#define tc_st		tc_pedit
-#define tcf_t_lock	pedit_lock
-#define tcf_ht		tcf_pedit_ht
-
-#define CONFIG_NET_ACT_INIT 1
-#include <net/pkt_act.h>
+static struct tcf_hashinfo pedit_hash_info = {
+	.htab	=	tcf_pedit_ht,
+	.hmask	=	PEDIT_TAB_MASK,
+	.lock	=	&pedit_lock,
+};
 
-static int
-tcf_pedit_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
-               int ovr, int bind)
+static int tcf_pedit_init(struct rtattr *rta, struct rtattr *est,
+			  struct tc_action *a, int ovr, int bind)
 {
 	struct rtattr *tb[TCA_PEDIT_MAX];
 	struct tc_pedit *parm;
 	int ret = 0;
 	struct tcf_pedit *p;
+	struct tcf_common *pc;
 	struct tc_pedit_key *keys = NULL;
 	int ksize;
 
@@ -73,54 +66,56 @@ tcf_pedit_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a,
 	if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm) + ksize)
 		return -EINVAL;
 
-	p = tcf_hash_check(parm->index, a, ovr, bind);
-	if (p == NULL) {
+	pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info);
+	if (!pc) {
 		if (!parm->nkeys)
 			return -EINVAL;
-		p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind);
-		if (p == NULL)
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
+				     &pedit_idx_gen, &pedit_hash_info);
+		if (unlikely(!pc))
 			return -ENOMEM;
+		p = to_pedit(pc);
 		keys = kmalloc(ksize, GFP_KERNEL);
 		if (keys == NULL) {
-			kfree(p);
+			kfree(pc);
 			return -ENOMEM;
 		}
 		ret = ACT_P_CREATED;
 	} else {
+		p = to_pedit(pc);
 		if (!ovr) {
-			tcf_hash_release(p, bind);
+			tcf_hash_release(pc, bind, &pedit_hash_info);
 			return -EEXIST;
 		}
-		if (p->nkeys && p->nkeys != parm->nkeys) {
+		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
 			keys = kmalloc(ksize, GFP_KERNEL);
 			if (keys == NULL)
 				return -ENOMEM;
 		}
 	}
 
-	spin_lock_bh(&p->lock);
-	p->flags = parm->flags;
-	p->action = parm->action;
+	spin_lock_bh(&p->tcf_lock);
+	p->tcfp_flags = parm->flags;
+	p->tcf_action = parm->action;
 	if (keys) {
-		kfree(p->keys);
-		p->keys = keys;
-		p->nkeys = parm->nkeys;
+		kfree(p->tcfp_keys);
+		p->tcfp_keys = keys;
+		p->tcfp_nkeys = parm->nkeys;
 	}
-	memcpy(p->keys, parm->keys, ksize);
-	spin_unlock_bh(&p->lock);
+	memcpy(p->tcfp_keys, parm->keys, ksize);
+	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(p);
+		tcf_hash_insert(pc, &pedit_hash_info);
 	return ret;
 }
 
-static int
-tcf_pedit_cleanup(struct tc_action *a, int bind)
+static int tcf_pedit_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_pedit *p = PRIV(a, pedit);
+	struct tcf_pedit *p = a->priv;
 
-	if (p != NULL) {
-		struct tc_pedit_key *keys = p->keys;
-		if (tcf_hash_release(p, bind)) {
+	if (p) {
+		struct tc_pedit_key *keys = p->tcfp_keys;
+		if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) {
 			kfree(keys);
 			return 1;
 		}
@@ -128,30 +123,30 @@ tcf_pedit_cleanup(struct tc_action *a, int bind)
 	return 0;
 }
 
-static int
-tcf_pedit(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
+static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
+		     struct tcf_result *res)
 {
-	struct tcf_pedit *p = PRIV(a, pedit);
+	struct tcf_pedit *p = a->priv;
 	int i, munged = 0;
 	u8 *pptr;
 
 	if (!(skb->tc_verd & TC_OK2MUNGE)) {
 		/* should we set skb->cloned? */
 		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
-			return p->action;
+			return p->tcf_action;
 		}
 	}
 
 	pptr = skb->nh.raw;
 
-	spin_lock(&p->lock);
+	spin_lock(&p->tcf_lock);
 
-	p->tm.lastuse = jiffies;
+	p->tcf_tm.lastuse = jiffies;
 
-	if (p->nkeys > 0) {
-		struct tc_pedit_key *tkey = p->keys;
+	if (p->tcfp_nkeys > 0) {
+		struct tc_pedit_key *tkey = p->tcfp_keys;
 
-		for (i = p->nkeys; i > 0; i--, tkey++) {
+		for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
 			u32 *ptr;
 			int offset = tkey->off;
 
@@ -169,7 +164,8 @@ tcf_pedit(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
 				printk("offset must be on 32 bit boundaries\n");
 				goto bad;
 			}
-			if (skb->len < 0 || (offset > 0 && offset > skb->len)) {
+			if (skb->len < 0 ||
+			    (offset > 0 && offset > skb->len)) {
 				printk("offset %d cant exceed pkt length %d\n",
 				       offset, skb->len);
 				goto bad;
@@ -185,63 +181,47 @@ tcf_pedit(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
 			skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
 		goto done;
 	} else {
-		printk("pedit BUG: index %d\n",p->index);
+		printk("pedit BUG: index %d\n", p->tcf_index);
 	}
 
 bad:
-	p->qstats.overlimits++;
+	p->tcf_qstats.overlimits++;
 done:
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
-	spin_unlock(&p->lock);
-	return p->action;
+	p->tcf_bstats.bytes += skb->len;
+	p->tcf_bstats.packets++;
+	spin_unlock(&p->tcf_lock);
+	return p->tcf_action;
 }
 
-static int
-tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,int bind, int ref)
+static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
+			  int bind, int ref)
 {
 	unsigned char *b = skb->tail;
+	struct tcf_pedit *p = a->priv;
 	struct tc_pedit *opt;
-	struct tcf_pedit *p = PRIV(a, pedit);
 	struct tcf_t t;
 	int s; 
 		
-	s = sizeof(*opt) + p->nkeys * sizeof(struct tc_pedit_key);
+	s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
 
 	/* netlink spinlocks held above us - must use ATOMIC */
 	opt = kzalloc(s, GFP_ATOMIC);
-	if (opt == NULL)
+	if (unlikely(!opt))
 		return -ENOBUFS;
 
-	memcpy(opt->keys, p->keys, p->nkeys * sizeof(struct tc_pedit_key));
-	opt->index = p->index;
-	opt->nkeys = p->nkeys;
-	opt->flags = p->flags;
-	opt->action = p->action;
-	opt->refcnt = p->refcnt - ref;
-	opt->bindcnt = p->bindcnt - bind;
-
-
-#ifdef PEDIT_DEB
-	{                
-		/* Debug - get rid of later */
-		int i;
-		struct tc_pedit_key *key = opt->keys;
-
-		for (i=0; i<opt->nkeys; i++, key++) {
-			printk( "\n key #%d",i);
-			printk( "  at %d: val %08x mask %08x",
-			(unsigned int)key->off,
-			(unsigned int)key->val,
-			(unsigned int)key->mask);
-		}
-	}
-#endif
+	memcpy(opt->keys, p->tcfp_keys,
+	       p->tcfp_nkeys * sizeof(struct tc_pedit_key));
+	opt->index = p->tcf_index;
+	opt->nkeys = p->tcfp_nkeys;
+	opt->flags = p->tcfp_flags;
+	opt->action = p->tcf_action;
+	opt->refcnt = p->tcf_refcnt - ref;
+	opt->bindcnt = p->tcf_bindcnt - bind;
 
 	RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt);
-	t.install = jiffies_to_clock_t(jiffies - p->tm.install);
-	t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse);
-	t.expires = jiffies_to_clock_t(p->tm.expires);
+	t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
 	RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t);
 	kfree(opt);
 	return skb->len;
@@ -252,9 +232,9 @@ rtattr_failure:
 	return -1;
 }
 
-static
-struct tc_action_ops act_pedit_ops = {
+static struct tc_action_ops act_pedit_ops = {
 	.kind		=	"pedit",
+	.hinfo		=	&pedit_hash_info,
 	.type		=	TCA_ACT_PEDIT,
 	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
@@ -270,14 +250,12 @@ MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
 MODULE_DESCRIPTION("Generic Packet Editor actions");
 MODULE_LICENSE("GPL");
 
-static int __init
-pedit_init_module(void)
+static int __init pedit_init_module(void)
 {
 	return tcf_register_action(&act_pedit_ops);
 }
 
-static void __exit
-pedit_cleanup_module(void)
+static void __exit pedit_cleanup_module(void)
 {
 	tcf_unregister_action(&act_pedit_ops);
 }
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index da905d7b4b4..fed47b65883 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -32,43 +32,27 @@
 #include <net/sock.h>
 #include <net/act_api.h>
 
-#define L2T(p,L)   ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log])
-#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log])
-#define PRIV(a) ((struct tcf_police *) (a)->priv)
-
-/* use generic hash table */
-#define MY_TAB_SIZE     16
-#define MY_TAB_MASK     15
-static u32 idx_gen;
-static struct tcf_police *tcf_police_ht[MY_TAB_SIZE];
-/* Policer hash table lock */
-static DEFINE_RWLOCK(police_lock);
-
-/* Each policer is serialized by its individual spinlock */
+#define L2T(p,L)   ((p)->tcfp_R_tab->data[(L)>>(p)->tcfp_R_tab->rate.cell_log])
+#define L2T_P(p,L) ((p)->tcfp_P_tab->data[(L)>>(p)->tcfp_P_tab->rate.cell_log])
 
-static __inline__ unsigned tcf_police_hash(u32 index)
-{
-	return index&0xF;
-}
+#define POL_TAB_MASK     15
+static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1];
+static u32 police_idx_gen;
+static DEFINE_RWLOCK(police_lock);
 
-static __inline__ struct tcf_police * tcf_police_lookup(u32 index)
-{
-	struct tcf_police *p;
+static struct tcf_hashinfo police_hash_info = {
+	.htab	=	tcf_police_ht,
+	.hmask	=	POL_TAB_MASK,
+	.lock	=	&police_lock,
+};
 
-	read_lock(&police_lock);
-	for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) {
-		if (p->index == index)
-			break;
-	}
-	read_unlock(&police_lock);
-	return p;
-}
+/* Each policer is serialized by its individual spinlock */
 
 #ifdef CONFIG_NET_CLS_ACT
 static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
                               int type, struct tc_action *a)
 {
-	struct tcf_police *p;
+	struct tcf_common *p;
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
 	struct rtattr *r;
 
@@ -76,10 +60,10 @@ static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *c
 
 	s_i = cb->args[0];
 
-	for (i = 0; i < MY_TAB_SIZE; i++) {
-		p = tcf_police_ht[tcf_police_hash(i)];
+	for (i = 0; i < (POL_TAB_MASK + 1); i++) {
+		p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)];
 
-		for (; p; p = p->next) {
+		for (; p; p = p->tcfc_next) {
 			index++;
 			if (index < s_i)
 				continue;
@@ -110,48 +94,26 @@ rtattr_failure:
 	skb_trim(skb, (u8*)r - skb->data);
 	goto done;
 }
-
-static inline int
-tcf_act_police_hash_search(struct tc_action *a, u32 index)
-{
-	struct tcf_police *p = tcf_police_lookup(index);
-
-	if (p != NULL) {
-		a->priv = p;
-		return 1;
-	} else {
-		return 0;
-	}
-}
 #endif
 
-static inline u32 tcf_police_new_index(void)
-{
-	do {
-		if (++idx_gen == 0)
-			idx_gen = 1;
-	} while (tcf_police_lookup(idx_gen));
-
-	return idx_gen;
-}
-
 void tcf_police_destroy(struct tcf_police *p)
 {
-	unsigned h = tcf_police_hash(p->index);
-	struct tcf_police **p1p;
+	unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
+	struct tcf_common **p1p;
 	
-	for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) {
-		if (*p1p == p) {
+	for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) {
+		if (*p1p == &p->common) {
 			write_lock_bh(&police_lock);
-			*p1p = p->next;
+			*p1p = p->tcf_next;
 			write_unlock_bh(&police_lock);
 #ifdef CONFIG_NET_ESTIMATOR
-			gen_kill_estimator(&p->bstats, &p->rate_est);
+			gen_kill_estimator(&p->tcf_bstats,
+					   &p->tcf_rate_est);
 #endif
-			if (p->R_tab)
-				qdisc_put_rtab(p->R_tab);
-			if (p->P_tab)
-				qdisc_put_rtab(p->P_tab);
+			if (p->tcfp_R_tab)
+				qdisc_put_rtab(p->tcfp_R_tab);
+			if (p->tcfp_P_tab)
+				qdisc_put_rtab(p->tcfp_P_tab);
 			kfree(p);
 			return;
 		}
@@ -167,7 +129,7 @@ static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est,
 	int ret = 0, err;
 	struct rtattr *tb[TCA_POLICE_MAX];
 	struct tc_police *parm;
-	struct tcf_police *p;
+	struct tcf_police *police;
 	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
 
 	if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0)
@@ -185,27 +147,32 @@ static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est,
 	    RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
 		return -EINVAL;
 
-	if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) {
-		a->priv = p;
-		if (bind) {
-			p->bindcnt += 1;
-			p->refcnt += 1;
+	if (parm->index) {
+		struct tcf_common *pc;
+
+		pc = tcf_hash_lookup(parm->index, &police_hash_info);
+		if (pc != NULL) {
+			a->priv = pc;
+			police = to_police(pc);
+			if (bind) {
+				police->tcf_bindcnt += 1;
+				police->tcf_refcnt += 1;
+			}
+			if (ovr)
+				goto override;
+			return ret;
 		}
-		if (ovr)
-			goto override;
-		return ret;
 	}
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
-	if (p == NULL)
+	police = kzalloc(sizeof(*police), GFP_KERNEL);
+	if (police == NULL)
 		return -ENOMEM;
-
 	ret = ACT_P_CREATED;
-	p->refcnt = 1;
-	spin_lock_init(&p->lock);
-	p->stats_lock = &p->lock;
+	police->tcf_refcnt = 1;
+	spin_lock_init(&police->tcf_lock);
+	police->tcf_stats_lock = &police->tcf_lock;
 	if (bind)
-		p->bindcnt = 1;
+		police->tcf_bindcnt = 1;
 override:
 	if (parm->rate.rate) {
 		err = -ENOMEM;
@@ -215,67 +182,71 @@ override:
 		if (parm->peakrate.rate) {
 			P_tab = qdisc_get_rtab(&parm->peakrate,
 					       tb[TCA_POLICE_PEAKRATE-1]);
-			if (p->P_tab == NULL) {
+			if (P_tab == NULL) {
 				qdisc_put_rtab(R_tab);
 				goto failure;
 			}
 		}
 	}
 	/* No failure allowed after this point */
-	spin_lock_bh(&p->lock);
+	spin_lock_bh(&police->tcf_lock);
 	if (R_tab != NULL) {
-		qdisc_put_rtab(p->R_tab);
-		p->R_tab = R_tab;
+		qdisc_put_rtab(police->tcfp_R_tab);
+		police->tcfp_R_tab = R_tab;
 	}
 	if (P_tab != NULL) {
-		qdisc_put_rtab(p->P_tab);
-		p->P_tab = P_tab;
+		qdisc_put_rtab(police->tcfp_P_tab);
+		police->tcfp_P_tab = P_tab;
 	}
 
 	if (tb[TCA_POLICE_RESULT-1])
-		p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
-	p->toks = p->burst = parm->burst;
-	p->mtu = parm->mtu;
-	if (p->mtu == 0) {
-		p->mtu = ~0;
-		if (p->R_tab)
-			p->mtu = 255<<p->R_tab->rate.cell_log;
+		police->tcfp_result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
+	police->tcfp_toks = police->tcfp_burst = parm->burst;
+	police->tcfp_mtu = parm->mtu;
+	if (police->tcfp_mtu == 0) {
+		police->tcfp_mtu = ~0;
+		if (police->tcfp_R_tab)
+			police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log;
 	}
-	if (p->P_tab)
-		p->ptoks = L2T_P(p, p->mtu);
-	p->action = parm->action;
+	if (police->tcfp_P_tab)
+		police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu);
+	police->tcf_action = parm->action;
 
 #ifdef CONFIG_NET_ESTIMATOR
 	if (tb[TCA_POLICE_AVRATE-1])
-		p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
+		police->tcfp_ewma_rate =
+			*(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
 	if (est)
-		gen_replace_estimator(&p->bstats, &p->rate_est, p->stats_lock, est);
+		gen_replace_estimator(&police->tcf_bstats,
+				      &police->tcf_rate_est,
+				      police->tcf_stats_lock, est);
 #endif
 
-	spin_unlock_bh(&p->lock);
+	spin_unlock_bh(&police->tcf_lock);
 	if (ret != ACT_P_CREATED)
 		return ret;
 
-	PSCHED_GET_TIME(p->t_c);
-	p->index = parm->index ? : tcf_police_new_index();
-	h = tcf_police_hash(p->index);
+	PSCHED_GET_TIME(police->tcfp_t_c);
+	police->tcf_index = parm->index ? parm->index :
+		tcf_hash_new_index(&police_idx_gen, &police_hash_info);
+	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
 	write_lock_bh(&police_lock);
-	p->next = tcf_police_ht[h];
-	tcf_police_ht[h] = p;
+	police->tcf_next = tcf_police_ht[h];
+	tcf_police_ht[h] = &police->common;
 	write_unlock_bh(&police_lock);
 
-	a->priv = p;
+	a->priv = police;
 	return ret;
 
 failure:
 	if (ret == ACT_P_CREATED)
-		kfree(p);
+		kfree(police);
 	return err;
 }
 
 static int tcf_act_police_cleanup(struct tc_action *a, int bind)
 {
-	struct tcf_police *p = PRIV(a);
+	struct tcf_police *p = a->priv;
 
 	if (p != NULL)
 		return tcf_police_release(p, bind);
@@ -285,86 +256,87 @@ static int tcf_act_police_cleanup(struct tc_action *a, int bind)
 static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
                           struct tcf_result *res)
 {
+	struct tcf_police *police = a->priv;
 	psched_time_t now;
-	struct tcf_police *p = PRIV(a);
 	long toks;
 	long ptoks = 0;
 
-	spin_lock(&p->lock);
+	spin_lock(&police->tcf_lock);
 
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
+	police->tcf_bstats.bytes += skb->len;
+	police->tcf_bstats.packets++;
 
 #ifdef CONFIG_NET_ESTIMATOR
-	if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) {
-		p->qstats.overlimits++;
-		spin_unlock(&p->lock);
-		return p->action;
+	if (police->tcfp_ewma_rate &&
+	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
+		police->tcf_qstats.overlimits++;
+		spin_unlock(&police->tcf_lock);
+		return police->tcf_action;
 	}
 #endif
 
-	if (skb->len <= p->mtu) {
-		if (p->R_tab == NULL) {
-			spin_unlock(&p->lock);
-			return p->result;
+	if (skb->len <= police->tcfp_mtu) {
+		if (police->tcfp_R_tab == NULL) {
+			spin_unlock(&police->tcf_lock);
+			return police->tcfp_result;
 		}
 
 		PSCHED_GET_TIME(now);
 
-		toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst);
-
-		if (p->P_tab) {
-			ptoks = toks + p->ptoks;
-			if (ptoks > (long)L2T_P(p, p->mtu))
-				ptoks = (long)L2T_P(p, p->mtu);
-			ptoks -= L2T_P(p, skb->len);
+		toks = PSCHED_TDIFF_SAFE(now, police->tcfp_t_c,
+					 police->tcfp_burst);
+		if (police->tcfp_P_tab) {
+			ptoks = toks + police->tcfp_ptoks;
+			if (ptoks > (long)L2T_P(police, police->tcfp_mtu))
+				ptoks = (long)L2T_P(police, police->tcfp_mtu);
+			ptoks -= L2T_P(police, skb->len);
 		}
-		toks += p->toks;
-		if (toks > (long)p->burst)
-			toks = p->burst;
-		toks -= L2T(p, skb->len);
-
+		toks += police->tcfp_toks;
+		if (toks > (long)police->tcfp_burst)
+			toks = police->tcfp_burst;
+		toks -= L2T(police, skb->len);
 		if ((toks|ptoks) >= 0) {
-			p->t_c = now;
-			p->toks = toks;
-			p->ptoks = ptoks;
-			spin_unlock(&p->lock);
-			return p->result;
+			police->tcfp_t_c = now;
+			police->tcfp_toks = toks;
+			police->tcfp_ptoks = ptoks;
+			spin_unlock(&police->tcf_lock);
+			return police->tcfp_result;
 		}
 	}
 
-	p->qstats.overlimits++;
-	spin_unlock(&p->lock);
-	return p->action;
+	police->tcf_qstats.overlimits++;
+	spin_unlock(&police->tcf_lock);
+	return police->tcf_action;
 }
 
 static int
 tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 {
 	unsigned char	 *b = skb->tail;
+	struct tcf_police *police = a->priv;
 	struct tc_police opt;
-	struct tcf_police *p = PRIV(a);
-
-	opt.index = p->index;
-	opt.action = p->action;
-	opt.mtu = p->mtu;
-	opt.burst = p->burst;
-	opt.refcnt = p->refcnt - ref;
-	opt.bindcnt = p->bindcnt - bind;
-	if (p->R_tab)
-		opt.rate = p->R_tab->rate;
+
+	opt.index = police->tcf_index;
+	opt.action = police->tcf_action;
+	opt.mtu = police->tcfp_mtu;
+	opt.burst = police->tcfp_burst;
+	opt.refcnt = police->tcf_refcnt - ref;
+	opt.bindcnt = police->tcf_bindcnt - bind;
+	if (police->tcfp_R_tab)
+		opt.rate = police->tcfp_R_tab->rate;
 	else
 		memset(&opt.rate, 0, sizeof(opt.rate));
-	if (p->P_tab)
-		opt.peakrate = p->P_tab->rate;
+	if (police->tcfp_P_tab)
+		opt.peakrate = police->tcfp_P_tab->rate;
 	else
 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 	RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
-	if (p->result)
-		RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
+	if (police->tcfp_result)
+		RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int),
+			&police->tcfp_result);
 #ifdef CONFIG_NET_ESTIMATOR
-	if (p->ewma_rate)
-		RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
+	if (police->tcfp_ewma_rate)
+		RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &police->tcfp_ewma_rate);
 #endif
 	return skb->len;
 
@@ -379,13 +351,14 @@ MODULE_LICENSE("GPL");
 
 static struct tc_action_ops act_police_ops = {
 	.kind		=	"police",
+	.hinfo		=	&police_hash_info,
 	.type		=	TCA_ID_POLICE,
 	.capab		=	TCA_CAP_NONE,
 	.owner		=	THIS_MODULE,
 	.act		=	tcf_act_police,
 	.dump		=	tcf_act_police_dump,
 	.cleanup	=	tcf_act_police_cleanup,
-	.lookup		=	tcf_act_police_hash_search,
+	.lookup		=	tcf_hash_search,
 	.init		=	tcf_act_police_locate,
 	.walk		=	tcf_act_police_walker
 };
@@ -407,10 +380,39 @@ module_exit(police_cleanup_module);
 
 #else /* CONFIG_NET_CLS_ACT */
 
-struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est)
+static struct tcf_common *tcf_police_lookup(u32 index)
 {
-	unsigned h;
-	struct tcf_police *p;
+	struct tcf_hashinfo *hinfo = &police_hash_info;
+	struct tcf_common *p;
+
+	read_lock(hinfo->lock);
+	for (p = hinfo->htab[tcf_hash(index, hinfo->hmask)]; p;
+	     p = p->tcfc_next) {
+		if (p->tcfc_index == index)
+			break;
+	}
+	read_unlock(hinfo->lock);
+
+	return p;
+}
+
+static u32 tcf_police_new_index(void)
+{
+	u32 *idx_gen = &police_idx_gen;
+	u32 val = *idx_gen;
+
+	do {
+		if (++val == 0)
+			val = 1;
+	} while (tcf_police_lookup(val));
+
+	return (*idx_gen = val);
+}
+
+struct tcf_police *tcf_police_locate(struct rtattr *rta, struct rtattr *est)
+{
+	unsigned int h;
+	struct tcf_police *police;
 	struct rtattr *tb[TCA_POLICE_MAX];
 	struct tc_police *parm;
 
@@ -423,149 +425,158 @@ struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est)
 
 	parm = RTA_DATA(tb[TCA_POLICE_TBF-1]);
 
-	if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) {
-		p->refcnt++;
-		return p;
-	}
+	if (parm->index) {
+		struct tcf_common *pc;
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
-	if (p == NULL)
+		pc = tcf_police_lookup(parm->index);
+		if (pc) {
+			police = to_police(pc);
+			police->tcf_refcnt++;
+			return police;
+		}
+	}
+	police = kzalloc(sizeof(*police), GFP_KERNEL);
+	if (unlikely(!police))
 		return NULL;
 
-	p->refcnt = 1;
-	spin_lock_init(&p->lock);
-	p->stats_lock = &p->lock;
+	police->tcf_refcnt = 1;
+	spin_lock_init(&police->tcf_lock);
+	police->tcf_stats_lock = &police->tcf_lock;
 	if (parm->rate.rate) {
-		p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]);
-		if (p->R_tab == NULL)
+		police->tcfp_R_tab =
+			qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]);
+		if (police->tcfp_R_tab == NULL)
 			goto failure;
 		if (parm->peakrate.rate) {
-			p->P_tab = qdisc_get_rtab(&parm->peakrate,
-			                          tb[TCA_POLICE_PEAKRATE-1]);
-			if (p->P_tab == NULL)
+			police->tcfp_P_tab =
+				qdisc_get_rtab(&parm->peakrate,
+					       tb[TCA_POLICE_PEAKRATE-1]);
+			if (police->tcfp_P_tab == NULL)
 				goto failure;
 		}
 	}
 	if (tb[TCA_POLICE_RESULT-1]) {
 		if (RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32))
 			goto failure;
-		p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
+		police->tcfp_result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]);
 	}
 #ifdef CONFIG_NET_ESTIMATOR
 	if (tb[TCA_POLICE_AVRATE-1]) {
 		if (RTA_PAYLOAD(tb[TCA_POLICE_AVRATE-1]) != sizeof(u32))
 			goto failure;
-		p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
+		police->tcfp_ewma_rate =
+			*(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]);
 	}
 #endif
-	p->toks = p->burst = parm->burst;
-	p->mtu = parm->mtu;
-	if (p->mtu == 0) {
-		p->mtu = ~0;
-		if (p->R_tab)
-			p->mtu = 255<<p->R_tab->rate.cell_log;
+	police->tcfp_toks = police->tcfp_burst = parm->burst;
+	police->tcfp_mtu = parm->mtu;
+	if (police->tcfp_mtu == 0) {
+		police->tcfp_mtu = ~0;
+		if (police->tcfp_R_tab)
+			police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log;
 	}
-	if (p->P_tab)
-		p->ptoks = L2T_P(p, p->mtu);
-	PSCHED_GET_TIME(p->t_c);
-	p->index = parm->index ? : tcf_police_new_index();
-	p->action = parm->action;
+	if (police->tcfp_P_tab)
+		police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu);
+	PSCHED_GET_TIME(police->tcfp_t_c);
+	police->tcf_index = parm->index ? parm->index :
+		tcf_police_new_index();
+	police->tcf_action = parm->action;
 #ifdef CONFIG_NET_ESTIMATOR
 	if (est)
-		gen_new_estimator(&p->bstats, &p->rate_est, p->stats_lock, est);
+		gen_new_estimator(&police->tcf_bstats, &police->tcf_rate_est,
+				  police->tcf_stats_lock, est);
 #endif
-	h = tcf_police_hash(p->index);
+	h = tcf_hash(police->tcf_index, POL_TAB_MASK);
 	write_lock_bh(&police_lock);
-	p->next = tcf_police_ht[h];
-	tcf_police_ht[h] = p;
+	police->tcf_next = tcf_police_ht[h];
+	tcf_police_ht[h] = &police->common;
 	write_unlock_bh(&police_lock);
-	return p;
+	return police;
 
 failure:
-	if (p->R_tab)
-		qdisc_put_rtab(p->R_tab);
-	kfree(p);
+	if (police->tcfp_R_tab)
+		qdisc_put_rtab(police->tcfp_R_tab);
+	kfree(police);
 	return NULL;
 }
 
-int tcf_police(struct sk_buff *skb, struct tcf_police *p)
+int tcf_police(struct sk_buff *skb, struct tcf_police *police)
 {
 	psched_time_t now;
 	long toks;
 	long ptoks = 0;
 
-	spin_lock(&p->lock);
+	spin_lock(&police->tcf_lock);
 
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
+	police->tcf_bstats.bytes += skb->len;
+	police->tcf_bstats.packets++;
 
 #ifdef CONFIG_NET_ESTIMATOR
-	if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) {
-		p->qstats.overlimits++;
-		spin_unlock(&p->lock);
-		return p->action;
+	if (police->tcfp_ewma_rate &&
+	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
+		police->tcf_qstats.overlimits++;
+		spin_unlock(&police->tcf_lock);
+		return police->tcf_action;
 	}
 #endif
-
-	if (skb->len <= p->mtu) {
-		if (p->R_tab == NULL) {
-			spin_unlock(&p->lock);
-			return p->result;
+	if (skb->len <= police->tcfp_mtu) {
+		if (police->tcfp_R_tab == NULL) {
+			spin_unlock(&police->tcf_lock);
+			return police->tcfp_result;
 		}
 
 		PSCHED_GET_TIME(now);
-
-		toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst);
-
-		if (p->P_tab) {
-			ptoks = toks + p->ptoks;
-			if (ptoks > (long)L2T_P(p, p->mtu))
-				ptoks = (long)L2T_P(p, p->mtu);
-			ptoks -= L2T_P(p, skb->len);
+		toks = PSCHED_TDIFF_SAFE(now, police->tcfp_t_c,
+					 police->tcfp_burst);
+		if (police->tcfp_P_tab) {
+			ptoks = toks + police->tcfp_ptoks;
+			if (ptoks > (long)L2T_P(police, police->tcfp_mtu))
+				ptoks = (long)L2T_P(police, police->tcfp_mtu);
+			ptoks -= L2T_P(police, skb->len);
 		}
-		toks += p->toks;
-		if (toks > (long)p->burst)
-			toks = p->burst;
-		toks -= L2T(p, skb->len);
-
+		toks += police->tcfp_toks;
+		if (toks > (long)police->tcfp_burst)
+			toks = police->tcfp_burst;
+		toks -= L2T(police, skb->len);
 		if ((toks|ptoks) >= 0) {
-			p->t_c = now;
-			p->toks = toks;
-			p->ptoks = ptoks;
-			spin_unlock(&p->lock);
-			return p->result;
+			police->tcfp_t_c = now;
+			police->tcfp_toks = toks;
+			police->tcfp_ptoks = ptoks;
+			spin_unlock(&police->tcf_lock);
+			return police->tcfp_result;
 		}
 	}
 
-	p->qstats.overlimits++;
-	spin_unlock(&p->lock);
-	return p->action;
+	police->tcf_qstats.overlimits++;
+	spin_unlock(&police->tcf_lock);
+	return police->tcf_action;
 }
 EXPORT_SYMBOL(tcf_police);
 
-int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p)
+int tcf_police_dump(struct sk_buff *skb, struct tcf_police *police)
 {
-	unsigned char	 *b = skb->tail;
+	unsigned char *b = skb->tail;
 	struct tc_police opt;
 
-	opt.index = p->index;
-	opt.action = p->action;
-	opt.mtu = p->mtu;
-	opt.burst = p->burst;
-	if (p->R_tab)
-		opt.rate = p->R_tab->rate;
+	opt.index = police->tcf_index;
+	opt.action = police->tcf_action;
+	opt.mtu = police->tcfp_mtu;
+	opt.burst = police->tcfp_burst;
+	if (police->tcfp_R_tab)
+		opt.rate = police->tcfp_R_tab->rate;
 	else
 		memset(&opt.rate, 0, sizeof(opt.rate));
-	if (p->P_tab)
-		opt.peakrate = p->P_tab->rate;
+	if (police->tcfp_P_tab)
+		opt.peakrate = police->tcfp_P_tab->rate;
 	else
 		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
 	RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
-	if (p->result)
-		RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result);
+	if (police->tcfp_result)
+		RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int),
+			&police->tcfp_result);
 #ifdef CONFIG_NET_ESTIMATOR
-	if (p->ewma_rate)
-		RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate);
+	if (police->tcfp_ewma_rate)
+		RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &police->tcfp_ewma_rate);
 #endif
 	return skb->len;
 
@@ -574,19 +585,20 @@ rtattr_failure:
 	return -1;
 }
 
-int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *p)
+int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *police)
 {
 	struct gnet_dump d;
 	
 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
-			TCA_XSTATS, p->stats_lock, &d) < 0)
+					 TCA_XSTATS, police->tcf_stats_lock,
+					 &d) < 0)
 		goto errout;
 	
-	if (gnet_stats_copy_basic(&d, &p->bstats) < 0 ||
+	if (gnet_stats_copy_basic(&d, &police->tcf_bstats) < 0 ||
 #ifdef CONFIG_NET_ESTIMATOR
-	    gnet_stats_copy_rate_est(&d, &p->rate_est) < 0 ||
+	    gnet_stats_copy_rate_est(&d, &police->tcf_rate_est) < 0 ||
 #endif
-	    gnet_stats_copy_queue(&d, &p->qstats) < 0)
+	    gnet_stats_copy_queue(&d, &police->tcf_qstats) < 0)
 		goto errout;
 
 	if (gnet_stats_finish_copy(&d) < 0)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 17105c82537..8c1ab8ad8fa 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -20,54 +20,175 @@
 
 #define TCA_ACT_SIMP 22
 
-/* XXX: Hide all these common elements under some macro 
- * probably
-*/
 #include <linux/tc_act/tc_defact.h>
 #include <net/tc_act/tc_defact.h>
 
-/* use generic hash table with 8 buckets */
-#define MY_TAB_SIZE     8
-#define MY_TAB_MASK     (MY_TAB_SIZE - 1)
-static u32 idx_gen;
-static struct tcf_defact *tcf_simp_ht[MY_TAB_SIZE];
+#define SIMP_TAB_MASK     7
+static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1];
+static u32 simp_idx_gen;
 static DEFINE_RWLOCK(simp_lock);
 
-/* override the defaults */
-#define tcf_st		tcf_defact
-#define tc_st		tc_defact
-#define tcf_t_lock	simp_lock
-#define tcf_ht		tcf_simp_ht
-
-#define CONFIG_NET_ACT_INIT 1
-#include <net/pkt_act.h>
-#include <net/act_generic.h>
+struct tcf_hashinfo simp_hash_info = {
+	.htab	=	tcf_simp_ht,
+	.hmask	=	SIMP_TAB_MASK,
+	.lock	=	&simp_lock,
+};
 
 static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
 {
-	struct tcf_defact *p = PRIV(a, defact);
+	struct tcf_defact *d = a->priv;
 
-	spin_lock(&p->lock);
-	p->tm.lastuse = jiffies;
-	p->bstats.bytes += skb->len;
-	p->bstats.packets++;
+	spin_lock(&d->tcf_lock);
+	d->tcf_tm.lastuse = jiffies;
+	d->tcf_bstats.bytes += skb->len;
+	d->tcf_bstats.packets++;
 
 	/* print policy string followed by _ then packet count 
 	 * Example if this was the 3rd packet and the string was "hello" 
 	 * then it would look like "hello_3" (without quotes) 
 	 **/
-	printk("simple: %s_%d\n", (char *)p->defdata, p->bstats.packets);
-	spin_unlock(&p->lock);
-	return p->action;
+	printk("simple: %s_%d\n",
+	       (char *)d->tcfd_defdata, d->tcf_bstats.packets);
+	spin_unlock(&d->tcf_lock);
+	return d->tcf_action;
+}
+
+static int tcf_simp_release(struct tcf_defact *d, int bind)
+{
+	int ret = 0;
+	if (d) {
+		if (bind)
+			d->tcf_bindcnt--;
+		d->tcf_refcnt--;
+		if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) {
+			kfree(d->tcfd_defdata);
+			tcf_hash_destroy(&d->common, &simp_hash_info);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+
+static int alloc_defdata(struct tcf_defact *d, u32 datalen, void *defdata)
+{
+	d->tcfd_defdata = kmalloc(datalen, GFP_KERNEL);
+	if (unlikely(!d->tcfd_defdata))
+		return -ENOMEM;
+	d->tcfd_datalen = datalen;
+	memcpy(d->tcfd_defdata, defdata, datalen);
+	return 0;
+}
+
+static int realloc_defdata(struct tcf_defact *d, u32 datalen, void *defdata)
+{
+	kfree(d->tcfd_defdata);
+	return alloc_defdata(d, datalen, defdata);
+}
+
+static int tcf_simp_init(struct rtattr *rta, struct rtattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct rtattr *tb[TCA_DEF_MAX];
+	struct tc_defact *parm;
+	struct tcf_defact *d;
+	struct tcf_common *pc;
+	void *defdata;
+	u32 datalen = 0;
+	int ret = 0;
+
+	if (rta == NULL || rtattr_parse_nested(tb, TCA_DEF_MAX, rta) < 0)
+		return -EINVAL;
+
+	if (tb[TCA_DEF_PARMS - 1] == NULL ||
+	    RTA_PAYLOAD(tb[TCA_DEF_PARMS - 1]) < sizeof(*parm))
+		return -EINVAL;
+
+	parm = RTA_DATA(tb[TCA_DEF_PARMS - 1]);
+	defdata = RTA_DATA(tb[TCA_DEF_DATA - 1]);
+	if (defdata == NULL)
+		return -EINVAL;
+
+	datalen = RTA_PAYLOAD(tb[TCA_DEF_DATA - 1]);
+	if (datalen <= 0)
+		return -EINVAL;
+
+	pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
+				     &simp_idx_gen, &simp_hash_info);
+		if (unlikely(!pc))
+			return -ENOMEM;
+
+		d = to_defact(pc);
+		ret = alloc_defdata(d, datalen, defdata);
+		if (ret < 0) {
+			kfree(pc);
+			return ret;
+		}
+		ret = ACT_P_CREATED;
+	} else {
+		d = to_defact(pc);
+		if (!ovr) {
+			tcf_simp_release(d, bind);
+			return -EEXIST;
+		}
+		realloc_defdata(d, datalen, defdata);
+	}
+
+	spin_lock_bh(&d->tcf_lock);
+	d->tcf_action = parm->action;
+	spin_unlock_bh(&d->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &simp_hash_info);
+	return ret;
+}
+
+static inline int tcf_simp_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_defact *d = a->priv;
+
+	if (d)
+		return tcf_simp_release(d, bind);
+	return 0;
+}
+
+static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
+				int bind, int ref)
+{
+	unsigned char *b = skb->tail;
+	struct tcf_defact *d = a->priv;
+	struct tc_defact opt;
+	struct tcf_t t;
+
+	opt.index = d->tcf_index;
+	opt.refcnt = d->tcf_refcnt - ref;
+	opt.bindcnt = d->tcf_bindcnt - bind;
+	opt.action = d->tcf_action;
+	RTA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
+	RTA_PUT(skb, TCA_DEF_DATA, d->tcfd_datalen, d->tcfd_defdata);
+	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+	RTA_PUT(skb, TCA_DEF_TM, sizeof(t), &t);
+	return skb->len;
+
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
 }
 
 static struct tc_action_ops act_simp_ops = {
-	.kind = "simple",
-	.type = TCA_ACT_SIMP,
-	.capab = TCA_CAP_NONE,
-	.owner = THIS_MODULE,
-	.act = tcf_simp,
-	tca_use_default_ops
+	.kind		=	"simple",
+	.hinfo		=	&simp_hash_info,
+	.type		=	TCA_ACT_SIMP,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_simp,
+	.dump		=	tcf_simp_dump,
+	.cleanup	=	tcf_simp_cleanup,
+	.init		=	tcf_simp_init,
+	.walk		=	tcf_generic_walker,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2005)");
-- 
cgit v1.2.3-70-g09d2


From e0a1ad73d34fd6dfdb630479400511e9879069c0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 22 Aug 2006 00:00:21 -0700
Subject: [IPv6] route: Simplify ip6_del_rt()

Provide a simple ip6_del_rt() for the majority of users and
an alternative for the exception via netlink. Avoids code
obfuscation.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  5 +----
 net/ipv6/addrconf.c     |  6 +++---
 net/ipv6/ndisc.c        |  4 ++--
 net/ipv6/route.c        | 18 ++++++++++++------
 4 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 01bfe404784..a7e6086a2bd 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -69,10 +69,7 @@ extern int			ip6_ins_rt(struct rt6_info *,
 					   struct nlmsghdr *,
 					   void *rtattr,
 					   struct netlink_skb_parms *req);
-extern int			ip6_del_rt(struct rt6_info *,
-					   struct nlmsghdr *,
-					   void *rtattr,
-					   struct netlink_skb_parms *req);
+extern int			ip6_del_rt(struct rt6_info *);
 
 extern int			ip6_rt_addr_add(struct in6_addr *addr,
 						struct net_device *dev,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f1ede900488..27f2e330959 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -736,7 +736,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 
 		if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
 			if (onlink == 0) {
-				ip6_del_rt(rt, NULL, NULL, NULL);
+				ip6_del_rt(rt);
 				rt = NULL;
 			} else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
 				rt->rt6i_expires = expires;
@@ -1662,7 +1662,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 		if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
 			if (rt->rt6i_flags&RTF_EXPIRES) {
 				if (valid_lft == 0) {
-					ip6_del_rt(rt, NULL, NULL, NULL);
+					ip6_del_rt(rt);
 					rt = NULL;
 				} else {
 					rt->rt6i_expires = jiffies + rt_expires;
@@ -3557,7 +3557,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			addrconf_leave_anycast(ifp);
 		addrconf_leave_solict(ifp->idev, &ifp->addr);
 		dst_hold(&ifp->rt->u.dst);
-		if (ip6_del_rt(ifp->rt, NULL, NULL, NULL))
+		if (ip6_del_rt(ifp->rt))
 			dst_free(&ifp->rt->u.dst);
 		break;
 	}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 5743e8bffef..419d6516381 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -961,7 +961,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 			struct rt6_info *rt;
 			rt = rt6_get_dflt_router(saddr, dev);
 			if (rt)
-				ip6_del_rt(rt, NULL, NULL, NULL);
+				ip6_del_rt(rt);
 		}
 
 out:
@@ -1114,7 +1114,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 	if (rt && lifetime == 0) {
 		neigh_clone(neigh);
-		ip6_del_rt(rt, NULL, NULL, NULL);
+		ip6_del_rt(rt);
 		rt = NULL;
 	}
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1aca787ead8..8d511de0db1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -457,7 +457,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
 
 	if (rt && !lifetime) {
-		ip6_del_rt(rt, NULL, NULL, NULL);
+		ip6_del_rt(rt);
 		rt = NULL;
 	}
 
@@ -813,7 +813,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE)
-			ip6_del_rt(rt, NULL, NULL, NULL);
+			ip6_del_rt(rt);
 		else
 			dst_release(dst);
 	}
@@ -1218,7 +1218,8 @@ out:
 	return err;
 }
 
-int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
+static int __ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
+			void *_rtattr, struct netlink_skb_parms *req)
 {
 	int err;
 	struct fib6_table *table;
@@ -1237,6 +1238,11 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct
 	return err;
 }
 
+int ip6_del_rt(struct rt6_info *rt)
+{
+	return __ip6_del_rt(rt, NULL, NULL, NULL);
+}
+
 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 			 void *_rtattr, struct netlink_skb_parms *req,
 			 u32 table_id)
@@ -1271,7 +1277,7 @@ static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 			dst_hold(&rt->u.dst);
 			read_unlock_bh(&table->tb6_lock);
 
-			return ip6_del_rt(rt, nlh, _rtattr, req);
+			return __ip6_del_rt(rt, nlh, _rtattr, req);
 		}
 	}
 	read_unlock_bh(&table->tb6_lock);
@@ -1395,7 +1401,7 @@ restart:
 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
 
 	if (rt->rt6i_flags&RTF_CACHE) {
-		ip6_del_rt(rt, NULL, NULL, NULL);
+		ip6_del_rt(rt);
 		return;
 	}
 
@@ -1631,7 +1637,7 @@ restart:
 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
 			dst_hold(&rt->u.dst);
 			read_unlock_bh(&table->tb6_lock);
-			ip6_del_rt(rt, NULL, NULL, NULL);
+			ip6_del_rt(rt);
 			goto restart;
 		}
 	}
-- 
cgit v1.2.3-70-g09d2


From 40e22e8f3d4d4f1ff68fb03683f007c53ee8b348 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 22 Aug 2006 00:00:45 -0700
Subject: [IPv6] route: Simplify ip6_ins_rt()

Provide a simple ip6_ins_rt() for the majority of users and
an alternative for the exception via netlink. Avoids code
obfuscation.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  5 +----
 net/ipv6/addrconf.c     |  2 +-
 net/ipv6/anycast.c      |  2 +-
 net/ipv6/route.c        | 19 ++++++++++++-------
 4 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index a7e6086a2bd..172c4761e2b 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -65,10 +65,7 @@ extern int			ip6_route_add(struct in6_rtmsg *rtmsg,
 					      void *rtattr,
 					      struct netlink_skb_parms *req,
 					      u32 table_id);
-extern int			ip6_ins_rt(struct rt6_info *,
-					   struct nlmsghdr *,
-					   void *rtattr,
-					   struct netlink_skb_parms *req);
+extern int			ip6_ins_rt(struct rt6_info *);
 extern int			ip6_del_rt(struct rt6_info *);
 
 extern int			ip6_rt_addr_add(struct in6_addr *addr,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27f2e330959..aafba9ea9cb 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3548,7 +3548,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 
 	switch (event) {
 	case RTM_NEWADDR:
-		ip6_ins_rt(ifp->rt, NULL, NULL, NULL);
+		ip6_ins_rt(ifp->rt);
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
 		break;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index f6881d7a038..abbc35a13e0 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -335,7 +335,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
 	write_unlock_bh(&idev->lock);
 
 	dst_hold(&rt->u.dst);
-	if (ip6_ins_rt(rt, NULL, NULL, NULL))
+	if (ip6_ins_rt(rt))
 		dst_release(&rt->u.dst);
 
 	addrconf_join_solict(dev, &aca->aca_addr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8d511de0db1..9ec348a72a9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -546,8 +546,8 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
    be destroyed.
  */
 
-int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
-		void *_rtattr, struct netlink_skb_parms *req)
+static int __ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
+			void *_rtattr, struct netlink_skb_parms *req)
 {
 	int err;
 	struct fib6_table *table;
@@ -560,6 +560,11 @@ int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
 	return err;
 }
 
+int ip6_ins_rt(struct rt6_info *rt)
+{
+	return __ip6_ins_rt(rt, NULL, NULL, NULL);
+}
+
 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
 				      struct in6_addr *saddr)
 {
@@ -657,7 +662,7 @@ restart:
 
 	dst_hold(&rt->u.dst);
 	if (nrt) {
-		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
+		err = ip6_ins_rt(nrt);
 		if (!err)
 			goto out2;
 	}
@@ -752,7 +757,7 @@ restart:
 
 	dst_hold(&rt->u.dst);
 	if (nrt) {
-		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
+		err = ip6_ins_rt(nrt);
 		if (!err)
 			goto out2;
 	}
@@ -1206,7 +1211,7 @@ install_route:
 	rt->u.dst.dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
-	return ip6_ins_rt(rt, nlh, _rtattr, req);
+	return __ip6_ins_rt(rt, nlh, _rtattr, req);
 
 out:
 	if (dev)
@@ -1393,7 +1398,7 @@ restart:
 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
 
-	if (ip6_ins_rt(nrt, NULL, NULL, NULL))
+	if (ip6_ins_rt(nrt))
 		goto out;
 
 	netevent.old = &rt->u.dst;
@@ -1483,7 +1488,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
 
-		ip6_ins_rt(nrt, NULL, NULL, NULL);
+		ip6_ins_rt(nrt);
 	}
 out:
 	dst_release(&rt->u.dst);
-- 
cgit v1.2.3-70-g09d2


From 86872cb57925c46a6499887d77afb880a892c0ec Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 22 Aug 2006 00:01:08 -0700
Subject: [IPv6] route: FIB6 configuration using struct fib6_config

Replaces the struct in6_rtmsg based interface orignating from
the ioctl interface with a struct fib6_config based on. Allows
changing the interface without breaking the ioctl interface
and avoids passing on tons of parameters.

The recently introduced struct nl_info is used to pass on
netlink authorship information for notifications.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   |  38 ++++--
 include/net/ip6_route.h |   6 +-
 net/ipv6/addrconf.c     |  65 +++++-----
 net/ipv6/ip6_fib.c      |  19 ++-
 net/ipv6/route.c        | 331 ++++++++++++++++++++++++++++--------------------
 5 files changed, 259 insertions(+), 200 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 69c44420978..9610b887ffb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -16,14 +16,35 @@
 #ifdef __KERNEL__
 
 #include <linux/ipv6_route.h>
-
-#include <net/dst.h>
-#include <net/flow.h>
 #include <linux/rtnetlink.h>
 #include <linux/spinlock.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/netlink.h>
 
 struct rt6_info;
 
+struct fib6_config
+{
+	u32		fc_table;
+	u32		fc_metric;
+	int		fc_dst_len;
+	int		fc_src_len;
+	int		fc_ifindex;
+	u32		fc_flags;
+	u32		fc_protocol;
+
+	struct in6_addr	fc_dst;
+	struct in6_addr	fc_src;
+	struct in6_addr	fc_gateway;
+
+	unsigned long	fc_expires;
+	struct nlattr	*fc_mx;
+	int		fc_mx_len;
+
+	struct nl_info	fc_nlinfo;
+};
+
 struct fib6_node
 {
 	struct fib6_node	*parent;
@@ -175,18 +196,13 @@ extern void			fib6_clean_all(int (*func)(struct rt6_info *, void *arg),
 
 extern int			fib6_add(struct fib6_node *root,
 					 struct rt6_info *rt,
-					 struct nlmsghdr *nlh,
-					 void *rtattr,
-					 struct netlink_skb_parms *req);
+					 struct nl_info *info);
 
 extern int			fib6_del(struct rt6_info *rt,
-					 struct nlmsghdr *nlh,
-					 void *rtattr,
-					 struct netlink_skb_parms *req);
+					 struct nl_info *info);
 
 extern void			inet6_rt_notify(int event, struct rt6_info *rt,
-						struct nlmsghdr *nlh,
-						struct netlink_skb_parms *req);
+						struct nl_info *info);
 
 extern void			fib6_run_gc(unsigned long dummy);
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 172c4761e2b..3f170f667c7 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -60,11 +60,7 @@ extern void			ip6_route_cleanup(void);
 
 extern int			ipv6_route_ioctl(unsigned int cmd, void __user *arg);
 
-extern int			ip6_route_add(struct in6_rtmsg *rtmsg,
-					      struct nlmsghdr *,
-					      void *rtattr,
-					      struct netlink_skb_parms *req,
-					      u32 table_id);
+extern int			ip6_route_add(struct fib6_config *cfg);
 extern int			ip6_ins_rt(struct rt6_info *);
 extern int			ip6_del_rt(struct rt6_info *);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index aafba9ea9cb..fc9cff3426c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1509,59 +1509,56 @@ static void
 addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 		      unsigned long expires, u32 flags)
 {
-	struct in6_rtmsg rtmsg;
+	struct fib6_config cfg = {
+		.fc_table = RT6_TABLE_PREFIX,
+		.fc_metric = IP6_RT_PRIO_ADDRCONF,
+		.fc_ifindex = dev->ifindex,
+		.fc_expires = expires,
+		.fc_dst_len = plen,
+		.fc_flags = RTF_UP | flags,
+	};
 
-	memset(&rtmsg, 0, sizeof(rtmsg));
-	ipv6_addr_copy(&rtmsg.rtmsg_dst, pfx);
-	rtmsg.rtmsg_dst_len = plen;
-	rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF;
-	rtmsg.rtmsg_ifindex = dev->ifindex;
-	rtmsg.rtmsg_info = expires;
-	rtmsg.rtmsg_flags = RTF_UP|flags;
-	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
+	ipv6_addr_copy(&cfg.fc_dst, pfx);
 
 	/* Prevent useless cloning on PtP SIT.
 	   This thing is done here expecting that the whole
 	   class of non-broadcast devices need not cloning.
 	 */
-	if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
-		rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
+	if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT))
+		cfg.fc_flags |= RTF_NONEXTHOP;
 
-	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_PREFIX);
+	ip6_route_add(&cfg);
 }
 
 /* Create "default" multicast route to the interface */
 
 static void addrconf_add_mroute(struct net_device *dev)
 {
-	struct in6_rtmsg rtmsg;
-
-	memset(&rtmsg, 0, sizeof(rtmsg));
-	ipv6_addr_set(&rtmsg.rtmsg_dst,
-		      htonl(0xFF000000), 0, 0, 0);
-	rtmsg.rtmsg_dst_len = 8;
-	rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF;
-	rtmsg.rtmsg_ifindex = dev->ifindex;
-	rtmsg.rtmsg_flags = RTF_UP;
-	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
-	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_LOCAL);
+	struct fib6_config cfg = {
+		.fc_table = RT6_TABLE_LOCAL,
+		.fc_metric = IP6_RT_PRIO_ADDRCONF,
+		.fc_ifindex = dev->ifindex,
+		.fc_dst_len = 8,
+		.fc_flags = RTF_UP,
+	};
+
+	ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
+
+	ip6_route_add(&cfg);
 }
 
 static void sit_route_add(struct net_device *dev)
 {
-	struct in6_rtmsg rtmsg;
-
-	memset(&rtmsg, 0, sizeof(rtmsg));
-
-	rtmsg.rtmsg_type	= RTMSG_NEWROUTE;
-	rtmsg.rtmsg_metric	= IP6_RT_PRIO_ADDRCONF;
+	struct fib6_config cfg = {
+		.fc_table = RT6_TABLE_MAIN,
+		.fc_metric = IP6_RT_PRIO_ADDRCONF,
+		.fc_ifindex = dev->ifindex,
+		.fc_dst_len = 96,
+		.fc_flags = RTF_UP | RTF_NONEXTHOP,
+	};
 
 	/* prefix length - 96 bits "::d.d.d.d" */
-	rtmsg.rtmsg_dst_len	= 96;
-	rtmsg.rtmsg_flags	= RTF_UP|RTF_NONEXTHOP;
-	rtmsg.rtmsg_ifindex	= dev->ifindex;
-
-	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_MAIN);
+	ip6_route_add(&cfg);
 }
 
 static void addrconf_add_lroute(struct net_device *dev)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index be36f4acda9..667b1b1ea25 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -610,7 +610,7 @@ insert_above:
  */
 
 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
-		struct nlmsghdr *nlh,  struct netlink_skb_parms *req)
+			    struct nl_info *info)
 {
 	struct rt6_info *iter = NULL;
 	struct rt6_info **ins;
@@ -665,7 +665,7 @@ out:
 	*ins = rt;
 	rt->rt6i_node = fn;
 	atomic_inc(&rt->rt6i_ref);
-	inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req);
+	inet6_rt_notify(RTM_NEWROUTE, rt, info);
 	rt6_stats.fib_rt_entries++;
 
 	if ((fn->fn_flags & RTN_RTINFO) == 0) {
@@ -695,8 +695,7 @@ void fib6_force_start_gc(void)
  *	with source addr info in sub-trees
  */
 
-int fib6_add(struct fib6_node *root, struct rt6_info *rt, 
-		struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
+int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info)
 {
 	struct fib6_node *fn;
 	int err = -ENOMEM;
@@ -769,7 +768,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	}
 #endif
 
-	err = fib6_add_rt2node(fn, rt, nlh, req);
+	err = fib6_add_rt2node(fn, rt, info);
 
 	if (err == 0) {
 		fib6_start_gc(rt);
@@ -1076,7 +1075,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
 }
 
 static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
-    struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
+			   struct nl_info *info)
 {
 	struct fib6_walker_t *w;
 	struct rt6_info *rt = *rtp;
@@ -1132,11 +1131,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 		if (atomic_read(&rt->rt6i_ref) != 1) BUG();
 	}
 
-	inet6_rt_notify(RTM_DELROUTE, rt, nlh, req);
+	inet6_rt_notify(RTM_DELROUTE, rt, info);
 	rt6_release(rt);
 }
 
-int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
+int fib6_del(struct rt6_info *rt, struct nl_info *info)
 {
 	struct fib6_node *fn = rt->rt6i_node;
 	struct rt6_info **rtp;
@@ -1161,7 +1160,7 @@ int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct ne
 
 	for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
 		if (*rtp == rt) {
-			fib6_del_route(fn, rtp, nlh, _rtattr, req);
+			fib6_del_route(fn, rtp, info);
 			return 0;
 		}
 	}
@@ -1290,7 +1289,7 @@ static int fib6_clean_node(struct fib6_walker_t *w)
 		res = c->func(rt, c->arg);
 		if (res < 0) {
 			w->leaf = rt;
-			res = fib6_del(rt, NULL, NULL, NULL);
+			res = fib6_del(rt, NULL);
 			if (res) {
 #if RT6_DEBUG >= 2
 				printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9ec348a72a9..7bcffa6ddba 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -546,15 +546,14 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
    be destroyed.
  */
 
-static int __ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
-			void *_rtattr, struct netlink_skb_parms *req)
+static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
 {
 	int err;
 	struct fib6_table *table;
 
 	table = rt->rt6i_table;
 	write_lock_bh(&table->tb6_lock);
-	err = fib6_add(&table->tb6_root, rt, nlh, _rtattr, req);
+	err = fib6_add(&table->tb6_root, rt, info);
 	write_unlock_bh(&table->tb6_lock);
 
 	return err;
@@ -562,7 +561,7 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
 
 int ip6_ins_rt(struct rt6_info *rt)
 {
-	return __ip6_ins_rt(rt, NULL, NULL, NULL);
+	return __ip6_ins_rt(rt, NULL);
 }
 
 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
@@ -1014,30 +1013,24 @@ int ipv6_get_hoplimit(struct net_device *dev)
  *
  */
 
-int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
-		  void *_rtattr, struct netlink_skb_parms *req,
-		  u32 table_id)
+int ip6_route_add(struct fib6_config *cfg)
 {
 	int err;
-	struct rtmsg *r;
-	struct rtattr **rta;
 	struct rt6_info *rt = NULL;
 	struct net_device *dev = NULL;
 	struct inet6_dev *idev = NULL;
 	struct fib6_table *table;
 	int addr_type;
 
-	rta = (struct rtattr **) _rtattr;
-
-	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
+	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
 		return -EINVAL;
 #ifndef CONFIG_IPV6_SUBTREES
-	if (rtmsg->rtmsg_src_len)
+	if (cfg->fc_src_len)
 		return -EINVAL;
 #endif
-	if (rtmsg->rtmsg_ifindex) {
+	if (cfg->fc_ifindex) {
 		err = -ENODEV;
-		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
+		dev = dev_get_by_index(cfg->fc_ifindex);
 		if (!dev)
 			goto out;
 		idev = in6_dev_get(dev);
@@ -1045,10 +1038,10 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 			goto out;
 	}
 
-	if (rtmsg->rtmsg_metric == 0)
-		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
+	if (cfg->fc_metric == 0)
+		cfg->fc_metric = IP6_RT_PRIO_USER;
 
-	table = fib6_new_table(table_id);
+	table = fib6_new_table(cfg->fc_table);
 	if (table == NULL) {
 		err = -ENOBUFS;
 		goto out;
@@ -1062,14 +1055,13 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 	}
 
 	rt->u.dst.obsolete = -1;
-	rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
-	if (nlh && (r = NLMSG_DATA(nlh))) {
-		rt->rt6i_protocol = r->rtm_protocol;
-	} else {
-		rt->rt6i_protocol = RTPROT_BOOT;
-	}
+	rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
+
+	if (cfg->fc_protocol == RTPROT_UNSPEC)
+		cfg->fc_protocol = RTPROT_BOOT;
+	rt->rt6i_protocol = cfg->fc_protocol;
 
-	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
+	addr_type = ipv6_addr_type(&cfg->fc_dst);
 
 	if (addr_type & IPV6_ADDR_MULTICAST)
 		rt->u.dst.input = ip6_mc_input;
@@ -1078,24 +1070,22 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 
 	rt->u.dst.output = ip6_output;
 
-	ipv6_addr_prefix(&rt->rt6i_dst.addr, 
-			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
-	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
+	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
+	rt->rt6i_dst.plen = cfg->fc_dst_len;
 	if (rt->rt6i_dst.plen == 128)
 	       rt->u.dst.flags = DST_HOST;
 
 #ifdef CONFIG_IPV6_SUBTREES
-	ipv6_addr_prefix(&rt->rt6i_src.addr, 
-			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
-	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
+	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
+	rt->rt6i_src.plen = cfg->fc_src_len;
 #endif
 
-	rt->rt6i_metric = rtmsg->rtmsg_metric;
+	rt->rt6i_metric = cfg->fc_metric;
 
 	/* We cannot add true routes via loopback here,
 	   they would result in kernel looping; promote them to reject routes
 	 */
-	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
+	if ((cfg->fc_flags & RTF_REJECT) ||
 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
 		/* hold loopback dev/idev if we haven't done so. */
 		if (dev != &loopback_dev) {
@@ -1118,12 +1108,12 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 		goto install_route;
 	}
 
-	if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
+	if (cfg->fc_flags & RTF_GATEWAY) {
 		struct in6_addr *gw_addr;
 		int gwa_type;
 
-		gw_addr = &rtmsg->rtmsg_gateway;
-		ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
+		gw_addr = &cfg->fc_gateway;
+		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
 		gwa_type = ipv6_addr_type(gw_addr);
 
 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
@@ -1140,7 +1130,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 			if (!(gwa_type&IPV6_ADDR_UNICAST))
 				goto out;
 
-			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
+			grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
 
 			err = -EHOSTUNREACH;
 			if (grt == NULL)
@@ -1172,7 +1162,7 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 	if (dev == NULL)
 		goto out;
 
-	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
+	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
 		if (IS_ERR(rt->rt6i_nexthop)) {
 			err = PTR_ERR(rt->rt6i_nexthop);
@@ -1181,24 +1171,24 @@ int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
 		}
 	}
 
-	rt->rt6i_flags = rtmsg->rtmsg_flags;
+	rt->rt6i_flags = cfg->fc_flags;
 
 install_route:
-	if (rta && rta[RTA_METRICS-1]) {
-		int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
-		struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
-
-		while (RTA_OK(attr, attrlen)) {
-			unsigned flavor = attr->rta_type;
-			if (flavor) {
-				if (flavor > RTAX_MAX) {
+	if (cfg->fc_mx) {
+		struct nlattr *nla;
+		int remaining;
+
+		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+			int type = nla->nla_type;
+
+			if (type) {
+				if (type > RTAX_MAX) {
 					err = -EINVAL;
 					goto out;
 				}
-				rt->u.dst.metrics[flavor-1] =
-					*(u32 *)RTA_DATA(attr);
+
+				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
 			}
-			attr = RTA_NEXT(attr, attrlen);
 		}
 	}
 
@@ -1211,7 +1201,7 @@ install_route:
 	rt->u.dst.dev = dev;
 	rt->rt6i_idev = idev;
 	rt->rt6i_table = table;
-	return __ip6_ins_rt(rt, nlh, _rtattr, req);
+	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
 
 out:
 	if (dev)
@@ -1223,8 +1213,7 @@ out:
 	return err;
 }
 
-static int __ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
-			void *_rtattr, struct netlink_skb_parms *req)
+static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
 {
 	int err;
 	struct fib6_table *table;
@@ -1235,7 +1224,7 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
 	table = rt->rt6i_table;
 	write_lock_bh(&table->tb6_lock);
 
-	err = fib6_del(rt, nlh, _rtattr, req);
+	err = fib6_del(rt, info);
 	dst_release(&rt->u.dst);
 
 	write_unlock_bh(&table->tb6_lock);
@@ -1245,44 +1234,41 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
 
 int ip6_del_rt(struct rt6_info *rt)
 {
-	return __ip6_del_rt(rt, NULL, NULL, NULL);
+	return __ip6_del_rt(rt, NULL);
 }
 
-static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
-			 void *_rtattr, struct netlink_skb_parms *req,
-			 u32 table_id)
+static int ip6_route_del(struct fib6_config *cfg)
 {
 	struct fib6_table *table;
 	struct fib6_node *fn;
 	struct rt6_info *rt;
 	int err = -ESRCH;
 
-	table = fib6_get_table(table_id);
+	table = fib6_get_table(cfg->fc_table);
 	if (table == NULL)
 		return err;
 
 	read_lock_bh(&table->tb6_lock);
 
 	fn = fib6_locate(&table->tb6_root,
-			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
-			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
+			 &cfg->fc_dst, cfg->fc_dst_len,
+			 &cfg->fc_src, cfg->fc_src_len);
 	
 	if (fn) {
 		for (rt = fn->leaf; rt; rt = rt->u.next) {
-			if (rtmsg->rtmsg_ifindex &&
+			if (cfg->fc_ifindex &&
 			    (rt->rt6i_dev == NULL ||
-			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
+			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
 				continue;
-			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
-			    !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
+			if (cfg->fc_flags & RTF_GATEWAY &&
+			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
 				continue;
-			if (rtmsg->rtmsg_metric &&
-			    rtmsg->rtmsg_metric != rt->rt6i_metric)
+			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
 				continue;
 			dst_hold(&rt->u.dst);
 			read_unlock_bh(&table->tb6_lock);
 
-			return __ip6_del_rt(rt, nlh, _rtattr, req);
+			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
 		}
 	}
 	read_unlock_bh(&table->tb6_lock);
@@ -1565,21 +1551,23 @@ static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixle
 					   struct in6_addr *gwaddr, int ifindex,
 					   unsigned pref)
 {
-	struct in6_rtmsg rtmsg;
+	struct fib6_config cfg = {
+		.fc_table	= RT6_TABLE_INFO,
+		.fc_metric	= 1024,
+		.fc_ifindex	= ifindex,
+		.fc_dst_len	= prefixlen,
+		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
+				  RTF_UP | RTF_PREF(pref),
+	};
+
+	ipv6_addr_copy(&cfg.fc_dst, prefix);
+	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
 
-	memset(&rtmsg, 0, sizeof(rtmsg));
-	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
-	ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
-	rtmsg.rtmsg_dst_len = prefixlen;
-	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
-	rtmsg.rtmsg_metric = 1024;
-	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
 	/* We should treat it as a default route if prefix length is 0. */
 	if (!prefixlen)
-		rtmsg.rtmsg_flags |= RTF_DEFAULT;
-	rtmsg.rtmsg_ifindex = ifindex;
+		cfg.fc_flags |= RTF_DEFAULT;
 
-	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_INFO);
+	ip6_route_add(&cfg);
 
 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
 }
@@ -1611,18 +1599,18 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
 				     struct net_device *dev,
 				     unsigned int pref)
 {
-	struct in6_rtmsg rtmsg;
+	struct fib6_config cfg = {
+		.fc_table	= RT6_TABLE_DFLT,
+		.fc_metric	= 1024,
+		.fc_ifindex	= dev->ifindex,
+		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
+				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
+	};
 
-	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
-	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
-	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
-	rtmsg.rtmsg_metric = 1024;
-	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
-			    RTF_PREF(pref);
+	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
 
-	rtmsg.rtmsg_ifindex = dev->ifindex;
+	ip6_route_add(&cfg);
 
-	ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_DFLT);
 	return rt6_get_dflt_router(gwaddr, dev);
 }
 
@@ -1649,8 +1637,27 @@ restart:
 	read_unlock_bh(&table->tb6_lock);
 }
 
+static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
+				 struct fib6_config *cfg)
+{
+	memset(cfg, 0, sizeof(*cfg));
+
+	cfg->fc_table = RT6_TABLE_MAIN;
+	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
+	cfg->fc_metric = rtmsg->rtmsg_metric;
+	cfg->fc_expires = rtmsg->rtmsg_info;
+	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
+	cfg->fc_src_len = rtmsg->rtmsg_src_len;
+	cfg->fc_flags = rtmsg->rtmsg_flags;
+
+	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
+	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
+	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
+}
+
 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
 {
+	struct fib6_config cfg;
 	struct in6_rtmsg rtmsg;
 	int err;
 
@@ -1663,16 +1670,16 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
 				     sizeof(struct in6_rtmsg));
 		if (err)
 			return -EFAULT;
-			
+
+		rtmsg_to_fib6_config(&rtmsg, &cfg);
+
 		rtnl_lock();
 		switch (cmd) {
 		case SIOCADDRT:
-			err = ip6_route_add(&rtmsg, NULL, NULL, NULL,
-					    RT6_TABLE_MAIN);
+			err = ip6_route_add(&cfg);
 			break;
 		case SIOCDELRT:
-			err = ip6_route_del(&rtmsg, NULL, NULL, NULL,
-					    RT6_TABLE_MAIN);
+			err = ip6_route_del(&cfg);
 			break;
 		default:
 			err = -EINVAL;
@@ -1823,66 +1830,104 @@ void rt6_mtu_change(struct net_device *dev, unsigned mtu)
 	fib6_clean_all(rt6_mtu_change_route, 0, &arg);
 }
 
-static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
-			      struct in6_rtmsg *rtmsg)
+static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
+	[RTA_GATEWAY]           = { .minlen = sizeof(struct in6_addr) },
+	[RTA_OIF]               = { .type = NLA_U32 },
+	[RTA_PRIORITY]          = { .type = NLA_U32 },
+	[RTA_METRICS]           = { .type = NLA_NESTED },
+};
+
+static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
+			      struct fib6_config *cfg)
 {
-	memset(rtmsg, 0, sizeof(*rtmsg));
+	struct rtmsg *rtm;
+	struct nlattr *tb[RTA_MAX+1];
+	int err;
+
+	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
+	if (err < 0)
+		goto errout;
 
-	rtmsg->rtmsg_dst_len = r->rtm_dst_len;
-	rtmsg->rtmsg_src_len = r->rtm_src_len;
-	rtmsg->rtmsg_flags = RTF_UP;
-	if (r->rtm_type == RTN_UNREACHABLE)
-		rtmsg->rtmsg_flags |= RTF_REJECT;
+	err = -EINVAL;
+	rtm = nlmsg_data(nlh);
+	memset(cfg, 0, sizeof(*cfg));
 
-	if (rta[RTA_GATEWAY-1]) {
-		if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
-			return -EINVAL;
-		memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
-		rtmsg->rtmsg_flags |= RTF_GATEWAY;
-	}
-	if (rta[RTA_DST-1]) {
-		if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
-			return -EINVAL;
-		memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
+	cfg->fc_table = rtm->rtm_table;
+	cfg->fc_dst_len = rtm->rtm_dst_len;
+	cfg->fc_src_len = rtm->rtm_src_len;
+	cfg->fc_flags = RTF_UP;
+	cfg->fc_protocol = rtm->rtm_protocol;
+
+	if (rtm->rtm_type == RTN_UNREACHABLE)
+		cfg->fc_flags |= RTF_REJECT;
+
+	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+	cfg->fc_nlinfo.nlh = nlh;
+
+	if (tb[RTA_GATEWAY]) {
+		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
+		cfg->fc_flags |= RTF_GATEWAY;
 	}
-	if (rta[RTA_SRC-1]) {
-		if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
-			return -EINVAL;
-		memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
+
+	if (tb[RTA_DST]) {
+		int plen = (rtm->rtm_dst_len + 7) >> 3;
+
+		if (nla_len(tb[RTA_DST]) < plen)
+			goto errout;
+
+		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
 	}
-	if (rta[RTA_OIF-1]) {
-		if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
-			return -EINVAL;
-		memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
+
+	if (tb[RTA_SRC]) {
+		int plen = (rtm->rtm_src_len + 7) >> 3;
+
+		if (nla_len(tb[RTA_SRC]) < plen)
+			goto errout;
+
+		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
 	}
-	if (rta[RTA_PRIORITY-1]) {
-		if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
-			return -EINVAL;
-		memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
+
+	if (tb[RTA_OIF])
+		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
+
+	if (tb[RTA_PRIORITY])
+		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
+
+	if (tb[RTA_METRICS]) {
+		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
+		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
 	}
-	return 0;
+
+	if (tb[RTA_TABLE])
+		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
+
+	err = 0;
+errout:
+	return err;
 }
 
 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
-	struct rtmsg *r = NLMSG_DATA(nlh);
-	struct in6_rtmsg rtmsg;
+	struct fib6_config cfg;
+	int err;
 
-	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
-		return -EINVAL;
-	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb),
-			     rtm_get_table(arg, r->rtm_table));
+	err = rtm_to_fib6_config(skb, nlh, &cfg);
+	if (err < 0)
+		return err;
+
+	return ip6_route_del(&cfg);
 }
 
 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
-	struct rtmsg *r = NLMSG_DATA(nlh);
-	struct in6_rtmsg rtmsg;
+	struct fib6_config cfg;
+	int err;
 
-	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
-		return -EINVAL;
-	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb),
-			     rtm_get_table(arg, r->rtm_table));
+	err = rtm_to_fib6_config(skb, nlh, &cfg);
+	if (err < 0)
+		return err;
+
+	return ip6_route_add(&cfg);
 }
 
 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
@@ -2063,15 +2108,21 @@ out_free:
 	goto out;	
 }
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
-			struct netlink_skb_parms *req)
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
 {
 	struct sk_buff *skb;
-	u32 pid = req ? req->pid : 0;
-	u32 seq = nlh ? nlh->nlmsg_seq : 0;
+	u32 pid = 0, seq = 0;
+	struct nlmsghdr *nlh = NULL;
 	int payload = sizeof(struct rtmsg) + 256;
 	int err = -ENOBUFS;
 
+	if (info) {
+		pid = info->pid;
+		nlh = info->nlh;
+		if (nlh)
+			seq = nlh->nlmsg_seq;
+	}
+
 	skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
 	if (skb == NULL)
 		goto errout;
-- 
cgit v1.2.3-70-g09d2


From ac0b04627269ff16c3c7ab854a65fe6780c6e3e5 Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sri@us.ibm.com>
Date: Tue, 22 Aug 2006 00:15:33 -0700
Subject: [SCTP]: Extend /proc/net/sctp/snmp to provide more statistics.

This patch adds more statistics info under /proc/net/sctp/snmp
that should be useful for debugging. The additional events that
are counted now include timer expirations, retransmits, packet
and data chunk discards.

The Data chunk discards include all the cases where a data chunk
is discarded including high tsn, bad stream, dup tsn and the most
useful one(out of receive buffer/rwnd).

Also moved the SCTP MIB data structures from the generic include
directories to include/sctp/sctp.h.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/snmp.h    | 33 ---------------------------------
 include/net/sctp/sctp.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/net/snmp.h      |  6 ------
 net/sctp/input.c        |  8 ++++++--
 net/sctp/inqueue.c      |  4 ++--
 net/sctp/outqueue.c     |  6 +++++-
 net/sctp/proc.c         | 17 ++++++++++++++++-
 net/sctp/sm_statefuns.c | 15 +++++++++++++++
 8 files changed, 88 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 30156556f78..854aa6b543f 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -160,39 +160,6 @@ enum
 	__UDP_MIB_MAX
 };
 
-/* sctp mib definitions */
-/*
- * draft-ietf-sigtran-sctp-mib-07.txt
- */
-enum
-{
-	SCTP_MIB_NUM = 0,
-	SCTP_MIB_CURRESTAB,			/* CurrEstab */
-	SCTP_MIB_ACTIVEESTABS,			/* ActiveEstabs */
-	SCTP_MIB_PASSIVEESTABS,			/* PassiveEstabs */
-	SCTP_MIB_ABORTEDS,			/* Aborteds */
-	SCTP_MIB_SHUTDOWNS,			/* Shutdowns */
-	SCTP_MIB_OUTOFBLUES,			/* OutOfBlues */
-	SCTP_MIB_CHECKSUMERRORS,		/* ChecksumErrors */
-	SCTP_MIB_OUTCTRLCHUNKS,			/* OutCtrlChunks */
-	SCTP_MIB_OUTORDERCHUNKS,		/* OutOrderChunks */
-	SCTP_MIB_OUTUNORDERCHUNKS,		/* OutUnorderChunks */
-	SCTP_MIB_INCTRLCHUNKS,			/* InCtrlChunks */
-	SCTP_MIB_INORDERCHUNKS,			/* InOrderChunks */
-	SCTP_MIB_INUNORDERCHUNKS,		/* InUnorderChunks */
-	SCTP_MIB_FRAGUSRMSGS,			/* FragUsrMsgs */
-	SCTP_MIB_REASMUSRMSGS,			/* ReasmUsrMsgs */
-	SCTP_MIB_OUTSCTPPACKS,			/* OutSCTPPacks */
-	SCTP_MIB_INSCTPPACKS,			/* InSCTPPacks */
-	SCTP_MIB_RTOALGORITHM,			/* RtoAlgorithm */
-	SCTP_MIB_RTOMIN,			/* RtoMin */
-	SCTP_MIB_RTOMAX,			/* RtoMax */
-	SCTP_MIB_RTOINITIAL,			/* RtoInitial */
-	SCTP_MIB_VALCOOKIELIFE,			/* ValCookieLife */
-	SCTP_MIB_MAXINITRETR,			/* MaxInitRetr */
-	__SCTP_MIB_MAX
-};
-
 /* linux mib definitions */
 enum
 {
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 1c1abce5f6b..e274fd47999 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -216,6 +216,50 @@ DECLARE_SNMP_STAT(struct sctp_mib, sctp_statistics);
 
 #endif /* !TEST_FRAME */
 
+/* sctp mib definitions */
+enum
+{
+	SCTP_MIB_NUM = 0,
+	SCTP_MIB_CURRESTAB,			/* CurrEstab */
+	SCTP_MIB_ACTIVEESTABS,			/* ActiveEstabs */
+	SCTP_MIB_PASSIVEESTABS,			/* PassiveEstabs */
+	SCTP_MIB_ABORTEDS,			/* Aborteds */
+	SCTP_MIB_SHUTDOWNS,			/* Shutdowns */
+	SCTP_MIB_OUTOFBLUES,			/* OutOfBlues */
+	SCTP_MIB_CHECKSUMERRORS,		/* ChecksumErrors */
+	SCTP_MIB_OUTCTRLCHUNKS,			/* OutCtrlChunks */
+	SCTP_MIB_OUTORDERCHUNKS,		/* OutOrderChunks */
+	SCTP_MIB_OUTUNORDERCHUNKS,		/* OutUnorderChunks */
+	SCTP_MIB_INCTRLCHUNKS,			/* InCtrlChunks */
+	SCTP_MIB_INORDERCHUNKS,			/* InOrderChunks */
+	SCTP_MIB_INUNORDERCHUNKS,		/* InUnorderChunks */
+	SCTP_MIB_FRAGUSRMSGS,			/* FragUsrMsgs */
+	SCTP_MIB_REASMUSRMSGS,			/* ReasmUsrMsgs */
+	SCTP_MIB_OUTSCTPPACKS,			/* OutSCTPPacks */
+	SCTP_MIB_INSCTPPACKS,			/* InSCTPPacks */
+	SCTP_MIB_T1_INIT_EXPIREDS,
+	SCTP_MIB_T1_COOKIE_EXPIREDS,
+	SCTP_MIB_T2_SHUTDOWN_EXPIREDS,
+	SCTP_MIB_T3_RTX_EXPIREDS,
+	SCTP_MIB_T4_RTO_EXPIREDS,
+	SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS,
+	SCTP_MIB_DELAY_SACK_EXPIREDS,
+	SCTP_MIB_AUTOCLOSE_EXPIREDS,
+	SCTP_MIB_T3_RETRANSMITS,
+	SCTP_MIB_PMTUD_RETRANSMITS,
+	SCTP_MIB_FAST_RETRANSMITS,
+	SCTP_MIB_IN_PKT_SOFTIRQ,
+	SCTP_MIB_IN_PKT_BACKLOG,
+	SCTP_MIB_IN_PKT_DISCARDS,
+	SCTP_MIB_IN_DATA_CHUNK_DISCARDS,
+	__SCTP_MIB_MAX
+};
+
+#define SCTP_MIB_MAX    __SCTP_MIB_MAX
+struct sctp_mib {
+        unsigned long   mibs[SCTP_MIB_MAX];
+} __SNMP_MIB_ALIGN__;
+
 
 /* Print debugging messages.  */
 #if SCTP_DEBUG
diff --git a/include/net/snmp.h b/include/net/snmp.h
index a36bed8ea21..464970e39ec 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -100,12 +100,6 @@ struct udp_mib {
 	unsigned long	mibs[UDP_MIB_MAX];
 } __SNMP_MIB_ALIGN__;
 
-/* SCTP */
-#define SCTP_MIB_MAX	__SCTP_MIB_MAX
-struct sctp_mib {
-	unsigned long	mibs[SCTP_MIB_MAX];
-} __SNMP_MIB_ALIGN__;
-
 /* Linux */
 #define LINUX_MIB_MAX	__LINUX_MIB_MAX
 struct linux_mib {
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 42b66e74bbb..8a34d95602c 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -255,10 +255,13 @@ int sctp_rcv(struct sk_buff *skb)
 	 */
 	sctp_bh_lock_sock(sk);
 
-	if (sock_owned_by_user(sk))
+	if (sock_owned_by_user(sk)) {
+		SCTP_INC_STATS_BH(SCTP_MIB_IN_PKT_BACKLOG);
 		sctp_add_backlog(sk, skb);
-	else
+	} else {
+		SCTP_INC_STATS_BH(SCTP_MIB_IN_PKT_SOFTIRQ);
 		sctp_inq_push(&chunk->rcvr->inqueue, chunk);
+	}
 
 	sctp_bh_unlock_sock(sk);
 
@@ -271,6 +274,7 @@ int sctp_rcv(struct sk_buff *skb)
 	return 0;
 
 discard_it:
+	SCTP_INC_STATS_BH(SCTP_MIB_IN_PKT_DISCARDS);
 	kfree_skb(skb);
 	return 0;
 
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cf0c767d43a..cf6deed7e84 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -87,7 +87,7 @@ void sctp_inq_free(struct sctp_inq *queue)
 /* Put a new packet in an SCTP inqueue.
  * We assume that packet->sctp_hdr is set and in host byte order.
  */
-void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet)
+void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk)
 {
 	/* Directly call the packet handling routine. */
 
@@ -96,7 +96,7 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet)
 	 * Eventually, we should clean up inqueue to not rely
 	 * on the BH related data structures.
 	 */
-	list_add_tail(&packet->list, &q->in_chunk_list);
+	list_add_tail(&chunk->list, &q->in_chunk_list);
 	q->immediate.func(q->immediate.data);
 }
 
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 30b710c54e6..37074a39ecb 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -467,6 +467,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 
 	switch(reason) {
 	case SCTP_RTXR_T3_RTX:
+		SCTP_INC_STATS(SCTP_MIB_T3_RETRANSMITS);
 		sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX);
 		/* Update the retran path if the T3-rtx timer has expired for
 		 * the current retran path.
@@ -475,12 +476,15 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 			sctp_assoc_update_retran_path(transport->asoc);
 		break;
 	case SCTP_RTXR_FAST_RTX:
+		SCTP_INC_STATS(SCTP_MIB_FAST_RETRANSMITS);
 		sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX);
 		fast_retransmit = 1;
 		break;
 	case SCTP_RTXR_PMTUD:
-	default:
+		SCTP_INC_STATS(SCTP_MIB_PMTUD_RETRANSMITS);
 		break;
+	default:
+		BUG();
 	}
 
 	sctp_retransmit_mark(q, transport, fast_retransmit);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 5b3b0e0ae7e..a356d8d310a 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -57,6 +57,21 @@ static struct snmp_mib sctp_snmp_list[] = {
 	SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS),
 	SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS),
 	SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS),
+	SNMP_MIB_ITEM("SctpT1InitExpireds", SCTP_MIB_T1_INIT_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT1CookieExpireds", SCTP_MIB_T1_COOKIE_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT2ShutdownExpireds", SCTP_MIB_T2_SHUTDOWN_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT3RtxExpireds", SCTP_MIB_T3_RTX_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT4RtoExpireds", SCTP_MIB_T4_RTO_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT5ShutdownGuardExpireds", SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS),
+	SNMP_MIB_ITEM("SctpDelaySackExpireds", SCTP_MIB_DELAY_SACK_EXPIREDS),
+	SNMP_MIB_ITEM("SctpAutocloseExpireds", SCTP_MIB_AUTOCLOSE_EXPIREDS),
+	SNMP_MIB_ITEM("SctpT3Retransmits", SCTP_MIB_T3_RETRANSMITS),
+	SNMP_MIB_ITEM("SctpPmtudRetransmits", SCTP_MIB_PMTUD_RETRANSMITS),
+	SNMP_MIB_ITEM("SctpFastRetransmits", SCTP_MIB_FAST_RETRANSMITS),
+	SNMP_MIB_ITEM("SctpInPktSoftirq", SCTP_MIB_IN_PKT_SOFTIRQ),
+	SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG),
+	SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS),
+	SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS),
 	SNMP_MIB_SENTINEL
 };
 
@@ -328,8 +343,8 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
 			   "%8p %8p %-3d %-3d %-2d %-4d %4d %8d %8d %7d %5lu %-5d %5d ",
 			   assoc, sk, sctp_sk(sk)->type, sk->sk_state,
 			   assoc->state, hash, assoc->assoc_id,
-			   (sk->sk_rcvbuf - assoc->rwnd),
 			   assoc->sndbuf_used,
+			   (sk->sk_rcvbuf - assoc->rwnd),
 			   sock_i_uid(sk), sock_i_ino(sk),
 			   epb->bind_addr.port,
 			   assoc->peer.port);
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 5b5ae795832..32f57f42af9 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2663,9 +2663,11 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep,
 		break;
 	case SCTP_IERROR_HIGH_TSN:
 	case SCTP_IERROR_BAD_STREAM:
+		SCTP_INC_STATS(SCTP_MIB_IN_DATA_CHUNK_DISCARDS);
 		goto discard_noforce;
 	case SCTP_IERROR_DUP_TSN:
 	case SCTP_IERROR_IGNORE_TSN:
+		SCTP_INC_STATS(SCTP_MIB_IN_DATA_CHUNK_DISCARDS);
 		goto discard_force;
 	case SCTP_IERROR_NO_DATA:
 		goto consume;
@@ -3652,6 +3654,7 @@ sctp_disposition_t sctp_sf_pdiscard(const struct sctp_endpoint *ep,
 				    void *arg,
 				    sctp_cmd_seq_t *commands)
 {
+	SCTP_INC_STATS(SCTP_MIB_IN_PKT_DISCARDS);
 	sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
 
 	return SCTP_DISPOSITION_CONSUME;
@@ -4548,6 +4551,8 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(const struct sctp_endpoint *ep,
 {
 	struct sctp_transport *transport = arg;
 
+	SCTP_INC_STATS(SCTP_MIB_T3_RTX_EXPIREDS);
+
 	if (asoc->overall_error_count >= asoc->max_retrans) {
 		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
 				SCTP_ERROR(ETIMEDOUT));
@@ -4616,6 +4621,7 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
 				       void *arg,
 				       sctp_cmd_seq_t *commands)
 {
+	SCTP_INC_STATS(SCTP_MIB_DELAY_SACK_EXPIREDS);
 	sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
 	return SCTP_DISPOSITION_CONSUME;
 }
@@ -4650,6 +4656,7 @@ sctp_disposition_t sctp_sf_t1_init_timer_expire(const struct sctp_endpoint *ep,
 	int attempts = asoc->init_err_counter + 1;
 
 	SCTP_DEBUG_PRINTK("Timer T1 expired (INIT).\n");
+	SCTP_INC_STATS(SCTP_MIB_T1_INIT_EXPIREDS);
 
 	if (attempts <= asoc->max_init_attempts) {
 		bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
@@ -4709,6 +4716,7 @@ sctp_disposition_t sctp_sf_t1_cookie_timer_expire(const struct sctp_endpoint *ep
 	int attempts = asoc->init_err_counter + 1;
 
 	SCTP_DEBUG_PRINTK("Timer T1 expired (COOKIE-ECHO).\n");
+	SCTP_INC_STATS(SCTP_MIB_T1_COOKIE_EXPIREDS);
 
 	if (attempts <= asoc->max_init_attempts) {
 		repl = sctp_make_cookie_echo(asoc, NULL);
@@ -4753,6 +4761,8 @@ sctp_disposition_t sctp_sf_t2_timer_expire(const struct sctp_endpoint *ep,
 	struct sctp_chunk *reply = NULL;
 
 	SCTP_DEBUG_PRINTK("Timer T2 expired.\n");
+	SCTP_INC_STATS(SCTP_MIB_T2_SHUTDOWN_EXPIREDS);
+
 	if (asoc->overall_error_count >= asoc->max_retrans) {
 		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
 				SCTP_ERROR(ETIMEDOUT));
@@ -4814,6 +4824,8 @@ sctp_disposition_t sctp_sf_t4_timer_expire(
 	struct sctp_chunk *chunk = asoc->addip_last_asconf;
 	struct sctp_transport *transport = chunk->transport;
 
+	SCTP_INC_STATS(SCTP_MIB_T4_RTO_EXPIREDS);
+
 	/* ADDIP 4.1 B1) Increment the error counters and perform path failure
 	 * detection on the appropriate destination address as defined in
 	 * RFC2960 [5] section 8.1 and 8.2.
@@ -4880,6 +4892,7 @@ sctp_disposition_t sctp_sf_t5_timer_expire(const struct sctp_endpoint *ep,
 	struct sctp_chunk *reply = NULL;
 
 	SCTP_DEBUG_PRINTK("Timer T5 expired.\n");
+	SCTP_INC_STATS(SCTP_MIB_T5_SHUTDOWN_GUARD_EXPIREDS);
 
 	reply = sctp_make_abort(asoc, NULL, 0);
 	if (!reply)
@@ -4910,6 +4923,8 @@ sctp_disposition_t sctp_sf_autoclose_timer_expire(
 {
 	int disposition;
 
+	SCTP_INC_STATS(SCTP_MIB_AUTOCLOSE_EXPIREDS);
+
 	/* From 9.2 Shutdown of an Association
 	 * Upon receipt of the SHUTDOWN primitive from its upper
 	 * layer, the endpoint enters SHUTDOWN-PENDING state and
-- 
cgit v1.2.3-70-g09d2


From 9ba1627617d396135a4d679542a3623d5819e628 Mon Sep 17 00:00:00 2001
From: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Date: Tue, 22 Aug 2006 00:29:37 -0700
Subject: [NETFILTER]: x_tables: replace IPv4 dscp match by address family
 independent version

This replaces IPv4 dscp match by address family independent version.
This also
	- utilizes dsfield.h to get the DS field in IPv4/IPv6 header, and
	- checks for the DSCP value from user space.
	- fixes Kconfig help text.

Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_dscp.h       |  23 +++++++
 include/linux/netfilter_ipv4/ipt_dscp.h |  14 ++--
 net/ipv4/netfilter/Kconfig              |  11 ----
 net/ipv4/netfilter/Makefile             |   1 -
 net/ipv4/netfilter/ipt_dscp.c           |  54 ---------------
 net/netfilter/Kconfig                   |  11 ++++
 net/netfilter/Makefile                  |   1 +
 net/netfilter/xt_dscp.c                 | 113 ++++++++++++++++++++++++++++++++
 8 files changed, 154 insertions(+), 74 deletions(-)
 create mode 100644 include/linux/netfilter/xt_dscp.h
 delete mode 100644 net/ipv4/netfilter/ipt_dscp.c
 create mode 100644 net/netfilter/xt_dscp.c

(limited to 'include')

diff --git a/include/linux/netfilter/xt_dscp.h b/include/linux/netfilter/xt_dscp.h
new file mode 100644
index 00000000000..1da61e6acaf
--- /dev/null
+++ b/include/linux/netfilter/xt_dscp.h
@@ -0,0 +1,23 @@
+/* x_tables module for matching the IPv4/IPv6 DSCP field
+ *
+ * (C) 2002 Harald Welte <laforge@gnumonks.org>
+ * This software is distributed under GNU GPL v2, 1991
+ *
+ * See RFC2474 for a description of the DSCP field within the IP Header.
+ *
+ * xt_dscp.h,v 1.3 2002/08/05 19:00:21 laforge Exp
+*/
+#ifndef _XT_DSCP_H
+#define _XT_DSCP_H
+
+#define XT_DSCP_MASK	0xfc	/* 11111100 */
+#define XT_DSCP_SHIFT	2
+#define XT_DSCP_MAX	0x3f	/* 00111111 */
+
+/* match info */
+struct xt_dscp_info {
+	u_int8_t dscp;
+	u_int8_t invert;
+};
+
+#endif /* _XT_DSCP_H */
diff --git a/include/linux/netfilter_ipv4/ipt_dscp.h b/include/linux/netfilter_ipv4/ipt_dscp.h
index 2fa6dfe9289..4b82ca912b0 100644
--- a/include/linux/netfilter_ipv4/ipt_dscp.h
+++ b/include/linux/netfilter_ipv4/ipt_dscp.h
@@ -10,14 +10,12 @@
 #ifndef _IPT_DSCP_H
 #define _IPT_DSCP_H
 
-#define IPT_DSCP_MASK	0xfc	/* 11111100 */
-#define IPT_DSCP_SHIFT	2
-#define IPT_DSCP_MAX	0x3f	/* 00111111 */
+#include <linux/netfilter/xt_dscp.h>
 
-/* match info */
-struct ipt_dscp_info {
-	u_int8_t dscp;
-	u_int8_t invert;
-};
+#define IPT_DSCP_MASK	XT_DSCP_MASK
+#define IPT_DSCP_SHIFT	XT_DSCP_SHIFT
+#define IPT_DSCP_MAX	XT_DSCP_MAX
+
+#define ipt_dscp_info	xt_dscp_info
 
 #endif /* _IPT_DSCP_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ef0b5aac583..d88d71d1ce0 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -278,17 +278,6 @@ config IP_NF_MATCH_ECN
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_MATCH_DSCP
-	tristate "DSCP match support"
-	depends on IP_NF_IPTABLES
-	help
-	  This option adds a `DSCP' match, which allows you to match against
-	  the IPv4 header DSCP field (DSCP codepoint).
-
-	  The DSCP codepoint can have any value between 0x0 and 0x4f.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_MATCH_AH
 	tristate "AH match support"
 	depends on IP_NF_IPTABLES
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3ded4a3af59..b946b0f3ea9 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -59,7 +59,6 @@ obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
 obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
 obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
 obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
-obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
 obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
 obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c
deleted file mode 100644
index 47177591aeb..00000000000
--- a/net/ipv4/netfilter/ipt_dscp.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/* IP tables module for matching the value of the IPv4 DSCP field
- *
- * ipt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp
- *
- * (C) 2002 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-
-#include <linux/netfilter_ipv4/ipt_dscp.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables DSCP matching module");
-MODULE_LICENSE("GPL");
-
-static int match(const struct sk_buff *skb,
-		 const struct net_device *in, const struct net_device *out,
-		 const struct xt_match *match, const void *matchinfo,
-		 int offset, unsigned int protoff, int *hotdrop)
-{
-	const struct ipt_dscp_info *info = matchinfo;
-	const struct iphdr *iph = skb->nh.iph;
-
-	u_int8_t sh_dscp = ((info->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
-
-	return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert;
-}
-
-static struct ipt_match dscp_match = {
-	.name		= "dscp",
-	.match		= match,
-	.matchsize	= sizeof(struct ipt_dscp_info),
-	.me		= THIS_MODULE,
-};
-
-static int __init ipt_dscp_init(void)
-{
-	return ipt_register_match(&dscp_match);
-}
-
-static void __exit ipt_dscp_fini(void)
-{
-	ipt_unregister_match(&dscp_match);
-
-}
-
-module_init(ipt_dscp_init);
-module_exit(ipt_dscp_fini);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index a9894ddfd72..f781405f5d6 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -263,6 +263,17 @@ config NETFILTER_XT_MATCH_DCCP
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/modules.txt>.  If unsure, say `N'.
 
+config NETFILTER_XT_MATCH_DSCP
+	tristate '"DSCP" match support'
+	depends on NETFILTER_XTABLES
+	help
+	  This option adds a `DSCP' match, which allows you to match against
+	  the IPv4/IPv6 header DSCP field (differentiated services codepoint).
+
+	  The DSCP field can have any value between 0x0 and 0x3f inclusive.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_MATCH_ESP
 	tristate '"ESP" match support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 6fa4b758045..0b8a70c1df4 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNMARK) += xt_connmark.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
new file mode 100644
index 00000000000..82e250d1f00
--- /dev/null
+++ b/net/netfilter/xt_dscp.c
@@ -0,0 +1,113 @@
+/* IP tables module for matching the value of the IPv4/IPv6 DSCP field
+ *
+ * xt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/dsfield.h>
+
+#include <linux/netfilter/xt_dscp.h>
+#include <linux/netfilter/x_tables.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("x_tables DSCP matching module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_dscp");
+MODULE_ALIAS("ip6t_dscp");
+
+static int match(const struct sk_buff *skb,
+		 const struct net_device *in,
+		 const struct net_device *out,
+		 const struct xt_match *match,
+		 const void *matchinfo,
+		 int offset,
+		 unsigned int protoff,
+		 int *hotdrop)
+{
+	const struct xt_dscp_info *info = matchinfo;
+	u_int8_t dscp = ipv4_get_dsfield(skb->nh.iph) >> XT_DSCP_SHIFT;
+
+	return (dscp == info->dscp) ^ !!info->invert;
+}
+
+static int match6(const struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  const struct xt_match *match,
+		  const void *matchinfo,
+		  int offset,
+		  unsigned int protoff,
+		  int *hotdrop)
+{
+	const struct xt_dscp_info *info = matchinfo;
+	u_int8_t dscp = ipv6_get_dsfield(skb->nh.ipv6h) >> XT_DSCP_SHIFT;
+
+	return (dscp == info->dscp) ^ !!info->invert;
+}
+
+static int checkentry(const char *tablename,
+		      const void *info,
+		      const struct xt_match *match,
+		      void *matchinfo,
+		      unsigned int matchsize,
+		      unsigned int hook_mask)
+{
+	const u_int8_t dscp = ((struct xt_dscp_info *)matchinfo)->dscp;
+
+	if (dscp > XT_DSCP_MAX) {
+		printk(KERN_ERR "xt_dscp: dscp %x out of range\n", dscp);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct xt_match dscp_match = {
+	.name		= "dscp",
+	.match		= match,
+	.checkentry	= checkentry,
+	.matchsize	= sizeof(struct xt_dscp_info),
+	.family		= AF_INET,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_match dscp6_match = {
+	.name		= "dscp",
+	.match		= match6,
+	.checkentry	= checkentry,
+	.matchsize	= sizeof(struct xt_dscp_info),
+	.family		= AF_INET6,
+	.me		= THIS_MODULE,
+};
+
+static int __init xt_dscp_match_init(void)
+{
+	int ret;
+	ret = xt_register_match(&dscp_match);
+	if (ret)
+		return ret;
+
+	ret = xt_register_match(&dscp6_match);
+	if (ret)
+		xt_unregister_match(&dscp_match);
+
+	return ret;
+}
+
+static void __exit xt_dscp_match_fini(void)
+{
+	xt_unregister_match(&dscp_match);
+	xt_unregister_match(&dscp6_match);
+}
+
+module_init(xt_dscp_match_init);
+module_exit(xt_dscp_match_fini);
-- 
cgit v1.2.3-70-g09d2


From a468701db58a8b3e08e3f55fa6ac66db42014922 Mon Sep 17 00:00:00 2001
From: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Date: Tue, 22 Aug 2006 00:30:26 -0700
Subject: [NETFILTER]: x_tables: replace IPv4 DSCP target by address family
 independent version

This replaces IPv4 DSCP target by address family independent version.
This also
	- utilizes dsfield.h to get/mangle DS field in IPv4/IPv6 header
	- fixes Kconfig help text.

Signed-off-by: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/xt_DSCP.h       |  20 +++++
 include/linux/netfilter_ipv4/ipt_DSCP.h |   6 +-
 net/ipv4/netfilter/Kconfig              |  11 ---
 net/ipv4/netfilter/Makefile             |   1 -
 net/ipv4/netfilter/ipt_DSCP.c           |  96 -----------------------
 net/netfilter/Kconfig                   |  12 +++
 net/netfilter/Makefile                  |   1 +
 net/netfilter/xt_DSCP.c                 | 130 ++++++++++++++++++++++++++++++++
 8 files changed, 165 insertions(+), 112 deletions(-)
 create mode 100644 include/linux/netfilter/xt_DSCP.h
 delete mode 100644 net/ipv4/netfilter/ipt_DSCP.c
 create mode 100644 net/netfilter/xt_DSCP.c

(limited to 'include')

diff --git a/include/linux/netfilter/xt_DSCP.h b/include/linux/netfilter/xt_DSCP.h
new file mode 100644
index 00000000000..3c7c963997b
--- /dev/null
+++ b/include/linux/netfilter/xt_DSCP.h
@@ -0,0 +1,20 @@
+/* x_tables module for setting the IPv4/IPv6 DSCP field
+ *
+ * (C) 2002 Harald Welte <laforge@gnumonks.org>
+ * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com>
+ * This software is distributed under GNU GPL v2, 1991
+ *
+ * See RFC2474 for a description of the DSCP field within the IP Header.
+ *
+ * xt_DSCP.h,v 1.7 2002/03/14 12:03:13 laforge Exp
+*/
+#ifndef _XT_DSCP_TARGET_H
+#define _XT_DSCP_TARGET_H
+#include <linux/netfilter/xt_dscp.h>
+
+/* target info */
+struct xt_DSCP_info {
+	u_int8_t dscp;
+};
+
+#endif /* _XT_DSCP_TARGET_H */
diff --git a/include/linux/netfilter_ipv4/ipt_DSCP.h b/include/linux/netfilter_ipv4/ipt_DSCP.h
index b30f510b5be..3491e524d5e 100644
--- a/include/linux/netfilter_ipv4/ipt_DSCP.h
+++ b/include/linux/netfilter_ipv4/ipt_DSCP.h
@@ -11,10 +11,8 @@
 #ifndef _IPT_DSCP_TARGET_H
 #define _IPT_DSCP_TARGET_H
 #include <linux/netfilter_ipv4/ipt_dscp.h>
+#include <linux/netfilter/xt_DSCP.h>
 
-/* target info */
-struct ipt_DSCP_info {
-	u_int8_t dscp;
-};
+#define ipt_DSCP_info xt_DSCP_info
 
 #endif /* _IPT_DSCP_TARGET_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index d88d71d1ce0..a55b8ff70de 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -557,17 +557,6 @@ config IP_NF_TARGET_ECN
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config IP_NF_TARGET_DSCP
-	tristate "DSCP target support"
-	depends on IP_NF_MANGLE
-	help
-	  This option adds a `DSCP' match, which allows you to match against
-	  the IPv4 header DSCP field (DSCP codepoint).
-
-	  The DSCP codepoint can have any value between 0x0 and 0x4f.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config IP_NF_TARGET_TTL
 	tristate  'TTL target support'
 	depends on IP_NF_MANGLE
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index b946b0f3ea9..09aaed1a806 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -67,7 +67,6 @@ obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
 obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
 obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
-obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
 obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
 obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
deleted file mode 100644
index c8e971288df..00000000000
--- a/net/ipv4/netfilter/ipt_DSCP.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/* iptables module for setting the IPv4 DSCP field, Version 1.8
- *
- * (C) 2002 by Harald Welte <laforge@netfilter.org>
- * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as 
- * published by the Free Software Foundation.
- * 
- * See RFC2474 for a description of the DSCP field within the IP Header.
- *
- * ipt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp
-*/
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <net/checksum.h>
-
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_DSCP.h>
-
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("iptables DSCP modification module");
-MODULE_LICENSE("GPL");
-
-static unsigned int
-target(struct sk_buff **pskb,
-       const struct net_device *in,
-       const struct net_device *out,
-       unsigned int hooknum,
-       const struct xt_target *target,
-       const void *targinfo,
-       void *userinfo)
-{
-	const struct ipt_DSCP_info *dinfo = targinfo;
-	u_int8_t sh_dscp = ((dinfo->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
-
-
-	if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
-		u_int16_t diffs[2];
-
-		if (!skb_make_writable(pskb, sizeof(struct iphdr)))
-			return NF_DROP;
-
-		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
-		(*pskb)->nh.iph->tos = ((*pskb)->nh.iph->tos & ~IPT_DSCP_MASK)
-			| sh_dscp;
-		diffs[1] = htons((*pskb)->nh.iph->tos);
-		(*pskb)->nh.iph->check
-			= csum_fold(csum_partial((char *)diffs,
-						 sizeof(diffs),
-						 (*pskb)->nh.iph->check
-						 ^ 0xFFFF));
-	}
-	return IPT_CONTINUE;
-}
-
-static int
-checkentry(const char *tablename,
-	   const void *e_void,
-	   const struct xt_target *target,
-           void *targinfo,
-           unsigned int targinfosize,
-           unsigned int hook_mask)
-{
-	const u_int8_t dscp = ((struct ipt_DSCP_info *)targinfo)->dscp;
-
-	if ((dscp > IPT_DSCP_MAX)) {
-		printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
-		return 0;
-	}
-	return 1;
-}
-
-static struct ipt_target ipt_dscp_reg = {
-	.name		= "DSCP",
-	.target		= target,
-	.targetsize	= sizeof(struct ipt_DSCP_info),
-	.table		= "mangle",
-	.checkentry	= checkentry,
-	.me		= THIS_MODULE,
-};
-
-static int __init ipt_dscp_init(void)
-{
-	return ipt_register_target(&ipt_dscp_reg);
-}
-
-static void __exit ipt_dscp_fini(void)
-{
-	ipt_unregister_target(&ipt_dscp_reg);
-}
-
-module_init(ipt_dscp_init);
-module_exit(ipt_dscp_fini);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f781405f5d6..0a28d2c5c44 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -148,6 +148,18 @@ config NETFILTER_XT_TARGET_CONNMARK
 	  <file:Documentation/modules.txt>.  The module will be called
 	  ipt_CONNMARK.o.  If unsure, say `N'.
 
+config NETFILTER_XT_TARGET_DSCP
+	tristate '"DSCP" target support'
+	depends on NETFILTER_XTABLES
+	depends on IP_NF_MANGLE || IP6_NF_MANGLE
+	help
+	  This option adds a `DSCP' target, which allows you to manipulate
+	  the IPv4/IPv6 header DSCP field (differentiated services codepoint).
+
+	  The DSCP field can have any value between 0x0 and 0x3f inclusive.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NETFILTER_XT_TARGET_MARK
 	tristate '"MARK" target support'
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0b8a70c1df4..a74be492fd0 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
 # targets
 obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
new file mode 100644
index 00000000000..79df8165cd7
--- /dev/null
+++ b/net/netfilter/xt_DSCP.c
@@ -0,0 +1,130 @@
+/* x_tables module for setting the IPv4/IPv6 DSCP field, Version 1.8
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * See RFC2474 for a description of the DSCP field within the IP Header.
+ *
+ * xt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp
+*/
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/dsfield.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_DSCP.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("x_tables DSCP modification module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_DSCP");
+MODULE_ALIAS("ip6t_DSCP");
+
+static unsigned int target(struct sk_buff **pskb,
+			   const struct net_device *in,
+			   const struct net_device *out,
+			   unsigned int hooknum,
+			   const struct xt_target *target,
+			   const void *targinfo,
+			   void *userinfo)
+{
+	const struct xt_DSCP_info *dinfo = targinfo;
+	u_int8_t dscp = ipv4_get_dsfield((*pskb)->nh.iph) >> XT_DSCP_SHIFT;
+
+	if (dscp != dinfo->dscp) {
+		if (!skb_make_writable(pskb, sizeof(struct iphdr)))
+			return NF_DROP;
+
+		ipv4_change_dsfield((*pskb)->nh.iph, (__u8)(~XT_DSCP_MASK),
+				    dinfo->dscp << XT_DSCP_SHIFT);
+
+	}
+	return XT_CONTINUE;
+}
+
+static unsigned int target6(struct sk_buff **pskb,
+			    const struct net_device *in,
+			    const struct net_device *out,
+			    unsigned int hooknum,
+			    const struct xt_target *target,
+			    const void *targinfo,
+			    void *userinfo)
+{
+	const struct xt_DSCP_info *dinfo = targinfo;
+	u_int8_t dscp = ipv6_get_dsfield((*pskb)->nh.ipv6h) >> XT_DSCP_SHIFT;
+
+	if (dscp != dinfo->dscp) {
+		if (!skb_make_writable(pskb, sizeof(struct ipv6hdr)))
+			return NF_DROP;
+
+		ipv6_change_dsfield((*pskb)->nh.ipv6h, (__u8)(~XT_DSCP_MASK),
+				    dinfo->dscp << XT_DSCP_SHIFT);
+	}
+	return XT_CONTINUE;
+}
+
+static int checkentry(const char *tablename,
+		      const void *e_void,
+		      const struct xt_target *target,
+		      void *targinfo,
+		      unsigned int targinfosize,
+		      unsigned int hook_mask)
+{
+	const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp;
+
+	if ((dscp > XT_DSCP_MAX)) {
+		printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
+		return 0;
+	}
+	return 1;
+}
+
+static struct xt_target xt_dscp_reg = {
+	.name		= "DSCP",
+	.target		= target,
+	.targetsize	= sizeof(struct xt_DSCP_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.family		= AF_INET,
+	.me		= THIS_MODULE,
+};
+
+static struct xt_target xt_dscp6_reg = {
+	.name		= "DSCP",
+	.target		= target6,
+	.targetsize	= sizeof(struct xt_DSCP_info),
+	.table		= "mangle",
+	.checkentry	= checkentry,
+	.family		= AF_INET6,
+	.me		= THIS_MODULE,
+};
+
+static int __init xt_dscp_target_init(void)
+{
+	int ret;
+	ret = xt_register_target(&xt_dscp_reg);
+	if (ret)
+		return ret;
+
+	ret = xt_register_target(&xt_dscp6_reg);
+	if (ret)
+		xt_unregister_target(&xt_dscp_reg);
+
+	return ret;
+}
+
+static void __exit xt_dscp_target_fini(void)
+{
+	xt_unregister_target(&xt_dscp_reg);
+	xt_unregister_target(&xt_dscp6_reg);
+}
+
+module_init(xt_dscp_target_init);
+module_exit(xt_dscp_target_fini);
-- 
cgit v1.2.3-70-g09d2


From 2521c12cf1a29f6c380b13ca32a38175f6beed08 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 22 Aug 2006 00:31:24 -0700
Subject: [NETFILTER]: conntrack: introduce connection mark event

This patch introduces the mark event. ctnetlink can use this to know if
the mark needs to be dumped.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nf_conntrack_common.h |  4 ++++
 net/netfilter/xt_CONNMARK.c                   | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index d2e4bd7a7a1..9e0dae07861 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -125,6 +125,10 @@ enum ip_conntrack_events
 	/* Counter highest bit has been set */
 	IPCT_COUNTER_FILLING_BIT = 11,
 	IPCT_COUNTER_FILLING = (1 << IPCT_COUNTER_FILLING_BIT),
+
+	/* Mark is set */
+	IPCT_MARK_BIT = 12,
+	IPCT_MARK = (1 << IPCT_MARK_BIT),
 };
 
 enum ip_conntrack_expect_events {
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index 60c375d36f0..784482b74e5 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -52,13 +52,25 @@ target(struct sk_buff **pskb,
 	    switch(markinfo->mode) {
 	    case XT_CONNMARK_SET:
 		newmark = (*ctmark & ~markinfo->mask) | markinfo->mark;
-		if (newmark != *ctmark)
+		if (newmark != *ctmark) {
 		    *ctmark = newmark;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+		    ip_conntrack_event_cache(IPCT_MARK, *pskb);
+#else
+		    nf_conntrack_event_cache(IPCT_MARK, *pskb);
+#endif
+		}
 		break;
 	    case XT_CONNMARK_SAVE:
 		newmark = (*ctmark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
-		if (*ctmark != newmark)
+		if (*ctmark != newmark) {
 		    *ctmark = newmark;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+		    ip_conntrack_event_cache(IPCT_MARK, *pskb);
+#else
+		    nf_conntrack_event_cache(IPCT_MARK, *pskb);
+#endif
+		}
 		break;
 	    case XT_CONNMARK_RESTORE:
 		nfmark = (*pskb)->nfmark;
-- 
cgit v1.2.3-70-g09d2


From 52d9c42ef2563d2c420eb23b96bf5a4cae9e167b Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 22 Aug 2006 00:33:45 -0700
Subject: [NETFILTER]: x_tables: add helpers for mass match/target registration

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h |  5 ++++
 net/netfilter/x_tables.c           | 60 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 48cc32d83f7..9a9912430e3 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -290,8 +290,13 @@ struct xt_table_info
 
 extern int xt_register_target(struct xt_target *target);
 extern void xt_unregister_target(struct xt_target *target);
+extern int xt_register_targets(struct xt_target *target, unsigned int n);
+extern void xt_unregister_targets(struct xt_target *target, unsigned int n);
+
 extern int xt_register_match(struct xt_match *target);
 extern void xt_unregister_match(struct xt_match *target);
+extern int xt_register_matches(struct xt_match *match, unsigned int n);
+extern void xt_unregister_matches(struct xt_match *match, unsigned int n);
 
 extern int xt_check_match(const struct xt_match *match, unsigned short family,
 			  unsigned int size, const char *table, unsigned int hook,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 174e8f97009..8037ba63d58 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -86,6 +86,36 @@ xt_unregister_target(struct xt_target *target)
 }
 EXPORT_SYMBOL(xt_unregister_target);
 
+int
+xt_register_targets(struct xt_target *target, unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = xt_register_target(&target[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		xt_unregister_targets(target, i);
+	return err;
+}
+EXPORT_SYMBOL(xt_register_targets);
+
+void
+xt_unregister_targets(struct xt_target *target, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++)
+		xt_unregister_target(&target[i]);
+}
+EXPORT_SYMBOL(xt_unregister_targets);
+
 int
 xt_register_match(struct xt_match *match)
 {
@@ -113,6 +143,36 @@ xt_unregister_match(struct xt_match *match)
 }
 EXPORT_SYMBOL(xt_unregister_match);
 
+int
+xt_register_matches(struct xt_match *match, unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = xt_register_match(&match[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		xt_unregister_matches(match, i);
+	return err;
+}
+EXPORT_SYMBOL(xt_register_matches);
+
+void
+xt_unregister_matches(struct xt_match *match, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; i++)
+		xt_unregister_match(&match[i]);
+}
+EXPORT_SYMBOL(xt_unregister_matches);
+
 
 /*
  * These are weird, but module loading must not be done with mutex
-- 
cgit v1.2.3-70-g09d2


From fe1cb10873b44cf89082465823ee6d4d4ac63ad7 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 22 Aug 2006 00:35:47 -0700
Subject: [NETFILTER]: x_tables: remove unused argument to target functions

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h        | 3 +--
 include/linux/netfilter_arp/arp_tables.h  | 3 +--
 include/linux/netfilter_ipv4/ip_tables.h  | 3 +--
 include/linux/netfilter_ipv6/ip6_tables.h | 3 +--
 net/ipv4/netfilter/arp_tables.c           | 9 +++------
 net/ipv4/netfilter/arpt_mangle.c          | 2 +-
 net/ipv4/netfilter/arptable_filter.c      | 2 +-
 net/ipv4/netfilter/ip_nat_rule.c          | 8 +++-----
 net/ipv4/netfilter/ip_tables.c            | 9 +++------
 net/ipv4/netfilter/ipt_CLUSTERIP.c        | 3 +--
 net/ipv4/netfilter/ipt_ECN.c              | 3 +--
 net/ipv4/netfilter/ipt_LOG.c              | 3 +--
 net/ipv4/netfilter/ipt_MASQUERADE.c       | 3 +--
 net/ipv4/netfilter/ipt_NETMAP.c           | 3 +--
 net/ipv4/netfilter/ipt_REDIRECT.c         | 3 +--
 net/ipv4/netfilter/ipt_REJECT.c           | 3 +--
 net/ipv4/netfilter/ipt_SAME.c             | 3 +--
 net/ipv4/netfilter/ipt_TCPMSS.c           | 3 +--
 net/ipv4/netfilter/ipt_TOS.c              | 3 +--
 net/ipv4/netfilter/ipt_TTL.c              | 2 +-
 net/ipv4/netfilter/ipt_ULOG.c             | 2 +-
 net/ipv4/netfilter/iptable_filter.c       | 4 ++--
 net/ipv4/netfilter/iptable_mangle.c       | 4 ++--
 net/ipv4/netfilter/iptable_raw.c          | 2 +-
 net/ipv6/netfilter/ip6_tables.c           | 9 +++------
 net/ipv6/netfilter/ip6t_HL.c              | 2 +-
 net/ipv6/netfilter/ip6t_LOG.c             | 3 +--
 net/ipv6/netfilter/ip6t_REJECT.c          | 3 +--
 net/ipv6/netfilter/ip6table_filter.c      | 4 ++--
 net/ipv6/netfilter/ip6table_mangle.c      | 4 ++--
 net/ipv6/netfilter/ip6table_raw.c         | 2 +-
 net/netfilter/xt_CLASSIFY.c               | 3 +--
 net/netfilter/xt_CONNMARK.c               | 3 +--
 net/netfilter/xt_CONNSECMARK.c            | 2 +-
 net/netfilter/xt_DSCP.c                   | 6 ++----
 net/netfilter/xt_MARK.c                   | 6 ++----
 net/netfilter/xt_NFQUEUE.c                | 3 +--
 net/netfilter/xt_NOTRACK.c                | 3 +--
 net/netfilter/xt_SECMARK.c                | 2 +-
 net/netfilter/xt_connbytes.c              | 2 +-
 net/sched/act_ipt.c                       | 2 +-
 41 files changed, 55 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 9a9912430e3..9cef0e91542 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -211,8 +211,7 @@ struct xt_target
 			       const struct net_device *out,
 			       unsigned int hooknum,
 			       const struct xt_target *target,
-			       const void *targinfo,
-			       void *userdata);
+			       const void *targinfo);
 
 	/* Called when user tries to insert an entry of this type:
            hook_mask is a bitmask of hooks from which it can be
diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 62cc27daca4..149e87c9ab1 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -248,8 +248,7 @@ extern unsigned int arpt_do_table(struct sk_buff **pskb,
 				  unsigned int hook,
 				  const struct net_device *in,
 				  const struct net_device *out,
-				  struct arpt_table *table,
-				  void *userdata);
+				  struct arpt_table *table);
 
 #define ARPT_ALIGN(s) (((s) + (__alignof__(struct arpt_entry)-1)) & ~(__alignof__(struct arpt_entry)-1))
 #endif /*__KERNEL__*/
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index c0dac16e190..a536bbdef14 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -312,8 +312,7 @@ extern unsigned int ipt_do_table(struct sk_buff **pskb,
 				 unsigned int hook,
 				 const struct net_device *in,
 				 const struct net_device *out,
-				 struct ipt_table *table,
-				 void *userdata);
+				 struct ipt_table *table);
 
 #define IPT_ALIGN(s) XT_ALIGN(s)
 
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index d0d5d1ee4be..d7a8e9c0dad 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -300,8 +300,7 @@ extern unsigned int ip6t_do_table(struct sk_buff **pskb,
 				  unsigned int hook,
 				  const struct net_device *in,
 				  const struct net_device *out,
-				  struct ip6t_table *table,
-				  void *userdata);
+				  struct ip6t_table *table);
 
 /* Check for an extension */
 extern int ip6t_ext_hdr(u8 nexthdr);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 8d1d7a6e72a..c6bd270bf46 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -208,8 +208,7 @@ static unsigned int arpt_error(struct sk_buff **pskb,
 			       const struct net_device *out,
 			       unsigned int hooknum,
 			       const struct xt_target *target,
-			       const void *targinfo,
-			       void *userinfo)
+			       const void *targinfo)
 {
 	if (net_ratelimit())
 		printk("arp_tables: error: '%s'\n", (char *)targinfo);
@@ -226,8 +225,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
 			   unsigned int hook,
 			   const struct net_device *in,
 			   const struct net_device *out,
-			   struct arpt_table *table,
-			   void *userdata)
+			   struct arpt_table *table)
 {
 	static const char nulldevname[IFNAMSIZ];
 	unsigned int verdict = NF_DROP;
@@ -302,8 +300,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb,
 								     in, out,
 								     hook,
 								     t->u.kernel.target,
-								     t->data,
-								     userdata);
+								     t->data);
 
 				/* Target might have changed stuff. */
 				arp = (*pskb)->nh.arph;
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index a58325c1ceb..05fb2421bb2 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -11,7 +11,7 @@ static unsigned int
 target(struct sk_buff **pskb,
        const struct net_device *in, const struct net_device *out,
        unsigned int hooknum, const struct xt_target *target,
-       const void *targinfo, void *userinfo)
+       const void *targinfo)
 {
 	const struct arpt_mangle *mangle = targinfo;
 	struct arphdr *arp;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index d7c472faa53..7edea2a1696 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -155,7 +155,7 @@ static unsigned int arpt_hook(unsigned int hook,
 			      const struct net_device *out,
 			      int (*okfn)(struct sk_buff *))
 {
-	return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+	return arpt_do_table(pskb, hook, in, out, &packet_filter);
 }
 
 static struct nf_hook_ops arpt_ops[] = {
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 1aba926c1cb..1aa0e4f462a 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -104,8 +104,7 @@ static unsigned int ipt_snat_target(struct sk_buff **pskb,
 				    const struct net_device *out,
 				    unsigned int hooknum,
 				    const struct ipt_target *target,
-				    const void *targinfo,
-				    void *userinfo)
+				    const void *targinfo)
 {
 	struct ip_conntrack *ct;
 	enum ip_conntrack_info ctinfo;
@@ -147,8 +146,7 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb,
 				    const struct net_device *out,
 				    unsigned int hooknum,
 				    const struct ipt_target *target,
-				    const void *targinfo,
-				    void *userinfo)
+				    const void *targinfo)
 {
 	struct ip_conntrack *ct;
 	enum ip_conntrack_info ctinfo;
@@ -255,7 +253,7 @@ int ip_nat_rule_find(struct sk_buff **pskb,
 {
 	int ret;
 
-	ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL);
+	ret = ipt_do_table(pskb, hooknum, in, out, &nat_table);
 
 	if (ret == NF_ACCEPT) {
 		if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 048514f15f2..8ce5b6f7644 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -180,8 +180,7 @@ ipt_error(struct sk_buff **pskb,
 	  const struct net_device *out,
 	  unsigned int hooknum,
 	  const struct xt_target *target,
-	  const void *targinfo,
-	  void *userinfo)
+	  const void *targinfo)
 {
 	if (net_ratelimit())
 		printk("ip_tables: error: `%s'\n", (char *)targinfo);
@@ -217,8 +216,7 @@ ipt_do_table(struct sk_buff **pskb,
 	     unsigned int hook,
 	     const struct net_device *in,
 	     const struct net_device *out,
-	     struct ipt_table *table,
-	     void *userdata)
+	     struct ipt_table *table)
 {
 	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
 	u_int16_t offset;
@@ -308,8 +306,7 @@ ipt_do_table(struct sk_buff **pskb,
 								     in, out,
 								     hook,
 								     t->u.kernel.target,
-								     t->data,
-								     userdata);
+								     t->data);
 
 #ifdef CONFIG_NETFILTER_DEBUG
 				if (((struct ipt_entry *)table_base)->comefrom
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index d994c5f5744..a08383cf9e7 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -302,8 +302,7 @@ target(struct sk_buff **pskb,
        const struct net_device *out,
        unsigned int hooknum,
        const struct xt_target *target,
-       const void *targinfo,
-       void *userinfo)
+       const void *targinfo)
 {
 	const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 7e30e6d2b5d..1c3da4a48e5 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -85,8 +85,7 @@ target(struct sk_buff **pskb,
        const struct net_device *out,
        unsigned int hooknum,
        const struct xt_target *target,
-       const void *targinfo,
-       void *userinfo)
+       const void *targinfo)
 {
 	const struct ipt_ECN_info *einfo = targinfo;
 
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index b98f7b08b08..a8d356c6191 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -416,8 +416,7 @@ ipt_log_target(struct sk_buff **pskb,
 	       const struct net_device *out,
 	       unsigned int hooknum,
 	       const struct xt_target *target,
-	       const void *targinfo,
-	       void *userinfo)
+	       const void *targinfo)
 {
 	const struct ipt_log_info *loginfo = targinfo;
 	struct nf_loginfo li;
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index ebd94f2abf0..9659793c66c 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -64,8 +64,7 @@ masquerade_target(struct sk_buff **pskb,
 		  const struct net_device *out,
 		  unsigned int hooknum,
 		  const struct xt_target *target,
-		  const void *targinfo,
-		  void *userinfo)
+		  const void *targinfo)
 {
 	struct ip_conntrack *ct;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 736c4b5a86a..fd5e74a19fb 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -55,8 +55,7 @@ target(struct sk_buff **pskb,
        const struct net_device *out,
        unsigned int hooknum,
        const struct xt_target *target,
-       const void *targinfo,
-       void *userinfo)
+       const void *targinfo)
 {
 	struct ip_conntrack *ct;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index f290463232d..839fe99f71d 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -58,8 +58,7 @@ redirect_target(struct sk_buff **pskb,
 		const struct net_device *out,
 		unsigned int hooknum,
 		const struct xt_target *target,
-		const void *targinfo,
-		void *userinfo)
+		const void *targinfo)
 {
 	struct ip_conntrack *ct;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 95c6662b663..1dfd8e56be8 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -228,8 +228,7 @@ static unsigned int reject(struct sk_buff **pskb,
 			   const struct net_device *out,
 			   unsigned int hooknum,
 			   const struct xt_target *target,
-			   const void *targinfo,
-			   void *userinfo)
+			   const void *targinfo)
 {
 	const struct ipt_reject_info *reject = targinfo;
 
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
index 7169b09b5a6..cf801749490 100644
--- a/net/ipv4/netfilter/ipt_SAME.c
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -133,8 +133,7 @@ same_target(struct sk_buff **pskb,
 		const struct net_device *out,
 		unsigned int hooknum,
 		const struct xt_target *target,
-		const void *targinfo,
-		void *userinfo)
+		const void *targinfo)
 {
 	struct ip_conntrack *ct;
 	enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 0fce85e0550..6d668dcfc22 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -41,8 +41,7 @@ ipt_tcpmss_target(struct sk_buff **pskb,
 		  const struct net_device *out,
 		  unsigned int hooknum,
 		  const struct xt_target *target,
-		  const void *targinfo,
-		  void *userinfo)
+		  const void *targinfo)
 {
 	const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
 	struct tcphdr *tcph;
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 52e9d705d48..043df013708 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -26,8 +26,7 @@ target(struct sk_buff **pskb,
        const struct net_device *out,
        unsigned int hooknum,
        const struct xt_target *target,
-       const void *targinfo,
-       void *userinfo)
+       const void *targinfo)
 {
 	const struct ipt_tos_target_info *tosinfo = targinfo;
 	struct iphdr *iph = (*pskb)->nh.iph;
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index 2afb2a8aa8c..164007107b5 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -23,7 +23,7 @@ static unsigned int
 ipt_ttl_target(struct sk_buff **pskb,
 	       const struct net_device *in, const struct net_device *out,
 	       unsigned int hooknum, const struct xt_target *target,
-	       const void *targinfo, void *userinfo)
+	       const void *targinfo)
 {
 	struct iphdr *iph;
 	const struct ipt_TTL_info *info = targinfo;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index d46fd677fa1..4c5f0a11786 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -308,7 +308,7 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb,
 				    const struct net_device *out,
 				    unsigned int hooknum,
 				    const struct xt_target *target,
-				    const void *targinfo, void *userinfo)
+				    const void *targinfo)
 {
 	struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
 
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 7f417484bfb..e2e7dd8d790 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -90,7 +90,7 @@ ipt_hook(unsigned int hook,
 	 const struct net_device *out,
 	 int (*okfn)(struct sk_buff *))
 {
-	return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+	return ipt_do_table(pskb, hook, in, out, &packet_filter);
 }
 
 static unsigned int
@@ -108,7 +108,7 @@ ipt_local_out_hook(unsigned int hook,
 		return NF_ACCEPT;
 	}
 
-	return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+	return ipt_do_table(pskb, hook, in, out, &packet_filter);
 }
 
 static struct nf_hook_ops ipt_ops[] = {
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 4e7998beda6..79336cb4252 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -119,7 +119,7 @@ ipt_route_hook(unsigned int hook,
 	 const struct net_device *out,
 	 int (*okfn)(struct sk_buff *))
 {
-	return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+	return ipt_do_table(pskb, hook, in, out, &packet_mangler);
 }
 
 static unsigned int
@@ -148,7 +148,7 @@ ipt_local_hook(unsigned int hook,
 	daddr = (*pskb)->nh.iph->daddr;
 	tos = (*pskb)->nh.iph->tos;
 
-	ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+	ret = ipt_do_table(pskb, hook, in, out, &packet_mangler);
 	/* Reroute for ANY change. */
 	if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE
 	    && ((*pskb)->nh.iph->saddr != saddr
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 7912cce1e1b..bcbeb4aeacd 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -95,7 +95,7 @@ ipt_hook(unsigned int hook,
 	 const struct net_device *out,
 	 int (*okfn)(struct sk_buff *))
 {
-	return ipt_do_table(pskb, hook, in, out, &packet_raw, NULL);
+	return ipt_do_table(pskb, hook, in, out, &packet_raw);
 }
 
 /* 'raw' is the very first table. */
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c9d6b23cd3f..38cd7ffda9a 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -220,8 +220,7 @@ ip6t_error(struct sk_buff **pskb,
 	  const struct net_device *out,
 	  unsigned int hooknum,
 	  const struct xt_target *target,
-	  const void *targinfo,
-	  void *userinfo)
+	  const void *targinfo)
 {
 	if (net_ratelimit())
 		printk("ip6_tables: error: `%s'\n", (char *)targinfo);
@@ -258,8 +257,7 @@ ip6t_do_table(struct sk_buff **pskb,
 	      unsigned int hook,
 	      const struct net_device *in,
 	      const struct net_device *out,
-	      struct xt_table *table,
-	      void *userdata)
+	      struct xt_table *table)
 {
 	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
 	int offset = 0;
@@ -349,8 +347,7 @@ ip6t_do_table(struct sk_buff **pskb,
 								     in, out,
 								     hook,
 								     t->u.kernel.target,
-								     t->data,
-								     userdata);
+								     t->data);
 
 #ifdef CONFIG_NETFILTER_DEBUG
 				if (((struct ip6t_entry *)table_base)->comefrom
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
index b8eff8ee69b..c85d124f9a3 100644
--- a/net/ipv6/netfilter/ip6t_HL.c
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -22,7 +22,7 @@ static unsigned int ip6t_hl_target(struct sk_buff **pskb,
 				   const struct net_device *out,
 				   unsigned int hooknum,
 				   const struct xt_target *target,
-				   const void *targinfo, void *userinfo)
+				   const void *targinfo)
 {
 	struct ipv6hdr *ip6h;
 	const struct ip6t_HL_info *info = targinfo;
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 73c6300109d..acb91733e1f 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -427,8 +427,7 @@ ip6t_log_target(struct sk_buff **pskb,
 		const struct net_device *out,
 		unsigned int hooknum,
 		const struct xt_target *target,
-		const void *targinfo,
-		void *userinfo)
+		const void *targinfo)
 {
 	const struct ip6t_log_info *loginfo = targinfo;
 	struct nf_loginfo li;
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 7929ff40216..343acd3cbf5 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -180,8 +180,7 @@ static unsigned int reject6_target(struct sk_buff **pskb,
 			   const struct net_device *out,
 			   unsigned int hooknum,
 			   const struct xt_target *target,
-			   const void *targinfo,
-			   void *userinfo)
+			   const void *targinfo)
 {
 	const struct ip6t_reject_info *reject = targinfo;
 
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 60976c0c58e..2fc07c74dec 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -108,7 +108,7 @@ ip6t_hook(unsigned int hook,
 	 const struct net_device *out,
 	 int (*okfn)(struct sk_buff *))
 {
-	return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL);
+	return ip6t_do_table(pskb, hook, in, out, &packet_filter);
 }
 
 static unsigned int
@@ -128,7 +128,7 @@ ip6t_local_out_hook(unsigned int hook,
 	}
 #endif
 
-	return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL);
+	return ip6t_do_table(pskb, hook, in, out, &packet_filter);
 }
 
 static struct nf_hook_ops ip6t_ops[] = {
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 03a13eab1da..32db04fd831 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -138,7 +138,7 @@ ip6t_route_hook(unsigned int hook,
 	 const struct net_device *out,
 	 int (*okfn)(struct sk_buff *))
 {
-	return ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+	return ip6t_do_table(pskb, hook, in, out, &packet_mangler);
 }
 
 static unsigned int
@@ -174,7 +174,7 @@ ip6t_local_hook(unsigned int hook,
 	/* flowlabel and prio (includes version, which shouldn't change either */
 	flowlabel = *((u_int32_t *) (*pskb)->nh.ipv6h);
 
-	ret = ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+	ret = ip6t_do_table(pskb, hook, in, out, &packet_mangler);
 
 	if (ret != NF_DROP && ret != NF_STOLEN 
 		&& (memcmp(&(*pskb)->nh.ipv6h->saddr, &saddr, sizeof(saddr))
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 61a7c58e99f..b4154da575c 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -122,7 +122,7 @@ ip6t_hook(unsigned int hook,
 	 const struct net_device *out,
 	 int (*okfn)(struct sk_buff *))
 {
-	return ip6t_do_table(pskb, hook, in, out, &packet_raw, NULL);
+	return ip6t_do_table(pskb, hook, in, out, &packet_raw);
 }
 
 static struct nf_hook_ops ip6t_ops[] = { 
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index 1f92edd0593..50de965bb10 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -29,8 +29,7 @@ target(struct sk_buff **pskb,
        const struct net_device *out,
        unsigned int hooknum,
        const struct xt_target *target,
-       const void *targinfo,
-       void *userinfo)
+       const void *targinfo)
 {
 	const struct xt_classify_target_info *clinfo = targinfo;
 
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index e577356b5c7..c2125f6ee12 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -38,8 +38,7 @@ target(struct sk_buff **pskb,
        const struct net_device *out,
        unsigned int hooknum,
        const struct xt_target *target,
-       const void *targinfo,
-       void *userinfo)
+       const void *targinfo)
 {
 	const struct xt_connmark_target_info *markinfo = targinfo;
 	u_int32_t diff;
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index 48f7fc3c85c..4b9cc65bb82 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -66,7 +66,7 @@ static void secmark_restore(struct sk_buff *skb)
 static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
 			   const struct net_device *out, unsigned int hooknum,
 			   const struct xt_target *target,
-			   const void *targinfo, void *userinfo)
+			   const void *targinfo)
 {
 	struct sk_buff *skb = *pskb;
 	const struct xt_connsecmark_target_info *info = targinfo;
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index a1cd9723644..9d23c9580d8 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -32,8 +32,7 @@ static unsigned int target(struct sk_buff **pskb,
 			   const struct net_device *out,
 			   unsigned int hooknum,
 			   const struct xt_target *target,
-			   const void *targinfo,
-			   void *userinfo)
+			   const void *targinfo)
 {
 	const struct xt_DSCP_info *dinfo = targinfo;
 	u_int8_t dscp = ipv4_get_dsfield((*pskb)->nh.iph) >> XT_DSCP_SHIFT;
@@ -54,8 +53,7 @@ static unsigned int target6(struct sk_buff **pskb,
 			    const struct net_device *out,
 			    unsigned int hooknum,
 			    const struct xt_target *target,
-			    const void *targinfo,
-			    void *userinfo)
+			    const void *targinfo)
 {
 	const struct xt_DSCP_info *dinfo = targinfo;
 	u_int8_t dscp = ipv6_get_dsfield((*pskb)->nh.ipv6h) >> XT_DSCP_SHIFT;
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index 0a612721946..95a171c8799 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -27,8 +27,7 @@ target_v0(struct sk_buff **pskb,
 	  const struct net_device *out,
 	  unsigned int hooknum,
 	  const struct xt_target *target,
-	  const void *targinfo,
-	  void *userinfo)
+	  const void *targinfo)
 {
 	const struct xt_mark_target_info *markinfo = targinfo;
 
@@ -44,8 +43,7 @@ target_v1(struct sk_buff **pskb,
 	  const struct net_device *out,
 	  unsigned int hooknum,
 	  const struct xt_target *target,
-	  const void *targinfo,
-	  void *userinfo)
+	  const void *targinfo)
 {
 	const struct xt_mark_target_info_v1 *markinfo = targinfo;
 	int mark = 0;
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 7b982283abd..db9b896e57c 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -29,8 +29,7 @@ target(struct sk_buff **pskb,
        const struct net_device *out,
        unsigned int hooknum,
        const struct xt_target *target,
-       const void *targinfo,
-       void *userinfo)
+       const void *targinfo)
 {
 	const struct xt_NFQ_info *tinfo = targinfo;
 
diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c
index cab881d4424..6d00dcaed23 100644
--- a/net/netfilter/xt_NOTRACK.c
+++ b/net/netfilter/xt_NOTRACK.c
@@ -16,8 +16,7 @@ target(struct sk_buff **pskb,
        const struct net_device *out,
        unsigned int hooknum,
        const struct xt_target *target,
-       const void *targinfo,
-       void *userinfo)
+       const void *targinfo)
 {
 	/* Previously seen (loopback)? Ignore. */
 	if ((*pskb)->nfct != NULL)
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 4300988786c..8a04dcf2611 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -31,7 +31,7 @@ static u8 mode;
 static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
 			   const struct net_device *out, unsigned int hooknum,
 			   const struct xt_target *target,
-			   const void *targinfo, void *userinfo)
+			   const void *targinfo)
 {
 	u32 secmark = 0;
 	const struct xt_secmark_target_info *info = targinfo;
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 2d49948d3c3..d725e8b8450 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -143,7 +143,7 @@ static int check(const char *tablename,
 	return 1;
 }
 
-static struct xt_match xt_connbytes_match = {
+static struct xt_match xt_connbytes_match[] = {
 	{
 		.name		= "connbytes",
 		.family		= AF_INET,
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 224c078a398..45a3143b862 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -222,7 +222,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
 	ret = ipt->tcfi_t->u.kernel.target->target(&skb, skb->dev, NULL,
 						   ipt->tcfi_hook,
 						   ipt->tcfi_t->u.kernel.target,
-						   ipt->tcfi_t->data, NULL);
+						   ipt->tcfi_t->data);
 	switch (ret) {
 	case NF_ACCEPT:
 		result = TC_ACT_OK;
-- 
cgit v1.2.3-70-g09d2


From efa741656e9ebf5fd6e0432b0d1b3c7f156392d3 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 22 Aug 2006 00:36:37 -0700
Subject: [NETFILTER]: x_tables: remove unused size argument to check/destroy
 functions

The size is verified by x_tables and isn't needed by the modules anymore.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h   |  8 ++------
 net/ipv4/netfilter/arp_tables.c      |  5 +----
 net/ipv4/netfilter/arpt_mangle.c     |  2 +-
 net/ipv4/netfilter/ip_nat_rule.c     |  2 --
 net/ipv4/netfilter/ip_tables.c       | 14 +++-----------
 net/ipv4/netfilter/ipt_CLUSTERIP.c   |  4 +---
 net/ipv4/netfilter/ipt_ECN.c         |  1 -
 net/ipv4/netfilter/ipt_LOG.c         |  1 -
 net/ipv4/netfilter/ipt_MASQUERADE.c  |  1 -
 net/ipv4/netfilter/ipt_NETMAP.c      |  1 -
 net/ipv4/netfilter/ipt_REDIRECT.c    |  1 -
 net/ipv4/netfilter/ipt_REJECT.c      |  1 -
 net/ipv4/netfilter/ipt_SAME.c        |  4 +---
 net/ipv4/netfilter/ipt_TCPMSS.c      |  1 -
 net/ipv4/netfilter/ipt_TOS.c         |  1 -
 net/ipv4/netfilter/ipt_TTL.c         |  1 -
 net/ipv4/netfilter/ipt_ULOG.c        |  1 -
 net/ipv4/netfilter/ipt_ah.c          |  1 -
 net/ipv4/netfilter/ipt_ecn.c         |  3 +--
 net/ipv4/netfilter/ipt_hashlimit.c   |  4 +---
 net/ipv4/netfilter/ipt_owner.c       |  1 -
 net/ipv4/netfilter/ipt_recent.c      |  5 ++---
 net/ipv6/netfilter/ip6_tables.c      | 10 ++--------
 net/ipv6/netfilter/ip6t_HL.c         |  1 -
 net/ipv6/netfilter/ip6t_LOG.c        |  1 -
 net/ipv6/netfilter/ip6t_REJECT.c     |  1 -
 net/ipv6/netfilter/ip6t_ah.c         |  1 -
 net/ipv6/netfilter/ip6t_dst.c        |  1 -
 net/ipv6/netfilter/ip6t_frag.c       |  1 -
 net/ipv6/netfilter/ip6t_hbh.c        |  1 -
 net/ipv6/netfilter/ip6t_ipv6header.c |  1 -
 net/ipv6/netfilter/ip6t_owner.c      |  1 -
 net/ipv6/netfilter/ip6t_rt.c         |  1 -
 net/netfilter/xt_CONNMARK.c          |  1 -
 net/netfilter/xt_CONNSECMARK.c       |  2 +-
 net/netfilter/xt_DSCP.c              |  1 -
 net/netfilter/xt_MARK.c              |  2 --
 net/netfilter/xt_SECMARK.c           |  2 +-
 net/netfilter/xt_connbytes.c         |  1 -
 net/netfilter/xt_connmark.c          |  3 +--
 net/netfilter/xt_conntrack.c         |  3 +--
 net/netfilter/xt_dccp.c              |  1 -
 net/netfilter/xt_dscp.c              |  1 -
 net/netfilter/xt_esp.c               |  1 -
 net/netfilter/xt_helper.c            |  3 +--
 net/netfilter/xt_limit.c             |  1 -
 net/netfilter/xt_mark.c              |  1 -
 net/netfilter/xt_multiport.c         |  4 ----
 net/netfilter/xt_physdev.c           |  1 -
 net/netfilter/xt_policy.c            |  3 +--
 net/netfilter/xt_quota.c             |  2 +-
 net/netfilter/xt_sctp.c              |  1 -
 net/netfilter/xt_state.c             |  3 +--
 net/netfilter/xt_statistic.c         |  2 +-
 net/netfilter/xt_string.c            |  4 +---
 net/netfilter/xt_tcpudp.c            |  2 --
 net/sched/act_ipt.c                  |  4 +---
 57 files changed, 26 insertions(+), 106 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 9cef0e91542..9d97102a934 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -174,12 +174,10 @@ struct xt_match
 			  const void *ip,
 			  const struct xt_match *match,
 			  void *matchinfo,
-			  unsigned int matchinfosize,
 			  unsigned int hook_mask);
 
 	/* Called when entry of this type deleted. */
-	void (*destroy)(const struct xt_match *match, void *matchinfo,
-			unsigned int matchinfosize);
+	void (*destroy)(const struct xt_match *match, void *matchinfo);
 
 	/* Called when userspace align differs from kernel space one */
 	int (*compat)(void *match, void **dstptr, int *size, int convert);
@@ -221,12 +219,10 @@ struct xt_target
 			  const void *entry,
 			  const struct xt_target *target,
 			  void *targinfo,
-			  unsigned int targinfosize,
 			  unsigned int hook_mask);
 
 	/* Called when entry of this type deleted. */
-	void (*destroy)(const struct xt_target *target, void *targinfo,
-			unsigned int targinfosize);
+	void (*destroy)(const struct xt_target *target, void *targinfo);
 
 	/* Called when userspace align differs from kernel space one */
 	int (*compat)(void *target, void **dstptr, int *size, int convert);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c6bd270bf46..4f10b06413a 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -491,8 +491,6 @@ static inline int check_entry(struct arpt_entry *e, const char *name, unsigned i
 		}
 	} else if (t->u.kernel.target->checkentry
 		   && !t->u.kernel.target->checkentry(name, e, target, t->data,
-						      t->u.target_size
-						      - sizeof(*t),
 						      e->comefrom)) {
 		duprintf("arp_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
@@ -559,8 +557,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
 
 	t = arpt_get_target(e);
 	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->u.kernel.target, t->data,
-					    t->u.target_size - sizeof(*t));
+		t->u.kernel.target->destroy(t->u.kernel.target, t->data);
 	module_put(t->u.kernel.target->me);
 	return 0;
 }
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index 05fb2421bb2..d12b1df252a 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -67,7 +67,7 @@ target(struct sk_buff **pskb,
 
 static int
 checkentry(const char *tablename, const void *e, const struct xt_target *target,
-           void *targinfo, unsigned int targinfosize, unsigned int hook_mask)
+           void *targinfo, unsigned int hook_mask)
 {
 	const struct arpt_mangle *mangle = targinfo;
 
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 1aa0e4f462a..e59f5a8ecb6 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -172,7 +172,6 @@ static int ipt_snat_checkentry(const char *tablename,
 			       const void *entry,
 			       const struct ipt_target *target,
 			       void *targinfo,
-			       unsigned int targinfosize,
 			       unsigned int hook_mask)
 {
 	struct ip_nat_multi_range_compat *mr = targinfo;
@@ -189,7 +188,6 @@ static int ipt_dnat_checkentry(const char *tablename,
 			       const void *entry,
 			       const struct ipt_target *target,
 			       void *targinfo,
-			       unsigned int targinfosize,
 			       unsigned int hook_mask)
 {
 	struct ip_nat_multi_range_compat *mr = targinfo;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8ce5b6f7644..a0f36806998 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -464,8 +464,7 @@ cleanup_match(struct ipt_entry_match *m, unsigned int *i)
 		return 1;
 
 	if (m->u.kernel.match->destroy)
-		m->u.kernel.match->destroy(m->u.kernel.match, m->data,
-					   m->u.match_size - sizeof(*m));
+		m->u.kernel.match->destroy(m->u.kernel.match, m->data);
 	module_put(m->u.kernel.match->me);
 	return 0;
 }
@@ -518,7 +517,6 @@ check_match(struct ipt_entry_match *m,
 
 	if (m->u.kernel.match->checkentry
 	    && !m->u.kernel.match->checkentry(name, ip, match, m->data,
-					      m->u.match_size - sizeof(*m),
 					      hookmask)) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 m->u.kernel.match->name);
@@ -579,8 +577,6 @@ check_entry(struct ipt_entry *e, const char *name, unsigned int size,
 		}
 	} else if (t->u.kernel.target->checkentry
 		   && !t->u.kernel.target->checkentry(name, e, target, t->data,
-						      t->u.target_size
-						      - sizeof(*t),
 						      e->comefrom)) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
@@ -652,8 +648,7 @@ cleanup_entry(struct ipt_entry *e, unsigned int *i)
 	IPT_MATCH_ITERATE(e, cleanup_match, NULL);
 	t = ipt_get_target(e);
 	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->u.kernel.target, t->data,
-					    t->u.target_size - sizeof(*t));
+		t->u.kernel.target->destroy(t->u.kernel.target, t->data);
 	module_put(t->u.kernel.target->me);
 	return 0;
 }
@@ -1599,7 +1594,6 @@ static inline int compat_copy_match_from_user(struct ipt_entry_match *m,
 
 	if (m->u.kernel.match->checkentry
 	    && !m->u.kernel.match->checkentry(name, ip, match, dm->data,
-					      dm->u.match_size - sizeof(*dm),
 					      hookmask)) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 m->u.kernel.match->name);
@@ -1658,8 +1652,7 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
 			goto out;
 	} else if (t->u.kernel.target->checkentry
 		   && !t->u.kernel.target->checkentry(name, de, target,
-				t->data, t->u.target_size - sizeof(*t),
-				de->comefrom)) {
+						      t->data, de->comefrom)) {
 		duprintf("ip_tables: compat: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
 		goto out;
@@ -2182,7 +2175,6 @@ icmp_checkentry(const char *tablename,
 	   const void *info,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
 	const struct ipt_icmp *icmpinfo = matchinfo;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index a08383cf9e7..41589665fc5 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -372,7 +372,6 @@ checkentry(const char *tablename,
 	   const void *e_void,
 	   const struct xt_target *target,
            void *targinfo,
-           unsigned int targinfosize,
            unsigned int hook_mask)
 {
 	struct ipt_clusterip_tgt_info *cipinfo = targinfo;
@@ -449,8 +448,7 @@ checkentry(const char *tablename,
 }
 
 /* drop reference count of cluster config when rule is deleted */
-static void destroy(const struct xt_target *target, void *targinfo,
-		    unsigned int targinfosize)
+static void destroy(const struct xt_target *target, void *targinfo)
 {
 	struct ipt_clusterip_tgt_info *cipinfo = targinfo;
 
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index 1c3da4a48e5..23f9c7ebe7e 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -106,7 +106,6 @@ checkentry(const char *tablename,
 	   const void *e_void,
 	   const struct xt_target *target,
            void *targinfo,
-           unsigned int targinfosize,
            unsigned int hook_mask)
 {
 	const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo;
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index a8d356c6191..7dc820df8bc 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -439,7 +439,6 @@ static int ipt_log_checkentry(const char *tablename,
 			      const void *e,
 			      const struct xt_target *target,
 			      void *targinfo,
-			      unsigned int targinfosize,
 			      unsigned int hook_mask)
 {
 	const struct ipt_log_info *loginfo = targinfo;
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 9659793c66c..bc65168a343 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -42,7 +42,6 @@ masquerade_check(const char *tablename,
 		 const void *e,
 		 const struct xt_target *target,
 		 void *targinfo,
-		 unsigned int targinfosize,
 		 unsigned int hook_mask)
 {
 	const struct ip_nat_multi_range_compat *mr = targinfo;
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index fd5e74a19fb..beb2914225f 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -33,7 +33,6 @@ check(const char *tablename,
       const void *e,
       const struct xt_target *target,
       void *targinfo,
-      unsigned int targinfosize,
       unsigned int hook_mask)
 {
 	const struct ip_nat_multi_range_compat *mr = targinfo;
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 839fe99f71d..f03d43671c6 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -36,7 +36,6 @@ redirect_check(const char *tablename,
 	       const void *e,
 	       const struct xt_target *target,
 	       void *targinfo,
-	       unsigned int targinfosize,
 	       unsigned int hook_mask)
 {
 	const struct ip_nat_multi_range_compat *mr = targinfo;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 1dfd8e56be8..b81821edd89 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -276,7 +276,6 @@ static int check(const char *tablename,
 		 const void *e_void,
 		 const struct xt_target *target,
 		 void *targinfo,
-		 unsigned int targinfosize,
 		 unsigned int hook_mask)
 {
  	const struct ipt_reject_info *rejinfo = targinfo;
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
index cf801749490..efbcb119883 100644
--- a/net/ipv4/netfilter/ipt_SAME.c
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -52,7 +52,6 @@ same_check(const char *tablename,
 	      const void *e,
 	      const struct xt_target *target,
 	      void *targinfo,
-	      unsigned int targinfosize,
 	      unsigned int hook_mask)
 {
 	unsigned int count, countess, rangeip, index = 0;
@@ -116,8 +115,7 @@ same_check(const char *tablename,
 }
 
 static void 
-same_destroy(const struct xt_target *target, void *targinfo,
-		unsigned int targinfosize)
+same_destroy(const struct xt_target *target, void *targinfo)
 {
 	struct ipt_same_info *mr = targinfo;
 
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
index 6d668dcfc22..ac8a35eeea3 100644
--- a/net/ipv4/netfilter/ipt_TCPMSS.c
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -207,7 +207,6 @@ ipt_tcpmss_checkentry(const char *tablename,
 		      const void *e_void,
 		      const struct xt_target *target,
 		      void *targinfo,
-		      unsigned int targinfosize,
 		      unsigned int hook_mask)
 {
 	const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
index 043df013708..471a4c438b0 100644
--- a/net/ipv4/netfilter/ipt_TOS.c
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -49,7 +49,6 @@ checkentry(const char *tablename,
 	   const void *e_void,
 	   const struct xt_target *target,
            void *targinfo,
-           unsigned int targinfosize,
            unsigned int hook_mask)
 {
 	const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos;
diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c
index 164007107b5..214d9d9c428 100644
--- a/net/ipv4/netfilter/ipt_TTL.c
+++ b/net/ipv4/netfilter/ipt_TTL.c
@@ -67,7 +67,6 @@ static int ipt_ttl_checkentry(const char *tablename,
 		const void *e,
 		const struct xt_target *target,
 		void *targinfo,
-		unsigned int targinfosize,
 		unsigned int hook_mask)
 {
 	struct ipt_TTL_info *info = targinfo;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 4c5f0a11786..2b104ea54f4 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -346,7 +346,6 @@ static int ipt_ulog_checkentry(const char *tablename,
 			       const void *e,
 			       const struct xt_target *target,
 			       void *targinfo,
-			       unsigned int targinfosize,
 			       unsigned int hookmask)
 {
 	struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 2927135873d..1798f86bc53 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -74,7 +74,6 @@ checkentry(const char *tablename,
 	   const void *ip_void,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchinfosize,
 	   unsigned int hook_mask)
 {
 	const struct ipt_ah *ahinfo = matchinfo;
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index b2825041493..dafbdec0efc 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -88,8 +88,7 @@ static int match(const struct sk_buff *skb,
 
 static int checkentry(const char *tablename, const void *ip_void,
 		      const struct xt_match *match,
-		      void *matchinfo, unsigned int matchsize,
-		      unsigned int hook_mask)
+		      void *matchinfo, unsigned int hook_mask)
 {
 	const struct ipt_ecn_info *info = matchinfo;
 	const struct ipt_ip *ip = ip_void;
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index 3bd2368e1fc..b5b74b07370 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -478,7 +478,6 @@ hashlimit_checkentry(const char *tablename,
 		     const void *inf,
 		     const struct xt_match *match,
 		     void *matchinfo,
-		     unsigned int matchsize,
 		     unsigned int hook_mask)
 {
 	struct ipt_hashlimit_info *r = matchinfo;
@@ -529,8 +528,7 @@ hashlimit_checkentry(const char *tablename,
 }
 
 static void
-hashlimit_destroy(const struct xt_match *match, void *matchinfo,
-		  unsigned int matchsize)
+hashlimit_destroy(const struct xt_match *match, void *matchinfo)
 {
 	struct ipt_hashlimit_info *r = matchinfo;
 
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index 5ac6ac023b5..78c336f12a9 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -56,7 +56,6 @@ checkentry(const char *tablename,
            const void *ip,
 	   const struct xt_match *match,
            void *matchinfo,
-           unsigned int matchsize,
            unsigned int hook_mask)
 {
 	const struct ipt_owner_info *info = matchinfo;
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 682c0946201..32ae8d7ac50 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -238,7 +238,7 @@ out:
 static int
 ipt_recent_checkentry(const char *tablename, const void *ip,
 		      const struct xt_match *match, void *matchinfo,
-		      unsigned int matchsize, unsigned int hook_mask)
+		      unsigned int hook_mask)
 {
 	const struct ipt_recent_info *info = matchinfo;
 	struct recent_table *t;
@@ -294,8 +294,7 @@ out:
 }
 
 static void
-ipt_recent_destroy(const struct xt_match *match, void *matchinfo,
-		   unsigned int matchsize)
+ipt_recent_destroy(const struct xt_match *match, void *matchinfo)
 {
 	const struct ipt_recent_info *info = matchinfo;
 	struct recent_table *t;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 38cd7ffda9a..d1c315364ee 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -504,8 +504,7 @@ cleanup_match(struct ip6t_entry_match *m, unsigned int *i)
 		return 1;
 
 	if (m->u.kernel.match->destroy)
-		m->u.kernel.match->destroy(m->u.kernel.match, m->data,
-					   m->u.match_size - sizeof(*m));
+		m->u.kernel.match->destroy(m->u.kernel.match, m->data);
 	module_put(m->u.kernel.match->me);
 	return 0;
 }
@@ -558,7 +557,6 @@ check_match(struct ip6t_entry_match *m,
 
 	if (m->u.kernel.match->checkentry
 	    && !m->u.kernel.match->checkentry(name, ipv6, match,  m->data,
-					      m->u.match_size - sizeof(*m),
 					      hookmask)) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 m->u.kernel.match->name);
@@ -619,8 +617,6 @@ check_entry(struct ip6t_entry *e, const char *name, unsigned int size,
 		}
 	} else if (t->u.kernel.target->checkentry
 		   && !t->u.kernel.target->checkentry(name, e, target, t->data,
-						      t->u.target_size
-						      - sizeof(*t),
 						      e->comefrom)) {
 		duprintf("ip_tables: check failed for `%s'.\n",
 			 t->u.kernel.target->name);
@@ -692,8 +688,7 @@ cleanup_entry(struct ip6t_entry *e, unsigned int *i)
 	IP6T_MATCH_ITERATE(e, cleanup_match, NULL);
 	t = ip6t_get_target(e);
 	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->u.kernel.target, t->data,
-					    t->u.target_size - sizeof(*t));
+		t->u.kernel.target->destroy(t->u.kernel.target, t->data);
 	module_put(t->u.kernel.target->me);
 	return 0;
 }
@@ -1349,7 +1344,6 @@ icmp6_checkentry(const char *tablename,
 	   const void *entry,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
 	const struct ip6t_icmp *icmpinfo = matchinfo;
diff --git a/net/ipv6/netfilter/ip6t_HL.c b/net/ipv6/netfilter/ip6t_HL.c
index c85d124f9a3..e54ea92d107 100644
--- a/net/ipv6/netfilter/ip6t_HL.c
+++ b/net/ipv6/netfilter/ip6t_HL.c
@@ -66,7 +66,6 @@ static int ip6t_hl_checkentry(const char *tablename,
 		const void *entry,
 		const struct xt_target *target,
 		void *targinfo,
-		unsigned int targinfosize,
 		unsigned int hook_mask)
 {
 	struct ip6t_HL_info *info = targinfo;
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index acb91733e1f..0cf537d3018 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -451,7 +451,6 @@ static int ip6t_log_checkentry(const char *tablename,
 			       const void *entry,
 			       const struct xt_target *target,
 			       void *targinfo,
-			       unsigned int targinfosize,
 			       unsigned int hook_mask)
 {
 	const struct ip6t_log_info *loginfo = targinfo;
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 343acd3cbf5..311eae82feb 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -223,7 +223,6 @@ static int check(const char *tablename,
 		 const void *entry,
 		 const struct xt_target *target,
 		 void *targinfo,
-		 unsigned int targinfosize,
 		 unsigned int hook_mask)
 {
  	const struct ip6t_reject_info *rejinfo = targinfo;
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 2f7bb20c758..ec1b1608156 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -102,7 +102,6 @@ checkentry(const char *tablename,
           const void *entry,
 	  const struct xt_match *match,
           void *matchinfo,
-          unsigned int matchinfosize,
           unsigned int hook_mask)
 {
 	const struct ip6t_ah *ahinfo = matchinfo;
diff --git a/net/ipv6/netfilter/ip6t_dst.c b/net/ipv6/netfilter/ip6t_dst.c
index 9422413d057..223c335467c 100644
--- a/net/ipv6/netfilter/ip6t_dst.c
+++ b/net/ipv6/netfilter/ip6t_dst.c
@@ -182,7 +182,6 @@ checkentry(const char *tablename,
 	   const void *info,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchinfosize,
 	   unsigned int hook_mask)
 {
 	const struct ip6t_opts *optsinfo = matchinfo;
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index 06768c84bd3..78d9c8b9e28 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -119,7 +119,6 @@ checkentry(const char *tablename,
 	   const void *ip,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchinfosize,
 	   unsigned int hook_mask)
 {
 	const struct ip6t_frag *fraginfo = matchinfo;
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 374f1be85c0..72defc81656 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -182,7 +182,6 @@ checkentry(const char *tablename,
 	   const void *entry,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchinfosize,
 	   unsigned int hook_mask)
 {
 	const struct ip6t_opts *optsinfo = matchinfo;
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 9375eeb1369..3093c398002 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -128,7 +128,6 @@ ipv6header_checkentry(const char *tablename,
 		      const void *ip,
 		      const struct xt_match *match,
 		      void *matchinfo,
-		      unsigned int matchsize,
 		      unsigned int hook_mask)
 {
 	const struct ip6t_ipv6header_info *info = matchinfo;
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index 5d047990cd4..4eb9bbc4ebc 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -57,7 +57,6 @@ checkentry(const char *tablename,
 	   const void *ip,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
 	const struct ip6t_owner_info *info = matchinfo;
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index fbb0184a41d..bcb2e168a5b 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -197,7 +197,6 @@ checkentry(const char *tablename,
 	   const void *entry,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchinfosize,
 	   unsigned int hook_mask)
 {
 	const struct ip6t_rt *rtinfo = matchinfo;
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index c2125f6ee12..0e4249ddc17 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -89,7 +89,6 @@ checkentry(const char *tablename,
 	   const void *entry,
 	   const struct xt_target *target,
 	   void *targinfo,
-	   unsigned int targinfosize,
 	   unsigned int hook_mask)
 {
 	struct xt_connmark_target_info *matchinfo = targinfo;
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index 4b9cc65bb82..4b0e14bb172 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -89,7 +89,7 @@ static unsigned int target(struct sk_buff **pskb, const struct net_device *in,
 
 static int checkentry(const char *tablename, const void *entry,
 		      const struct xt_target *target, void *targinfo,
-		      unsigned int targinfosize, unsigned int hook_mask)
+		      unsigned int hook_mask)
 {
 	struct xt_connsecmark_target_info *info = targinfo;
 
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index 9d23c9580d8..a7cc75aeb38 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -72,7 +72,6 @@ static int checkentry(const char *tablename,
 		      const void *e_void,
 		      const struct xt_target *target,
 		      void *targinfo,
-		      unsigned int targinfosize,
 		      unsigned int hook_mask)
 {
 	const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp;
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index 95a171c8799..782f8d8c3ed 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -74,7 +74,6 @@ checkentry_v0(const char *tablename,
 	      const void *entry,
 	      const struct xt_target *target,
 	      void *targinfo,
-	      unsigned int targinfosize,
 	      unsigned int hook_mask)
 {
 	struct xt_mark_target_info *markinfo = targinfo;
@@ -91,7 +90,6 @@ checkentry_v1(const char *tablename,
 	      const void *entry,
 	      const struct xt_target *target,
 	      void *targinfo,
-	      unsigned int targinfosize,
 	      unsigned int hook_mask)
 {
 	struct xt_mark_target_info_v1 *markinfo = targinfo;
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 8a04dcf2611..451b67c4bb5 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -85,7 +85,7 @@ static int checkentry_selinux(struct xt_secmark_target_info *info)
 
 static int checkentry(const char *tablename, const void *entry,
 		      const struct xt_target *target, void *targinfo,
-		      unsigned int targinfosize, unsigned int hook_mask)
+		      unsigned int hook_mask)
 {
 	struct xt_secmark_target_info *info = targinfo;
 
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index d725e8b8450..dcc497ea818 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -125,7 +125,6 @@ static int check(const char *tablename,
 		 const void *ip,
 		 const struct xt_match *match,
 		 void *matchinfo,
-		 unsigned int matchsize,
 		 unsigned int hook_mask)
 {
 	const struct xt_connbytes_info *sinfo = matchinfo;
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index a97b2d455b7..c9104d05a19 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -55,7 +55,6 @@ checkentry(const char *tablename,
 	   const void *ip,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
 	struct xt_connmark_info *cm = matchinfo;
@@ -75,7 +74,7 @@ checkentry(const char *tablename,
 }
 
 static void
-destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize)
+destroy(const struct xt_match *match, void *matchinfo)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	nf_ct_l3proto_module_put(match->family);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 1540885174e..39c57e9f756 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -208,7 +208,6 @@ checkentry(const char *tablename,
 	   const void *ip,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
@@ -222,7 +221,7 @@ checkentry(const char *tablename,
 }
 
 static void
-destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize)
+destroy(const struct xt_match *match, void *matchinfo)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	nf_ct_l3proto_module_put(match->family);
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c
index 5ca6f5288f4..3e6cf430e51 100644
--- a/net/netfilter/xt_dccp.c
+++ b/net/netfilter/xt_dccp.c
@@ -131,7 +131,6 @@ checkentry(const char *tablename,
 	   const void *inf,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
 	const struct xt_dccp_info *info = matchinfo;
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index d84075c3015..26c7f4ad102 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -58,7 +58,6 @@ static int checkentry(const char *tablename,
 		      const void *info,
 		      const struct xt_match *match,
 		      void *matchinfo,
-		      unsigned int matchsize,
 		      unsigned int hook_mask)
 {
 	const u_int8_t dscp = ((struct xt_dscp_info *)matchinfo)->dscp;
diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c
index 7b19bc9ea20..7c95f149d94 100644
--- a/net/netfilter/xt_esp.c
+++ b/net/netfilter/xt_esp.c
@@ -79,7 +79,6 @@ checkentry(const char *tablename,
 	   const void *ip_void,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchinfosize,
 	   unsigned int hook_mask)
 {
 	const struct xt_esp *espinfo = matchinfo;
diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c
index db453a7a154..5d7818b73e3 100644
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -139,7 +139,6 @@ static int check(const char *tablename,
 		 const void *inf,
 		 const struct xt_match *match,
 		 void *matchinfo,
-		 unsigned int matchsize,
 		 unsigned int hook_mask)
 {
 	struct xt_helper_info *info = matchinfo;
@@ -156,7 +155,7 @@ static int check(const char *tablename,
 }
 
 static void
-destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize)
+destroy(const struct xt_match *match, void *matchinfo)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	nf_ct_l3proto_module_put(match->family);
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index e8d5e7ac695..b9c9ff3a06e 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -110,7 +110,6 @@ ipt_limit_checkentry(const char *tablename,
 		     const void *inf,
 		     const struct xt_match *match,
 		     void *matchinfo,
-		     unsigned int matchsize,
 		     unsigned int hook_mask)
 {
 	struct xt_rateinfo *r = matchinfo;
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 39f9b079f5d..e8059cd1727 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -39,7 +39,6 @@ checkentry(const char *tablename,
            const void *entry,
 	   const struct xt_match *match,
            void *matchinfo,
-           unsigned int matchsize,
            unsigned int hook_mask)
 {
 	const struct xt_mark_info *minfo = matchinfo;
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index e74f9bb98b3..d3aefd38093 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -176,7 +176,6 @@ checkentry(const char *tablename,
 	   const void *info,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
 	const struct ipt_ip *ip = info;
@@ -191,7 +190,6 @@ checkentry_v1(const char *tablename,
 	      const void *info,
 	      const struct xt_match *match,
 	      void *matchinfo,
-	      unsigned int matchsize,
 	      unsigned int hook_mask)
 {
 	const struct ipt_ip *ip = info;
@@ -206,7 +204,6 @@ checkentry6(const char *tablename,
 	    const void *info,
 	    const struct xt_match *match,
 	    void *matchinfo,
-	    unsigned int matchsize,
 	    unsigned int hook_mask)
 {
 	const struct ip6t_ip6 *ip = info;
@@ -221,7 +218,6 @@ checkentry6_v1(const char *tablename,
 	       const void *info,
 	       const struct xt_match *match,
 	       void *matchinfo,
-	       unsigned int matchsize,
 	       unsigned int hook_mask)
 {
 	const struct ip6t_ip6 *ip = info;
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index af3d70f96ec..fd8f954cded 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -106,7 +106,6 @@ checkentry(const char *tablename,
 		       const void *ip,
 		       const struct xt_match *match,
 		       void *matchinfo,
-		       unsigned int matchsize,
 		       unsigned int hook_mask)
 {
 	const struct xt_physdev_info *info = matchinfo;
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index f5639c45111..e9d81378d65 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -135,8 +135,7 @@ static int match(const struct sk_buff *skb,
 
 static int checkentry(const char *tablename, const void *ip_void,
                       const struct xt_match *match,
-                      void *matchinfo, unsigned int matchsize,
-                      unsigned int hook_mask)
+                      void *matchinfo, unsigned int hook_mask)
 {
 	struct xt_policy_info *info = matchinfo;
 
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c
index cc44f87cb8e..b75fa2c70e6 100644
--- a/net/netfilter/xt_quota.c
+++ b/net/netfilter/xt_quota.c
@@ -41,7 +41,7 @@ match(const struct sk_buff *skb,
 static int
 checkentry(const char *tablename, const void *entry,
 	   const struct xt_match *match, void *matchinfo,
-	   unsigned int matchsize, unsigned int hook_mask)
+	   unsigned int hook_mask)
 {
 	struct xt_quota_info *q = (struct xt_quota_info *)matchinfo;
 
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index 5628621170e..7956acaaa24 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -163,7 +163,6 @@ checkentry(const char *tablename,
 	   const void *inf,
 	   const struct xt_match *match,
 	   void *matchinfo,
-	   unsigned int matchsize,
 	   unsigned int hook_mask)
 {
 	const struct xt_sctp_info *info = matchinfo;
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index 5f9492e3b2b..d9010b16a1f 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -48,7 +48,6 @@ static int check(const char *tablename,
 		 const void *inf,
 		 const struct xt_match *match,
 		 void *matchinfo,
-		 unsigned int matchsize,
 		 unsigned int hook_mask)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
@@ -62,7 +61,7 @@ static int check(const char *tablename,
 }
 
 static void
-destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize)
+destroy(const struct xt_match *match, void *matchinfo)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	nf_ct_l3proto_module_put(match->family);
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 5181630a87f..091a9f89f5d 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -55,7 +55,7 @@ match(const struct sk_buff *skb,
 static int
 checkentry(const char *tablename, const void *entry,
 	   const struct xt_match *match, void *matchinfo,
-	   unsigned int matchsize, unsigned int hook_mask)
+	   unsigned int hook_mask)
 {
 	struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo;
 
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 1a1c1d17d85..4453252400a 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -46,7 +46,6 @@ static int checkentry(const char *tablename,
 		      const void *ip,
 		      const struct xt_match *match,
 		      void *matchinfo,
-		      unsigned int matchsize,
 		      unsigned int hook_mask)
 {
 	struct xt_string_info *conf = matchinfo;
@@ -69,8 +68,7 @@ static int checkentry(const char *tablename,
 	return 1;
 }
 
-static void destroy(const struct xt_match *match, void *matchinfo,
-		    unsigned int matchsize)
+static void destroy(const struct xt_match *match, void *matchinfo)
 {
 	textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config);
 }
diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c
index 54aab051af8..e76a68e0bc6 100644
--- a/net/netfilter/xt_tcpudp.c
+++ b/net/netfilter/xt_tcpudp.c
@@ -141,7 +141,6 @@ tcp_checkentry(const char *tablename,
 	       const void *info,
 	       const struct xt_match *match,
 	       void *matchinfo,
-	       unsigned int matchsize,
 	       unsigned int hook_mask)
 {
 	const struct xt_tcp *tcpinfo = matchinfo;
@@ -190,7 +189,6 @@ udp_checkentry(const char *tablename,
 	       const void *info,
 	       const struct xt_match *match,
 	       void *matchinfo,
-	       unsigned int matchsize,
 	       unsigned int hook_mask)
 {
 	const struct xt_tcp *udpinfo = matchinfo;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 45a3143b862..d8c9310da6e 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -69,7 +69,6 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
 	if (t->u.kernel.target->checkentry
 	    && !t->u.kernel.target->checkentry(table, NULL,
 		    			       t->u.kernel.target, t->data,
-					       t->u.target_size - sizeof(*t),
 					       hook)) {
 		module_put(t->u.kernel.target->me);
 		ret = -EINVAL;
@@ -81,8 +80,7 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
 static void ipt_destroy_target(struct ipt_entry_target *t)
 {
 	if (t->u.kernel.target->destroy)
-		t->u.kernel.target->destroy(t->u.kernel.target, t->data,
-		                            t->u.target_size - sizeof(*t));
+		t->u.kernel.target->destroy(t->u.kernel.target, t->data);
         module_put(t->u.kernel.target->me);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 53e26658282373b84ba85a0c9807cb762f7738a6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 22 Aug 2006 00:43:20 -0700
Subject: [NETFILTER]: nfnetlink: remove unnecessary packed attributes

Remove unnecessary packed attributes in nfnetlink structures. Unfortunately
in a few cases they have to stay to avoid changing structure sizes.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink.h       | 4 ++--
 include/linux/netfilter/nfnetlink_log.h   | 6 +++---
 include/linux/netfilter/nfnetlink_queue.h | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 9f5b12cf489..6d8e3e5a80e 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -43,7 +43,7 @@ struct nfattr
 	u_int16_t nfa_len;
 	u_int16_t nfa_type;	/* we use 15 bits for the type, and the highest
 				 * bit to indicate whether the payload is nested */
-} __attribute__ ((packed));
+};
 
 /* FIXME: Apart from NFNL_NFA_NESTED shamelessly copy and pasted from
  * rtnetlink.h, it's time to put this in a generic file */
@@ -79,7 +79,7 @@ struct nfgenmsg {
 	u_int8_t  nfgen_family;		/* AF_xxx */
 	u_int8_t  version;		/* nfnetlink version */
 	u_int16_t res_id;		/* resource id */
-} __attribute__ ((packed));
+};
 
 #define NFNETLINK_V0	0
 
diff --git a/include/linux/netfilter/nfnetlink_log.h b/include/linux/netfilter/nfnetlink_log.h
index a7497c7436d..87b92f8b988 100644
--- a/include/linux/netfilter/nfnetlink_log.h
+++ b/include/linux/netfilter/nfnetlink_log.h
@@ -19,18 +19,18 @@ struct nfulnl_msg_packet_hdr {
 	u_int16_t	hw_protocol;	/* hw protocol (network order) */
 	u_int8_t	hook;		/* netfilter hook */
 	u_int8_t	_pad;
-} __attribute__ ((packed));
+};
 
 struct nfulnl_msg_packet_hw {
 	u_int16_t	hw_addrlen;
 	u_int16_t	_pad;
 	u_int8_t	hw_addr[8];
-} __attribute__ ((packed));
+};
 
 struct nfulnl_msg_packet_timestamp {
 	aligned_u64	sec;
 	aligned_u64	usec;
-} __attribute__ ((packed));
+};
 
 #define NFULNL_PREFIXLEN	30	/* just like old log target */
 
diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
index 9e774373244..36af0360b56 100644
--- a/include/linux/netfilter/nfnetlink_queue.h
+++ b/include/linux/netfilter/nfnetlink_queue.h
@@ -22,12 +22,12 @@ struct nfqnl_msg_packet_hw {
 	u_int16_t	hw_addrlen;
 	u_int16_t	_pad;
 	u_int8_t	hw_addr[8];
-} __attribute__ ((packed));
+};
 
 struct nfqnl_msg_packet_timestamp {
 	aligned_u64	sec;
 	aligned_u64	usec;
-} __attribute__ ((packed));
+};
 
 enum nfqnl_attr_type {
 	NFQA_UNSPEC,
@@ -49,7 +49,7 @@ enum nfqnl_attr_type {
 struct nfqnl_msg_verdict_hdr {
 	u_int32_t verdict;
 	u_int32_t id;
-} __attribute__ ((packed));
+};
 
 
 enum nfqnl_msg_config_cmds {
@@ -64,7 +64,7 @@ struct nfqnl_msg_config_cmd {
 	u_int8_t	command;	/* nfqnl_msg_config_cmds */
 	u_int8_t	_pad;
 	u_int16_t	pf;		/* AF_xxx for PF_[UN]BIND */
-} __attribute__ ((packed));
+};
 
 enum nfqnl_config_mode {
 	NFQNL_COPY_NONE,
-- 
cgit v1.2.3-70-g09d2


From 91270cf81765152f6e77953440beb4d3b34a71b5 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 22 Aug 2006 00:43:38 -0700
Subject: [NETFILTER]: x_tables: add data member to struct xt_match

Shared match functions can use this to make runtime decisions basen on the
used match.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 9d97102a934..03d1027fb0e 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -185,6 +185,9 @@ struct xt_match
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
 
+	/* Free to use by each match */
+	unsigned long data;
+
 	char *table;
 	unsigned int matchsize;
 	unsigned int hooks;
-- 
cgit v1.2.3-70-g09d2


From 3fd091e73b81f131e1567c4d4a1ec042940bf2f7 Mon Sep 17 00:00:00 2001
From: Vladislav Yasevich <vladislav.yasevich@hp.com>
Date: Tue, 22 Aug 2006 13:29:17 -0700
Subject: [SCTP]: Remove multiple levels of msecs to jiffies conversions.

The SCTP sysctl entries are displayed in milliseconds, but stored
internally in jiffies. This results in multiple levels of msecs to
jiffies conversion and as a result produces a truncation error. This
patch makes things consistent in that we store and display defaults
in milliseconds and only convert once for use by association.
This patch also adds some sane min/max values so that we don't go off
the deep end.

Signed-off-by: Vladislav Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  13 ++--
 include/net/sctp/structs.h   |  12 ++--
 net/sctp/protocol.c          |   2 +-
 net/sctp/socket.c            |  15 +++--
 net/sctp/sysctl.c            | 140 +++++++++++++++++++------------------------
 net/sctp/transport.c         |   2 +-
 6 files changed, 84 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 57166bfdf8e..6c632e26f72 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -264,10 +264,10 @@ enum { SCTP_MAX_DUP_TSNS = 16 };
 enum { SCTP_MAX_GABS = 16 };
 
 /* Heartbeat interval - 30 secs */
-#define SCTP_DEFAULT_TIMEOUT_HEARTBEAT	(30 * HZ)
+#define SCTP_DEFAULT_TIMEOUT_HEARTBEAT	(30*1000)
 
 /* Delayed sack timer - 200ms */
-#define SCTP_DEFAULT_TIMEOUT_SACK	((200 * HZ) / 1000)
+#define SCTP_DEFAULT_TIMEOUT_SACK	(200)
 
 /* RTO.Initial              - 3  seconds
  * RTO.Min                  - 1  second
@@ -275,9 +275,9 @@ enum { SCTP_MAX_GABS = 16 };
  * RTO.Alpha                - 1/8
  * RTO.Beta                 - 1/4
  */
-#define SCTP_RTO_INITIAL	(3 * HZ)
-#define SCTP_RTO_MIN		(1 * HZ)
-#define SCTP_RTO_MAX		(60 * HZ)
+#define SCTP_RTO_INITIAL	(3 * 1000)
+#define SCTP_RTO_MIN		(1 * 1000)
+#define SCTP_RTO_MAX		(60 * 1000)
 
 #define SCTP_RTO_ALPHA          3   /* 1/8 when converted to right shifts. */
 #define SCTP_RTO_BETA           2   /* 1/4 when converted to right shifts. */
@@ -290,8 +290,7 @@ enum { SCTP_MAX_GABS = 16 };
 #define SCTP_DEF_MAX_INIT 6
 #define SCTP_DEF_MAX_SEND 10
 
-#define SCTP_DEFAULT_COOKIE_LIFE_SEC	60 /* seconds */
-#define SCTP_DEFAULT_COOKIE_LIFE_USEC	0  /* microseconds */
+#define SCTP_DEFAULT_COOKIE_LIFE	(60 * 1000) /* 60 seconds */
 
 #define SCTP_DEFAULT_MINWINDOW	1500	/* default minimum rwnd size */
 #define SCTP_DEFAULT_MAXWINDOW	65535	/* default rwnd size */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 0412e730c76..c6d93bb0dcd 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -128,9 +128,9 @@ extern struct sctp_globals {
 	 * RTO.Alpha		    - 1/8  (3 when converted to right shifts.)
 	 * RTO.Beta		    - 1/4  (2 when converted to right shifts.)
 	 */
-	unsigned long rto_initial;
-	unsigned long rto_min;
-	unsigned long rto_max;
+	unsigned int rto_initial;
+	unsigned int rto_min;
+	unsigned int rto_max;
 
 	/* Note: rto_alpha and rto_beta are really defined as inverse
 	 * powers of two to facilitate integer operations.
@@ -145,13 +145,13 @@ extern struct sctp_globals {
 	int cookie_preserve_enable;
 
 	/* Valid.Cookie.Life	    - 60  seconds  */
-	unsigned long valid_cookie_life;
+	unsigned int valid_cookie_life;
 
 	/* Delayed SACK timeout  200ms default*/
-	unsigned long sack_timeout;
+	unsigned int sack_timeout;
 
 	/* HB.interval		    - 30 seconds  */
-	unsigned long hb_interval;
+	unsigned int hb_interval;
 
 	/* Association.Max.Retrans  - 10 attempts
 	 * Path.Max.Retrans	    - 5	 attempts (per destination address)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 1ab03a27a76..5692ef5485d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1049,7 +1049,7 @@ SCTP_STATIC __init int sctp_init(void)
 	sctp_rto_beta			= SCTP_RTO_BETA;
 
 	/* Valid.Cookie.Life        - 60  seconds */
-	sctp_valid_cookie_life		= 60 * HZ;
+	sctp_valid_cookie_life		= SCTP_DEFAULT_COOKIE_LIFE;
 
 	/* Whether Cookie Preservative is enabled(1) or not(0) */
 	sctp_cookie_preserve_enable 	= 1;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3b6e82cb372..7c1dbb1d10d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3045,14 +3045,14 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
 	sp->initmsg.sinit_num_ostreams   = sctp_max_outstreams;
 	sp->initmsg.sinit_max_instreams  = sctp_max_instreams;
 	sp->initmsg.sinit_max_attempts   = sctp_max_retrans_init;
-	sp->initmsg.sinit_max_init_timeo = jiffies_to_msecs(sctp_rto_max);
+	sp->initmsg.sinit_max_init_timeo = sctp_rto_max;
 
 	/* Initialize default RTO related parameters.  These parameters can
 	 * be modified for with the SCTP_RTOINFO socket option.
 	 */
-	sp->rtoinfo.srto_initial = jiffies_to_msecs(sctp_rto_initial);
-	sp->rtoinfo.srto_max     = jiffies_to_msecs(sctp_rto_max);
-	sp->rtoinfo.srto_min     = jiffies_to_msecs(sctp_rto_min);
+	sp->rtoinfo.srto_initial = sctp_rto_initial;
+	sp->rtoinfo.srto_max     = sctp_rto_max;
+	sp->rtoinfo.srto_min     = sctp_rto_min;
 
 	/* Initialize default association related parameters. These parameters
 	 * can be modified with the SCTP_ASSOCINFO socket option.
@@ -3061,8 +3061,7 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
 	sp->assocparams.sasoc_number_peer_destinations = 0;
 	sp->assocparams.sasoc_peer_rwnd = 0;
 	sp->assocparams.sasoc_local_rwnd = 0;
-	sp->assocparams.sasoc_cookie_life = 
-		jiffies_to_msecs(sctp_valid_cookie_life);
+	sp->assocparams.sasoc_cookie_life = sctp_valid_cookie_life;
 
 	/* Initialize default event subscriptions. By default, all the
 	 * options are off. 
@@ -3072,10 +3071,10 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
 	/* Default Peer Address Parameters.  These defaults can
 	 * be modified via SCTP_PEER_ADDR_PARAMS
 	 */
-	sp->hbinterval  = jiffies_to_msecs(sctp_hb_interval);
+	sp->hbinterval  = sctp_hb_interval;
 	sp->pathmaxrxt  = sctp_max_retrans_path;
 	sp->pathmtu     = 0; // allow default discovery
-	sp->sackdelay   = jiffies_to_msecs(sctp_sack_timeout);
+	sp->sackdelay   = sctp_sack_timeout;
 	sp->param_flags = SPP_HB_ENABLE |
 	                  SPP_PMTUD_ENABLE |
 	                  SPP_SACKDELAY_ENABLE;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index dc6f3ff3235..633cd178654 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -45,9 +45,10 @@
 #include <net/sctp/sctp.h>
 #include <linux/sysctl.h>
 
-static ctl_handler sctp_sysctl_jiffies_ms;
-static long rto_timer_min = 1;
-static long rto_timer_max = 86400000; /* One day */
+static int zero = 0;
+static int one = 1;
+static int timer_max = 86400000; /* ms in one day */
+static int int_max = INT_MAX;
 static long sack_timer_min = 1;
 static long sack_timer_max = 500;
 
@@ -56,45 +57,45 @@ static ctl_table sctp_table[] = {
 		.ctl_name	= NET_SCTP_RTO_INITIAL,
 		.procname	= "rto_initial",
 		.data		= &sctp_rto_initial,
-		.maxlen		= sizeof(long),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
-		.strategy	= &sctp_sysctl_jiffies_ms,
-		.extra1         = &rto_timer_min,
-		.extra2         = &rto_timer_max
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1         = &one,
+		.extra2         = &timer_max
 	},
 	{
 		.ctl_name	= NET_SCTP_RTO_MIN,
 		.procname	= "rto_min",
 		.data		= &sctp_rto_min,
-		.maxlen		= sizeof(long),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
-		.strategy	= &sctp_sysctl_jiffies_ms,
-		.extra1         = &rto_timer_min,
-		.extra2         = &rto_timer_max
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1         = &one,
+		.extra2         = &timer_max
 	},
 	{
 		.ctl_name	= NET_SCTP_RTO_MAX,
 		.procname	= "rto_max",
 		.data		= &sctp_rto_max,
-		.maxlen		= sizeof(long),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
-		.strategy	= &sctp_sysctl_jiffies_ms,
-		.extra1         = &rto_timer_min,
-		.extra2         = &rto_timer_max
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1         = &one,
+		.extra2         = &timer_max
 	},
 	{
 		.ctl_name	= NET_SCTP_VALID_COOKIE_LIFE,
 		.procname	= "valid_cookie_life",
 		.data		= &sctp_valid_cookie_life,
-		.maxlen		= sizeof(long),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
-		.strategy	= &sctp_sysctl_jiffies_ms,
-		.extra1         = &rto_timer_min,
-		.extra2         = &rto_timer_max
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1         = &one,
+		.extra2         = &timer_max
 	},
 	{
 		.ctl_name	= NET_SCTP_MAX_BURST,
@@ -102,7 +103,10 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_max_burst,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &int_max
 	},
 	{
 		.ctl_name	= NET_SCTP_ASSOCIATION_MAX_RETRANS,
@@ -110,7 +114,10 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_max_retrans_association,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &int_max
 	},
 	{
 		.ctl_name	= NET_SCTP_SNDBUF_POLICY,
@@ -118,7 +125,8 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_sndbuf_policy,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec
 	},
 	{
 		.ctl_name	= NET_SCTP_RCVBUF_POLICY,
@@ -126,7 +134,8 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_rcvbuf_policy,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec
 	},
 	{
 		.ctl_name	= NET_SCTP_PATH_MAX_RETRANS,
@@ -134,7 +143,10 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_max_retrans_path,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &int_max
 	},
 	{
 		.ctl_name	= NET_SCTP_MAX_INIT_RETRANSMITS,
@@ -142,18 +154,21 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_max_retrans_init,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &int_max
 	},
 	{
 		.ctl_name	= NET_SCTP_HB_INTERVAL,
 		.procname	= "hb_interval",
 		.data		= &sctp_hb_interval,
-		.maxlen		= sizeof(long),
+		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
-		.strategy	= &sctp_sysctl_jiffies_ms,
-		.extra1         = &rto_timer_min,
-		.extra2         = &rto_timer_max
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1         = &one,
+		.extra2         = &timer_max
 	},
 	{
 		.ctl_name	= NET_SCTP_PRESERVE_ENABLE,
@@ -161,23 +176,26 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_cookie_preserve_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec
 	},
 	{
 		.ctl_name	= NET_SCTP_RTO_ALPHA,
 		.procname	= "rto_alpha_exp_divisor",
 		.data		= &sctp_rto_alpha,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec
 	},
 	{
 		.ctl_name	= NET_SCTP_RTO_BETA,
 		.procname	= "rto_beta_exp_divisor",
 		.data		= &sctp_rto_beta,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec
 	},
 	{
 		.ctl_name	= NET_SCTP_ADDIP_ENABLE,
@@ -185,7 +203,8 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_addip_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec
 	},
 	{
 		.ctl_name	= NET_SCTP_PRSCTP_ENABLE,
@@ -193,7 +212,8 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_prsctp_enable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec
 	},
 	{
 		.ctl_name	= NET_SCTP_SACK_TIMEOUT,
@@ -201,8 +221,8 @@ static ctl_table sctp_table[] = {
 		.data		= &sctp_sack_timeout,
 		.maxlen		= sizeof(long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
-		.strategy	= &sctp_sysctl_jiffies_ms,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
 		.extra1         = &sack_timer_min,
 		.extra2         = &sack_timer_max,
 	},
@@ -242,37 +262,3 @@ void sctp_sysctl_unregister(void)
 {
 	unregister_sysctl_table(sctp_sysctl_header);
 }
-
-/* Strategy function to convert jiffies to milliseconds.  */
-static int sctp_sysctl_jiffies_ms(ctl_table *table, int __user *name, int nlen,
-		void __user *oldval, size_t __user *oldlenp,
-		void __user *newval, size_t newlen, void **context) {
-
-	if (oldval) {
-		size_t olen;
-
-		if (oldlenp) {
-			if (get_user(olen, oldlenp))
-				return -EFAULT;
-
-			if (olen != sizeof (int))
-				return -EINVAL;
-		}
-		if (put_user((*(int *)(table->data) * 1000) / HZ,
-			(int __user *)oldval) ||
-		    (oldlenp && put_user(sizeof (int), oldlenp)))
-			return -EFAULT;
-	}
-	if (newval && newlen) {
-		int new;
-
-		if (newlen != sizeof (int))
-			return -EINVAL;
-
-		if (get_user(new, (int __user *)newval))
-			return -EFAULT;
-
-		*(int *)(table->data) = (new * HZ) / 1000;
-	}
-	return 1;
-}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 2763aa93de1..3e5936a5f67 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -75,7 +75,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 	 * parameter 'RTO.Initial'.
 	 */
 	peer->rtt = 0;
-	peer->rto = sctp_rto_initial;
+	peer->rto = msecs_to_jiffies(sctp_rto_initial);
 	peer->rttvar = 0;
 	peer->srtt = 0;
 	peer->rto_pending = 0;
-- 
cgit v1.2.3-70-g09d2


From 5e032e32ecc2e6cb0385dc115ca9bfe5e19a9539 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 23 Aug 2006 17:12:24 -0700
Subject: [IPV6] NDISC: Take source address into account for redirects.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Ville Nuorvala <vnuorval@tcs.hut.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 1 +
 net/ipv6/ndisc.c        | 3 ++-
 net/ipv6/route.c        | 5 +++--
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 3f170f667c7..249ce4545ef 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -110,6 +110,7 @@ extern int			rt6_route_rcv(struct net_device *dev,
 					      struct in6_addr *gwaddr);
 
 extern void			rt6_redirect(struct in6_addr *dest,
+					     struct in6_addr *src,
 					     struct in6_addr *saddr,
 					     struct neighbour *neigh,
 					     u8 *lladdr,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 419d6516381..32f28dec399 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1346,7 +1346,8 @@ static void ndisc_redirect_rcv(struct sk_buff *skb)
 
 	neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
 	if (neigh) {
-		rt6_redirect(dest, &skb->nh.ipv6h->saddr, neigh, lladdr, 
+		rt6_redirect(dest, &skb->nh.ipv6h->daddr,
+			     &skb->nh.ipv6h->saddr, neigh, lladdr,
 			     on_link);
 		neigh_release(neigh);
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5d6e9083ca2..a9b08a2422e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1279,7 +1279,8 @@ static int ip6_route_del(struct fib6_config *cfg)
 /*
  *	Handle redirects
  */
-void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
+void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
+		  struct in6_addr *saddr,
 		  struct neighbour *neigh, u8 *lladdr, int on_link)
 {
 	struct rt6_info *rt, *nrt = NULL;
@@ -1304,7 +1305,7 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
 	 */
 
 	read_lock_bh(&table->tb6_lock);
-	fn = fib6_lookup(&table->tb6_root, dest, NULL);
+	fn = fib6_lookup(&table->tb6_root, dest, src);
 restart:
 	for (rt = fn->leaf; rt; rt = rt->u.next) {
 		/*
-- 
cgit v1.2.3-70-g09d2


From 8e1ef0a95b87e8b4292b2ba733e8cb854ea2d2fe Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 29 Aug 2006 17:15:09 -0700
Subject: [IPV6]: Cache source address as well in ipv6_pinfo{}.

Based on MIPL2 kernel patch.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Ville Nuorvala <vnuorval@tcs.hut.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h             | 3 +++
 include/net/ip6_route.h          | 9 ++++++---
 net/dccp/ipv6.c                  | 4 ++--
 net/ipv6/af_inet6.c              | 2 +-
 net/ipv6/datagram.c              | 7 ++++++-
 net/ipv6/inet6_connection_sock.c | 2 +-
 net/ipv6/ip6_output.c            | 3 +++
 net/ipv6/tcp_ipv6.c              | 4 ++--
 net/ipv6/udp.c                   | 7 ++++++-
 9 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 297853c841b..02d14a3ff2a 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -242,6 +242,9 @@ struct ipv6_pinfo {
 	struct in6_addr 	rcv_saddr;
 	struct in6_addr		daddr;
 	struct in6_addr		*daddr_cache;
+#ifdef CONFIG_IPV6_SUBTREES
+	struct in6_addr		*saddr_cache;
+#endif
 
 	__u32			flow_label;
 	__u32			frag_size;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 249ce4545ef..0d40f84df21 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -144,21 +144,24 @@ extern rwlock_t rt6_lock;
  *	Store a destination cache entry in a socket
  */
 static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
-				   struct in6_addr *daddr)
+				   struct in6_addr *daddr, struct in6_addr *saddr)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct rt6_info *rt = (struct rt6_info *) dst;
 
 	sk_setup_caps(sk, dst);
 	np->daddr_cache = daddr;
+#ifdef CONFIG_IPV6_SUBTREES
+	np->saddr_cache = saddr;
+#endif
 	np->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
 }
 
 static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
-				 struct in6_addr *daddr)
+				 struct in6_addr *daddr, struct in6_addr *saddr)
 {
 	write_lock(&sk->sk_dst_lock);
-	__ip6_dst_store(sk, dst, daddr);
+	__ip6_dst_store(sk, dst, daddr, saddr);
 	write_unlock(&sk->sk_dst_lock);
 }
 
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 231bc7c7e74..f9c5e12d703 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -231,7 +231,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	ipv6_addr_copy(&np->saddr, saddr);
 	inet->rcv_saddr = LOOPBACK4_IPV6;
 
-	__ip6_dst_store(sk, dst, NULL);
+	__ip6_dst_store(sk, dst, NULL, NULL);
 
 	icsk->icsk_ext_hdr_len = 0;
 	if (np->opt != NULL)
@@ -872,7 +872,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 	 * comment in that function for the gory details. -acme
 	 */
 
-	__ip6_dst_store(newsk, dst, NULL);
+	__ip6_dst_store(newsk, dst, NULL, NULL);
 	newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
 						      NETIF_F_TSO);
 	newdp6 = (struct dccp6_sock *)newsk;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2ff600cfe3a..57ee5ddea96 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -659,7 +659,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 			return err;
 		}
 
-		__ip6_dst_store(sk, dst, NULL);
+		__ip6_dst_store(sk, dst, NULL, NULL);
 	}
 
 	return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index c73508e090a..8561b9da6db 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -193,7 +193,12 @@ ipv4_connected:
 
 	ip6_dst_store(sk, dst,
 		      ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ?
-		      &np->daddr : NULL);
+		      &np->daddr : NULL,
+#ifdef CONFIG_IPV6_SUBTREES
+		      ipv6_addr_equal(&fl.fl6_src, &np->saddr) ?
+		      &np->saddr :
+#endif
+		      NULL);
 
 	sk->sk_state = TCP_ESTABLISHED;
 out:
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 7a51a258615..827f41d1478 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -186,7 +186,7 @@ int inet6_csk_xmit(struct sk_buff *skb, int ipfragok)
 			return err;
 		}
 
-		__ip6_dst_store(sk, dst, NULL);
+		__ip6_dst_store(sk, dst, NULL, NULL);
 	}
 
 	skb->dst = dst_clone(dst);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 0a18cb6b1cb..2a376b7d91b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -762,6 +762,9 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 	 * 2. oif also should be the same.
 	 */
 	if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
+#ifdef CONFIG_IPV6_SUBTREES
+	    ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
+#endif
 	    (fl->oif && fl->oif != dst->dev->ifindex)) {
 		dst_release(dst);
 		dst = NULL;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7f1b660493b..2b18918f301 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -272,7 +272,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	inet->rcv_saddr = LOOPBACK4_IPV6;
 
 	sk->sk_gso_type = SKB_GSO_TCPV6;
-	__ip6_dst_store(sk, dst, NULL);
+	__ip6_dst_store(sk, dst, NULL, NULL);
 
 	icsk->icsk_ext_hdr_len = 0;
 	if (np->opt)
@@ -954,7 +954,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	 */
 
 	newsk->sk_gso_type = SKB_GSO_TCPV6;
-	__ip6_dst_store(newsk, dst, NULL);
+	__ip6_dst_store(newsk, dst, NULL, NULL);
 
 	newtcp6sk = (struct tcp6_sock *)newsk;
 	inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index eb9e1b39c8f..b9cc55ccb00 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -847,7 +847,12 @@ do_append_data:
 		if (connected) {
 			ip6_dst_store(sk, dst,
 				      ipv6_addr_equal(&fl->fl6_dst, &np->daddr) ?
-				      &np->daddr : NULL);
+				      &np->daddr : NULL,
+#ifdef CONFIG_IPV6_SUBTREES
+				      ipv6_addr_equal(&fl->fl6_src, &np->saddr) ?
+				      &np->saddr :
+#endif
+				      NULL);
 		} else {
 			dst_release(dst);
 		}
-- 
cgit v1.2.3-70-g09d2


From 7fc33165a74301b2c5c90b2f2a1f6907cbd5c6f1 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 23 Aug 2006 17:22:24 -0700
Subject: [IPV6] ROUTE: Put SUBTREE() as FIB6_SUBTREE() into ip6_fib.h for
 future use.

Based on MIPL2 kernel patch.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: Ville Nuorvala <vnuorval@tcs.hut.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/ip6_fib.c    | 20 +++++++++-----------
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 9610b887ffb..6a3f26a0450 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -60,6 +60,11 @@ struct fib6_node
 	__u32			fn_sernum;
 };
 
+#ifndef CONFIG_IPV6_SUBTREES
+#define FIB6_SUBTREE(fn)	NULL
+#else
+#define FIB6_SUBTREE(fn)	((fn)->subtree)
+#endif
 
 /*
  *	routing information
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b706424e70b..6536e33d835 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -73,10 +73,8 @@ static DEFINE_RWLOCK(fib6_walker_lock);
 
 #ifdef CONFIG_IPV6_SUBTREES
 #define FWS_INIT FWS_S
-#define SUBTREE(fn) ((fn)->subtree)
 #else
 #define FWS_INIT FWS_L
-#define SUBTREE(fn) NULL
 #endif
 
 static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt);
@@ -854,7 +852,7 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root,
 	}
 
 	while(fn) {
-		if (SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
+		if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
 			struct rt6key *key;
 
 			key = (struct rt6key *) ((u8 *) fn->leaf +
@@ -985,7 +983,7 @@ static struct rt6_info * fib6_find_prefix(struct fib6_node *fn)
 		if(fn->right)
 			return fn->right->leaf;
 
-		fn = SUBTREE(fn);
+		fn = FIB6_SUBTREE(fn);
 	}
 	return NULL;
 }
@@ -1016,7 +1014,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
 		if (fn->right) child = fn->right, children |= 1;
 		if (fn->left) child = fn->left, children |= 2;
 
-		if (children == 3 || SUBTREE(fn) 
+		if (children == 3 || FIB6_SUBTREE(fn)
 #ifdef CONFIG_IPV6_SUBTREES
 		    /* Subtree root (i.e. fn) may have one child */
 		    || (children && fn->fn_flags&RTN_ROOT)
@@ -1035,9 +1033,9 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
 
 		pn = fn->parent;
 #ifdef CONFIG_IPV6_SUBTREES
-		if (SUBTREE(pn) == fn) {
+		if (FIB6_SUBTREE(pn) == fn) {
 			BUG_TRAP(fn->fn_flags&RTN_ROOT);
-			SUBTREE(pn) = NULL;
+			FIB6_SUBTREE(pn) = NULL;
 			nstate = FWS_L;
 		} else {
 			BUG_TRAP(!(fn->fn_flags&RTN_ROOT));
@@ -1085,7 +1083,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
 		read_unlock(&fib6_walker_lock);
 
 		node_free(fn);
-		if (pn->fn_flags&RTN_RTINFO || SUBTREE(pn))
+		if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn))
 			return pn;
 
 		rt6_release(pn->leaf);
@@ -1228,8 +1226,8 @@ static int fib6_walk_continue(struct fib6_walker_t *w)
 		switch (w->state) {
 #ifdef CONFIG_IPV6_SUBTREES
 		case FWS_S:
-			if (SUBTREE(fn)) {
-				w->node = SUBTREE(fn);
+			if (FIB6_SUBTREE(fn)) {
+				w->node = FIB6_SUBTREE(fn);
 				continue;
 			}
 			w->state = FWS_L;
@@ -1263,7 +1261,7 @@ static int fib6_walk_continue(struct fib6_walker_t *w)
 			pn = fn->parent;
 			w->node = pn;
 #ifdef CONFIG_IPV6_SUBTREES
-			if (SUBTREE(pn) == fn) {
+			if (FIB6_SUBTREE(pn) == fn) {
 				BUG_TRAP(fn->fn_flags&RTN_ROOT);
 				w->state = FWS_L;
 				continue;
-- 
cgit v1.2.3-70-g09d2


From 77d16f450ae0452d7d4b009f78debb1294fb435c Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Wed, 23 Aug 2006 17:25:05 -0700
Subject: [IPV6] ROUTE: Unify RT6_F_xxx and RT6_SELECT_F_xxx flags

Unify RT6_F_xxx and RT6_SELECT_F_xxx flags into
RT6_LOOKUP_F_xxx flags, and put them into ip6_route.h

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Acked-by: Ville Nuorvala <vnuorval@tcs.hut.fi
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   |  3 ---
 include/net/ip6_route.h |  4 ++++
 net/ipv6/fib6_rules.c   |  2 +-
 net/ipv6/route.c        | 32 ++++++++++++--------------------
 4 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6a3f26a0450..e4438de3bd6 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -173,9 +173,6 @@ struct fib6_table {
 #define RT6_TABLE_LOCAL		RT6_TABLE_MAIN
 #endif
 
-#define RT6_F_STRICT		1
-#define RT6_F_HAS_SADDR		2
-
 typedef struct rt6_info *(*pol_lookup_t)(struct fib6_table *,
 					 struct flowi *, int);
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 0d40f84df21..29790957004 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -32,6 +32,10 @@ struct route_info {
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 
+#define RT6_LOOKUP_F_IFACE	0x1
+#define RT6_LOOKUP_F_REACHABLE	0x2
+#define RT6_LOOKUP_F_HAS_SADDR	0x4
+
 struct pol_chain {
 	int			type;
 	int			priority;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 2c4fbc855e6..7b4908cc52b 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -117,7 +117,7 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 	if (!ipv6_prefix_equal(&fl->fl6_dst, &r->dst.addr, r->dst.plen))
 		return 0;
 
-	if ((flags & RT6_F_HAS_SADDR) &&
+	if ((flags & RT6_LOOKUP_F_HAS_SADDR) &&
 	    !ipv6_prefix_equal(&fl->fl6_src, &r->src.addr, r->src.plen))
 		return 0;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fd6f2ec4fa0..20691285aee 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -76,9 +76,6 @@
 
 #define CLONE_OFFLINK_ROUTE 0
 
-#define RT6_SELECT_F_IFACE	0x1
-#define RT6_SELECT_F_REACHABLE	0x2
-
 static int ip6_rt_max_size = 4096;
 static int ip6_rt_gc_min_interval = HZ / 2;
 static int ip6_rt_gc_timeout = 60*HZ;
@@ -340,7 +337,7 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
 	int m, n;
 		
 	m = rt6_check_dev(rt, oif);
-	if (!m && (strict & RT6_SELECT_F_IFACE))
+	if (!m && (strict & RT6_LOOKUP_F_IFACE))
 		return -1;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
@@ -348,7 +345,7 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
 	n = rt6_check_neigh(rt);
 	if (n > 1)
 		m |= 16;
-	else if (!n && strict & RT6_SELECT_F_REACHABLE)
+	else if (!n && strict & RT6_LOOKUP_F_REACHABLE)
 		return -1;
 	return m;
 }
@@ -388,7 +385,7 @@ static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
 	}
 
 	if (!match &&
-	    (strict & RT6_SELECT_F_REACHABLE) &&
+	    (strict & RT6_LOOKUP_F_REACHABLE) &&
 	    last && last != rt0) {
 		/* no entries matched; do round-robin */
 		static DEFINE_SPINLOCK(lock);
@@ -511,7 +508,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 restart:
 	rt = fn->leaf;
-	rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
+	rt = rt6_device_match(rt, fl->oif, flags);
 	BACKTRACK(&fl->fl6_src);
 	dst_hold(&rt->u.dst);
 out:
@@ -537,7 +534,7 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
 		},
 	};
 	struct dst_entry *dst;
-	int flags = strict ? RT6_F_STRICT : 0;
+	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
 
 	dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
 	if (dst->error == 0)
@@ -633,10 +630,9 @@ static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
 	int strict = 0;
 	int attempts = 3;
 	int err;
-	int reachable = RT6_SELECT_F_REACHABLE;
+	int reachable = RT6_LOOKUP_F_REACHABLE;
 
-	if (flags & RT6_F_STRICT)
-		strict = RT6_SELECT_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IFACE;
 
 relookup:
 	read_lock_bh(&table->tb6_lock);
@@ -712,10 +708,7 @@ void ip6_route_input(struct sk_buff *skb)
 		},
 		.proto = iph->nexthdr,
 	};
-	int flags = 0;
-
-	if (rt6_need_strict(&iph->daddr))
-		flags |= RT6_F_STRICT;
+	int flags = rt6_need_strict(&iph->daddr) ? RT6_LOOKUP_F_IFACE : 0;
 
 	skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
 }
@@ -728,10 +721,9 @@ static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
 	int strict = 0;
 	int attempts = 3;
 	int err;
-	int reachable = RT6_SELECT_F_REACHABLE;
+	int reachable = RT6_LOOKUP_F_REACHABLE;
 
-	if (flags & RT6_F_STRICT)
-		strict = RT6_SELECT_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IFACE;
 
 relookup:
 	read_lock_bh(&table->tb6_lock);
@@ -797,7 +789,7 @@ struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
 	int flags = 0;
 
 	if (rt6_need_strict(&fl->fl6_dst))
-		flags |= RT6_F_STRICT;
+		flags |= RT6_LOOKUP_F_IFACE;
 
 	return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
 }
@@ -1362,7 +1354,7 @@ static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
 		},
 		.gateway = *gateway,
 	};
-	int flags = rt6_need_strict(dest) ? RT6_F_STRICT : 0;
+	int flags = rt6_need_strict(dest) ? RT6_LOOKUP_F_IFACE : 0;
 
 	return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
 }
-- 
cgit v1.2.3-70-g09d2


From 7e49e6de30efa716614e280d97963c570f3acf29 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Fri, 22 Sep 2006 15:05:15 -0700
Subject: [XFRM]: Add XFRM_MODE_xxx for future use.

Transformation mode is used as either IPsec transport or tunnel.
It is required to add two more items, route optimization and inbound trigger
for Mobile IPv6.
Based on MIPL2 kernel patch.

This patch was also written by: Ville Nuorvala <vnuorval@tcs.hut.fi>

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h    |  6 ++++--
 include/net/xfrm.h      |  2 +-
 net/ipv4/ah4.c          |  2 +-
 net/ipv4/esp4.c         |  6 +++---
 net/ipv4/ipcomp.c       |  8 ++++----
 net/ipv4/xfrm4_input.c  |  2 +-
 net/ipv4/xfrm4_output.c |  4 ++--
 net/ipv4/xfrm4_policy.c |  2 +-
 net/ipv4/xfrm4_state.c  |  2 +-
 net/ipv4/xfrm4_tunnel.c |  2 +-
 net/ipv6/ah6.c          |  2 +-
 net/ipv6/esp6.c         |  4 ++--
 net/ipv6/ipcomp6.c      |  6 +++---
 net/ipv6/xfrm6_input.c  |  2 +-
 net/ipv6/xfrm6_output.c |  4 ++--
 net/ipv6/xfrm6_policy.c |  2 +-
 net/ipv6/xfrm6_state.c  |  2 +-
 net/ipv6/xfrm6_tunnel.c |  2 +-
 net/key/af_key.c        |  6 +++---
 net/xfrm/xfrm_policy.c  | 11 ++++++-----
 net/xfrm/xfrm_user.c    |  4 ++--
 21 files changed, 42 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 46a15c7a1a1..5154064b6d9 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -120,7 +120,9 @@ enum
 
 #define XFRM_MODE_TRANSPORT 0
 #define XFRM_MODE_TUNNEL 1
-#define XFRM_MODE_MAX 2
+#define XFRM_MODE_ROUTEOPTIMIZATION 2
+#define XFRM_MODE_IN_TRIGGER 3
+#define XFRM_MODE_MAX 4
 
 /* Netlink configuration messages.  */
 enum {
@@ -247,7 +249,7 @@ struct xfrm_usersa_info {
 	__u32				seq;
 	__u32				reqid;
 	__u16				family;
-	__u8				mode; /* 0=transport,1=tunnel */
+	__u8				mode;		/* XFRM_MODE_xxx */
 	__u8				replay_window;
 	__u8				flags;
 #define XFRM_STATE_NOECN	1
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 00bf86e6e82..762795624b1 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -298,7 +298,7 @@ struct xfrm_tmpl
 
 	__u32			reqid;
 
-/* Mode: transport/tunnel */
+/* Mode: transport, tunnel etc. */
 	__u8			mode;
 
 /* Sharing mode: unique, this session only, this user only etc. */
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 008e69d2e42..99542977e47 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -265,7 +265,7 @@ static int ah_init_state(struct xfrm_state *x)
 		goto error;
 	
 	x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len);
-	if (x->props.mode)
+	if (x->props.mode == XFRM_MODE_TUNNEL)
 		x->props.header_len += sizeof(struct iphdr);
 	x->data = ahp;
 
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b428489f6cc..e87377e1d6b 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -248,7 +248,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 		 *    as per draft-ietf-ipsec-udp-encaps-06,
 		 *    section 3.1.2
 		 */
-		if (!x->props.mode)
+		if (x->props.mode == XFRM_MODE_TRANSPORT)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
@@ -267,7 +267,7 @@ static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
 	struct esp_data *esp = x->data;
 	u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4);
 
-	if (x->props.mode) {
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
 		mtu = ALIGN(mtu + 2, blksize);
 	} else {
 		/* The worst case. */
@@ -383,7 +383,7 @@ static int esp_init_state(struct xfrm_state *x)
 	if (crypto_blkcipher_setkey(tfm, esp->conf.key, esp->conf.key_len))
 		goto error;
 	x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
-	if (x->props.mode)
+	if (x->props.mode == XFRM_MODE_TUNNEL)
 		x->props.header_len += sizeof(struct iphdr);
 	if (x->encap) {
 		struct xfrm_encap_tmpl *encap = x->encap;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 5bb9c9f03fb..17342430a84 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -176,7 +176,7 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
 	return 0;
 
 out_ok:
-	if (x->props.mode)
+	if (x->props.mode == XFRM_MODE_TUNNEL)
 		ip_send_check(iph);
 	return 0;
 }
@@ -216,7 +216,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
 	t->id.daddr.a4 = x->id.daddr.a4;
 	memcpy(&t->sel, &x->sel, sizeof(t->sel));
 	t->props.family = AF_INET;
-	t->props.mode = 1;
+	t->props.mode = XFRM_MODE_TUNNEL;
 	t->props.saddr.a4 = x->props.saddr.a4;
 	t->props.flags = x->props.flags;
 
@@ -416,7 +416,7 @@ static int ipcomp_init_state(struct xfrm_state *x)
 		goto out;
 
 	x->props.header_len = 0;
-	if (x->props.mode)
+	if (x->props.mode == XFRM_MODE_TUNNEL)
 		x->props.header_len += sizeof(struct iphdr);
 
 	mutex_lock(&ipcomp_resource_mutex);
@@ -428,7 +428,7 @@ static int ipcomp_init_state(struct xfrm_state *x)
 		goto error;
 	mutex_unlock(&ipcomp_resource_mutex);
 
-	if (x->props.mode) {
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
 		err = ipcomp_tunnel_attach(x);
 		if (err)
 			goto error_tunnel;
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 817ed84511a..040e8475f29 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -106,7 +106,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
 		if (x->mode->input(x, skb))
 			goto drop;
 
-		if (x->props.mode) {
+		if (x->props.mode == XFRM_MODE_TUNNEL) {
 			decaps = 1;
 			break;
 		}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 4a96a9e3ef3..5fd115f0c54 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -54,7 +54,7 @@ static int xfrm4_output_one(struct sk_buff *skb)
 			goto error_nolock;
 	}
 
-	if (x->props.mode) {
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
 		err = xfrm4_tunnel_check_size(skb);
 		if (err)
 			goto error_nolock;
@@ -85,7 +85,7 @@ static int xfrm4_output_one(struct sk_buff *skb)
 		}
 		dst = skb->dst;
 		x = dst->xfrm;
-	} while (x && !x->props.mode);
+	} while (x && (x->props.mode != XFRM_MODE_TUNNEL));
 
 	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
 	err = 0;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 8f50eae47d0..a5bed741de2 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -96,7 +96,7 @@ __xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 
 		dst1->next = dst_prev;
 		dst_prev = dst1;
-		if (xfrm[i]->props.mode) {
+		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
 			remote = xfrm[i]->id.daddr.a4;
 			local  = xfrm[i]->props.saddr.a4;
 			tunnel = 1;
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 81e1751c966..97b0c758971 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -42,7 +42,7 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
 	x->props.saddr = tmpl->saddr;
 	if (x->props.saddr.a4 == 0)
 		x->props.saddr.a4 = saddr->a4;
-	if (tmpl->mode && x->props.saddr.a4 == 0) {
+	if (tmpl->mode == XFRM_MODE_TUNNEL && x->props.saddr.a4 == 0) {
 		struct rtable *rt;
 	        struct flowi fl_tunnel = {
         	        .nl_u = {
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index f8ceaa127c8..f110af5b131 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -28,7 +28,7 @@ static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
 
 static int ipip_init_state(struct xfrm_state *x)
 {
-	if (!x->props.mode)
+	if (x->props.mode != XFRM_MODE_TUNNEL)
 		return -EINVAL;
 
 	if (x->encap)
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 00ffa7bc6c9..60954fc7eb3 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -398,7 +398,7 @@ static int ah6_init_state(struct xfrm_state *x)
 		goto error;
 	
 	x->props.header_len = XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len);
-	if (x->props.mode)
+	if (x->props.mode == XFRM_MODE_TUNNEL)
 		x->props.header_len += sizeof(struct ipv6hdr);
 	x->data = ahp;
 
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 2ebfd281e72..2b8e52e1d0a 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -237,7 +237,7 @@ static u32 esp6_get_max_size(struct xfrm_state *x, int mtu)
 	struct esp_data *esp = x->data;
 	u32 blksize = ALIGN(crypto_blkcipher_blocksize(esp->conf.tfm), 4);
 
-	if (x->props.mode) {
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
 		mtu = ALIGN(mtu + 2, blksize);
 	} else {
 		/* The worst case. */
@@ -358,7 +358,7 @@ static int esp6_init_state(struct xfrm_state *x)
 	if (crypto_blkcipher_setkey(tfm, esp->conf.key, esp->conf.key_len))
 		goto error;
 	x->props.header_len = sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen;
-	if (x->props.mode)
+	if (x->props.mode == XFRM_MODE_TUNNEL)
 		x->props.header_len += sizeof(struct ipv6hdr);
 	x->data = esp;
 	return 0;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index a81e9e9d93b..19eba8d9f85 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -212,7 +212,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
 	memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr));
 	memcpy(&t->sel, &x->sel, sizeof(t->sel));
 	t->props.family = AF_INET6;
-	t->props.mode = 1;
+	t->props.mode = XFRM_MODE_TUNNEL;
 	memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
 
 	if (xfrm_init_state(t))
@@ -417,7 +417,7 @@ static int ipcomp6_init_state(struct xfrm_state *x)
 		goto out;
 
 	x->props.header_len = 0;
-	if (x->props.mode)
+	if (x->props.mode == XFRM_MODE_TUNNEL)
 		x->props.header_len += sizeof(struct ipv6hdr);
 	
 	mutex_lock(&ipcomp6_resource_mutex);
@@ -429,7 +429,7 @@ static int ipcomp6_init_state(struct xfrm_state *x)
 		goto error;
 	mutex_unlock(&ipcomp6_resource_mutex);
 
-	if (x->props.mode) {
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
 		err = ipcomp6_tunnel_attach(x);
 		if (err)
 			goto error_tunnel;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 0405d74ff91..ee2f6b3908b 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -72,7 +72,7 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi)
 		if (x->mode->input(x, skb))
 			goto drop;
 
-		if (x->props.mode) { /* XXX */
+		if (x->props.mode == XFRM_MODE_TUNNEL) { /* XXX */
 			decaps = 1;
 			break;
 		}
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 6d111743e50..26f18869f77 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -47,7 +47,7 @@ static int xfrm6_output_one(struct sk_buff *skb)
 			goto error_nolock;
 	}
 
-	if (x->props.mode) {
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
 		err = xfrm6_tunnel_check_size(skb);
 		if (err)
 			goto error_nolock;
@@ -80,7 +80,7 @@ static int xfrm6_output_one(struct sk_buff *skb)
 		}
 		dst = skb->dst;
 		x = dst->xfrm;
-	} while (x && !x->props.mode);
+	} while (x && (x->props.mode != XFRM_MODE_TUNNEL));
 
 	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
 	err = 0;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 73cd250aecb..81355bb5032 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -114,7 +114,7 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 
 		dst1->next = dst_prev;
 		dst_prev = dst1;
-		if (xfrm[i]->props.mode) {
+		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
 			remote = (struct in6_addr*)&xfrm[i]->id.daddr;
 			local  = (struct in6_addr*)&xfrm[i]->props.saddr;
 			tunnel = 1;
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index b33296b3f6d..a1a1f547644 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -42,7 +42,7 @@ __xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl,
 	memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
 	if (ipv6_addr_any((struct in6_addr*)&x->props.saddr))
 		memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
-	if (tmpl->mode && ipv6_addr_any((struct in6_addr*)&x->props.saddr)) {
+	if (tmpl->mode == XFRM_MODE_TUNNEL && ipv6_addr_any((struct in6_addr*)&x->props.saddr)) {
 		struct rt6_info *rt;
 		struct flowi fl_tunnel = {
 			.nl_u = {
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index c8f9369c2a8..59685ee8f70 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -307,7 +307,7 @@ static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 static int xfrm6_tunnel_init_state(struct xfrm_state *x)
 {
-	if (!x->props.mode)
+	if (x->props.mode != XFRM_MODE_TUNNEL)
 		return -EINVAL;
 
 	if (x->encap)
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 797c744a843..19e047b0e67 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1765,7 +1765,7 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
 	}
 
 	/* addresses present only in tunnel mode */
-	if (t->mode) {
+	if (t->mode == XFRM_MODE_TUNNEL) {
 		switch (xp->family) {
 		case AF_INET:
 			sin = (void*)(rq+1);
@@ -1997,7 +1997,7 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
 		int req_size;
 
 		req_size = sizeof(struct sadb_x_ipsecrequest);
-		if (t->mode)
+		if (t->mode == XFRM_MODE_TUNNEL)
 			req_size += 2*socklen;
 		else
 			size -= 2*socklen;
@@ -2013,7 +2013,7 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
 		if (t->optional)
 			rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE;
 		rq->sadb_x_ipsecrequest_reqid = t->reqid;
-		if (t->mode) {
+		if (t->mode == XFRM_MODE_TUNNEL) {
 			switch (xp->family) {
 			case AF_INET:
 				sin = (void*)(rq+1);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 32c963c9057..a0d58971391 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -779,7 +779,7 @@ xfrm_tmpl_resolve(struct xfrm_policy *policy, struct flowi *fl,
 		xfrm_address_t *local  = saddr;
 		struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
 
-		if (tmpl->mode) {
+		if (tmpl->mode == XFRM_MODE_TUNNEL) {
 			remote = &tmpl->id.daddr;
 			local = &tmpl->saddr;
 		}
@@ -1005,7 +1005,8 @@ xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x,
 		(x->props.reqid == tmpl->reqid || !tmpl->reqid) &&
 		x->props.mode == tmpl->mode &&
 		(tmpl->aalgos & (1<<x->props.aalgo)) &&
-		!(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family));
+		!(x->props.mode != XFRM_MODE_TRANSPORT &&
+		  xfrm_state_addr_cmp(tmpl, x, family));
 }
 
 static inline int
@@ -1015,14 +1016,14 @@ xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
 	int idx = start;
 
 	if (tmpl->optional) {
-		if (!tmpl->mode)
+		if (tmpl->mode == XFRM_MODE_TRANSPORT)
 			return start;
 	} else
 		start = -1;
 	for (; idx < sp->len; idx++) {
 		if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
 			return ++idx;
-		if (sp->xvec[idx]->props.mode)
+		if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT)
 			break;
 	}
 	return start;
@@ -1047,7 +1048,7 @@ EXPORT_SYMBOL(xfrm_decode_session);
 static inline int secpath_has_tunnel(struct sec_path *sp, int k)
 {
 	for (; k < sp->len; k++) {
-		if (sp->xvec[k]->props.mode)
+		if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT)
 			return 1;
 	}
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index f70e158874d..0d580ac1977 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -174,8 +174,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 
 	err = -EINVAL;
 	switch (p->mode) {
-	case 0:
-	case 1:
+	case XFRM_MODE_TRANSPORT:
+	case XFRM_MODE_TUNNEL:
 		break;
 
 	default:
-- 
cgit v1.2.3-70-g09d2


From 5794708f11551b6d19b10673abf4b0202f66b44d Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Fri, 22 Sep 2006 15:06:24 -0700
Subject: [XFRM]: Introduce a helper to compare id protocol.

Put the helper to header for future use.
Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h    | 6 ++++++
 net/xfrm/xfrm_state.c | 6 +++---
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 762795624b1..5b364b0a6a2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -9,6 +9,7 @@
 #include <linux/skbuff.h>
 #include <linux/socket.h>
 #include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
 #include <linux/in6.h>
 #include <linux/mutex.h>
 
@@ -835,6 +836,11 @@ static inline int xfrm_state_kern(struct xfrm_state *x)
 	return atomic_read(&x->tunnel_users);
 }
 
+static inline int xfrm_id_proto_match(u8 proto, u8 userproto)
+{
+	return (userproto == IPSEC_PROTO_ANY || proto == userproto);
+}
+
 /*
  * xfrm algorithm information
  */
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1c796087ee7..34c038cbdf4 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -294,7 +294,7 @@ void xfrm_state_flush(u8 proto)
 restart:
 		list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
 			if (!xfrm_state_kern(x) &&
-			    (proto == IPSEC_PROTO_ANY || x->id.proto == proto)) {
+			    xfrm_id_proto_match(x->id.proto, proto)) {
 				xfrm_state_hold(x);
 				spin_unlock_bh(&xfrm_state_lock);
 
@@ -772,7 +772,7 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
 	spin_lock_bh(&xfrm_state_lock);
 	for (i = 0; i < XFRM_DST_HSIZE; i++) {
 		list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
-			if (proto == IPSEC_PROTO_ANY || x->id.proto == proto)
+			if (xfrm_id_proto_match(x->id.proto, proto))
 				count++;
 		}
 	}
@@ -783,7 +783,7 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
 
 	for (i = 0; i < XFRM_DST_HSIZE; i++) {
 		list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
-			if (proto != IPSEC_PROTO_ANY && x->id.proto != proto)
+			if (!xfrm_id_proto_match(x->id.proto, proto))
 				continue;
 			err = func(x, --count, data);
 			if (err)
-- 
cgit v1.2.3-70-g09d2


From dc00a525603650a1471c823a1e48c6505c2f9765 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 17:49:52 -0700
Subject: [XFRM] STATE: Allow non IPsec protocol.

It will be added two more transformation protocols (routing header
and destination options header) for Mobile IPv6.
xfrm_id_proto_match() can be handle zero as all, IPSEC_PROTO_ANY as
all IPsec and otherwise as exact one.
Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h   | 5 ++++-
 net/xfrm/xfrm_user.c | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 5b364b0a6a2..2a7d2132a1a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -838,7 +838,10 @@ static inline int xfrm_state_kern(struct xfrm_state *x)
 
 static inline int xfrm_id_proto_match(u8 proto, u8 userproto)
 {
-	return (userproto == IPSEC_PROTO_ANY || proto == userproto);
+	return (!userproto || proto == userproto ||
+		(userproto == IPSEC_PROTO_ANY && (proto == IPPROTO_AH ||
+						  proto == IPPROTO_ESP ||
+						  proto == IPPROTO_COMP)));
 }
 
 /*
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0d580ac1977..41f3d51ffc3 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -542,7 +542,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb)
 	info.nlmsg_flags = NLM_F_MULTI;
 	info.this_idx = 0;
 	info.start_idx = cb->args[0];
-	(void) xfrm_state_walk(IPSEC_PROTO_ANY, dump_one_state, &info);
+	(void) xfrm_state_walk(0, dump_one_state, &info);
 	cb->args[0] = info.this_idx;
 
 	return skb->len;
-- 
cgit v1.2.3-70-g09d2


From 622dc8281a80374873686514e46f852093d91106 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 17:52:01 -0700
Subject: [XFRM]: Expand XFRM_MAX_DEPTH for route optimization.

XFRM_MAX_DEPTH is a limit of transformation states to be applied to the same
flow. Two more extension headers are used by Mobile IPv6 transformation.
Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2a7d2132a1a..aa3be68041b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -314,7 +314,7 @@ struct xfrm_tmpl
 	__u32			calgos;
 };
 
-#define XFRM_MAX_DEPTH		4
+#define XFRM_MAX_DEPTH		6
 
 struct xfrm_policy
 {
-- 
cgit v1.2.3-70-g09d2


From 6c44e6b7ab500d7e3e3f406c83325671be51a752 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 17:53:57 -0700
Subject: [XFRM] STATE: Add source address list.

Support source address based searching.
Mobile IPv6 will use it.
Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     | 26 ++++++++++++++++++++++++++
 net/ipv4/xfrm4_state.c |  3 +++
 net/ipv6/xfrm6_state.c |  3 +++
 net/xfrm/xfrm_state.c  | 21 +++++++++++++++++++--
 4 files changed, 51 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index aa3be68041b..88145e3348d 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -95,6 +95,7 @@ struct xfrm_state
 {
 	/* Note: bydst is re-used during gc */
 	struct list_head	bydst;
+	struct list_head	bysrc;
 	struct list_head	byspi;
 
 	atomic_t		refcnt;
@@ -236,6 +237,7 @@ extern int __xfrm_state_delete(struct xfrm_state *x);
 struct xfrm_state_afinfo {
 	unsigned short		family;
 	struct list_head	*state_bydst;
+	struct list_head	*state_bysrc;
 	struct list_head	*state_byspi;
 	int			(*init_flags)(struct xfrm_state *x);
 	void			(*init_tempsel)(struct xfrm_state *x, struct flowi *fl,
@@ -420,6 +422,30 @@ unsigned xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
 	return 0;
 }
 
+static __inline__
+unsigned __xfrm4_src_hash(xfrm_address_t *addr)
+{
+	return __xfrm4_dst_hash(addr);
+}
+
+static __inline__
+unsigned __xfrm6_src_hash(xfrm_address_t *addr)
+{
+	return __xfrm6_dst_hash(addr);
+}
+
+static __inline__
+unsigned xfrm_src_hash(xfrm_address_t *addr, unsigned short family)
+{
+	switch (family) {
+	case AF_INET:
+		return __xfrm4_src_hash(addr);
+	case AF_INET6:
+		return __xfrm6_src_hash(addr);
+	}
+	return 0;
+}
+
 static __inline__
 unsigned __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
 {
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 97b0c758971..c56b258fad7 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -122,6 +122,9 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
 		add_timer(&x0->timer);
 		xfrm_state_hold(x0);
 		list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h);
+		h = __xfrm4_src_hash(saddr);
+		xfrm_state_hold(x0);
+		list_add_tail(&x0->bysrc, xfrm4_state_afinfo.state_bysrc+h);
 		wake_up(&km_waitq);
 	}
 	if (x0)
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index a1a1f547644..2fb07850449 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -126,6 +126,9 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 proto,
 		add_timer(&x0->timer);
 		xfrm_state_hold(x0);
 		list_add_tail(&x0->bydst, xfrm6_state_afinfo.state_bydst+h);
+		h = __xfrm6_src_hash(saddr);
+		xfrm_state_hold(x0);
+		list_add_tail(&x0->bysrc, xfrm6_state_afinfo.state_bysrc+h);
 		wake_up(&km_waitq);
 	}
 	if (x0)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 34c038cbdf4..2a9992894e6 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -45,6 +45,7 @@ static DEFINE_SPINLOCK(xfrm_state_lock);
  * Also, it can be used by ah/esp icmp error handler to find offending SA.
  */
 static struct list_head xfrm_state_bydst[XFRM_DST_HSIZE];
+static struct list_head xfrm_state_bysrc[XFRM_DST_HSIZE];
 static struct list_head xfrm_state_byspi[XFRM_DST_HSIZE];
 
 DECLARE_WAIT_QUEUE_HEAD(km_waitq);
@@ -200,6 +201,7 @@ struct xfrm_state *xfrm_state_alloc(void)
 		atomic_set(&x->refcnt, 1);
 		atomic_set(&x->tunnel_users, 0);
 		INIT_LIST_HEAD(&x->bydst);
+		INIT_LIST_HEAD(&x->bysrc);
 		INIT_LIST_HEAD(&x->byspi);
 		init_timer(&x->timer);
 		x->timer.function = xfrm_timer_handler;
@@ -240,6 +242,8 @@ int __xfrm_state_delete(struct xfrm_state *x)
 		spin_lock(&xfrm_state_lock);
 		list_del(&x->bydst);
 		__xfrm_state_put(x);
+		list_del(&x->bysrc);
+		__xfrm_state_put(x);
 		if (x->id.spi) {
 			list_del(&x->byspi);
 			__xfrm_state_put(x);
@@ -415,6 +419,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 			x->km.state = XFRM_STATE_ACQ;
 			list_add_tail(&x->bydst, xfrm_state_bydst+h);
 			xfrm_state_hold(x);
+			list_add_tail(&x->bysrc, xfrm_state_bysrc+h);
+			xfrm_state_hold(x);
 			if (x->id.spi) {
 				h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, family);
 				list_add(&x->byspi, xfrm_state_byspi+h);
@@ -448,11 +454,19 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 	list_add(&x->bydst, xfrm_state_bydst+h);
 	xfrm_state_hold(x);
 
-	h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family);
+	h = xfrm_src_hash(&x->props.saddr, x->props.family);
 
-	list_add(&x->byspi, xfrm_state_byspi+h);
+	list_add(&x->bysrc, xfrm_state_bysrc+h);
 	xfrm_state_hold(x);
 
+	if (xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY)) {
+		h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto,
+				  x->props.family);
+
+		list_add(&x->byspi, xfrm_state_byspi+h);
+		xfrm_state_hold(x);
+	}
+
 	if (!mod_timer(&x->timer, jiffies + HZ))
 		xfrm_state_hold(x);
 
@@ -1075,6 +1089,7 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
 		err = -ENOBUFS;
 	else {
 		afinfo->state_bydst = xfrm_state_bydst;
+		afinfo->state_bysrc = xfrm_state_bysrc;
 		afinfo->state_byspi = xfrm_state_byspi;
 		xfrm_state_afinfo[afinfo->family] = afinfo;
 	}
@@ -1097,6 +1112,7 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
 		else {
 			xfrm_state_afinfo[afinfo->family] = NULL;
 			afinfo->state_byspi = NULL;
+			afinfo->state_bysrc = NULL;
 			afinfo->state_bydst = NULL;
 		}
 	}
@@ -1218,6 +1234,7 @@ void __init xfrm_state_init(void)
 
 	for (i=0; i<XFRM_DST_HSIZE; i++) {
 		INIT_LIST_HEAD(&xfrm_state_bydst[i]);
+		INIT_LIST_HEAD(&xfrm_state_bysrc[i]);
 		INIT_LIST_HEAD(&xfrm_state_byspi[i]);
 	}
 	INIT_WORK(&xfrm_state_gc_work, xfrm_state_gc_task, NULL);
-- 
cgit v1.2.3-70-g09d2


From eb2971b68a7d17a7d0fa2c7fc6fbc4bfe41cd694 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 17:56:04 -0700
Subject: [XFRM] STATE: Search by address using source address list.

This is a support to search transformation states by its addresses
by using source address list for Mobile IPv6 usage.
To use it from user-space, it is also added a message type for
source address as a xfrm state option.
Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h   |  1 +
 include/net/xfrm.h     |  2 ++
 net/ipv4/xfrm4_state.c |  9 ++++++++
 net/ipv6/xfrm6_state.c | 21 ++++++++++++++++++
 net/xfrm/xfrm_state.c  | 37 +++++++++++++++++++++++++++----
 net/xfrm/xfrm_user.c   | 59 +++++++++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 119 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 5154064b6d9..66343d3d4b9 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -234,6 +234,7 @@ enum xfrm_attr_type_t {
 	XFRMA_REPLAY_VAL,
 	XFRMA_REPLAY_THRESH,
 	XFRMA_ETIMER_THRESH,
+	XFRMA_SRCADDR,		/* xfrm_address_t */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 88145e3348d..d9c40e71318 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -244,6 +244,7 @@ struct xfrm_state_afinfo {
 						struct xfrm_tmpl *tmpl,
 						xfrm_address_t *daddr, xfrm_address_t *saddr);
 	struct xfrm_state	*(*state_lookup)(xfrm_address_t *daddr, u32 spi, u8 proto);
+	struct xfrm_state	*(*state_lookup_byaddr)(xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto);
 	struct xfrm_state	*(*find_acq)(u8 mode, u32 reqid, u8 proto, 
 					     xfrm_address_t *daddr, xfrm_address_t *saddr, 
 					     int create);
@@ -937,6 +938,7 @@ extern void xfrm_state_insert(struct xfrm_state *x);
 extern int xfrm_state_add(struct xfrm_state *x);
 extern int xfrm_state_update(struct xfrm_state *x);
 extern struct xfrm_state *xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto, unsigned short family);
+extern struct xfrm_state *xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto, unsigned short family);
 extern struct xfrm_state *xfrm_find_acq_byseq(u32 seq);
 extern int xfrm_state_delete(struct xfrm_state *x);
 extern void xfrm_state_flush(u8 proto);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index c56b258fad7..616be131b4e 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -80,6 +80,14 @@ __xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
 	return NULL;
 }
 
+/* placeholder until ipv4's code is written */
+static struct xfrm_state *
+__xfrm4_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr,
+			    u8 proto)
+{
+	return NULL;
+}
+
 static struct xfrm_state *
 __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, 
 		 xfrm_address_t *daddr, xfrm_address_t *saddr, 
@@ -137,6 +145,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
 	.state_lookup		= __xfrm4_state_lookup,
+	.state_lookup_byaddr	= __xfrm4_state_lookup_byaddr,
 	.find_acq		= __xfrm4_find_acq,
 };
 
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 2fb07850449..9c95b9d3e11 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -63,6 +63,26 @@ __xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl,
 	x->props.family = AF_INET6;
 }
 
+static struct xfrm_state *
+__xfrm6_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr,
+			    u8 proto)
+{
+	struct xfrm_state *x = NULL;
+	unsigned h;
+
+	h = __xfrm6_src_hash(saddr);
+	list_for_each_entry(x, xfrm6_state_afinfo.state_bysrc+h, bysrc) {
+		if (x->props.family == AF_INET6 &&
+		    ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) &&
+		    ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)x->props.saddr.a6) &&
+		    proto == x->id.proto) {
+			xfrm_state_hold(x);
+			return x;
+		}
+	}
+	return NULL;
+}
+
 static struct xfrm_state *
 __xfrm6_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
 {
@@ -140,6 +160,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.family			= AF_INET6,
 	.init_tempsel		= __xfrm6_init_tempsel,
 	.state_lookup		= __xfrm6_state_lookup,
+	.state_lookup_byaddr	= __xfrm6_state_lookup_byaddr,
 	.find_acq		= __xfrm6_find_acq,
 };
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 2a9992894e6..11f480b1295 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -487,6 +487,16 @@ void xfrm_state_insert(struct xfrm_state *x)
 }
 EXPORT_SYMBOL(xfrm_state_insert);
 
+static inline struct xfrm_state *
+__xfrm_state_locate(struct xfrm_state_afinfo *afinfo, struct xfrm_state *x,
+		    int use_spi)
+{
+	if (use_spi)
+		return afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto);
+	else
+		return afinfo->state_lookup_byaddr(&x->id.daddr, &x->props.saddr, x->id.proto);
+}
+
 static struct xfrm_state *__xfrm_find_acq_byseq(u32 seq);
 
 int xfrm_state_add(struct xfrm_state *x)
@@ -495,6 +505,7 @@ int xfrm_state_add(struct xfrm_state *x)
 	struct xfrm_state *x1;
 	int family;
 	int err;
+	int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
 
 	family = x->props.family;
 	afinfo = xfrm_state_get_afinfo(family);
@@ -503,7 +514,7 @@ int xfrm_state_add(struct xfrm_state *x)
 
 	spin_lock_bh(&xfrm_state_lock);
 
-	x1 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto);
+	x1 = __xfrm_state_locate(afinfo, x, use_spi);
 	if (x1) {
 		xfrm_state_put(x1);
 		x1 = NULL;
@@ -511,7 +522,7 @@ int xfrm_state_add(struct xfrm_state *x)
 		goto out;
 	}
 
-	if (x->km.seq) {
+	if (use_spi && x->km.seq) {
 		x1 = __xfrm_find_acq_byseq(x->km.seq);
 		if (x1 && xfrm_addr_cmp(&x1->id.daddr, &x->id.daddr, family)) {
 			xfrm_state_put(x1);
@@ -519,7 +530,7 @@ int xfrm_state_add(struct xfrm_state *x)
 		}
 	}
 
-	if (!x1)
+	if (use_spi && !x1)
 		x1 = afinfo->find_acq(
 			x->props.mode, x->props.reqid, x->id.proto,
 			&x->id.daddr, &x->props.saddr, 0);
@@ -548,13 +559,14 @@ int xfrm_state_update(struct xfrm_state *x)
 	struct xfrm_state_afinfo *afinfo;
 	struct xfrm_state *x1;
 	int err;
+	int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
 
 	afinfo = xfrm_state_get_afinfo(x->props.family);
 	if (unlikely(afinfo == NULL))
 		return -EAFNOSUPPORT;
 
 	spin_lock_bh(&xfrm_state_lock);
-	x1 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto);
+	x1 = __xfrm_state_locate(afinfo, x, use_spi);
 
 	err = -ESRCH;
 	if (!x1)
@@ -674,6 +686,23 @@ xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto,
 }
 EXPORT_SYMBOL(xfrm_state_lookup);
 
+struct xfrm_state *
+xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr,
+			 u8 proto, unsigned short family)
+{
+	struct xfrm_state *x;
+	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+	if (!afinfo)
+		return NULL;
+
+	spin_lock_bh(&xfrm_state_lock);
+	x = afinfo->state_lookup_byaddr(daddr, saddr, proto);
+	spin_unlock_bh(&xfrm_state_lock);
+	xfrm_state_put_afinfo(afinfo);
+	return x;
+}
+EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
+
 struct xfrm_state *
 xfrm_find_acq(u8 mode, u32 reqid, u8 proto, 
 	      xfrm_address_t *daddr, xfrm_address_t *saddr, 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 41f3d51ffc3..b5f8ab71aa5 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -87,6 +87,22 @@ static int verify_encap_tmpl(struct rtattr **xfrma)
 	return 0;
 }
 
+static int verify_one_addr(struct rtattr **xfrma, enum xfrm_attr_type_t type,
+			   xfrm_address_t **addrp)
+{
+	struct rtattr *rt = xfrma[type - 1];
+
+	if (!rt)
+		return 0;
+
+	if ((rt->rta_len - sizeof(*rt)) < sizeof(**addrp))
+		return -EINVAL;
+
+	if (addrp)
+		*addrp = RTA_DATA(rt);
+
+	return 0;
+}
 
 static inline int verify_sec_ctx_len(struct rtattr **xfrma)
 {
@@ -418,16 +434,48 @@ out:
 	return err;
 }
 
+static struct xfrm_state *xfrm_user_state_lookup(struct xfrm_usersa_id *p,
+						 struct rtattr **xfrma,
+						 int *errp)
+{
+	struct xfrm_state *x = NULL;
+	int err;
+
+	if (xfrm_id_proto_match(p->proto, IPSEC_PROTO_ANY)) {
+		err = -ESRCH;
+		x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family);
+	} else {
+		xfrm_address_t *saddr = NULL;
+
+		err = verify_one_addr(xfrma, XFRMA_SRCADDR, &saddr);
+		if (err)
+			goto out;
+
+		if (!saddr) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		x = xfrm_state_lookup_byaddr(&p->daddr, saddr, p->proto,
+					     p->family);
+	}
+
+ out:
+	if (!x && errp)
+		*errp = err;
+	return x;
+}
+
 static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 {
 	struct xfrm_state *x;
-	int err;
+	int err = -ESRCH;
 	struct km_event c;
 	struct xfrm_usersa_id *p = NLMSG_DATA(nlh);
 
-	x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family);
+	x = xfrm_user_state_lookup(p, (struct rtattr **)xfrma, &err);
 	if (x == NULL)
-		return -ESRCH;
+		return err;
 
 	if ((err = security_xfrm_state_delete(x)) != 0)
 		goto out;
@@ -578,10 +626,9 @@ static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	struct xfrm_usersa_id *p = NLMSG_DATA(nlh);
 	struct xfrm_state *x;
 	struct sk_buff *resp_skb;
-	int err;
+	int err = -ESRCH;
 
-	x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family);
-	err = -ESRCH;
+	x = xfrm_user_state_lookup(p, (struct rtattr **)xfrma, &err);
 	if (x == NULL)
 		goto out_noput;
 
-- 
cgit v1.2.3-70-g09d2


From aee5adb4307c4c63a4dc5f3b49984d76f8a71b5b Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 17:57:28 -0700
Subject: [XFRM] STATE: Add a hook to find offset to be inserted header in
 outbound.

On current kernel, ip6_find_1stfragopt() is used by IPv6 IPsec to find
offset to be inserted header in outbound for transport mode. (BTW, no
usage may be needed for IPv4 case.)  Mobile IPv6 requires another
logic for routing header and destination options header
respectively. This patch is common platform for the offset and adopts
it to IPsec.

Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h              | 3 +++
 net/ipv6/ah6.c                  | 3 ++-
 net/ipv6/esp6.c                 | 3 ++-
 net/ipv6/ipcomp6.c              | 1 +
 net/ipv6/ipv6_syms.c            | 1 +
 net/ipv6/xfrm6_mode_transport.c | 2 +-
 net/ipv6/xfrm6_output.c         | 6 ++++++
 7 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d9c40e71318..eed48f832ce 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -265,6 +265,7 @@ struct xfrm_type
 	void			(*destructor)(struct xfrm_state *);
 	int			(*input)(struct xfrm_state *, struct sk_buff *skb);
 	int			(*output)(struct xfrm_state *, struct sk_buff *pskb);
+	int			(*hdr_offset)(struct xfrm_state *, struct sk_buff *, u8 **);
 	/* Estimate maximal size of result of transformation of a dgram */
 	u32			(*get_max_size)(struct xfrm_state *, int size);
 };
@@ -960,6 +961,8 @@ extern u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr);
 extern void xfrm6_tunnel_free_spi(xfrm_address_t *saddr);
 extern u32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr);
 extern int xfrm6_output(struct sk_buff *skb);
+extern int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
+				 u8 **prevhdr);
 
 #ifdef CONFIG_XFRM
 extern int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type);
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 60954fc7eb3..6c0aa51319a 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -435,7 +435,8 @@ static struct xfrm_type ah6_type =
 	.init_state	= ah6_init_state,
 	.destructor	= ah6_destroy,
 	.input		= ah6_input,
-	.output		= ah6_output
+	.output		= ah6_output,
+	.hdr_offset	= xfrm6_find_1stfragopt,
 };
 
 static struct inet6_protocol ah6_protocol = {
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 2b8e52e1d0a..ae50b951115 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -379,7 +379,8 @@ static struct xfrm_type esp6_type =
 	.destructor	= esp6_destroy,
 	.get_max_size	= esp6_get_max_size,
 	.input		= esp6_input,
-	.output		= esp6_output
+	.output		= esp6_output,
+	.hdr_offset	= xfrm6_find_1stfragopt,
 };
 
 static struct inet6_protocol esp6_protocol = {
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 19eba8d9f85..ad9c6e824e6 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -461,6 +461,7 @@ static struct xfrm_type ipcomp6_type =
 	.destructor	= ipcomp6_destroy,
 	.input		= ipcomp6_input,
 	.output		= ipcomp6_output,
+	.hdr_offset	= xfrm6_find_1stfragopt,
 };
 
 static struct inet6_protocol ipcomp6_protocol = 
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index dd4d1ce7776..e1a74161288 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL(ipv6_chk_addr);
 EXPORT_SYMBOL(in6_dev_finish_destroy);
 #ifdef CONFIG_XFRM
 EXPORT_SYMBOL(xfrm6_rcv);
+EXPORT_SYMBOL(xfrm6_find_1stfragopt);
 #endif
 EXPORT_SYMBOL(rt6_lookup);
 EXPORT_SYMBOL(ipv6_push_nfrag_opts);
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index 711d713e36d..a5dce216024 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -35,7 +35,7 @@ static int xfrm6_transport_output(struct sk_buff *skb)
 	skb_push(skb, x->props.header_len);
 	iph = skb->nh.ipv6h;
 
-	hdr_len = ip6_find_1stfragopt(skb, &prevhdr);
+	hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
 	skb->nh.raw = prevhdr - x->props.header_len;
 	skb->h.raw = skb->data + hdr_len;
 	memmove(skb->data, iph, hdr_len);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 26f18869f77..b4628fbf8ff 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -17,6 +17,12 @@
 #include <net/ipv6.h>
 #include <net/xfrm.h>
 
+int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
+			  u8 **prevhdr)
+{
+	return ip6_find_1stfragopt(skb, prevhdr);
+}
+
 static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
-- 
cgit v1.2.3-70-g09d2


From fbd9a5b47ee9c319ff0cae584391241ce78ffd6b Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 18:08:21 -0700
Subject: [XFRM] STATE: Common receive function for route optimization
 extension headers.

XFRM_STATE_WILDRECV flag is introduced; the last resort state is set
it and receives packet which is not route optimized but uses such
extension headers i.e. Mobile IPv6 signaling (binding update and
acknowledgement).  A node enabled Mobile IPv6 adds the state.

Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h   |   1 +
 include/net/xfrm.h     |   2 +
 net/ipv6/ipv6_syms.c   |   1 +
 net/ipv6/xfrm6_input.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_state.c  |   1 +
 5 files changed, 113 insertions(+)

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 66343d3d4b9..a7c9e4cfb15 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -256,6 +256,7 @@ struct xfrm_usersa_info {
 #define XFRM_STATE_NOECN	1
 #define XFRM_STATE_DECAP_DSCP	2
 #define XFRM_STATE_NOPMTUDISC	4
+#define XFRM_STATE_WILDRECV	8
 };
 
 struct xfrm_usersa_id {
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index eed48f832ce..0d735a5aba6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -955,6 +955,8 @@ extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
 extern int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi);
 extern int xfrm6_rcv(struct sk_buff **pskb);
+extern int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
+			    xfrm_address_t *saddr, u8 proto);
 extern int xfrm6_tunnel_register(struct xfrm6_tunnel *handler);
 extern int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler);
 extern u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr);
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index e1a74161288..7b7b90d9c3d 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL(ipv6_chk_addr);
 EXPORT_SYMBOL(in6_dev_finish_destroy);
 #ifdef CONFIG_XFRM
 EXPORT_SYMBOL(xfrm6_rcv);
+EXPORT_SYMBOL(xfrm6_input_addr);
 EXPORT_SYMBOL(xfrm6_find_1stfragopt);
 #endif
 EXPORT_SYMBOL(rt6_lookup);
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index ee2f6b3908b..a40a0578901 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -138,3 +138,111 @@ int xfrm6_rcv(struct sk_buff **pskb)
 {
 	return xfrm6_rcv_spi(*pskb, 0);
 }
+
+int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
+		     xfrm_address_t *saddr, u8 proto)
+{
+ 	struct xfrm_state *x = NULL;
+ 	int wildcard = 0;
+	struct in6_addr any;
+	xfrm_address_t *xany;
+	struct xfrm_state *xfrm_vec_one = NULL;
+ 	int nh = 0;
+	int i = 0;
+
+	ipv6_addr_set(&any, 0, 0, 0, 0);
+	xany = (xfrm_address_t *)&any;
+
+	for (i = 0; i < 3; i++) {
+		xfrm_address_t *dst, *src;
+		switch (i) {
+		case 0:
+			dst = daddr;
+			src = saddr;
+			break;
+		case 1:
+			/* lookup state with wild-card source address */
+			wildcard = 1;
+			dst = daddr;
+			src = xany;
+			break;
+		case 2:
+		default:
+ 			/* lookup state with wild-card addresses */
+			wildcard = 1; /* XXX */
+			dst = xany;
+			src = xany;
+			break;
+ 		}
+
+		x = xfrm_state_lookup_byaddr(dst, src, proto, AF_INET6);
+		if (!x)
+			continue;
+
+		spin_lock(&x->lock);
+
+		if (wildcard) {
+			if ((x->props.flags & XFRM_STATE_WILDRECV) == 0) {
+				spin_unlock(&x->lock);
+				xfrm_state_put(x);
+				x = NULL;
+				continue;
+			}
+		}
+
+		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
+			spin_unlock(&x->lock);
+			xfrm_state_put(x);
+ 			x = NULL;
+ 			continue;
+		}
+		if (xfrm_state_check_expire(x)) {
+			spin_unlock(&x->lock);
+			xfrm_state_put(x);
+			x = NULL;
+			continue;
+		}
+
+		nh = x->type->input(x, skb);
+		if (nh <= 0) {
+			spin_unlock(&x->lock);
+			xfrm_state_put(x);
+			x = NULL;
+			continue;
+		}
+
+		x->curlft.bytes += skb->len;
+		x->curlft.packets++;
+
+		spin_unlock(&x->lock);
+
+		xfrm_vec_one = x;
+		break;
+	}
+
+	if (!xfrm_vec_one)
+		goto drop;
+
+	/* Allocate new secpath or COW existing one. */
+	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+		struct sec_path *sp;
+		sp = secpath_dup(skb->sp);
+		if (!sp)
+			goto drop;
+		if (skb->sp)
+			secpath_put(skb->sp);
+		skb->sp = sp;
+	}
+
+	if (1 + skb->sp->len > XFRM_MAX_DEPTH)
+		goto drop;
+
+	skb->sp->xvec[skb->sp->len] = xfrm_vec_one;
+	skb->sp->len ++;
+
+	return 1;
+drop:
+	if (xfrm_vec_one)
+		xfrm_state_put(xfrm_vec_one);
+	return -1;
+}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 11f480b1295..f05371556cc 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -352,6 +352,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 	list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
 		if (x->props.family == family &&
 		    x->props.reqid == tmpl->reqid &&
+		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
 		    xfrm_state_addr_check(x, daddr, saddr, family) &&
 		    tmpl->mode == x->props.mode &&
 		    tmpl->id.proto == x->id.proto &&
-- 
cgit v1.2.3-70-g09d2


From 99505a843673faeae962a8cde128c7c034ba6b5e Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 18:10:33 -0700
Subject: [XFRM] STATE: Add a hook to obtain local/remote outbound address.

Outbound transformation replaces both source and destination address with
state's end-point addresses at the same time when IPsec tunnel mode.
It is also required to change them for Mobile IPv6 route optimization, but we
should care about the following differences:
 - changing result is not end-point but care-of address
 - either source or destination is replaced for each state
This hook is a common platform to change outbound address.
Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h      |  2 ++
 net/ipv6/xfrm6_policy.c | 20 ++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 0d735a5aba6..aa3ac994477 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -266,6 +266,8 @@ struct xfrm_type
 	int			(*input)(struct xfrm_state *, struct sk_buff *skb);
 	int			(*output)(struct xfrm_state *, struct sk_buff *pskb);
 	int			(*hdr_offset)(struct xfrm_state *, struct sk_buff *, u8 **);
+	xfrm_address_t		*(*local_addr)(struct xfrm_state *, xfrm_address_t *);
+	xfrm_address_t		*(*remote_addr)(struct xfrm_state *, xfrm_address_t *);
 	/* Estimate maximal size of result of transformation of a dgram */
 	u32			(*get_max_size)(struct xfrm_state *, int size);
 };
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 81355bb5032..9328fc88708 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -59,6 +59,22 @@ __xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
 	return dst;
 }
 
+static inline struct in6_addr*
+__xfrm6_bundle_addr_remote(struct xfrm_state *x, struct in6_addr *addr)
+{
+	return (x->type->remote_addr) ?
+		(struct in6_addr*)x->type->remote_addr(x, (xfrm_address_t *)addr) :
+		(struct in6_addr*)&x->id.daddr;
+}
+
+static inline struct in6_addr*
+__xfrm6_bundle_addr_local(struct xfrm_state *x, struct in6_addr *addr)
+{
+	return (x->type->local_addr) ?
+		(struct in6_addr*)x->type->local_addr(x, (xfrm_address_t *)addr) :
+		(struct in6_addr*)&x->props.saddr;
+}
+
 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
  * all the metrics... Shortly, bundle a bundle.
  */
@@ -115,8 +131,8 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 		dst1->next = dst_prev;
 		dst_prev = dst1;
 		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
-			remote = (struct in6_addr*)&xfrm[i]->id.daddr;
-			local  = (struct in6_addr*)&xfrm[i]->props.saddr;
+			remote = __xfrm6_bundle_addr_remote(xfrm[i], remote);
+			local  = __xfrm6_bundle_addr_local(xfrm[i], local);
 			tunnel = 1;
 		}
 		header_len += xfrm[i]->props.header_len;
-- 
cgit v1.2.3-70-g09d2


From 1b5c229987dc4d0c92a38fac0cde2aeec08cd775 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 18:11:50 -0700
Subject: [XFRM] STATE: Support non-fragment outbound transformation headers.

For originated outbound IPv6 packets which will fragment, ip6_append_data()
should know length of extension headers before sending them and
the length is carried by dst_entry.
IPv6 IPsec headers fragment then transformation was
designed to place all headers after fragment header.
OTOH Mobile IPv6 extension headers do not fragment then
it is a good idea to make dst_entry have non-fragment length to tell it
to ip6_append_data().

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h       |  1 +
 include/net/xfrm.h      |  2 ++
 net/ipv4/xfrm4_policy.c |  1 +
 net/ipv6/ip6_output.c   |  2 +-
 net/ipv6/xfrm6_policy.c | 24 ++++++++++++++++++++++--
 5 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 36d54fc248b..a8d825f9030 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -54,6 +54,7 @@ struct dst_entry
 	unsigned long		expires;
 
 	unsigned short		header_len;	/* more space at head required */
+	unsigned short		nfheader_len;	/* more non-fragment space at head required */
 	unsigned short		trailer_len;	/* space to reserve at tail */
 
 	u32			metrics[RTAX_MAX];
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index aa3ac994477..aa93cc1f629 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -260,6 +260,8 @@ struct xfrm_type
 	char			*description;
 	struct module		*owner;
 	__u8			proto;
+	__u8			flags;
+#define XFRM_TYPE_NON_FRAGMENT	1
 
 	int			(*init_state)(struct xfrm_state *x);
 	void			(*destructor)(struct xfrm_state *);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index a5bed741de2..e517981cead 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -135,6 +135,7 @@ __xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 		dst_prev->flags	       |= DST_HOST;
 		dst_prev->lastuse	= jiffies;
 		dst_prev->header_len	= header_len;
+		dst_prev->nfheader_len	= 0;
 		dst_prev->trailer_len	= trailer_len;
 		memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2a376b7d91b..258e3e45f5e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -971,7 +971,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 
 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 
-	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
+	fragheaderlen = sizeof(struct ipv6hdr) + rt->u.dst.nfheader_len + (opt ? opt->opt_nflen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 
 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 9328fc88708..a3f68c8b737 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -75,6 +75,24 @@ __xfrm6_bundle_addr_local(struct xfrm_state *x, struct in6_addr *addr)
 		(struct in6_addr*)&x->props.saddr;
 }
 
+static inline void
+__xfrm6_bundle_len_inc(int *len, int *nflen, struct xfrm_state *x)
+{
+	if (x->type->flags & XFRM_TYPE_NON_FRAGMENT)
+		*nflen += x->props.header_len;
+	else
+		*len += x->props.header_len;
+}
+
+static inline void
+__xfrm6_bundle_len_dec(int *len, int *nflen, struct xfrm_state *x)
+{
+	if (x->type->flags & XFRM_TYPE_NON_FRAGMENT)
+		*nflen -= x->props.header_len;
+	else
+		*len -= x->props.header_len;
+}
+
 /* Allocate chain of dst_entry's, attach known xfrm's, calculate
  * all the metrics... Shortly, bundle a bundle.
  */
@@ -99,6 +117,7 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 	int i;
 	int err = 0;
 	int header_len = 0;
+	int nfheader_len = 0;
 	int trailer_len = 0;
 
 	dst = dst_prev = NULL;
@@ -135,7 +154,7 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 			local  = __xfrm6_bundle_addr_local(xfrm[i], local);
 			tunnel = 1;
 		}
-		header_len += xfrm[i]->props.header_len;
+		__xfrm6_bundle_len_inc(&header_len, &nfheader_len, xfrm[i]);
 		trailer_len += xfrm[i]->props.trailer_len;
 
 		if (tunnel) {
@@ -170,6 +189,7 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 		dst_prev->flags	       |= DST_HOST;
 		dst_prev->lastuse	= jiffies;
 		dst_prev->header_len	= header_len;
+		dst_prev->nfheader_len	= nfheader_len;
 		dst_prev->trailer_len	= trailer_len;
 		memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
 
@@ -188,7 +208,7 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 		x->u.rt6.rt6i_src      = rt0->rt6i_src;	
 		x->u.rt6.rt6i_idev     = rt0->rt6i_idev;
 		in6_dev_hold(rt0->rt6i_idev);
-		header_len -= x->u.dst.xfrm->props.header_len;
+		__xfrm6_bundle_len_dec(&header_len, &nfheader_len, x->u.dst.xfrm);
 		trailer_len -= x->u.dst.xfrm->props.trailer_len;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 060f02a3bdd4d9ba8aa3c48e9b470672b1f3a585 Mon Sep 17 00:00:00 2001
From: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Date: Wed, 23 Aug 2006 18:18:55 -0700
Subject: [XFRM] STATE: Introduce care-of address.

Care-of address is carried by state as a transformation option like
IPsec encryption/authentication algorithm.

Based on MIPL2 kernel patch.

Signed-off-by: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 include/linux/xfrm.h  |  1 +
 include/net/xfrm.h    |  3 +++
 net/xfrm/xfrm_state.c |  6 ++++++
 net/xfrm/xfrm_user.c  | 28 +++++++++++++++++++++++++++-
 4 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index a7c9e4cfb15..b53f799189a 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -235,6 +235,7 @@ enum xfrm_attr_type_t {
 	XFRMA_REPLAY_THRESH,
 	XFRMA_ETIMER_THRESH,
 	XFRMA_SRCADDR,		/* xfrm_address_t */
+	XFRMA_COADDR,		/* xfrm_address_t */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index aa93cc1f629..872a2a4022b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -134,6 +134,9 @@ struct xfrm_state
 	/* Data for encapsulator */
 	struct xfrm_encap_tmpl	*encap;
 
+	/* Data for care-of address */
+	xfrm_address_t	*coaddr;
+
 	/* IPComp needs an IPIP tunnel for handling uncompressed packets */
 	struct xfrm_state	*tunnel;
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index f05371556cc..3da89c01ea7 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -78,6 +78,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 	kfree(x->ealg);
 	kfree(x->calg);
 	kfree(x->encap);
+	kfree(x->coaddr);
 	if (x->mode)
 		xfrm_put_mode(x->mode);
 	if (x->type) {
@@ -603,6 +604,11 @@ out:
 	if (likely(x1->km.state == XFRM_STATE_VALID)) {
 		if (x->encap && x1->encap)
 			memcpy(x1->encap, x->encap, sizeof(*x1->encap));
+		if (x->coaddr && x1->coaddr) {
+			memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr));
+		}
+		if (!use_spi && memcmp(&x1->sel, &x->sel, sizeof(x1->sel)))
+			memcpy(&x1->sel, &x->sel, sizeof(x1->sel));
 		memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
 		x1->km.dying = 0;
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b5f8ab71aa5..939808de9e2 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -187,11 +187,14 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		goto out;
 	if ((err = verify_sec_ctx_len(xfrma)))
 		goto out;
+	if ((err = verify_one_addr(xfrma, XFRMA_COADDR, NULL)))
+		goto out;
 
 	err = -EINVAL;
 	switch (p->mode) {
 	case XFRM_MODE_TRANSPORT:
 	case XFRM_MODE_TUNNEL:
+	case XFRM_MODE_ROUTEOPTIMIZATION:
 		break;
 
 	default:
@@ -276,6 +279,24 @@ static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg)
 	return security_xfrm_state_alloc(x, uctx);
 }
 
+static int attach_one_addr(xfrm_address_t **addrpp, struct rtattr *u_arg)
+{
+	struct rtattr *rta = u_arg;
+	xfrm_address_t *p, *uaddrp;
+
+	if (!rta)
+		return 0;
+
+	uaddrp = RTA_DATA(rta);
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	memcpy(p, uaddrp, sizeof(*p));
+	*addrpp = p;
+	return 0;
+}
+
 static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
 {
 	memcpy(&x->id, &p->id, sizeof(x->id));
@@ -365,7 +386,8 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
 		goto error;
 	if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1])))
 		goto error;
-
+	if ((err = attach_one_addr(&x->coaddr, xfrma[XFRMA_COADDR-1])))
+		goto error;
 	err = xfrm_init_state(x);
 	if (err)
 		goto error;
@@ -569,6 +591,10 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
 		uctx->ctx_len = x->security->ctx_len;
 		memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len);
 	}
+
+	if (x->coaddr)
+		RTA_PUT(skb, XFRMA_COADDR, sizeof(*x->coaddr), x->coaddr);
+
 	nlh->nlmsg_len = skb->tail - b;
 out:
 	sp->this_idx++;
-- 
cgit v1.2.3-70-g09d2


From 9afaca057980c02771f4657c455cc7592fcd7373 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 18:20:16 -0700
Subject: [XFRM] IPV6: Update outbound state timestamp for each sending.

With this patch transformation state is updated last used time
for each sending. Xtime is used for it like other state lifetime
expiration.
Mobile IPv6 enabled nodes will want to know traffic status of each
binding (e.g. judgement to request binding refresh by correspondent node,
or to keep home/care-of nonce alive by mobile node).
The last used timestamp is an important hint about it.
Based on MIPL2 kernel patch.

This patch was also written by: Henrik Petander <petander@tcs.hut.fi>

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h    | 1 +
 include/net/xfrm.h      | 3 +++
 net/ipv6/xfrm6_output.c | 2 ++
 net/xfrm/xfrm_user.c    | 3 +++
 4 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index b53f799189a..1d8c1f22c12 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -236,6 +236,7 @@ enum xfrm_attr_type_t {
 	XFRMA_ETIMER_THRESH,
 	XFRMA_SRCADDR,		/* xfrm_address_t */
 	XFRMA_COADDR,		/* xfrm_address_t */
+	XFRMA_LASTUSED,
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 872a2a4022b..248874ecf8d 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -167,6 +167,9 @@ struct xfrm_state
 	struct xfrm_lifetime_cur curlft;
 	struct timer_list	timer;
 
+	/* Last used time */
+	u64			lastused;
+
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	struct xfrm_type	*type;
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index b4628fbf8ff..db58104e710 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -75,6 +75,8 @@ static int xfrm6_output_one(struct sk_buff *skb)
 
 		x->curlft.bytes += skb->len;
 		x->curlft.packets++;
+		if (x->props.mode == XFRM_MODE_ROUTEOPTIMIZATION)
+			x->lastused = (u64)xtime.tv_sec;
 
 		spin_unlock_bh(&x->lock);
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 939808de9e2..f643063a1cb 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -595,6 +595,9 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr)
 	if (x->coaddr)
 		RTA_PUT(skb, XFRMA_COADDR, sizeof(*x->coaddr), x->coaddr);
 
+	if (x->lastused)
+		RTA_PUT(skb, XFRMA_LASTUSED, sizeof(x->lastused), &x->lastused);
+
 	nlh->nlmsg_len = skb->tail - b;
 out:
 	sp->this_idx++;
-- 
cgit v1.2.3-70-g09d2


From e53820de0f81da1429048634cadc6ef5f50c2f8b Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 19:12:01 -0700
Subject: [XFRM] IPV6: Restrict bundle reusing

For outbound transformation, bundle is checked whether it is
suitable for current flow to be reused or not. In such IPv6 case
as below, transformation may apply incorrect bundle for the flow instead
of creating another bundle:

- The policy selector has destination prefix length < 128
  (Two or more addresses can be matched it)
- Its bundle holds dst entry of default route whose prefix length < 128
  (Previous traffic was used such route as next hop)
- The policy and the bundle were used a transport mode state and
  this time flow address is not matched the bundled state.

This issue is found by Mobile IPv6 usage to protect mobility signaling
by IPsec, but it is not a Mobile IPv6 specific.
This patch adds strict check to xfrm_bundle_ok() for each
state mode and address when prefix length is less than 128.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h      | 19 ++++++++++++++++++-
 net/ipv4/xfrm4_policy.c |  2 +-
 net/ipv6/xfrm6_policy.c |  4 +++-
 net/xfrm/xfrm_policy.c  |  8 ++++++--
 4 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 248874ecf8d..7f1630630dc 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -869,6 +869,23 @@ xfrm_state_addr_check(struct xfrm_state *x,
 	return 0;
 }
 
+static __inline__ int
+xfrm_state_addr_flow_check(struct xfrm_state *x, struct flowi *fl,
+			   unsigned short family)
+{
+	switch (family) {
+	case AF_INET:
+		return __xfrm4_state_addr_check(x,
+						(xfrm_address_t *)&fl->fl4_dst,
+						(xfrm_address_t *)&fl->fl4_src);
+	case AF_INET6:
+		return __xfrm6_state_addr_check(x,
+						(xfrm_address_t *)&fl->fl6_dst,
+						(xfrm_address_t *)&fl->fl6_src);
+	}
+	return 0;
+}
+
 static inline int xfrm_state_kern(struct xfrm_state *x)
 {
 	return atomic_read(&x->tunnel_users);
@@ -1014,7 +1031,7 @@ extern void xfrm_policy_flush(void);
 extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
 extern int xfrm_flush_bundles(void);
 extern void xfrm_flush_all_bundles(void);
-extern int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family);
+extern int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family, int strict);
 extern void xfrm_init_pmtu(struct dst_entry *dst);
 
 extern wait_queue_head_t km_waitq;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e517981cead..42d8ded0f96 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -33,7 +33,7 @@ __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
 		    xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
 	    	    xdst->u.rt.fl.fl4_src == fl->fl4_src &&
 	    	    xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
-		    xfrm_bundle_ok(xdst, fl, AF_INET)) {
+		    xfrm_bundle_ok(xdst, fl, AF_INET, 0)) {
 			dst_clone(dst);
 			break;
 		}
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index a3f68c8b737..729b4748d6d 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -50,7 +50,9 @@ __xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
 				 xdst->u.rt6.rt6i_src.plen);
 		if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) &&
 		    ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) &&
-		    xfrm_bundle_ok(xdst, fl, AF_INET6)) {
+		    xfrm_bundle_ok(xdst, fl, AF_INET6,
+				   (xdst->u.rt6.rt6i_dst.plen != 128 ||
+				    xdst->u.rt6.rt6i_src.plen != 128))) {
 			dst_clone(dst);
 			break;
 		}
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 56abb5c057d..ad2a5cba1f5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1167,7 +1167,7 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
 
 static int stale_bundle(struct dst_entry *dst)
 {
-	return !xfrm_bundle_ok((struct xfrm_dst *)dst, NULL, AF_UNSPEC);
+	return !xfrm_bundle_ok((struct xfrm_dst *)dst, NULL, AF_UNSPEC, 0);
 }
 
 void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
@@ -1282,7 +1282,7 @@ EXPORT_SYMBOL(xfrm_init_pmtu);
  * still valid.
  */
 
-int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family)
+int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family, int strict)
 {
 	struct dst_entry *dst = &first->u.dst;
 	struct xfrm_dst *last;
@@ -1304,6 +1304,10 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family)
 		if (dst->xfrm->km.state != XFRM_STATE_VALID)
 			return 0;
 
+		if (strict && fl && dst->xfrm->props.mode != XFRM_MODE_TUNNEL &&
+		    !xfrm_state_addr_flow_check(dst->xfrm, fl, family))
+			return 0;
+
 		mtu = dst_mtu(dst->child);
 		if (xdst->child_mtu_cached != mtu) {
 			last = xdst;
-- 
cgit v1.2.3-70-g09d2


From 642ec62eee5bdc158e01029220c8a23c685778fb Mon Sep 17 00:00:00 2001
From: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Date: Wed, 23 Aug 2006 19:15:07 -0700
Subject: [IPV6] MIP6: Add routing header type 2 definition.

Add routing header type 2 definition for Mobile IPv6.
Based on MIPL2 kernel patch.

Signed-off-by: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 include/linux/ipv6.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 02d14a3ff2a..d995662e94c 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -29,6 +29,7 @@ struct in6_ifreq {
 
 #define IPV6_SRCRT_STRICT	0x01	/* this hop must be a neighbor	*/
 #define IPV6_SRCRT_TYPE_0	0	/* IPv6 type 0 Routing Header	*/
+#define IPV6_SRCRT_TYPE_2	2	/* IPv6 type 2 Routing Header	*/
 
 /*
  *	routing header
@@ -73,6 +74,18 @@ struct rt0_hdr {
 #define rt0_type		rt_hdr.type
 };
 
+/*
+ *	routing header type 2
+ */
+
+struct rt2_hdr {
+	struct ipv6_rt_hdr	rt_hdr;
+	__u32			reserved;
+	struct in6_addr		addr;
+
+#define rt2_type		rt_hdr.type
+};
+
 struct ipv6_auth_hdr {
 	__u8  nexthdr;
 	__u8  hdrlen;           /* This one is measured in 32 bit units! */
-- 
cgit v1.2.3-70-g09d2


From 65d4ed92219b28875efb52de5700da8c3dfa83e1 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 19:16:22 -0700
Subject: [IPV6] MIP6: Add inbound interface of routing header type 2.

Add inbound interface of routing header type 2 for Mobile IPv6.
Based on MIPL2 kernel patch.

This patch was also written by: Ville Nuorvala <vnuorval@tcs.hut.fi>

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h |  7 +++++
 net/ipv6/exthdrs.c     | 69 ++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 68 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 3d71251b3ec..5fc8627435e 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -61,6 +61,13 @@ extern int			addrconf_set_dstaddr(void __user *arg);
 extern int			ipv6_chk_addr(struct in6_addr *addr,
 					      struct net_device *dev,
 					      int strict);
+/* XXX: this is a placeholder till addrconf supports */
+#ifdef CONFIG_IPV6_MIP6
+static inline int ipv6_chk_home_addr(struct in6_addr *addr)
+{
+	return 0;
+}
+#endif
 extern struct inet6_ifaddr *	ipv6_get_ifaddr(struct in6_addr *addr,
 						struct net_device *dev,
 						int strict);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 05afa6b1912..8d3a0e17314 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -43,6 +43,9 @@
 #include <net/ndisc.h>
 #include <net/ip6_route.h>
 #include <net/addrconf.h>
+#ifdef CONFIG_IPV6_MIP6
+#include <net/xfrm.h>
+#endif
 
 #include <asm/uaccess.h>
 
@@ -219,7 +222,7 @@ static int ipv6_rthdr_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
-	struct in6_addr *addr;
+	struct in6_addr *addr = NULL;
 	struct in6_addr daddr;
 	int n, i;
 
@@ -244,6 +247,23 @@ static int ipv6_rthdr_rcv(struct sk_buff **skbp)
 
 looped_back:
 	if (hdr->segments_left == 0) {
+		switch (hdr->type) {
+#ifdef CONFIG_IPV6_MIP6
+		case IPV6_SRCRT_TYPE_2:
+			/* Silently discard type 2 header unless it was
+			 * processed by own
+			 */
+			if (!addr) {
+				IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
+				kfree_skb(skb);
+				return -1;
+			}
+			break;
+#endif
+		default:
+			break;
+		}
+
 		opt->lastopt = skb->h.raw - skb->nh.raw;
 		opt->srcrt = skb->h.raw - skb->nh.raw;
 		skb->h.raw += (hdr->hdrlen + 1) << 3;
@@ -253,17 +273,29 @@ looped_back:
 		return 1;
 	}
 
-	if (hdr->type != IPV6_SRCRT_TYPE_0) {
+	switch (hdr->type) {
+	case IPV6_SRCRT_TYPE_0:
+		if (hdr->hdrlen & 0x01) {
+			IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+			icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->hdrlen) - skb->nh.raw);
+			return -1;
+		}
+		break;
+#ifdef CONFIG_IPV6_MIP6
+	case IPV6_SRCRT_TYPE_2:
+		/* Silently discard invalid RTH type 2 */
+		if (hdr->hdrlen != 2 || hdr->segments_left != 1) {
+			IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+			kfree_skb(skb);
+			return -1;
+		}
+		break;
+#endif
+	default:
 		IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb->nh.raw);
 		return -1;
 	}
-	
-	if (hdr->hdrlen & 0x01) {
-		IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
-		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->hdrlen) - skb->nh.raw);
-		return -1;
-	}
 
 	/*
 	 *	This is the routing header forwarding algorithm from
@@ -303,6 +335,27 @@ looped_back:
 	addr = rthdr->addr;
 	addr += i - 1;
 
+	switch (hdr->type) {
+#ifdef CONFIG_IPV6_MIP6
+	case IPV6_SRCRT_TYPE_2:
+		if (xfrm6_input_addr(skb, (xfrm_address_t *)addr,
+				     (xfrm_address_t *)&skb->nh.ipv6h->saddr,
+				     IPPROTO_ROUTING) < 0) {
+			IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
+			kfree_skb(skb);
+			return -1;
+		}
+		if (!ipv6_chk_home_addr(addr)) {
+			IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
+			kfree_skb(skb);
+			return -1;
+		}
+		break;
+#endif
+	default:
+		break;
+	}
+
 	if (ipv6_addr_is_multicast(addr)) {
 		IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
 		kfree_skb(skb);
-- 
cgit v1.2.3-70-g09d2


From c61a404325093250b676f40ad8f4dd00f3bcab5f Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 19:18:35 -0700
Subject: [IPV6]: Find option offset by type.

This is a helper to search option offset from extension header which
can carry TLV option like destination options header.

Mobile IPv6 home address option will use it.

Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h |  2 ++
 net/ipv6/exthdrs.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index ece7e8a84ff..c4ea1271057 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -506,6 +506,8 @@ extern int			ipv6_skip_exthdr(const struct sk_buff *, int start,
 
 extern int 			ipv6_ext_hdr(u8 nexthdr);
 
+extern int ipv6_find_tlv(struct sk_buff *skb, int offset, int type);
+
 extern struct ipv6_txoptions *	ipv6_invert_rthdr(struct sock *sk,
 						  struct ipv6_rt_hdr *hdr);
 
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 8d3a0e17314..50ff49e518b 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -49,6 +49,49 @@
 
 #include <asm/uaccess.h>
 
+int ipv6_find_tlv(struct sk_buff *skb, int offset, int type)
+{
+	int packet_len = skb->tail - skb->nh.raw;
+	struct ipv6_opt_hdr *hdr;
+	int len;
+
+	if (offset + 2 > packet_len)
+		goto bad;
+	hdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
+	len = ((hdr->hdrlen + 1) << 3);
+
+	if (offset + len > packet_len)
+		goto bad;
+
+	offset += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int opttype = skb->nh.raw[offset];
+		int optlen;
+
+		if (opttype == type)
+			return offset;
+
+		switch (opttype) {
+		case IPV6_TLV_PAD0:
+			optlen = 1;
+			break;
+		default:
+			optlen = skb->nh.raw[offset + 1] + 2;
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		offset += optlen;
+		len -= optlen;
+	}
+	/* not_found */
+	return -1;
+ bad:
+	return -1;
+}
+
 /*
  *	Parsing tlv encoded headers.
  *
-- 
cgit v1.2.3-70-g09d2


From a80ff03e05e4343d647780c116b02ec86078fd24 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 19:19:50 -0700
Subject: [IPV6]: Allow to replace skbuff by TLV parser.

In receiving Mobile IPv6 home address option which is a TLV carried by
destination options header, kernel will try to mangle source adderss
of packet. Think of cloned skbuff it is required to replace it by the
parser just like routing header case.

This is a framework to achieve that to allow TLV parser to replace
inbound skbuff pointer.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h   |  2 +-
 net/ipv6/exthdrs.c   | 29 +++++++++++++++++++----------
 net/ipv6/ip6_input.c |  2 +-
 3 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index c4ea1271057..8e6ec6063f8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -229,7 +229,7 @@ extern int 			ip6_ra_control(struct sock *sk, int sel,
 					       void (*destructor)(struct sock *));
 
 
-extern int			ipv6_parse_hopopts(struct sk_buff *skb);
+extern int			ipv6_parse_hopopts(struct sk_buff **skbp);
 
 extern struct ipv6_txoptions *  ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt);
 extern struct ipv6_txoptions *	ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 50ff49e518b..1cdd0f0b5d3 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -102,7 +102,7 @@ int ipv6_find_tlv(struct sk_buff *skb, int offset, int type)
 
 struct tlvtype_proc {
 	int	type;
-	int	(*func)(struct sk_buff *skb, int offset);
+	int	(*func)(struct sk_buff **skbp, int offset);
 };
 
 /*********************
@@ -111,8 +111,10 @@ struct tlvtype_proc {
 
 /* An unknown option is detected, decide what to do */
 
-static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff)
+static int ip6_tlvopt_unknown(struct sk_buff **skbp, int optoff)
 {
+	struct sk_buff *skb = *skbp;
+
 	switch ((skb->nh.raw[optoff] & 0xC0) >> 6) {
 	case 0: /* ignore */
 		return 1;
@@ -137,8 +139,9 @@ static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff)
 
 /* Parse tlv encoded option header (hop-by-hop or destination) */
 
-static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb)
+static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff **skbp)
 {
+	struct sk_buff *skb = *skbp;
 	struct tlvtype_proc *curr;
 	int off = skb->h.raw - skb->nh.raw;
 	int len = ((skb->h.raw[1]+1)<<3);
@@ -168,13 +171,13 @@ static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb)
 					/* type specific length/alignment 
 					   checks will be performed in the 
 					   func(). */
-					if (curr->func(skb, off) == 0)
+					if (curr->func(skbp, off) == 0)
 						return 0;
 					break;
 				}
 			}
 			if (curr->type < 0) {
-				if (ip6_tlvopt_unknown(skb, off) == 0)
+				if (ip6_tlvopt_unknown(skbp, off) == 0)
 					return 0;
 			}
 			break;
@@ -213,7 +216,8 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp)
 	opt->lastopt = skb->h.raw - skb->nh.raw;
 	opt->dst1 = skb->h.raw - skb->nh.raw;
 
-	if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
+	if (ip6_parse_tlv(tlvprocdestopt_lst, skbp)) {
+		skb = *skbp;
 		skb->h.raw += ((skb->h.raw[1]+1)<<3);
 		opt->nhoff = opt->dst1;
 		return 1;
@@ -517,8 +521,10 @@ EXPORT_SYMBOL_GPL(ipv6_invert_rthdr);
 
 /* Router Alert as of RFC 2711 */
 
-static int ipv6_hop_ra(struct sk_buff *skb, int optoff)
+static int ipv6_hop_ra(struct sk_buff **skbp, int optoff)
 {
+	struct sk_buff *skb = *skbp;
+
 	if (skb->nh.raw[optoff+1] == 2) {
 		IP6CB(skb)->ra = optoff;
 		return 1;
@@ -531,8 +537,9 @@ static int ipv6_hop_ra(struct sk_buff *skb, int optoff)
 
 /* Jumbo payload */
 
-static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
+static int ipv6_hop_jumbo(struct sk_buff **skbp, int optoff)
 {
+	struct sk_buff *skb = *skbp;
 	u32 pkt_len;
 
 	if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) {
@@ -581,8 +588,9 @@ static struct tlvtype_proc tlvprochopopt_lst[] = {
 	{ -1, }
 };
 
-int ipv6_parse_hopopts(struct sk_buff *skb)
+int ipv6_parse_hopopts(struct sk_buff **skbp)
 {
+	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
 
 	/*
@@ -598,7 +606,8 @@ int ipv6_parse_hopopts(struct sk_buff *skb)
 	}
 
 	opt->hop = sizeof(struct ipv6hdr);
-	if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
+	if (ip6_parse_tlv(tlvprochopopt_lst, skbp)) {
+		skb = *skbp;
 		skb->h.raw += (skb->h.raw[1]+1)<<3;
 		opt->nhoff = sizeof(struct ipv6hdr);
 		return 1;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 25c2a9e0389..6b8e6d76a58 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -111,7 +111,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	}
 
 	if (hdr->nexthdr == NEXTHDR_HOP) {
-		if (ipv6_parse_hopopts(skb) < 0) {
+		if (ipv6_parse_hopopts(&skb) < 0) {
 			IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 			return 0;
 		}
-- 
cgit v1.2.3-70-g09d2


From 842426e719f86cd5709617208efae93ff1a1e2d8 Mon Sep 17 00:00:00 2001
From: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Date: Wed, 23 Aug 2006 19:21:34 -0700
Subject: [IPV6] MIP6: Add home address option definition.

Add home address option definition for Mobile IPv6.
Based on MIPL2 kernel patch.

Signed-off-by: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in6.h  |  1 +
 include/linux/ipv6.h | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/in6.h b/include/linux/in6.h
index 304aaedea30..086ec2ac8c5 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -142,6 +142,7 @@ struct in6_flowlabel_req
 #define IPV6_TLV_PADN		1
 #define IPV6_TLV_ROUTERALERT	5
 #define IPV6_TLV_JUMBO		194
+#define IPV6_TLV_HAO		201	/* home address option */
 
 /*
  *	IPV6 socket options
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index d995662e94c..5bf4406e26d 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -86,6 +86,16 @@ struct rt2_hdr {
 #define rt2_type		rt_hdr.type
 };
 
+/*
+ *	home address option in destination options header
+ */
+
+struct ipv6_destopt_hao {
+	__u8			type;
+	__u8			length;
+	struct in6_addr		addr;
+} __attribute__ ((__packed__));
+
 struct ipv6_auth_hdr {
 	__u8  nexthdr;
 	__u8  hdrlen;           /* This one is measured in 32 bit units! */
-- 
cgit v1.2.3-70-g09d2


From a831f5bbc89a9978795504be9e1ff412043f8f77 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 19:24:48 -0700
Subject: [IPV6] MIP6: Add inbound interface of home address option.

Add inbound function of home address option by registering it to TLV
table for destination options header.

Based on MIPL2 kernel patch.

This patch was also written by: Ville Nuorvala <vnuorval@tcs.hut.fi>

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h |  3 ++
 net/ipv6/exthdrs.c   | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 86 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5bf4406e26d..db3b2ba0f4f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -226,6 +226,9 @@ struct inet6_skb_parm {
 	__u16			dst0;
 	__u16			srcrt;
 	__u16			dst1;
+#ifdef CONFIG_IPV6_MIP6
+	__u16			dsthao;
+#endif
 	__u16			lastopt;
 	__u32			nhoff;
 	__u16			flags;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 1cdd0f0b5d3..6a6466bb5f2 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -196,8 +196,80 @@ bad:
   Destination options header.
  *****************************/
 
+#ifdef CONFIG_IPV6_MIP6
+static int ipv6_dest_hao(struct sk_buff **skbp, int optoff)
+{
+	struct sk_buff *skb = *skbp;
+	struct ipv6_destopt_hao *hao;
+	struct inet6_skb_parm *opt = IP6CB(skb);
+	struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->nh.raw;
+	struct in6_addr tmp_addr;
+	int ret;
+
+	if (opt->dsthao) {
+		LIMIT_NETDEBUG(KERN_DEBUG "hao duplicated\n");
+		goto discard;
+	}
+	opt->dsthao = opt->dst1;
+	opt->dst1 = 0;
+
+	hao = (struct ipv6_destopt_hao *)(skb->nh.raw + optoff);
+
+	if (hao->length != 16) {
+		LIMIT_NETDEBUG(
+			KERN_DEBUG "hao invalid option length = %d\n", hao->length);
+		goto discard;
+	}
+
+	if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
+		LIMIT_NETDEBUG(
+			KERN_DEBUG "hao is not an unicast addr: " NIP6_FMT "\n", NIP6(hao->addr));
+		goto discard;
+	}
+
+	ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr,
+			       (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS);
+	if (unlikely(ret < 0))
+		goto discard;
+
+	if (skb_cloned(skb)) {
+		struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
+		if (skb2 == NULL)
+			goto discard;
+
+		kfree_skb(skb);
+
+		/* update all variable using below by copied skbuff */
+		*skbp = skb = skb2;
+		hao = (struct ipv6_destopt_hao *)(skb2->nh.raw + optoff);
+		ipv6h = (struct ipv6hdr *)skb2->nh.raw;
+	}
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		skb->ip_summed = CHECKSUM_NONE;
+
+	ipv6_addr_copy(&tmp_addr, &ipv6h->saddr);
+	ipv6_addr_copy(&ipv6h->saddr, &hao->addr);
+	ipv6_addr_copy(&hao->addr, &tmp_addr);
+
+	if (skb->tstamp.off_sec == 0)
+		__net_timestamp(skb);
+
+	return 1;
+
+ discard:
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+
 static struct tlvtype_proc tlvprocdestopt_lst[] = {
-	/* No destination options are defined now */
+#ifdef CONFIG_IPV6_MIP6
+	{
+		.type	= IPV6_TLV_HAO,
+		.func	= ipv6_dest_hao,
+	},
+#endif
 	{-1,			NULL}
 };
 
@@ -205,6 +277,9 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
+#ifdef CONFIG_IPV6_MIP6
+	__u16 dstbuf;
+#endif
 
 	if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) ||
 	    !pskb_may_pull(skb, (skb->h.raw-skb->data)+((skb->h.raw[1]+1)<<3))) {
@@ -215,11 +290,18 @@ static int ipv6_destopt_rcv(struct sk_buff **skbp)
 
 	opt->lastopt = skb->h.raw - skb->nh.raw;
 	opt->dst1 = skb->h.raw - skb->nh.raw;
+#ifdef CONFIG_IPV6_MIP6
+	dstbuf = opt->dst1;
+#endif
 
 	if (ip6_parse_tlv(tlvprocdestopt_lst, skbp)) {
 		skb = *skbp;
 		skb->h.raw += ((skb->h.raw[1]+1)<<3);
+#ifdef CONFIG_IPV6_MIP6
+		opt->nhoff = dstbuf;
+#else
 		opt->nhoff = opt->dst1;
+#endif
 		return 1;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 8dd7368dd97def967bbb3aec67b882e8dfd1a528 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Wed, 23 Aug 2006 19:25:55 -0700
Subject: [IPV6]: Put dsthao after flags in order to pack inet6_skb_parm
 better.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index db3b2ba0f4f..1d6d3ccc941 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -226,12 +226,12 @@ struct inet6_skb_parm {
 	__u16			dst0;
 	__u16			srcrt;
 	__u16			dst1;
-#ifdef CONFIG_IPV6_MIP6
-	__u16			dsthao;
-#endif
 	__u16			lastopt;
 	__u32			nhoff;
 	__u16			flags;
+#ifdef CONFIG_IPV6_MIP6
+	__u16			dsthao;
+#endif
 
 #define IP6SKB_XFRM_TRANSFORMED	1
 };
-- 
cgit v1.2.3-70-g09d2


From 2c8d7ca0f76103855ad1f2a930e05683b64a00eb Mon Sep 17 00:00:00 2001
From: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Date: Wed, 23 Aug 2006 20:31:11 -0700
Subject: [IPV6] MIP6: Add routing header type 2 transformation.

Add routing header type 2 transformation for Mobile IPv6.
Based on MIPL2 kernel patch.

Signed-off-by: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mip6.h  |  31 +++++++++
 net/ipv6/Makefile   |   2 +
 net/ipv6/af_inet6.c |   9 +++
 net/ipv6/mip6.c     | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 223 insertions(+)
 create mode 100644 include/net/mip6.h
 create mode 100644 net/ipv6/mip6.c

(limited to 'include')

diff --git a/include/net/mip6.h b/include/net/mip6.h
new file mode 100644
index 00000000000..644b8b67304
--- /dev/null
+++ b/include/net/mip6.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C)2003-2006 Helsinki University of Technology
+ * Copyright (C)2003-2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+/*
+ * Authors:
+ *	Noriaki TAKAMIYA @USAGI
+ *	Masahide NAKAMURA @USAGI
+ *	YOSHIFUJI Hideaki @USAGI
+ */
+#ifndef _NET_MIP6_H
+#define _NET_MIP6_H
+
+extern int mip6_init(void);
+extern void mip6_fini(void);
+
+#endif
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 87e912e3192..0213c6612b5 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -14,6 +14,8 @@ ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
 	xfrm6_output.o
 ipv6-$(CONFIG_NETFILTER) += netfilter.o
 ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
+ipv6-$(CONFIG_IPV6_MIP6) += mip6.o
+
 ipv6-objs += $(ipv6-y)
 
 obj-$(CONFIG_INET6_AH) += ah6.o
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 57ee5ddea96..fc9c8a99bea 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -59,6 +59,9 @@
 #ifdef CONFIG_IPV6_TUNNEL
 #include <net/ip6_tunnel.h>
 #endif
+#ifdef CONFIG_IPV6_MIP6
+#include <net/mip6.h>
+#endif
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -857,6 +860,9 @@ static int __init inet6_init(void)
 	ipv6_frag_init();
 	ipv6_nodata_init();
 	ipv6_destopt_init();
+#ifdef CONFIG_IPV6_MIP6
+	mip6_init();
+#endif
 
 	/* Init v6 transport protocols. */
 	udpv6_init();
@@ -919,6 +925,9 @@ static void __exit inet6_exit(void)
  	udp6_proc_exit();
  	tcp6_proc_exit();
  	raw6_proc_exit();
+#endif
+#ifdef CONFIG_IPV6_MIP6
+	mip6_fini();
 #endif
 	/* Cleanup code parts. */
 	sit_cleanup();
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
new file mode 100644
index 00000000000..63e548b6f81
--- /dev/null
+++ b/net/ipv6/mip6.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C)2003-2006 Helsinki University of Technology
+ * Copyright (C)2003-2006 USAGI/WIDE Project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+/*
+ * Authors:
+ *	Noriaki TAKAMIYA @USAGI
+ *	Masahide NAKAMURA @USAGI
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/xfrm.h>
+#include <net/mip6.h>
+
+static xfrm_address_t *mip6_xfrm_addr(struct xfrm_state *x, xfrm_address_t *addr)
+{
+	return x->coaddr;
+}
+
+static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data;
+
+	if (!ipv6_addr_equal(&rt2->addr, (struct in6_addr *)x->coaddr) &&
+	    !ipv6_addr_any((struct in6_addr *)x->coaddr))
+		return -ENOENT;
+
+	return rt2->rt_hdr.nexthdr;
+}
+
+/* Routing Header type 2 is inserted.
+ * IP Header's dst address is replaced with Routing Header's Home Address.
+ */
+static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph;
+	struct rt2_hdr *rt2;
+	u8 nexthdr;
+
+	iph = (struct ipv6hdr *)skb->data;
+	iph->payload_len = htons(skb->len - sizeof(*iph));
+
+	nexthdr = *skb->nh.raw;
+	*skb->nh.raw = IPPROTO_ROUTING;
+
+	rt2 = (struct rt2_hdr *)skb->h.raw;
+	rt2->rt_hdr.nexthdr = nexthdr;
+	rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1;
+	rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2;
+	rt2->rt_hdr.segments_left = 1;
+	memset(&rt2->reserved, 0, sizeof(rt2->reserved));
+
+	BUG_TRAP(rt2->rt_hdr.hdrlen == 2);
+
+	memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr));
+	memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr));
+
+	return 0;
+}
+
+static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb,
+			     u8 **nexthdr)
+{
+	u16 offset = sizeof(struct ipv6hdr);
+	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
+	unsigned int packet_len = skb->tail - skb->nh.raw;
+	int found_rhdr = 0;
+
+	*nexthdr = &skb->nh.ipv6h->nexthdr;
+
+	while (offset + 1 <= packet_len) {
+
+		switch (**nexthdr) {
+		case NEXTHDR_HOP:
+			break;
+		case NEXTHDR_ROUTING:
+			if (offset + 3 <= packet_len) {
+				struct ipv6_rt_hdr *rt;
+				rt = (struct ipv6_rt_hdr *)(skb->nh.raw + offset);
+				if (rt->type != 0)
+					return offset;
+			}
+			found_rhdr = 1;
+			break;
+		case NEXTHDR_DEST:
+			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
+				return offset;
+
+			if (found_rhdr)
+				return offset;
+
+			break;
+		default:
+			return offset;
+		}
+
+		offset += ipv6_optlen(exthdr);
+		*nexthdr = &exthdr->nexthdr;
+		exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
+	}
+
+	return offset;
+}
+
+static int mip6_rthdr_init_state(struct xfrm_state *x)
+{
+	if (x->id.spi) {
+		printk(KERN_INFO "%s: spi is not 0: %u\n", __FUNCTION__,
+		       x->id.spi);
+		return -EINVAL;
+	}
+	if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
+		printk(KERN_INFO "%s: state's mode is not %u: %u\n",
+		       __FUNCTION__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+		return -EINVAL;
+	}
+
+	x->props.header_len = sizeof(struct rt2_hdr);
+
+	return 0;
+}
+
+/*
+ * Do nothing about destroying since it has no specific operation for routing
+ * header type 2 unlike IPsec protocols.
+ */
+static void mip6_rthdr_destroy(struct xfrm_state *x)
+{
+}
+
+static struct xfrm_type mip6_rthdr_type =
+{
+	.description	= "MIP6RT",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_ROUTING,
+	.flags		= XFRM_TYPE_NON_FRAGMENT,
+	.init_state	= mip6_rthdr_init_state,
+	.destructor	= mip6_rthdr_destroy,
+	.input		= mip6_rthdr_input,
+	.output		= mip6_rthdr_output,
+	.hdr_offset	= mip6_rthdr_offset,
+	.remote_addr	= mip6_xfrm_addr,
+};
+
+int __init mip6_init(void)
+{
+	printk(KERN_INFO "Mobile IPv6\n");
+
+	if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) {
+		printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __FUNCTION__);
+		goto mip6_rthdr_xfrm_fail;
+	}
+	return 0;
+
+ mip6_rthdr_xfrm_fail:
+	return -EAGAIN;
+}
+
+void __exit mip6_fini(void)
+{
+	if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
+		printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __FUNCTION__);
+}
-- 
cgit v1.2.3-70-g09d2


From 3d126890dd67beffec27c1b6f51c040fc8d0b526 Mon Sep 17 00:00:00 2001
From: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Date: Wed, 23 Aug 2006 20:32:34 -0700
Subject: [IPV6] MIP6: Add destination options header transformation.

Add destination options header transformation for Mobile IPv6.
Based on MIPL2 kernel patch.

This patch was also written by: Ville Nuorvala <vnuorval@tcs.hut.fi>

Signed-off-by: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mip6.h |   3 +
 net/ipv6/mip6.c    | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 170 insertions(+)

(limited to 'include')

diff --git a/include/net/mip6.h b/include/net/mip6.h
index 644b8b67304..42b65bace12 100644
--- a/include/net/mip6.h
+++ b/include/net/mip6.h
@@ -25,6 +25,9 @@
 #ifndef _NET_MIP6_H
 #define _NET_MIP6_H
 
+#define MIP6_OPT_PAD_1	0
+#define MIP6_OPT_PAD_N	1
+
 extern int mip6_init(void);
 extern void mip6_fini(void);
 
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 63e548b6f81..a8adf891fe0 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -35,6 +35,165 @@ static xfrm_address_t *mip6_xfrm_addr(struct xfrm_state *x, xfrm_address_t *addr
 	return x->coaddr;
 }
 
+static inline unsigned int calc_padlen(unsigned int len, unsigned int n)
+{
+	return (n - len + 16) & 0x7;
+}
+
+static inline void *mip6_padn(__u8 *data, __u8 padlen)
+{
+	if (!data)
+		return NULL;
+	if (padlen == 1) {
+		data[0] = MIP6_OPT_PAD_1;
+	} else if (padlen > 1) {
+		data[0] = MIP6_OPT_PAD_N;
+		data[1] = padlen - 2;
+		if (padlen > 2)
+			memset(data+2, 0, data[1]);
+	}
+	return data + padlen;
+}
+
+static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph = skb->nh.ipv6h;
+	struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data;
+
+	if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) &&
+	    !ipv6_addr_any((struct in6_addr *)x->coaddr))
+		return -ENOENT;
+
+	return destopt->nexthdr;
+}
+
+/* Destination Option Header is inserted.
+ * IP Header's src address is replaced with Home Address Option in
+ * Destination Option Header.
+ */
+static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ipv6hdr *iph;
+	struct ipv6_destopt_hdr *dstopt;
+	struct ipv6_destopt_hao *hao;
+	u8 nexthdr;
+	int len;
+
+	iph = (struct ipv6hdr *)skb->data;
+	iph->payload_len = htons(skb->len - sizeof(*iph));
+
+	nexthdr = *skb->nh.raw;
+	*skb->nh.raw = IPPROTO_DSTOPTS;
+
+	dstopt = (struct ipv6_destopt_hdr *)skb->h.raw;
+	dstopt->nexthdr = nexthdr;
+
+	hao = mip6_padn((char *)(dstopt + 1),
+			calc_padlen(sizeof(*dstopt), 6));
+
+	hao->type = IPV6_TLV_HAO;
+	hao->length = sizeof(*hao) - 2;
+	BUG_TRAP(hao->length == 16);
+
+	len = ((char *)hao - (char *)dstopt) + sizeof(*hao);
+
+	memcpy(&hao->addr, &iph->saddr, sizeof(hao->addr));
+	memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr));
+
+	BUG_TRAP(len == x->props.header_len);
+	dstopt->hdrlen = (x->props.header_len >> 3) - 1;
+
+	return 0;
+}
+
+static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb,
+			       u8 **nexthdr)
+{
+	u16 offset = sizeof(struct ipv6hdr);
+	struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
+	unsigned int packet_len = skb->tail - skb->nh.raw;
+	int found_rhdr = 0;
+
+	*nexthdr = &skb->nh.ipv6h->nexthdr;
+
+	while (offset + 1 <= packet_len) {
+
+		switch (**nexthdr) {
+		case NEXTHDR_HOP:
+			break;
+		case NEXTHDR_ROUTING:
+			found_rhdr = 1;
+			break;
+		case NEXTHDR_DEST:
+			/*
+			 * HAO MUST NOT appear more than once.
+			 * XXX: It is better to try to find by the end of
+			 * XXX: packet if HAO exists.
+			 */
+			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) {
+				LIMIT_NETDEBUG(KERN_WARNING "mip6: hao exists already, override\n");
+				return offset;
+			}
+
+			if (found_rhdr)
+				return offset;
+
+			break;
+		default:
+			return offset;
+		}
+
+		offset += ipv6_optlen(exthdr);
+		*nexthdr = &exthdr->nexthdr;
+		exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
+	}
+
+	return offset;
+}
+
+static int mip6_destopt_init_state(struct xfrm_state *x)
+{
+	if (x->id.spi) {
+		printk(KERN_INFO "%s: spi is not 0: %u\n", __FUNCTION__,
+		       x->id.spi);
+		return -EINVAL;
+	}
+	if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
+		printk(KERN_INFO "%s: state's mode is not %u: %u\n",
+		       __FUNCTION__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+		return -EINVAL;
+	}
+
+	x->props.header_len = sizeof(struct ipv6_destopt_hdr) +
+		calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) +
+		sizeof(struct ipv6_destopt_hao);
+	BUG_TRAP(x->props.header_len == 24);
+
+	return 0;
+}
+
+/*
+ * Do nothing about destroying since it has no specific operation for
+ * destination options header unlike IPsec protocols.
+ */
+static void mip6_destopt_destroy(struct xfrm_state *x)
+{
+}
+
+static struct xfrm_type mip6_destopt_type =
+{
+	.description	= "MIP6DESTOPT",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_DSTOPTS,
+	.flags		= XFRM_TYPE_NON_FRAGMENT,
+	.init_state	= mip6_destopt_init_state,
+	.destructor	= mip6_destopt_destroy,
+	.input		= mip6_destopt_input,
+	.output		= mip6_destopt_output,
+	.hdr_offset	= mip6_destopt_offset,
+	.local_addr	= mip6_xfrm_addr,
+};
+
 static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data;
@@ -164,6 +323,10 @@ int __init mip6_init(void)
 {
 	printk(KERN_INFO "Mobile IPv6\n");
 
+	if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) {
+		printk(KERN_INFO "%s: can't add xfrm type(destopt)\n", __FUNCTION__);
+		goto mip6_destopt_xfrm_fail;
+	}
 	if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) {
 		printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __FUNCTION__);
 		goto mip6_rthdr_xfrm_fail;
@@ -171,6 +334,8 @@ int __init mip6_init(void)
 	return 0;
 
  mip6_rthdr_xfrm_fail:
+	xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
+ mip6_destopt_xfrm_fail:
 	return -EAGAIN;
 }
 
@@ -178,4 +343,6 @@ void __exit mip6_fini(void)
 {
 	if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
 		printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __FUNCTION__);
+	if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
+		printk(KERN_INFO "%s: can't remove xfrm type(destopt)\n", __FUNCTION__);
 }
-- 
cgit v1.2.3-70-g09d2


From 2b741653b6c824fe7520ee92b6795f11c5f24b24 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 20:34:26 -0700
Subject: [IPV6] MIP6: Add Mobility header definition.

Add Mobility header definition for Mobile IPv6.
Based on MIPL2 kernel patch.

This patch was also written by: Antti Tuominen <anttit@tcs.hut.fi>

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in6.h |  1 +
 include/net/flow.h  |  9 +++++++++
 include/net/ipv6.h  |  1 +
 include/net/mip6.h  | 23 +++++++++++++++++++++++
 4 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/linux/in6.h b/include/linux/in6.h
index 086ec2ac8c5..d776829b443 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -134,6 +134,7 @@ struct in6_flowlabel_req
 #define IPPROTO_ICMPV6		58	/* ICMPv6			*/
 #define IPPROTO_NONE		59	/* IPv6 no next header		*/
 #define IPPROTO_DSTOPTS		60	/* IPv6 destination options	*/
+#define IPPROTO_MH		135	/* IPv6 mobility header		*/
 
 /*
  *	IPv6 TLV options.
diff --git a/include/net/flow.h b/include/net/flow.h
index 21d988b2058..e0522914316 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -72,12 +72,21 @@ struct flowi {
 		} dnports;
 
 		__u32		spi;
+
+#ifdef CONFIG_IPV6_MIP6
+		struct {
+			__u8	type;
+		} mht;
+#endif
 	} uli_u;
 #define fl_ip_sport	uli_u.ports.sport
 #define fl_ip_dport	uli_u.ports.dport
 #define fl_icmp_type	uli_u.icmpt.type
 #define fl_icmp_code	uli_u.icmpt.code
 #define fl_ipsec_spi	uli_u.spi
+#ifdef CONFIG_IPV6_MIP6
+#define fl_mh_type	uli_u.mht.type
+#endif
 	__u32           secid;	/* used by xfrm; see secid.txt */
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8e6ec6063f8..72bf47b2a4e 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -40,6 +40,7 @@
 #define NEXTHDR_ICMP		58	/* ICMP for IPv6. */
 #define NEXTHDR_NONE		59	/* No next header */
 #define NEXTHDR_DEST		60	/* Destination options header. */
+#define NEXTHDR_MOBILITY	135	/* Mobility header. */
 
 #define NEXTHDR_MAX		255
 
diff --git a/include/net/mip6.h b/include/net/mip6.h
index 42b65bace12..fd43178faac 100644
--- a/include/net/mip6.h
+++ b/include/net/mip6.h
@@ -28,6 +28,29 @@
 #define MIP6_OPT_PAD_1	0
 #define MIP6_OPT_PAD_N	1
 
+/*
+ * Mobility Header
+ */
+struct ip6_mh {
+	__u8	ip6mh_proto;
+	__u8	ip6mh_hdrlen;
+	__u8	ip6mh_type;
+	__u8	ip6mh_reserved;
+	__u16	ip6mh_cksum;
+	/* Followed by type specific messages */
+	__u8	data[0];
+} __attribute__ ((__packed__));
+
+#define IP6_MH_TYPE_BRR		0   /* Binding Refresh Request */
+#define IP6_MH_TYPE_HOTI	1   /* HOTI Message   */
+#define IP6_MH_TYPE_COTI	2   /* COTI Message  */
+#define IP6_MH_TYPE_HOT		3   /* HOT Message   */
+#define IP6_MH_TYPE_COT		4   /* COT Message  */
+#define IP6_MH_TYPE_BU		5   /* Binding Update */
+#define IP6_MH_TYPE_BACK	6   /* Binding ACK */
+#define IP6_MH_TYPE_BERROR	7   /* Binding Error */
+#define IP6_MH_TYPE_MAX		IP6_MH_TYPE_BERROR
+
 extern int mip6_init(void);
 extern void mip6_fini(void);
 
-- 
cgit v1.2.3-70-g09d2


From 7be96f7628469e56f91d51f13b03e9bcff113c7f Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 20:35:31 -0700
Subject: [IPV6] MIP6: Add receiving mobility header functions through raw
 socket.

Like ICMPv6, mobility header is handled through raw socket.
In inbound case, check only whether ICMPv6 error should be sent as a reply
or not by kernel.
Based on MIPL2 kernel patch.

This patch was also written by: Ville Nuorvala <vnuorval@tcs.hut.fi>
This patch was also written by: Antti Tuominen <anttit@tcs.hut.fi>

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mip6.h |  4 +++
 net/ipv6/mip6.c    | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/raw.c     | 29 ++++++++++++++++++-
 3 files changed, 115 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mip6.h b/include/net/mip6.h
index fd43178faac..68263c6d999 100644
--- a/include/net/mip6.h
+++ b/include/net/mip6.h
@@ -25,6 +25,9 @@
 #ifndef _NET_MIP6_H
 #define _NET_MIP6_H
 
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
 #define MIP6_OPT_PAD_1	0
 #define MIP6_OPT_PAD_N	1
 
@@ -53,5 +56,6 @@ struct ip6_mh {
 
 extern int mip6_init(void);
 extern void mip6_fini(void);
+extern int mip6_mh_filter(struct sock *sk, struct sk_buff *skb);
 
 #endif
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index a8adf891fe0..7b5f8932148 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -26,7 +26,10 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/ipv6.h>
+#include <linux/icmpv6.h>
+#include <net/sock.h>
 #include <net/ipv6.h>
+#include <net/ip6_checksum.h>
 #include <net/xfrm.h>
 #include <net/mip6.h>
 
@@ -55,6 +58,86 @@ static inline void *mip6_padn(__u8 *data, __u8 padlen)
 	return data + padlen;
 }
 
+static inline void mip6_param_prob(struct sk_buff *skb, int code, int pos)
+{
+	icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev);
+}
+
+static int mip6_mh_len(int type)
+{
+	int len = 0;
+
+	switch (type) {
+	case IP6_MH_TYPE_BRR:
+		len = 0;
+		break;
+	case IP6_MH_TYPE_HOTI:
+	case IP6_MH_TYPE_COTI:
+	case IP6_MH_TYPE_BU:
+	case IP6_MH_TYPE_BACK:
+		len = 1;
+		break;
+	case IP6_MH_TYPE_HOT:
+	case IP6_MH_TYPE_COT:
+	case IP6_MH_TYPE_BERROR:
+		len = 2;
+		break;
+	}
+	return len;
+}
+
+int mip6_mh_filter(struct sock *sk, struct sk_buff *skb)
+{
+	struct ip6_mh *mh;
+	int mhlen;
+
+	if (!pskb_may_pull(skb, (skb->h.raw - skb->data) + 8) ||
+	    !pskb_may_pull(skb, (skb->h.raw - skb->data) + ((skb->h.raw[1] + 1) << 3)))
+		return -1;
+
+	mh = (struct ip6_mh *)skb->h.raw;
+
+	if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) {
+		LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n",
+			       mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type));
+		mip6_param_prob(skb, 0, (&mh->ip6mh_hdrlen) - skb->nh.raw);
+		return -1;
+	}
+	mhlen = (mh->ip6mh_hdrlen + 1) << 3;
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
+				    &skb->nh.ipv6h->daddr,
+				    mhlen, IPPROTO_MH,
+				    skb->csum)) {
+			LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH hw checksum failed\n");
+			skb->ip_summed = CHECKSUM_NONE;
+		}
+	}
+	if (skb->ip_summed == CHECKSUM_NONE) {
+		if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
+				    &skb->nh.ipv6h->daddr,
+				    mhlen, IPPROTO_MH,
+				    skb_checksum(skb, 0, mhlen, 0))) {
+			LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
+				       NIP6(skb->nh.ipv6h->saddr),
+				       NIP6(skb->nh.ipv6h->daddr));
+			return -1;
+		}
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+
+	if (mh->ip6mh_proto != IPPROTO_NONE) {
+		LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n",
+			       mh->ip6mh_proto);
+		mip6_param_prob(skb, 0, (&mh->ip6mh_proto) - skb->nh.raw);
+		return -1;
+	}
+
+	return 0;
+}
+
 static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct ipv6hdr *iph = skb->nh.ipv6h;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index d4af1cb5e19..ecca8aae3c4 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -50,6 +50,9 @@
 #include <net/udp.h>
 #include <net/inet_common.h>
 #include <net/tcp_states.h>
+#ifdef CONFIG_IPV6_MIP6
+#include <net/mip6.h>
+#endif
 
 #include <net/rawv6.h>
 #include <net/xfrm.h>
@@ -169,8 +172,32 @@ int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 	sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr, IP6CB(skb)->iif);
 
 	while (sk) {
+		int filtered;
+
 		delivered = 1;
-		if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) {
+		switch (nexthdr) {
+		case IPPROTO_ICMPV6:
+			filtered = icmpv6_filter(sk, skb);
+			break;
+#ifdef CONFIG_IPV6_MIP6
+		case IPPROTO_MH:
+			/* XXX: To validate MH only once for each packet,
+			 * this is placed here. It should be after checking
+			 * xfrm policy, however it doesn't. The checking xfrm
+			 * policy is placed in rawv6_rcv() because it is
+			 * required for each socket.
+			 */
+			filtered = mip6_mh_filter(sk, skb);
+			break;
+#endif
+		default:
+			filtered = 0;
+			break;
+		}
+
+		if (filtered < 0)
+			break;
+		if (filtered == 0) {
 			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
 
 			/* Not releasing hash table! */
-- 
cgit v1.2.3-70-g09d2


From 2ce4272a699c731b9736d76126dc742353e381db Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 20:39:03 -0700
Subject: [IPV6] MIP6: Transformation support mobility header.

Transformation support mobility header.
Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h      |  5 +++++
 net/ipv6/xfrm6_policy.c | 15 +++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 7f1630630dc..13488e7ba68 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -546,6 +546,11 @@ u16 xfrm_flowi_sport(struct flowi *fl)
 	case IPPROTO_ICMPV6:
 		port = htons(fl->fl_icmp_type);
 		break;
+#ifdef CONFIG_IPV6_MIP6
+	case IPPROTO_MH:
+		port = htons(fl->fl_mh_type);
+		break;
+#endif
 	default:
 		port = 0;	/*XXX*/
 	}
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 729b4748d6d..98c2fe449b3 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -18,6 +18,9 @@
 #include <net/ip.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#ifdef CONFIG_IPV6_MIP6
+#include <net/mip6.h>
+#endif
 
 static struct dst_ops xfrm6_dst_ops;
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
@@ -270,6 +273,18 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl)
 			fl->proto = nexthdr;
 			return;
 
+#ifdef CONFIG_IPV6_MIP6
+		case IPPROTO_MH:
+			if (pskb_may_pull(skb, skb->nh.raw + offset + 3 - skb->data)) {
+				struct ip6_mh *mh;
+				mh = (struct ip6_mh *)exthdr;
+
+				fl->fl_mh_type = mh->ip6mh_type;
+			}
+			fl->proto = nexthdr;
+			return;
+#endif
+
 		/* XXX Why are there these headers? */
 		case IPPROTO_AH:
 		case IPPROTO_ESP:
-- 
cgit v1.2.3-70-g09d2


From df0ba92a99ca757039dfa84a929281ea3f7a50e8 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 20:41:00 -0700
Subject: [XFRM]: Trace which secpath state is reject factor.

For Mobile IPv6 usage, it is required to trace which secpath state is
reject factor in order to notify it to user space (to know the address
which cannot be used route optimized communication).

Based on MIPL2 kernel patch.

This patch was also written by: Henrik Petander <petander@tcs.hut.fi>

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     |  1 +
 net/xfrm/xfrm_policy.c | 55 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 13488e7ba68..9ebbdc1dd47 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -273,6 +273,7 @@ struct xfrm_type
 	void			(*destructor)(struct xfrm_state *);
 	int			(*input)(struct xfrm_state *, struct sk_buff *skb);
 	int			(*output)(struct xfrm_state *, struct sk_buff *pskb);
+	int			(*reject)(struct xfrm_state *, struct sk_buff *, struct flowi *);
 	int			(*hdr_offset)(struct xfrm_state *, struct sk_buff *, u8 **);
 	xfrm_address_t		*(*local_addr)(struct xfrm_state *, xfrm_address_t *);
 	xfrm_address_t		*(*remote_addr)(struct xfrm_state *, xfrm_address_t *);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ad2a5cba1f5..d125a264903 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -988,6 +988,23 @@ error:
 }
 EXPORT_SYMBOL(xfrm_lookup);
 
+static inline int
+xfrm_secpath_reject(int idx, struct sk_buff *skb, struct flowi *fl)
+{
+	struct xfrm_state *x;
+	int err;
+
+	if (!skb->sp || idx < 0 || idx >= skb->sp->len)
+		return 0;
+	x = skb->sp->xvec[idx];
+	if (!x->type->reject)
+		return 0;
+	xfrm_state_hold(x);
+	err = x->type->reject(x, skb, fl);
+	xfrm_state_put(x);
+	return err;
+}
+
 /* When skb is transformed back to its "native" form, we have to
  * check policy restrictions. At the moment we make this in maximally
  * stupid way. Shame on me. :-) Of course, connected sockets must
@@ -1010,6 +1027,13 @@ xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x,
 		  xfrm_state_addr_cmp(tmpl, x, family));
 }
 
+/*
+ * 0 or more than 0 is returned when validation is succeeded (either bypass
+ * because of optional transport mode, or next index of the mathced secpath
+ * state with the template.
+ * -1 is returned when no matching template is found.
+ * Otherwise "-2 - errored_index" is returned.
+ */
 static inline int
 xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
 	       unsigned short family)
@@ -1024,8 +1048,11 @@ xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start,
 	for (; idx < sp->len; idx++) {
 		if (xfrm_state_ok(tmpl, sp->xvec[idx], family))
 			return ++idx;
-		if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT)
+		if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) {
+			if (start == -1)
+				start = -2-idx;
 			break;
+		}
 	}
 	return start;
 }
@@ -1046,11 +1073,14 @@ xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family
 }
 EXPORT_SYMBOL(xfrm_decode_session);
 
-static inline int secpath_has_nontransport(struct sec_path *sp, int k)
+static inline int secpath_has_nontransport(struct sec_path *sp, int k, int *idxp)
 {
 	for (; k < sp->len; k++) {
-		if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT)
+		if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) {
+			if (idxp)
+				*idxp = k;
 			return 1;
+		}
 	}
 
 	return 0;
@@ -1062,6 +1092,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	struct xfrm_policy *pol;
 	struct flowi fl;
 	u8 fl_dir = policy_to_flow_dir(dir);
+	int xerr_idx = -1;
+	int *xerr_idxp = &xerr_idx;
 
 	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
@@ -1086,8 +1118,13 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		pol = flow_cache_lookup(&fl, family, fl_dir,
 					xfrm_policy_lookup);
 
-	if (!pol)
-		return !skb->sp || !secpath_has_nontransport(skb->sp, 0);
+	if (!pol) {
+		if (skb->sp && secpath_has_nontransport(skb->sp, 0, xerr_idxp)) {
+			xfrm_secpath_reject(xerr_idx, skb, &fl);
+			return 0;
+		}
+		return 1;
+	}
 
 	pol->curlft.use_time = (unsigned long)xtime.tv_sec;
 
@@ -1107,11 +1144,14 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		 */
 		for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) {
 			k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family);
-			if (k < 0)
+			if (k < 0) {
+				if (k < -1 && xerr_idxp)
+					*xerr_idxp = -(2+k);
 				goto reject;
+			}
 		}
 
-		if (secpath_has_nontransport(sp, k))
+		if (secpath_has_nontransport(sp, k, xerr_idxp))
 			goto reject;
 
 		xfrm_pol_put(pol);
@@ -1119,6 +1159,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	}
 
 reject:
+	xfrm_secpath_reject(xerr_idx, skb, &fl);
 	xfrm_pol_put(pol);
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 97a64b4577ae2bc5599dbd008a3cd9e25de9b9f5 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 20:44:06 -0700
Subject: [XFRM]: Introduce XFRM_MSG_REPORT.

XFRM_MSG_REPORT is a message as notification of state protocol and
selector from kernel to user-space.

Mobile IPv6 will use it when inbound reject is occurred at route
optimization to make user-space know a binding error requirement.

Based on MIPL2 kernel patch.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h  | 12 ++++++++++++
 include/net/xfrm.h    |  2 ++
 net/xfrm/xfrm_state.c | 19 +++++++++++++++++++
 net/xfrm/xfrm_user.c  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 79 insertions(+)

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 1d8c1f22c12..4009f4445fa 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -166,6 +166,10 @@ enum {
 #define XFRM_MSG_NEWAE XFRM_MSG_NEWAE
 	XFRM_MSG_GETAE,
 #define XFRM_MSG_GETAE XFRM_MSG_GETAE
+
+	XFRM_MSG_REPORT,
+#define XFRM_MSG_REPORT XFRM_MSG_REPORT
+
 	__XFRM_MSG_MAX
 };
 #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1)
@@ -325,12 +329,18 @@ struct xfrm_usersa_flush {
 	__u8				proto;
 };
 
+struct xfrm_user_report {
+	__u8				proto;
+	struct xfrm_selector		sel;
+};
+
 #ifndef __KERNEL__
 /* backwards compatibility for userspace */
 #define XFRMGRP_ACQUIRE		1
 #define XFRMGRP_EXPIRE		2
 #define XFRMGRP_SA		4
 #define XFRMGRP_POLICY		8
+#define XFRMGRP_REPORT		0x10
 #endif
 
 enum xfrm_nlgroups {
@@ -346,6 +356,8 @@ enum xfrm_nlgroups {
 #define XFRMNLGRP_POLICY	XFRMNLGRP_POLICY
 	XFRMNLGRP_AEVENTS,
 #define XFRMNLGRP_AEVENTS	XFRMNLGRP_AEVENTS
+	XFRMNLGRP_REPORT,
+#define XFRMNLGRP_REPORT	XFRMNLGRP_REPORT
 	__XFRMNLGRP_MAX
 };
 #define XFRMNLGRP_MAX	(__XFRMNLGRP_MAX - 1)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9ebbdc1dd47..0b223eed4c9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -381,6 +381,7 @@ struct xfrm_mgr
 	struct xfrm_policy	*(*compile_policy)(struct sock *sk, int opt, u8 *data, int len, int *dir);
 	int			(*new_mapping)(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport);
 	int			(*notify_policy)(struct xfrm_policy *x, int dir, struct km_event *c);
+	int			(*report)(u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr);
 };
 
 extern int xfrm_register_km(struct xfrm_mgr *km);
@@ -1043,6 +1044,7 @@ extern void xfrm_init_pmtu(struct dst_entry *dst);
 extern wait_queue_head_t km_waitq;
 extern int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport);
 extern void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 pid);
+extern int km_report(u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr);
 
 extern void xfrm_input_init(void);
 extern int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 3da89c01ea7..a26ef6952c3 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1055,6 +1055,25 @@ void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 pid)
 }
 EXPORT_SYMBOL(km_policy_expired);
 
+int km_report(u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr)
+{
+	int err = -EINVAL;
+	int ret;
+	struct xfrm_mgr *km;
+
+	read_lock(&xfrm_km_lock);
+	list_for_each_entry(km, &xfrm_km_list, list) {
+		if (km->report) {
+			ret = km->report(proto, sel, addr);
+			if (!ret)
+				err = ret;
+		}
+	}
+	read_unlock(&xfrm_km_lock);
+	return err;
+}
+EXPORT_SYMBOL(km_report);
+
 int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen)
 {
 	int err;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 770bd241074..7303b820bea 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1491,6 +1491,7 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
 	[XFRM_MSG_FLUSHPOLICY - XFRM_MSG_BASE] = NLMSG_LENGTH(0),
 	[XFRM_MSG_NEWAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
 	[XFRM_MSG_GETAE       - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
+	[XFRM_MSG_REPORT      - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
 };
 
 #undef XMSGSIZE
@@ -2058,12 +2059,57 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_ev
 
 }
 
+static int build_report(struct sk_buff *skb, u8 proto,
+			struct xfrm_selector *sel, xfrm_address_t *addr)
+{
+	struct xfrm_user_report *ur;
+	struct nlmsghdr *nlh;
+	unsigned char *b = skb->tail;
+
+	nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_REPORT, sizeof(*ur));
+	ur = NLMSG_DATA(nlh);
+	nlh->nlmsg_flags = 0;
+
+	ur->proto = proto;
+	memcpy(&ur->sel, sel, sizeof(ur->sel));
+
+	if (addr)
+		RTA_PUT(skb, XFRMA_COADDR, sizeof(*addr), addr);
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+static int xfrm_send_report(u8 proto, struct xfrm_selector *sel,
+			    xfrm_address_t *addr)
+{
+	struct sk_buff *skb;
+	size_t len;
+
+	len = NLMSG_ALIGN(NLMSG_LENGTH(sizeof(struct xfrm_user_report)));
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	if (build_report(skb, proto, sel, addr) < 0)
+		BUG();
+
+	NETLINK_CB(skb).dst_group = XFRMNLGRP_REPORT;
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_REPORT, GFP_ATOMIC);
+}
+
 static struct xfrm_mgr netlink_mgr = {
 	.id		= "netlink",
 	.notify		= xfrm_send_state_notify,
 	.acquire	= xfrm_send_acquire,
 	.compile_policy	= xfrm_compile_policy,
 	.notify_policy	= xfrm_send_policy_notify,
+	.report		= xfrm_send_report,
 };
 
 static int __init xfrm_user_init(void)
-- 
cgit v1.2.3-70-g09d2


From 4e81bb8336a0ac50289d4d4c7a55e559b994ee8f Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 22:43:30 -0700
Subject: [XFRM] POLICY: sub policy support.

Sub policy is introduced. Main and sub policy are applied the same flow.
(Policy that current kernel uses is named as main.)
It is required another transformation policy management to keep IPsec
and Mobile IPv6 lives separate.
Policy which lives shorter time in kernel should be a sub i.e. normally
main is for IPsec and sub is for Mobile IPv6.
(Such usage as two IPsec policies on different database can be used, too.)

Limitation or TODOs:
 - Sub policy is not supported for per socket one (it is always inserted as main).
 - Current kernel makes cached outbound with flowi to skip searching database.
   However this patch makes it disabled only when "two policies are used and
   the first matched one is bypass case" because neither flowi nor bundle
   information knows about transformation template size.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 include/linux/xfrm.h   |   7 ++
 include/net/xfrm.h     |  45 +++++++--
 net/xfrm/xfrm_policy.c | 252 ++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 260 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 4009f4445fa..492fb981874 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -102,6 +102,13 @@ struct xfrm_stats {
 	__u32	integrity_failed;
 };
 
+enum
+{
+	XFRM_POLICY_TYPE_MAIN	= 0,
+	XFRM_POLICY_TYPE_SUB	= 1,
+	XFRM_POLICY_TYPE_MAX	= 2
+};
+
 enum
 {
 	XFRM_POLICY_IN	= 0,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 0b223eed4c9..4655ca25f80 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -341,6 +341,7 @@ struct xfrm_policy
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	u8			type;
 	u32			priority;
 	u32			index;
 	struct xfrm_selector	selector;
@@ -389,6 +390,19 @@ extern int xfrm_unregister_km(struct xfrm_mgr *km);
 
 
 extern struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
+#ifdef CONFIG_XFRM_SUB_POLICY
+extern struct xfrm_policy *xfrm_policy_list_sub[XFRM_POLICY_MAX*2];
+
+static inline int xfrm_policy_lists_empty(int dir)
+{
+	return (!xfrm_policy_list[dir] && !xfrm_policy_list_sub[dir]);
+}
+#else
+static inline int xfrm_policy_lists_empty(int dir)
+{
+	return (!xfrm_policy_list[dir]);
+}
+#endif
 
 static inline void xfrm_pol_hold(struct xfrm_policy *policy)
 {
@@ -404,6 +418,20 @@ static inline void xfrm_pol_put(struct xfrm_policy *policy)
 		__xfrm_policy_destroy(policy);
 }
 
+#ifdef CONFIG_XFRM_SUB_POLICY
+static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
+{
+	int i;
+	for (i = npols - 1; i >= 0; --i)
+		xfrm_pol_put(pols[i]);
+}
+#else
+static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
+{
+	xfrm_pol_put(pols[0]);
+}
+#endif
+
 #define XFRM_DST_HSIZE		1024
 
 static __inline__
@@ -737,8 +765,8 @@ static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *sk
 {
 	if (sk && sk->sk_policy[XFRM_POLICY_IN])
 		return __xfrm_policy_check(sk, dir, skb, family);
-		
-	return	(!xfrm_policy_list[dir] && !skb->sp) ||
+
+	return	(xfrm_policy_lists_empty(dir) && !skb->sp) ||
 		(skb->dst->flags & DST_NOPOLICY) ||
 		__xfrm_policy_check(sk, dir, skb, family);
 }
@@ -758,7 +786,7 @@ extern int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);
 
 static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
-	return	!xfrm_policy_list[XFRM_POLICY_OUT] ||
+	return	xfrm_policy_lists_empty(XFRM_POLICY_OUT) ||
 		(skb->dst->flags & DST_NOXFRM) ||
 		__xfrm_route_forward(skb, family);
 }
@@ -1023,18 +1051,19 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig
 #endif
 
 struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
-extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *);
+extern int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*), void *);
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
-struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
+struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
+					  struct xfrm_selector *sel,
 					  struct xfrm_sec_ctx *ctx, int delete);
-struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete);
-void xfrm_policy_flush(void);
+struct xfrm_policy *xfrm_policy_byid(u8, int dir, u32 id, int delete);
+void xfrm_policy_flush(u8 type);
 u32 xfrm_get_acqseq(void);
 void xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi);
 struct xfrm_state * xfrm_find_acq(u8 mode, u32 reqid, u8 proto, 
 				  xfrm_address_t *daddr, xfrm_address_t *saddr, 
 				  int create, unsigned short family);
-extern void xfrm_policy_flush(void);
+extern void xfrm_policy_flush(u8 type);
 extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
 extern int xfrm_flush_bundles(void);
 extern void xfrm_flush_all_bundles(void);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d125a264903..96de6c76ed5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -32,6 +32,24 @@ static DEFINE_RWLOCK(xfrm_policy_lock);
 
 struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
 EXPORT_SYMBOL(xfrm_policy_list);
+#ifdef CONFIG_XFRM_SUB_POLICY
+struct xfrm_policy *xfrm_policy_list_sub[XFRM_POLICY_MAX*2];
+EXPORT_SYMBOL(xfrm_policy_list_sub);
+
+#define XFRM_POLICY_LISTS(type) \
+	((type == XFRM_POLICY_TYPE_SUB) ? xfrm_policy_list_sub : \
+	 xfrm_policy_list)
+#define XFRM_POLICY_LISTHEAD(type, dir) \
+	((type == XFRM_POLICY_TYPE_SUB) ? xfrm_policy_list_sub[dir] : \
+	 xfrm_policy_list[dir])
+#define XFRM_POLICY_LISTHEADP(type, dir) \
+	((type == XFRM_POLICY_TYPE_SUB) ? &xfrm_policy_list_sub[dir] : \
+	 &xfrm_policy_list[dir])
+#else
+#define XFRM_POLICY_LISTS(type)              xfrm_policy_list
+#define XFRM_POLICY_LISTHEAD(type, dif)      xfrm_policy_list[dir]
+#define XFRM_POLICY_LISTHEADP(type, dif)     &xfrm_policy_list[dir]
+#endif
 
 static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
@@ -397,7 +415,7 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
 
 /* Generate new index... KAME seems to generate them ordered by cost
  * of an absolute inpredictability of ordering of rules. This will not pass. */
-static u32 xfrm_gen_index(int dir)
+static u32 xfrm_gen_index(u8 type, int dir)
 {
 	u32 idx;
 	struct xfrm_policy *p;
@@ -408,7 +426,7 @@ static u32 xfrm_gen_index(int dir)
 		idx_generator += 8;
 		if (idx == 0)
 			idx = 8;
-		for (p = xfrm_policy_list[dir]; p; p = p->next) {
+		for (p = XFRM_POLICY_LISTHEAD(type, dir); p; p = p->next) {
 			if (p->index == idx)
 				break;
 		}
@@ -425,7 +443,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	struct dst_entry *gc_list;
 
 	write_lock_bh(&xfrm_policy_lock);
-	for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) {
+	for (p = XFRM_POLICY_LISTHEADP(policy->type, dir); (pol=*p)!=NULL;) {
 		if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 &&
 		    xfrm_sec_ctx_match(pol->security, policy->security)) {
 			if (excl) {
@@ -452,7 +470,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	policy->next = *p;
 	*p = policy;
 	atomic_inc(&flow_cache_genid);
-	policy->index = delpol ? delpol->index : xfrm_gen_index(dir);
+	policy->index = delpol ? delpol->index : xfrm_gen_index(policy->type, dir);
 	policy->curlft.add_time = (unsigned long)xtime.tv_sec;
 	policy->curlft.use_time = 0;
 	if (!mod_timer(&policy->timer, jiffies + HZ))
@@ -493,13 +511,14 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 }
 EXPORT_SYMBOL(xfrm_policy_insert);
 
-struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
+struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
+					  struct xfrm_selector *sel,
 					  struct xfrm_sec_ctx *ctx, int delete)
 {
 	struct xfrm_policy *pol, **p;
 
 	write_lock_bh(&xfrm_policy_lock);
-	for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
+	for (p = XFRM_POLICY_LISTHEADP(type, dir); (pol=*p)!=NULL; p = &pol->next) {
 		if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) &&
 		    (xfrm_sec_ctx_match(ctx, pol->security))) {
 			xfrm_pol_hold(pol);
@@ -518,12 +537,12 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel,
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
 
-struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
+struct xfrm_policy *xfrm_policy_byid(u8 type, int dir, u32 id, int delete)
 {
 	struct xfrm_policy *pol, **p;
 
 	write_lock_bh(&xfrm_policy_lock);
-	for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) {
+	for (p = XFRM_POLICY_LISTHEADP(type, dir); (pol=*p)!=NULL; p = &pol->next) {
 		if (pol->index == id) {
 			xfrm_pol_hold(pol);
 			if (delete)
@@ -541,15 +560,16 @@ struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete)
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
 
-void xfrm_policy_flush(void)
+void xfrm_policy_flush(u8 type)
 {
 	struct xfrm_policy *xp;
+	struct xfrm_policy **p_list = XFRM_POLICY_LISTS(type);
 	int dir;
 
 	write_lock_bh(&xfrm_policy_lock);
 	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
-		while ((xp = xfrm_policy_list[dir]) != NULL) {
-			xfrm_policy_list[dir] = xp->next;
+		while ((xp = p_list[dir]) != NULL) {
+			p_list[dir] = xp->next;
 			write_unlock_bh(&xfrm_policy_lock);
 
 			xfrm_policy_kill(xp);
@@ -562,7 +582,7 @@ void xfrm_policy_flush(void)
 }
 EXPORT_SYMBOL(xfrm_policy_flush);
 
-int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*),
+int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*),
 		     void *data)
 {
 	struct xfrm_policy *xp;
@@ -572,7 +592,7 @@ int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*),
 
 	read_lock_bh(&xfrm_policy_lock);
 	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
-		for (xp = xfrm_policy_list[dir]; xp; xp = xp->next)
+		for (xp = XFRM_POLICY_LISTHEAD(type, dir); xp; xp = xp->next)
 			count++;
 	}
 
@@ -582,7 +602,7 @@ int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*),
 	}
 
 	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
-		for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) {
+		for (xp = XFRM_POLICY_LISTHEAD(type, dir); xp; xp = xp->next) {
 			error = func(xp, dir%XFRM_POLICY_MAX, --count, data);
 			if (error)
 				goto out;
@@ -597,13 +617,13 @@ EXPORT_SYMBOL(xfrm_policy_walk);
 
 /* Find policy to apply to this flow. */
 
-static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
-			       void **objp, atomic_t **obj_refp)
+static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl,
+						     u16 family, u8 dir)
 {
 	struct xfrm_policy *pol;
 
 	read_lock_bh(&xfrm_policy_lock);
-	for (pol = xfrm_policy_list[dir]; pol; pol = pol->next) {
+	for (pol = XFRM_POLICY_LISTHEAD(type, dir); pol; pol = pol->next) {
 		struct xfrm_selector *sel = &pol->selector;
 		int match;
 
@@ -620,6 +640,25 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
 		}
 	}
 	read_unlock_bh(&xfrm_policy_lock);
+
+	return pol;
+}
+
+static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
+			       void **objp, atomic_t **obj_refp)
+{
+	struct xfrm_policy *pol;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_SUB, fl, family, dir);
+	if (pol)
+		goto end;
+#endif
+	pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+ end:
+#endif
 	if ((*objp = (void *) pol) != NULL)
 		*obj_refp = &pol->refcnt;
 }
@@ -665,8 +704,10 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struc
 
 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
 {
-	pol->next = xfrm_policy_list[dir];
-	xfrm_policy_list[dir] = pol;
+	struct xfrm_policy **p_list = XFRM_POLICY_LISTS(pol->type);
+
+	pol->next = p_list[dir];
+	p_list[dir] = pol;
 	xfrm_pol_hold(pol);
 }
 
@@ -675,7 +716,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 {
 	struct xfrm_policy **polp;
 
-	for (polp = &xfrm_policy_list[dir];
+	for (polp = XFRM_POLICY_LISTHEADP(pol->type, dir);
 	     *polp != NULL; polp = &(*polp)->next) {
 		if (*polp == pol) {
 			*polp = pol->next;
@@ -704,12 +745,17 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
 {
 	struct xfrm_policy *old_pol;
 
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (pol && pol->type != XFRM_POLICY_TYPE_MAIN)
+		return -EINVAL;
+#endif
+
 	write_lock_bh(&xfrm_policy_lock);
 	old_pol = sk->sk_policy[dir];
 	sk->sk_policy[dir] = pol;
 	if (pol) {
 		pol->curlft.add_time = (unsigned long)xtime.tv_sec;
-		pol->index = xfrm_gen_index(XFRM_POLICY_MAX+dir);
+		pol->index = xfrm_gen_index(pol->type, XFRM_POLICY_MAX+dir);
 		__xfrm_policy_link(pol, XFRM_POLICY_MAX+dir);
 	}
 	if (old_pol)
@@ -738,6 +784,7 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir)
 		newp->flags = old->flags;
 		newp->xfrm_nr = old->xfrm_nr;
 		newp->index = old->index;
+		newp->type = old->type;
 		memcpy(newp->xfrm_vec, old->xfrm_vec,
 		       newp->xfrm_nr*sizeof(struct xfrm_tmpl));
 		write_lock_bh(&xfrm_policy_lock);
@@ -764,9 +811,9 @@ int __xfrm_sk_clone_policy(struct sock *sk)
 /* Resolve list of templates for the flow, given policy. */
 
 static int
-xfrm_tmpl_resolve(struct xfrm_policy *policy, struct flowi *fl,
-		  struct xfrm_state **xfrm,
-		  unsigned short family)
+xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
+		      struct xfrm_state **xfrm,
+		      unsigned short family)
 {
 	int nx;
 	int i, error;
@@ -809,6 +856,38 @@ fail:
 	return error;
 }
 
+static int
+xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
+		  struct xfrm_state **xfrm,
+		  unsigned short family)
+{
+	int cnx = 0;
+	int error;
+	int ret;
+	int i;
+
+	for (i = 0; i < npols; i++) {
+		if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) {
+			error = -ENOBUFS;
+			goto fail;
+		}
+		ret = xfrm_tmpl_resolve_one(pols[i], fl, &xfrm[cnx], family);
+		if (ret < 0) {
+			error = ret;
+			goto fail;
+		} else
+			cnx += ret;
+	}
+
+	return cnx;
+
+ fail:
+	for (cnx--; cnx>=0; cnx--)
+		xfrm_state_put(xfrm[cnx]);
+	return error;
+
+}
+
 /* Check that the bundle accepts the flow and its components are
  * still valid.
  */
@@ -855,6 +934,11 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
 		struct sock *sk, int flags)
 {
 	struct xfrm_policy *policy;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	int npols;
+	int pol_dead;
+	int xfrm_nr;
+	int pi;
 	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
 	struct dst_entry *dst, *dst_orig = *dst_p;
 	int nx = 0;
@@ -866,12 +950,18 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
 restart:
 	genid = atomic_read(&flow_cache_genid);
 	policy = NULL;
+	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
+		pols[pi] = NULL;
+	npols = 0;
+	pol_dead = 0;
+	xfrm_nr = 0;
+
 	if (sk && sk->sk_policy[1])
 		policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
 
 	if (!policy) {
 		/* To accelerate a bit...  */
-		if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT])
+		if ((dst_orig->flags & DST_NOXFRM) || xfrm_policy_lists_empty(XFRM_POLICY_OUT))
 			return 0;
 
 		policy = flow_cache_lookup(fl, dst_orig->ops->family,
@@ -883,6 +973,9 @@ restart:
 
 	family = dst_orig->ops->family;
 	policy->curlft.use_time = (unsigned long)xtime.tv_sec;
+	pols[0] = policy;
+	npols ++;
+	xfrm_nr += pols[0]->xfrm_nr;
 
 	switch (policy->action) {
 	case XFRM_POLICY_BLOCK:
@@ -891,11 +984,13 @@ restart:
 		goto error;
 
 	case XFRM_POLICY_ALLOW:
+#ifndef CONFIG_XFRM_SUB_POLICY
 		if (policy->xfrm_nr == 0) {
 			/* Flow passes not transformed. */
 			xfrm_pol_put(policy);
 			return 0;
 		}
+#endif
 
 		/* Try to find matching bundle.
 		 *
@@ -911,7 +1006,36 @@ restart:
 		if (dst)
 			break;
 
-		nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
+#ifdef CONFIG_XFRM_SUB_POLICY
+		if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+			pols[1] = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN,
+							    fl, family,
+							    XFRM_POLICY_OUT);
+			if (pols[1]) {
+				if (pols[1]->action == XFRM_POLICY_BLOCK) {
+					err = -EPERM;
+					goto error;
+				}
+				npols ++;
+				xfrm_nr += pols[1]->xfrm_nr;
+			}
+		}
+
+		/*
+		 * Because neither flowi nor bundle information knows about
+		 * transformation template size. On more than one policy usage
+		 * we can realize whether all of them is bypass or not after
+		 * they are searched. See above not-transformed bypass
+		 * is surrounded by non-sub policy configuration, too.
+		 */
+		if (xfrm_nr == 0) {
+			/* Flow passes not transformed. */
+			xfrm_pols_put(pols, npols);
+			return 0;
+		}
+
+#endif
+		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
 
 		if (unlikely(nx<0)) {
 			err = nx;
@@ -924,7 +1048,7 @@ restart:
 				set_current_state(TASK_RUNNING);
 				remove_wait_queue(&km_waitq, &wait);
 
-				nx = xfrm_tmpl_resolve(policy, fl, xfrm, family);
+				nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
 
 				if (nx == -EAGAIN && signal_pending(current)) {
 					err = -ERESTART;
@@ -932,7 +1056,7 @@ restart:
 				}
 				if (nx == -EAGAIN ||
 				    genid != atomic_read(&flow_cache_genid)) {
-					xfrm_pol_put(policy);
+					xfrm_pols_put(pols, npols);
 					goto restart;
 				}
 				err = nx;
@@ -942,7 +1066,7 @@ restart:
 		}
 		if (nx == 0) {
 			/* Flow passes not transformed. */
-			xfrm_pol_put(policy);
+			xfrm_pols_put(pols, npols);
 			return 0;
 		}
 
@@ -956,8 +1080,14 @@ restart:
 			goto error;
 		}
 
+		for (pi = 0; pi < npols; pi++) {
+			read_lock_bh(&pols[pi]->lock);
+			pol_dead |= pols[pi]->dead;
+			read_unlock_bh(&pols[pi]->lock);
+		}
+
 		write_lock_bh(&policy->lock);
-		if (unlikely(policy->dead || stale_bundle(dst))) {
+		if (unlikely(pol_dead || stale_bundle(dst))) {
 			/* Wow! While we worked on resolving, this
 			 * policy has gone. Retry. It is not paranoia,
 			 * we just cannot enlist new bundle to dead object.
@@ -977,12 +1107,12 @@ restart:
 	}
 	*dst_p = dst;
 	dst_release(dst_orig);
-	xfrm_pol_put(policy);
+ 	xfrm_pols_put(pols, npols);
 	return 0;
 
 error:
 	dst_release(dst_orig);
-	xfrm_pol_put(policy);
+	xfrm_pols_put(pols, npols);
 	*dst_p = NULL;
 	return err;
 }
@@ -1090,6 +1220,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 			unsigned short family)
 {
 	struct xfrm_policy *pol;
+	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+	int npols = 0;
+	int xfrm_nr;
+	int pi;
 	struct flowi fl;
 	u8 fl_dir = policy_to_flow_dir(dir);
 	int xerr_idx = -1;
@@ -1128,22 +1262,50 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 
 	pol->curlft.use_time = (unsigned long)xtime.tv_sec;
 
+	pols[0] = pol;
+	npols ++;
+#ifdef CONFIG_XFRM_SUB_POLICY
+	if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+		pols[1] = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN,
+						    &fl, family,
+						    XFRM_POLICY_IN);
+		if (pols[1]) {
+			pols[1]->curlft.use_time = (unsigned long)xtime.tv_sec;
+			npols ++;
+		}
+	}
+#endif
+
 	if (pol->action == XFRM_POLICY_ALLOW) {
 		struct sec_path *sp;
 		static struct sec_path dummy;
+		struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
+		struct xfrm_tmpl **tpp = tp;
+		int ti = 0;
 		int i, k;
 
 		if ((sp = skb->sp) == NULL)
 			sp = &dummy;
 
+		for (pi = 0; pi < npols; pi++) {
+			if (pols[pi] != pol &&
+			    pols[pi]->action != XFRM_POLICY_ALLOW)
+				goto reject;
+			if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH)
+				goto reject_error;
+			for (i = 0; i < pols[pi]->xfrm_nr; i++)
+				tpp[ti++] = &pols[pi]->xfrm_vec[i];
+		}
+		xfrm_nr = ti;
+
 		/* For each tunnel xfrm, find the first matching tmpl.
 		 * For each tmpl before that, find corresponding xfrm.
 		 * Order is _important_. Later we will implement
 		 * some barriers, but at the moment barriers
 		 * are implied between each two transformations.
 		 */
-		for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) {
-			k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family);
+		for (i = xfrm_nr-1, k = 0; i >= 0; i--) {
+			k = xfrm_policy_ok(tpp[i], sp, k, family);
 			if (k < 0) {
 				if (k < -1 && xerr_idxp)
 					*xerr_idxp = -(2+k);
@@ -1154,13 +1316,14 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		if (secpath_has_nontransport(sp, k, xerr_idxp))
 			goto reject;
 
-		xfrm_pol_put(pol);
+		xfrm_pols_put(pols, npols);
 		return 1;
 	}
 
 reject:
 	xfrm_secpath_reject(xerr_idx, skb, &fl);
-	xfrm_pol_put(pol);
+reject_error:
+	xfrm_pols_put(pols, npols);
 	return 0;
 }
 EXPORT_SYMBOL(__xfrm_policy_check);
@@ -1246,6 +1409,23 @@ static void xfrm_prune_bundles(int (*func)(struct dst_entry *))
 
 	read_lock_bh(&xfrm_policy_lock);
 	for (i=0; i<2*XFRM_POLICY_MAX; i++) {
+#ifdef CONFIG_XFRM_SUB_POLICY
+		for (pol = xfrm_policy_list_sub[i]; pol; pol = pol->next) {
+			write_lock(&pol->lock);
+			dstp = &pol->bundles;
+			while ((dst=*dstp) != NULL) {
+				if (func(dst)) {
+					*dstp = dst->next;
+					dst->next = gc_list;
+					gc_list = dst;
+				} else {
+					dstp = &dst->next;
+				}
+			}
+			write_unlock(&pol->lock);
+		}
+
+#endif
 		for (pol = xfrm_policy_list[i]; pol; pol = pol->next) {
 			write_lock(&pol->lock);
 			dstp = &pol->bundles;
-- 
cgit v1.2.3-70-g09d2


From 41a49cc3c02ace59d4dddae91ea211c330970ee3 Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 22:48:31 -0700
Subject: [XFRM]: Add sorting interface for state and template.

Under two transformation policies it is required to merge them.
This is a platform to sort state for outbound and templates
for inbound respectively.
It will be used when Mobile IPv6 and IPsec are used at the same time.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     | 20 ++++++++++++++++++++
 net/xfrm/xfrm_policy.c | 16 ++++++++++++++--
 net/xfrm/xfrm_state.c  | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4655ca25f80..d341603e4ba 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -254,6 +254,8 @@ struct xfrm_state_afinfo {
 	struct xfrm_state	*(*find_acq)(u8 mode, u32 reqid, u8 proto, 
 					     xfrm_address_t *daddr, xfrm_address_t *saddr, 
 					     int create);
+	int			(*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n);
+	int			(*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n);
 };
 
 extern int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo);
@@ -1002,6 +1004,24 @@ extern int xfrm_state_add(struct xfrm_state *x);
 extern int xfrm_state_update(struct xfrm_state *x);
 extern struct xfrm_state *xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto, unsigned short family);
 extern struct xfrm_state *xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto, unsigned short family);
+#ifdef CONFIG_XFRM_SUB_POLICY
+extern int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src,
+			  int n, unsigned short family);
+extern int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src,
+			   int n, unsigned short family);
+#else
+static inline int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src,
+				 int n, unsigned short family)
+{
+	return -ENOSYS;
+}
+
+static inline int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src,
+				  int n, unsigned short family)
+{
+	return -ENOSYS;
+}
+#endif
 extern struct xfrm_state *xfrm_find_acq_byseq(u32 seq);
 extern int xfrm_state_delete(struct xfrm_state *x);
 extern void xfrm_state_flush(u8 proto);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 96de6c76ed5..1732159ffd0 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -861,6 +861,8 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
 		  struct xfrm_state **xfrm,
 		  unsigned short family)
 {
+	struct xfrm_state *tp[XFRM_MAX_DEPTH];
+	struct xfrm_state **tpp = (npols > 1) ? tp : xfrm;
 	int cnx = 0;
 	int error;
 	int ret;
@@ -871,7 +873,8 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
 			error = -ENOBUFS;
 			goto fail;
 		}
-		ret = xfrm_tmpl_resolve_one(pols[i], fl, &xfrm[cnx], family);
+
+		ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family);
 		if (ret < 0) {
 			error = ret;
 			goto fail;
@@ -879,11 +882,15 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
 			cnx += ret;
 	}
 
+	/* found states are sorted for outbound processing */
+	if (npols > 1)
+		xfrm_state_sort(xfrm, tpp, cnx, family);
+
 	return cnx;
 
  fail:
 	for (cnx--; cnx>=0; cnx--)
-		xfrm_state_put(xfrm[cnx]);
+		xfrm_state_put(tpp[cnx]);
 	return error;
 
 }
@@ -1280,6 +1287,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		struct sec_path *sp;
 		static struct sec_path dummy;
 		struct xfrm_tmpl *tp[XFRM_MAX_DEPTH];
+		struct xfrm_tmpl *stp[XFRM_MAX_DEPTH];
 		struct xfrm_tmpl **tpp = tp;
 		int ti = 0;
 		int i, k;
@@ -1297,6 +1305,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 				tpp[ti++] = &pols[pi]->xfrm_vec[i];
 		}
 		xfrm_nr = ti;
+		if (npols > 1) {
+			xfrm_tmpl_sort(stp, tpp, xfrm_nr, family);
+			tpp = stp;
+		}
 
 		/* For each tunnel xfrm, find the first matching tmpl.
 		 * For each tmpl before that, find corresponding xfrm.
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index a26ef6952c3..622e92a08d0 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -728,6 +728,44 @@ xfrm_find_acq(u8 mode, u32 reqid, u8 proto,
 }
 EXPORT_SYMBOL(xfrm_find_acq);
 
+#ifdef CONFIG_XFRM_SUB_POLICY
+int
+xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n,
+	       unsigned short family)
+{
+	int err = 0;
+	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+	if (!afinfo)
+		return -EAFNOSUPPORT;
+
+	spin_lock_bh(&xfrm_state_lock);
+	if (afinfo->tmpl_sort)
+		err = afinfo->tmpl_sort(dst, src, n);
+	spin_unlock_bh(&xfrm_state_lock);
+	xfrm_state_put_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_tmpl_sort);
+
+int
+xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n,
+		unsigned short family)
+{
+	int err = 0;
+	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
+	if (!afinfo)
+		return -EAFNOSUPPORT;
+
+	spin_lock_bh(&xfrm_state_lock);
+	if (afinfo->state_sort)
+		err = afinfo->state_sort(dst, src, n);
+	spin_unlock_bh(&xfrm_state_lock);
+	xfrm_state_put_afinfo(afinfo);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_state_sort);
+#endif
+
 /* Silly enough, but I'm lazy to build resolution list */
 
 static struct xfrm_state *__xfrm_find_acq_byseq(u32 seq)
-- 
cgit v1.2.3-70-g09d2


From f7b6983f0feeefcd2a594138adcffe640593d8de Mon Sep 17 00:00:00 2001
From: Masahide NAKAMURA <nakam@linux-ipv6.org>
Date: Wed, 23 Aug 2006 22:49:28 -0700
Subject: [XFRM] POLICY: Support netlink socket interface for sub policy.

Sub policy can be used through netlink socket.
PF_KEY uses main only and it is TODO to support sub.

Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h |   7 +++
 include/net/xfrm.h   |   1 +
 net/key/af_key.c     |  18 +++++--
 net/xfrm/xfrm_user.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 142 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 492fb981874..14ecd19f4cd 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -230,6 +230,12 @@ enum xfrm_ae_ftype_t {
 #define XFRM_AE_MAX (__XFRM_AE_MAX - 1)
 };
 
+struct xfrm_userpolicy_type {
+	__u8		type;
+	__u16		reserved1;
+	__u8		reserved2;
+};
+
 /* Netlink message attributes.  */
 enum xfrm_attr_type_t {
 	XFRMA_UNSPEC,
@@ -248,6 +254,7 @@ enum xfrm_attr_type_t {
 	XFRMA_SRCADDR,		/* xfrm_address_t */
 	XFRMA_COADDR,		/* xfrm_address_t */
 	XFRMA_LASTUSED,
+	XFRMA_POLICY_TYPE,	/* struct xfrm_userpolicy_type */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d341603e4ba..c75b3287d8f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -203,6 +203,7 @@ struct km_event
 		u32 proto;
 		u32 byid;
 		u32 aevent;
+		u32 type;
 	} data;
 
 	u32	seq;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 19e047b0e67..83b443ddc72 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1731,7 +1731,8 @@ static u32 gen_reqid(void)
 		++reqid;
 		if (reqid == 0)
 			reqid = IPSEC_MANUAL_REQID_MAX+1;
-		if (xfrm_policy_walk(check_reqid, (void*)&reqid) != -EEXIST)
+		if (xfrm_policy_walk(XFRM_POLICY_TYPE_MAIN, check_reqid,
+				     (void*)&reqid) != -EEXIST)
 			return reqid;
 	} while (reqid != start);
 	return 0;
@@ -2268,7 +2269,8 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 			return err;
 	}
 
-	xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1);
+	xp = xfrm_policy_bysel_ctx(XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir-1,
+				   &sel, tmp.security, 1);
 	security_xfrm_policy_free(&tmp);
 	if (xp == NULL)
 		return -ENOENT;
@@ -2330,7 +2332,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	if (dir >= XFRM_POLICY_MAX)
 		return -EINVAL;
 
-	xp = xfrm_policy_byid(dir, pol->sadb_x_policy_id,
+	xp = xfrm_policy_byid(XFRM_POLICY_TYPE_MAIN, dir, pol->sadb_x_policy_id,
 			      hdr->sadb_msg_type == SADB_X_SPDDELETE2);
 	if (xp == NULL)
 		return -ENOENT;
@@ -2378,7 +2380,7 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
 {
 	struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk };
 
-	return xfrm_policy_walk(dump_sp, &data);
+	return xfrm_policy_walk(XFRM_POLICY_TYPE_MAIN, dump_sp, &data);
 }
 
 static int key_notify_policy_flush(struct km_event *c)
@@ -2405,7 +2407,8 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 {
 	struct km_event c;
 
-	xfrm_policy_flush();
+	xfrm_policy_flush(XFRM_POLICY_TYPE_MAIN);
+	c.data.type = XFRM_POLICY_TYPE_MAIN;
 	c.event = XFRM_MSG_FLUSHPOLICY;
 	c.pid = hdr->sadb_msg_pid;
 	c.seq = hdr->sadb_msg_seq;
@@ -2667,6 +2670,9 @@ static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
 
 static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
 {
+	if (xp && xp->type != XFRM_POLICY_TYPE_MAIN)
+		return 0;
+
 	switch (c->event) {
 	case XFRM_MSG_POLEXPIRE:
 		return key_notify_policy_expire(xp, c);
@@ -2675,6 +2681,8 @@ static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_e
 	case XFRM_MSG_UPDPOLICY:
 		return key_notify_policy(xp, dir, c);
 	case XFRM_MSG_FLUSHPOLICY:
+		if (c->data.type != XFRM_POLICY_TYPE_MAIN)
+			break;
 		return key_notify_policy_flush(c);
 	default:
 		printk("pfkey: Unknown policy event %d\n", c->event);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 7303b820bea..c59a78d2923 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -786,6 +786,22 @@ static int verify_policy_dir(__u8 dir)
 	return 0;
 }
 
+static int verify_policy_type(__u8 type)
+{
+	switch (type) {
+	case XFRM_POLICY_TYPE_MAIN:
+#ifdef CONFIG_XFRM_SUB_POLICY
+	case XFRM_POLICY_TYPE_SUB:
+#endif
+		break;
+
+	default:
+		return -EINVAL;
+	};
+
+	return 0;
+}
+
 static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
 {
 	switch (p->share) {
@@ -879,6 +895,29 @@ static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma)
 	return 0;
 }
 
+static int copy_from_user_policy_type(u8 *tp, struct rtattr **xfrma)
+{
+	struct rtattr *rt = xfrma[XFRMA_POLICY_TYPE-1];
+	struct xfrm_userpolicy_type *upt;
+	__u8 type = XFRM_POLICY_TYPE_MAIN;
+	int err;
+
+	if (rt) {
+		if (rt->rta_len < sizeof(*upt))
+			return -EINVAL;
+
+		upt = RTA_DATA(rt);
+		type = upt->type;
+	}
+
+	err = verify_policy_type(type);
+	if (err)
+		return err;
+
+	*tp = type;
+	return 0;
+}
+
 static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p)
 {
 	xp->priority = p->priority;
@@ -917,16 +956,20 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p,
 
 	copy_from_user_policy(xp, p);
 
+	err = copy_from_user_policy_type(&xp->type, xfrma);
+	if (err)
+		goto error;
+
 	if (!(err = copy_from_user_tmpl(xp, xfrma)))
 		err = copy_from_user_sec_ctx(xp, xfrma);
-
-	if (err) {
-		*errp = err;
-		kfree(xp);
-		xp = NULL;
-	}
+	if (err)
+		goto error;
 
 	return xp;
+ error:
+	*errp = err;
+	kfree(xp);
+	return NULL;
 }
 
 static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
@@ -1037,6 +1080,29 @@ static inline int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *s
 	return 0;
 }
 
+#ifdef CONFIG_XFRM_SUB_POLICY
+static int copy_to_user_policy_type(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+	struct xfrm_userpolicy_type upt;
+
+	memset(&upt, 0, sizeof(upt));
+	upt.type = xp->type;
+
+	RTA_PUT(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt);
+
+	return 0;
+
+rtattr_failure:
+	return -1;
+}
+
+#else
+static inline int copy_to_user_policy_type(struct xfrm_policy *xp, struct sk_buff *skb)
+{
+	return 0;
+}
+#endif
+
 static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr)
 {
 	struct xfrm_dump_info *sp = ptr;
@@ -1060,6 +1126,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
 		goto nlmsg_failure;
 	if (copy_to_user_sec_ctx(xp, skb))
 		goto nlmsg_failure;
+	if (copy_to_user_policy_type(xp, skb) < 0)
+		goto nlmsg_failure;
 
 	nlh->nlmsg_len = skb->tail - b;
 out:
@@ -1081,7 +1149,10 @@ static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb)
 	info.nlmsg_flags = NLM_F_MULTI;
 	info.this_idx = 0;
 	info.start_idx = cb->args[0];
-	(void) xfrm_policy_walk(dump_one_policy, &info);
+	(void) xfrm_policy_walk(XFRM_POLICY_TYPE_MAIN, dump_one_policy, &info);
+#ifdef CONFIG_XFRM_SUB_POLICY
+	(void) xfrm_policy_walk(XFRM_POLICY_TYPE_SUB, dump_one_policy, &info);
+#endif
 	cb->args[0] = info.this_idx;
 
 	return skb->len;
@@ -1117,6 +1188,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 {
 	struct xfrm_policy *xp;
 	struct xfrm_userpolicy_id *p;
+	__u8 type = XFRM_POLICY_TYPE_MAIN;
 	int err;
 	struct km_event c;
 	int delete;
@@ -1124,12 +1196,16 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 	p = NLMSG_DATA(nlh);
 	delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
 
+	err = copy_from_user_policy_type(&type, (struct rtattr **)xfrma);
+	if (err)
+		return err;
+
 	err = verify_policy_dir(p->dir);
 	if (err)
 		return err;
 
 	if (p->index)
-		xp = xfrm_policy_byid(p->dir, p->index, delete);
+		xp = xfrm_policy_byid(type, p->dir, p->index, delete);
 	else {
 		struct rtattr **rtattrs = (struct rtattr **)xfrma;
 		struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1];
@@ -1146,7 +1222,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 			if ((err = security_xfrm_policy_alloc(&tmp, uctx)))
 				return err;
 		}
-		xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete);
+		xp = xfrm_policy_bysel_ctx(type, p->dir, &p->sel, tmp.security, delete);
 		security_xfrm_policy_free(&tmp);
 	}
 	if (xp == NULL)
@@ -1329,9 +1405,16 @@ out:
 
 static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 {
-struct km_event c;
+	struct km_event c;
+	__u8 type = XFRM_POLICY_TYPE_MAIN;
+	int err;
+
+	err = copy_from_user_policy_type(&type, (struct rtattr **)xfrma);
+	if (err)
+		return err;
 
-	xfrm_policy_flush();
+	xfrm_policy_flush(type);
+	c.data.type = type;
 	c.event = nlh->nlmsg_type;
 	c.seq = nlh->nlmsg_seq;
 	c.pid = nlh->nlmsg_pid;
@@ -1344,10 +1427,15 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, void *
 	struct xfrm_policy *xp;
 	struct xfrm_user_polexpire *up = NLMSG_DATA(nlh);
 	struct xfrm_userpolicy_info *p = &up->pol;
+	__u8 type = XFRM_POLICY_TYPE_MAIN;
 	int err = -ENOENT;
 
+	err = copy_from_user_policy_type(&type, (struct rtattr **)xfrma);
+	if (err)
+		return err;
+
 	if (p->index)
-		xp = xfrm_policy_byid(p->dir, p->index, 0);
+		xp = xfrm_policy_byid(type, p->dir, p->index, 0);
 	else {
 		struct rtattr **rtattrs = (struct rtattr **)xfrma;
 		struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1];
@@ -1364,7 +1452,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, void *
 			if ((err = security_xfrm_policy_alloc(&tmp, uctx)))
 				return err;
 		}
-		xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, 0);
+		xp = xfrm_policy_bysel_ctx(type, p->dir, &p->sel, tmp.security, 0);
 		security_xfrm_policy_free(&tmp);
 	}
 
@@ -1818,6 +1906,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
 		goto nlmsg_failure;
 	if (copy_to_user_state_sec_ctx(x, skb))
 		goto nlmsg_failure;
+	if (copy_to_user_policy_type(xp, skb) < 0)
+		goto nlmsg_failure;
 
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
@@ -1898,6 +1988,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
 	}
 
 	copy_from_user_policy(xp, p);
+	xp->type = XFRM_POLICY_TYPE_MAIN;
 	copy_templates(xp, ut, nr);
 
 	if (!xp->security) {
@@ -1931,6 +2022,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
 		goto nlmsg_failure;
 	if (copy_to_user_sec_ctx(xp, skb))
 		goto nlmsg_failure;
+	if (copy_to_user_policy_type(xp, skb) < 0)
+		goto nlmsg_failure;
 	upe->hard = !!hard;
 
 	nlh->nlmsg_len = skb->tail - b;
@@ -2002,6 +2095,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *
 	copy_to_user_policy(xp, p, dir);
 	if (copy_to_user_tmpl(xp, skb) < 0)
 		goto nlmsg_failure;
+	if (copy_to_user_policy_type(xp, skb) < 0)
+		goto nlmsg_failure;
 
 	nlh->nlmsg_len = skb->tail - b;
 
@@ -2019,6 +2114,9 @@ static int xfrm_notify_policy_flush(struct km_event *c)
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
 	unsigned char *b;
+#ifdef CONFIG_XFRM_SUB_POLICY
+	struct xfrm_userpolicy_type upt;
+#endif
 	int len = NLMSG_LENGTH(0);
 
 	skb = alloc_skb(len, GFP_ATOMIC);
@@ -2028,6 +2126,13 @@ static int xfrm_notify_policy_flush(struct km_event *c)
 
 
 	nlh = NLMSG_PUT(skb, c->pid, c->seq, XFRM_MSG_FLUSHPOLICY, 0);
+	nlh->nlmsg_flags = 0;
+
+#ifdef CONFIG_XFRM_SUB_POLICY
+	memset(&upt, 0, sizeof(upt));
+	upt.type = c->data.type;
+	RTA_PUT(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt);
+#endif
 
 	nlh->nlmsg_len = skb->tail - b;
 
@@ -2035,6 +2140,9 @@ static int xfrm_notify_policy_flush(struct km_event *c)
 	return netlink_broadcast(xfrm_nl, skb, 0, XFRMNLGRP_POLICY, GFP_ATOMIC);
 
 nlmsg_failure:
+#ifdef CONFIG_XFRM_SUB_POLICY
+rtattr_failure:
+#endif
 	kfree_skb(skb);
 	return -1;
 }
-- 
cgit v1.2.3-70-g09d2


From 2770834c9f44afd1bfa13914c7285470775af657 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 24 Aug 2006 00:13:10 -0700
Subject: [XFRM]: Pull xfrm_state_bydst hash table knowledge out of afinfo.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     |  16 -------
 net/ipv4/xfrm4_state.c |  53 ------------------------
 net/ipv6/xfrm6_state.c |  56 -------------------------
 net/xfrm/xfrm_state.c  | 110 ++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 100 insertions(+), 135 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c75b3287d8f..cc83443f301 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -243,7 +243,6 @@ extern int __xfrm_state_delete(struct xfrm_state *x);
 
 struct xfrm_state_afinfo {
 	unsigned short		family;
-	struct list_head	*state_bydst;
 	struct list_head	*state_bysrc;
 	struct list_head	*state_byspi;
 	int			(*init_flags)(struct xfrm_state *x);
@@ -252,9 +251,6 @@ struct xfrm_state_afinfo {
 						xfrm_address_t *daddr, xfrm_address_t *saddr);
 	struct xfrm_state	*(*state_lookup)(xfrm_address_t *daddr, u32 spi, u8 proto);
 	struct xfrm_state	*(*state_lookup_byaddr)(xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto);
-	struct xfrm_state	*(*find_acq)(u8 mode, u32 reqid, u8 proto, 
-					     xfrm_address_t *daddr, xfrm_address_t *saddr, 
-					     int create);
 	int			(*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n);
 	int			(*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n);
 };
@@ -455,18 +451,6 @@ unsigned __xfrm6_dst_hash(xfrm_address_t *addr)
 	return h;
 }
 
-static __inline__
-unsigned xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
-{
-	switch (family) {
-	case AF_INET:
-		return __xfrm4_dst_hash(addr);
-	case AF_INET6:
-		return __xfrm6_dst_hash(addr);
-	}
-	return 0;
-}
-
 static __inline__
 unsigned __xfrm4_src_hash(xfrm_address_t *addr)
 {
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 616be131b4e..9dc1afc17b6 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -88,65 +88,12 @@ __xfrm4_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr,
 	return NULL;
 }
 
-static struct xfrm_state *
-__xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, 
-		 xfrm_address_t *daddr, xfrm_address_t *saddr, 
-		 int create)
-{
-	struct xfrm_state *x, *x0;
-	unsigned h = __xfrm4_dst_hash(daddr);
-
-	x0 = NULL;
-
-	list_for_each_entry(x, xfrm4_state_afinfo.state_bydst+h, bydst) {
-		if (x->props.family == AF_INET &&
-		    daddr->a4 == x->id.daddr.a4 &&
-		    mode == x->props.mode &&
-		    proto == x->id.proto &&
-		    saddr->a4 == x->props.saddr.a4 &&
-		    reqid == x->props.reqid &&
-		    x->km.state == XFRM_STATE_ACQ &&
-		    !x->id.spi) {
-			    x0 = x;
-			    break;
-		    }
-	}
-	if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) {
-		x0->sel.daddr.a4 = daddr->a4;
-		x0->sel.saddr.a4 = saddr->a4;
-		x0->sel.prefixlen_d = 32;
-		x0->sel.prefixlen_s = 32;
-		x0->props.saddr.a4 = saddr->a4;
-		x0->km.state = XFRM_STATE_ACQ;
-		x0->id.daddr.a4 = daddr->a4;
-		x0->id.proto = proto;
-		x0->props.family = AF_INET;
-		x0->props.mode = mode;
-		x0->props.reqid = reqid;
-		x0->props.family = AF_INET;
-		x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
-		xfrm_state_hold(x0);
-		x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
-		add_timer(&x0->timer);
-		xfrm_state_hold(x0);
-		list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h);
-		h = __xfrm4_src_hash(saddr);
-		xfrm_state_hold(x0);
-		list_add_tail(&x0->bysrc, xfrm4_state_afinfo.state_bysrc+h);
-		wake_up(&km_waitq);
-	}
-	if (x0)
-		xfrm_state_hold(x0);
-	return x0;
-}
-
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
 	.state_lookup		= __xfrm4_state_lookup,
 	.state_lookup_byaddr	= __xfrm4_state_lookup_byaddr,
-	.find_acq		= __xfrm4_find_acq,
 };
 
 void __init xfrm4_state_init(void)
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 6269584e610..40fcaab7e02 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -101,61 +101,6 @@ __xfrm6_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
 	return NULL;
 }
 
-static struct xfrm_state *
-__xfrm6_find_acq(u8 mode, u32 reqid, u8 proto, 
-		 xfrm_address_t *daddr, xfrm_address_t *saddr, 
-		 int create)
-{
-	struct xfrm_state *x, *x0;
-	unsigned h = __xfrm6_dst_hash(daddr);
-
-	x0 = NULL;
-
-	list_for_each_entry(x, xfrm6_state_afinfo.state_bydst+h, bydst) {
-		if (x->props.family == AF_INET6 &&
-		    ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) &&
-		    mode == x->props.mode &&
-		    proto == x->id.proto &&
-		    ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)x->props.saddr.a6) &&
-		    reqid == x->props.reqid &&
-		    x->km.state == XFRM_STATE_ACQ &&
-		    !x->id.spi) {
-			    x0 = x;
-			    break;
-		    }
-	}
-	if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) {
-		ipv6_addr_copy((struct in6_addr *)x0->sel.daddr.a6,
-			       (struct in6_addr *)daddr);
-		ipv6_addr_copy((struct in6_addr *)x0->sel.saddr.a6,
-			       (struct in6_addr *)saddr);
-		x0->sel.prefixlen_d = 128;
-		x0->sel.prefixlen_s = 128;
-		ipv6_addr_copy((struct in6_addr *)x0->props.saddr.a6,
-			       (struct in6_addr *)saddr);
-		x0->km.state = XFRM_STATE_ACQ;
-		ipv6_addr_copy((struct in6_addr *)x0->id.daddr.a6,
-			       (struct in6_addr *)daddr);
-		x0->id.proto = proto;
-		x0->props.family = AF_INET6;
-		x0->props.mode = mode;
-		x0->props.reqid = reqid;
-		x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
-		xfrm_state_hold(x0);
-		x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
-		add_timer(&x0->timer);
-		xfrm_state_hold(x0);
-		list_add_tail(&x0->bydst, xfrm6_state_afinfo.state_bydst+h);
-		h = __xfrm6_src_hash(saddr);
-		xfrm_state_hold(x0);
-		list_add_tail(&x0->bysrc, xfrm6_state_afinfo.state_bysrc+h);
-		wake_up(&km_waitq);
-	}
-	if (x0)
-		xfrm_state_hold(x0);
-	return x0;
-}
-
 static int
 __xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
 {
@@ -280,7 +225,6 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.init_tempsel		= __xfrm6_init_tempsel,
 	.state_lookup		= __xfrm6_state_lookup,
 	.state_lookup_byaddr	= __xfrm6_state_lookup_byaddr,
-	.find_acq		= __xfrm6_find_acq,
 	.tmpl_sort		= __xfrm6_tmpl_sort,
 	.state_sort		= __xfrm6_state_sort,
 };
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 622e92a08d0..80f5f9dc2b9 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -48,6 +48,18 @@ static struct list_head xfrm_state_bydst[XFRM_DST_HSIZE];
 static struct list_head xfrm_state_bysrc[XFRM_DST_HSIZE];
 static struct list_head xfrm_state_byspi[XFRM_DST_HSIZE];
 
+static __inline__
+unsigned xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
+{
+	switch (family) {
+	case AF_INET:
+		return __xfrm4_dst_hash(addr);
+	case AF_INET6:
+		return __xfrm6_dst_hash(addr);
+	}
+	return 0;
+}
+
 DECLARE_WAIT_QUEUE_HEAD(km_waitq);
 EXPORT_SYMBOL(km_waitq);
 
@@ -489,6 +501,89 @@ void xfrm_state_insert(struct xfrm_state *x)
 }
 EXPORT_SYMBOL(xfrm_state_insert);
 
+/* xfrm_state_lock is held */
+static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 reqid, u8 proto, xfrm_address_t *daddr, xfrm_address_t *saddr, int create)
+{
+	unsigned int h = xfrm_dst_hash(daddr, family);
+	struct xfrm_state *x;
+
+	list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
+		if (x->props.reqid  != reqid ||
+		    x->props.mode   != mode ||
+		    x->props.family != family ||
+		    x->km.state     != XFRM_STATE_ACQ ||
+		    x->id.spi       != 0)
+			continue;
+
+		switch (family) {
+		case AF_INET:
+			if (x->id.daddr.a4    != daddr->a4 ||
+			    x->props.saddr.a4 != saddr->a4)
+				continue;
+			break;
+		case AF_INET6:
+			if (!ipv6_addr_equal((struct in6_addr *)x->id.daddr.a6,
+					     (struct in6_addr *)daddr) ||
+			    !ipv6_addr_equal((struct in6_addr *)
+					     x->props.saddr.a6,
+					     (struct in6_addr *)saddr))
+				continue;
+			break;
+		};
+
+		xfrm_state_hold(x);
+		return x;
+	}
+
+	if (!create)
+		return NULL;
+
+	x = xfrm_state_alloc();
+	if (likely(x)) {
+		switch (family) {
+		case AF_INET:
+			x->sel.daddr.a4 = daddr->a4;
+			x->sel.saddr.a4 = saddr->a4;
+			x->sel.prefixlen_d = 32;
+			x->sel.prefixlen_s = 32;
+			x->props.saddr.a4 = saddr->a4;
+			x->id.daddr.a4 = daddr->a4;
+			break;
+
+		case AF_INET6:
+			ipv6_addr_copy((struct in6_addr *)x->sel.daddr.a6,
+				       (struct in6_addr *)daddr);
+			ipv6_addr_copy((struct in6_addr *)x->sel.saddr.a6,
+				       (struct in6_addr *)saddr);
+			x->sel.prefixlen_d = 128;
+			x->sel.prefixlen_s = 128;
+			ipv6_addr_copy((struct in6_addr *)x->props.saddr.a6,
+				       (struct in6_addr *)saddr);
+			ipv6_addr_copy((struct in6_addr *)x->id.daddr.a6,
+				       (struct in6_addr *)daddr);
+			break;
+		};
+
+		x->km.state = XFRM_STATE_ACQ;
+		x->id.proto = proto;
+		x->props.family = family;
+		x->props.mode = mode;
+		x->props.reqid = reqid;
+		x->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
+		xfrm_state_hold(x);
+		x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
+		add_timer(&x->timer);
+		xfrm_state_hold(x);
+		list_add_tail(&x->bydst, xfrm_state_bydst+h);
+		h = xfrm_src_hash(saddr, family);
+		xfrm_state_hold(x);
+		list_add_tail(&x->bysrc, xfrm_state_bysrc+h);
+		wake_up(&km_waitq);
+	}
+
+	return x;
+}
+
 static inline struct xfrm_state *
 __xfrm_state_locate(struct xfrm_state_afinfo *afinfo, struct xfrm_state *x,
 		    int use_spi)
@@ -533,9 +628,9 @@ int xfrm_state_add(struct xfrm_state *x)
 	}
 
 	if (use_spi && !x1)
-		x1 = afinfo->find_acq(
-			x->props.mode, x->props.reqid, x->id.proto,
-			&x->id.daddr, &x->props.saddr, 0);
+		x1 = __find_acq_core(family, x->props.mode, x->props.reqid,
+				     x->id.proto,
+				     &x->id.daddr, &x->props.saddr, 0);
 
 	__xfrm_state_insert(x);
 	err = 0;
@@ -716,14 +811,11 @@ xfrm_find_acq(u8 mode, u32 reqid, u8 proto,
 	      int create, unsigned short family)
 {
 	struct xfrm_state *x;
-	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	if (!afinfo)
-		return NULL;
 
 	spin_lock_bh(&xfrm_state_lock);
-	x = afinfo->find_acq(mode, reqid, proto, daddr, saddr, create);
+	x = __find_acq_core(family, mode, reqid, proto, daddr, saddr, create);
 	spin_unlock_bh(&xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
+
 	return x;
 }
 EXPORT_SYMBOL(xfrm_find_acq);
@@ -1181,7 +1273,6 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
 	if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
 		err = -ENOBUFS;
 	else {
-		afinfo->state_bydst = xfrm_state_bydst;
 		afinfo->state_bysrc = xfrm_state_bysrc;
 		afinfo->state_byspi = xfrm_state_byspi;
 		xfrm_state_afinfo[afinfo->family] = afinfo;
@@ -1206,7 +1297,6 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
 			xfrm_state_afinfo[afinfo->family] = NULL;
 			afinfo->state_byspi = NULL;
 			afinfo->state_bysrc = NULL;
-			afinfo->state_bydst = NULL;
 		}
 	}
 	write_unlock_bh(&xfrm_state_afinfo_lock);
-- 
cgit v1.2.3-70-g09d2


From edcd582152090bfb0ccb4ad444c151798a73eda8 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 24 Aug 2006 00:42:45 -0700
Subject: [XFRM]: Pull xfrm_state_by{spi,src} hash table knowledge out of
 afinfo.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     |  78 ------------------
 net/ipv4/xfrm4_state.c |  28 -------
 net/ipv6/xfrm6_state.c |  40 ----------
 net/xfrm/xfrm_state.c  | 210 +++++++++++++++++++++++++++++++++++++------------
 4 files changed, 159 insertions(+), 197 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index cc83443f301..dd3b84b9c04 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -243,14 +243,10 @@ extern int __xfrm_state_delete(struct xfrm_state *x);
 
 struct xfrm_state_afinfo {
 	unsigned short		family;
-	struct list_head	*state_bysrc;
-	struct list_head	*state_byspi;
 	int			(*init_flags)(struct xfrm_state *x);
 	void			(*init_tempsel)(struct xfrm_state *x, struct flowi *fl,
 						struct xfrm_tmpl *tmpl,
 						xfrm_address_t *daddr, xfrm_address_t *saddr);
-	struct xfrm_state	*(*state_lookup)(xfrm_address_t *daddr, u32 spi, u8 proto);
-	struct xfrm_state	*(*state_lookup_byaddr)(xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto);
 	int			(*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n);
 	int			(*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n);
 };
@@ -431,80 +427,6 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
 }
 #endif
 
-#define XFRM_DST_HSIZE		1024
-
-static __inline__
-unsigned __xfrm4_dst_hash(xfrm_address_t *addr)
-{
-	unsigned h;
-	h = ntohl(addr->a4);
-	h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
-	return h;
-}
-
-static __inline__
-unsigned __xfrm6_dst_hash(xfrm_address_t *addr)
-{
-	unsigned h;
-	h = ntohl(addr->a6[2]^addr->a6[3]);
-	h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
-	return h;
-}
-
-static __inline__
-unsigned __xfrm4_src_hash(xfrm_address_t *addr)
-{
-	return __xfrm4_dst_hash(addr);
-}
-
-static __inline__
-unsigned __xfrm6_src_hash(xfrm_address_t *addr)
-{
-	return __xfrm6_dst_hash(addr);
-}
-
-static __inline__
-unsigned xfrm_src_hash(xfrm_address_t *addr, unsigned short family)
-{
-	switch (family) {
-	case AF_INET:
-		return __xfrm4_src_hash(addr);
-	case AF_INET6:
-		return __xfrm6_src_hash(addr);
-	}
-	return 0;
-}
-
-static __inline__
-unsigned __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
-{
-	unsigned h;
-	h = ntohl(addr->a4^spi^proto);
-	h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
-	return h;
-}
-
-static __inline__
-unsigned __xfrm6_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
-{
-	unsigned h;
-	h = ntohl(addr->a6[2]^addr->a6[3]^spi^proto);
-	h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
-	return h;
-}
-
-static __inline__
-unsigned xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family)
-{
-	switch (family) {
-	case AF_INET:
-		return __xfrm4_spi_hash(addr, spi, proto);
-	case AF_INET6:
-		return __xfrm6_spi_hash(addr, spi, proto);
-	}
-	return 0;	/*XXX*/
-}
-
 extern void __xfrm_state_destroy(struct xfrm_state *);
 
 static inline void __xfrm_state_put(struct xfrm_state *x)
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 9dc1afc17b6..6a2a4ab4277 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -62,38 +62,10 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
 	x->props.family = AF_INET;
 }
 
-static struct xfrm_state *
-__xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
-{
-	unsigned h = __xfrm4_spi_hash(daddr, spi, proto);
-	struct xfrm_state *x;
-
-	list_for_each_entry(x, xfrm4_state_afinfo.state_byspi+h, byspi) {
-		if (x->props.family == AF_INET &&
-		    spi == x->id.spi &&
-		    daddr->a4 == x->id.daddr.a4 &&
-		    proto == x->id.proto) {
-			xfrm_state_hold(x);
-			return x;
-		}
-	}
-	return NULL;
-}
-
-/* placeholder until ipv4's code is written */
-static struct xfrm_state *
-__xfrm4_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr,
-			    u8 proto)
-{
-	return NULL;
-}
-
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
-	.state_lookup		= __xfrm4_state_lookup,
-	.state_lookup_byaddr	= __xfrm4_state_lookup_byaddr,
 };
 
 void __init xfrm4_state_init(void)
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 40fcaab7e02..d88cd92c864 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -63,44 +63,6 @@ __xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl,
 	x->props.family = AF_INET6;
 }
 
-static struct xfrm_state *
-__xfrm6_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr,
-			    u8 proto)
-{
-	struct xfrm_state *x = NULL;
-	unsigned h;
-
-	h = __xfrm6_src_hash(saddr);
-	list_for_each_entry(x, xfrm6_state_afinfo.state_bysrc+h, bysrc) {
-		if (x->props.family == AF_INET6 &&
-		    ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) &&
-		    ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)x->props.saddr.a6) &&
-		    proto == x->id.proto) {
-			xfrm_state_hold(x);
-			return x;
-		}
-	}
-	return NULL;
-}
-
-static struct xfrm_state *
-__xfrm6_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
-{
-	unsigned h = __xfrm6_spi_hash(daddr, spi, proto);
-	struct xfrm_state *x;
-
-	list_for_each_entry(x, xfrm6_state_afinfo.state_byspi+h, byspi) {
-		if (x->props.family == AF_INET6 &&
-		    spi == x->id.spi &&
-		    ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) &&
-		    proto == x->id.proto) {
-			xfrm_state_hold(x);
-			return x;
-		}
-	}
-	return NULL;
-}
-
 static int
 __xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
 {
@@ -223,8 +185,6 @@ __xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n)
 static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.family			= AF_INET6,
 	.init_tempsel		= __xfrm6_init_tempsel,
-	.state_lookup		= __xfrm6_state_lookup,
-	.state_lookup_byaddr	= __xfrm6_state_lookup_byaddr,
 	.tmpl_sort		= __xfrm6_tmpl_sort,
 	.state_sort		= __xfrm6_state_sort,
 };
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 80f5f9dc2b9..4a3832f81c3 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -38,6 +38,8 @@ EXPORT_SYMBOL(sysctl_xfrm_aevent_rseqth);
 
 static DEFINE_SPINLOCK(xfrm_state_lock);
 
+#define XFRM_DST_HSIZE		1024
+
 /* Hash table to find appropriate SA towards given target (endpoint
  * of tunnel or destination of transport mode) allowed by selector.
  *
@@ -48,6 +50,48 @@ static struct list_head xfrm_state_bydst[XFRM_DST_HSIZE];
 static struct list_head xfrm_state_bysrc[XFRM_DST_HSIZE];
 static struct list_head xfrm_state_byspi[XFRM_DST_HSIZE];
 
+static __inline__
+unsigned __xfrm4_dst_hash(xfrm_address_t *addr)
+{
+	unsigned h;
+	h = ntohl(addr->a4);
+	h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
+	return h;
+}
+
+static __inline__
+unsigned __xfrm6_dst_hash(xfrm_address_t *addr)
+{
+	unsigned h;
+	h = ntohl(addr->a6[2]^addr->a6[3]);
+	h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
+	return h;
+}
+
+static __inline__
+unsigned __xfrm4_src_hash(xfrm_address_t *addr)
+{
+	return __xfrm4_dst_hash(addr);
+}
+
+static __inline__
+unsigned __xfrm6_src_hash(xfrm_address_t *addr)
+{
+	return __xfrm6_dst_hash(addr);
+}
+
+static __inline__
+unsigned xfrm_src_hash(xfrm_address_t *addr, unsigned short family)
+{
+	switch (family) {
+	case AF_INET:
+		return __xfrm4_src_hash(addr);
+	case AF_INET6:
+		return __xfrm6_src_hash(addr);
+	}
+	return 0;
+}
+
 static __inline__
 unsigned xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
 {
@@ -60,6 +104,36 @@ unsigned xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
 	return 0;
 }
 
+static __inline__
+unsigned __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
+{
+	unsigned h;
+	h = ntohl(addr->a4^spi^proto);
+	h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
+	return h;
+}
+
+static __inline__
+unsigned __xfrm6_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
+{
+	unsigned h;
+	h = ntohl(addr->a6[2]^addr->a6[3]^spi^proto);
+	h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
+	return h;
+}
+
+static __inline__
+unsigned xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family)
+{
+	switch (family) {
+	case AF_INET:
+		return __xfrm4_spi_hash(addr, spi, proto);
+	case AF_INET6:
+		return __xfrm6_spi_hash(addr, spi, proto);
+	}
+	return 0;	/*XXX*/
+}
+
 DECLARE_WAIT_QUEUE_HEAD(km_waitq);
 EXPORT_SYMBOL(km_waitq);
 
@@ -342,6 +416,83 @@ xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl,
 	return 0;
 }
 
+static struct xfrm_state *__xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto, unsigned short family)
+{
+	unsigned int h = xfrm_spi_hash(daddr, spi, proto, family);
+	struct xfrm_state *x;
+
+	list_for_each_entry(x, xfrm_state_byspi+h, byspi) {
+		if (x->props.family != family ||
+		    x->id.spi       != spi ||
+		    x->id.proto     != proto)
+			continue;
+
+		switch (family) {
+		case AF_INET:
+			if (x->id.daddr.a4 != daddr->a4)
+				continue;
+			break;
+		case AF_INET6:
+			if (!ipv6_addr_equal((struct in6_addr *)daddr,
+					     (struct in6_addr *)
+					     x->id.daddr.a6))
+				continue;
+			break;
+		};
+
+		xfrm_state_hold(x);
+		return x;
+	}
+
+	return NULL;
+}
+
+static struct xfrm_state *__xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto, unsigned short family)
+{
+	unsigned int h = xfrm_src_hash(saddr, family);
+	struct xfrm_state *x;
+
+	list_for_each_entry(x, xfrm_state_bysrc+h, bysrc) {
+		if (x->props.family != family ||
+		    x->id.proto     != proto)
+			continue;
+
+		switch (family) {
+		case AF_INET:
+			if (x->id.daddr.a4 != daddr->a4 ||
+			    x->props.saddr.a4 != saddr->a4)
+				continue;
+			break;
+		case AF_INET6:
+			if (!ipv6_addr_equal((struct in6_addr *)daddr,
+					     (struct in6_addr *)
+					     x->id.daddr.a6) ||
+			    !ipv6_addr_equal((struct in6_addr *)saddr,
+					     (struct in6_addr *)
+					     x->props.saddr.a6))
+				continue;
+			break;
+		};
+
+		xfrm_state_hold(x);
+		return x;
+	}
+
+	return NULL;
+}
+
+static inline struct xfrm_state *
+__xfrm_state_locate(struct xfrm_state *x, int use_spi, int family)
+{
+	if (use_spi)
+		return __xfrm_state_lookup(&x->id.daddr, x->id.spi,
+					   x->id.proto, family);
+	else
+		return __xfrm_state_lookup_byaddr(&x->id.daddr,
+						  &x->props.saddr,
+						  x->id.proto, family);
+}
+
 struct xfrm_state *
 xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, 
 		struct flowi *fl, struct xfrm_tmpl *tmpl,
@@ -353,14 +504,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 	int acquire_in_progress = 0;
 	int error = 0;
 	struct xfrm_state *best = NULL;
-	struct xfrm_state_afinfo *afinfo;
 	
-	afinfo = xfrm_state_get_afinfo(family);
-	if (afinfo == NULL) {
-		*err = -EAFNOSUPPORT;
-		return NULL;
-	}
-
 	spin_lock_bh(&xfrm_state_lock);
 	list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
 		if (x->props.family == family &&
@@ -406,8 +550,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 	x = best;
 	if (!x && !error && !acquire_in_progress) {
 		if (tmpl->id.spi &&
-		    (x0 = afinfo->state_lookup(daddr, tmpl->id.spi,
-		                               tmpl->id.proto)) != NULL) {
+		    (x0 = __xfrm_state_lookup(daddr, tmpl->id.spi,
+					      tmpl->id.proto, family)) != NULL) {
 			xfrm_state_put(x0);
 			error = -EEXIST;
 			goto out;
@@ -457,7 +601,6 @@ out:
 	else
 		*err = acquire_in_progress ? -EAGAIN : error;
 	spin_unlock_bh(&xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
 	return x;
 }
 
@@ -584,34 +727,20 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re
 	return x;
 }
 
-static inline struct xfrm_state *
-__xfrm_state_locate(struct xfrm_state_afinfo *afinfo, struct xfrm_state *x,
-		    int use_spi)
-{
-	if (use_spi)
-		return afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto);
-	else
-		return afinfo->state_lookup_byaddr(&x->id.daddr, &x->props.saddr, x->id.proto);
-}
-
 static struct xfrm_state *__xfrm_find_acq_byseq(u32 seq);
 
 int xfrm_state_add(struct xfrm_state *x)
 {
-	struct xfrm_state_afinfo *afinfo;
 	struct xfrm_state *x1;
 	int family;
 	int err;
 	int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
 
 	family = x->props.family;
-	afinfo = xfrm_state_get_afinfo(family);
-	if (unlikely(afinfo == NULL))
-		return -EAFNOSUPPORT;
 
 	spin_lock_bh(&xfrm_state_lock);
 
-	x1 = __xfrm_state_locate(afinfo, x, use_spi);
+	x1 = __xfrm_state_locate(x, use_spi, family);
 	if (x1) {
 		xfrm_state_put(x1);
 		x1 = NULL;
@@ -637,7 +766,6 @@ int xfrm_state_add(struct xfrm_state *x)
 
 out:
 	spin_unlock_bh(&xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
 
 	if (!err)
 		xfrm_flush_all_bundles();
@@ -653,17 +781,12 @@ EXPORT_SYMBOL(xfrm_state_add);
 
 int xfrm_state_update(struct xfrm_state *x)
 {
-	struct xfrm_state_afinfo *afinfo;
 	struct xfrm_state *x1;
 	int err;
 	int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY);
 
-	afinfo = xfrm_state_get_afinfo(x->props.family);
-	if (unlikely(afinfo == NULL))
-		return -EAFNOSUPPORT;
-
 	spin_lock_bh(&xfrm_state_lock);
-	x1 = __xfrm_state_locate(afinfo, x, use_spi);
+	x1 = __xfrm_state_locate(x, use_spi, x->props.family);
 
 	err = -ESRCH;
 	if (!x1)
@@ -683,7 +806,6 @@ int xfrm_state_update(struct xfrm_state *x)
 
 out:
 	spin_unlock_bh(&xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
 
 	if (err)
 		return err;
@@ -776,14 +898,10 @@ xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto,
 		  unsigned short family)
 {
 	struct xfrm_state *x;
-	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	if (!afinfo)
-		return NULL;
 
 	spin_lock_bh(&xfrm_state_lock);
-	x = afinfo->state_lookup(daddr, spi, proto);
+	x = __xfrm_state_lookup(daddr, spi, proto, family);
 	spin_unlock_bh(&xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
 	return x;
 }
 EXPORT_SYMBOL(xfrm_state_lookup);
@@ -793,14 +911,10 @@ xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr,
 			 u8 proto, unsigned short family)
 {
 	struct xfrm_state *x;
-	struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family);
-	if (!afinfo)
-		return NULL;
 
 	spin_lock_bh(&xfrm_state_lock);
-	x = afinfo->state_lookup_byaddr(daddr, saddr, proto);
+	x = __xfrm_state_lookup_byaddr(daddr, saddr, proto, family);
 	spin_unlock_bh(&xfrm_state_lock);
-	xfrm_state_put_afinfo(afinfo);
 	return x;
 }
 EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
@@ -1272,11 +1386,8 @@ int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo)
 	write_lock_bh(&xfrm_state_afinfo_lock);
 	if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL))
 		err = -ENOBUFS;
-	else {
-		afinfo->state_bysrc = xfrm_state_bysrc;
-		afinfo->state_byspi = xfrm_state_byspi;
+	else
 		xfrm_state_afinfo[afinfo->family] = afinfo;
-	}
 	write_unlock_bh(&xfrm_state_afinfo_lock);
 	return err;
 }
@@ -1293,11 +1404,8 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
 	if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) {
 		if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo))
 			err = -EINVAL;
-		else {
+		else
 			xfrm_state_afinfo[afinfo->family] = NULL;
-			afinfo->state_byspi = NULL;
-			afinfo->state_bysrc = NULL;
-		}
 	}
 	write_unlock_bh(&xfrm_state_afinfo_lock);
 	return err;
-- 
cgit v1.2.3-70-g09d2


From 8f126e37c0b250310a48a609bedf92a19a5559ec Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 24 Aug 2006 02:45:07 -0700
Subject: [XFRM]: Convert xfrm_state hash linkage to hlists.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h    |  6 ++--
 net/xfrm/xfrm_state.c | 92 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 54 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index dd3b84b9c04..3405e5d9d51 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -94,9 +94,9 @@ extern struct mutex xfrm_cfg_mutex;
 struct xfrm_state
 {
 	/* Note: bydst is re-used during gc */
-	struct list_head	bydst;
-	struct list_head	bysrc;
-	struct list_head	byspi;
+	struct hlist_node	bydst;
+	struct hlist_node	bysrc;
+	struct hlist_node	byspi;
 
 	atomic_t		refcnt;
 	spinlock_t		lock;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 4a3832f81c3..fe3c8c38d5e 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -46,9 +46,9 @@ static DEFINE_SPINLOCK(xfrm_state_lock);
  * Main use is finding SA after policy selected tunnel or transport mode.
  * Also, it can be used by ah/esp icmp error handler to find offending SA.
  */
-static struct list_head xfrm_state_bydst[XFRM_DST_HSIZE];
-static struct list_head xfrm_state_bysrc[XFRM_DST_HSIZE];
-static struct list_head xfrm_state_byspi[XFRM_DST_HSIZE];
+static struct hlist_head xfrm_state_bydst[XFRM_DST_HSIZE];
+static struct hlist_head xfrm_state_bysrc[XFRM_DST_HSIZE];
+static struct hlist_head xfrm_state_byspi[XFRM_DST_HSIZE];
 
 static __inline__
 unsigned __xfrm4_dst_hash(xfrm_address_t *addr)
@@ -141,7 +141,7 @@ static DEFINE_RWLOCK(xfrm_state_afinfo_lock);
 static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO];
 
 static struct work_struct xfrm_state_gc_work;
-static struct list_head xfrm_state_gc_list = LIST_HEAD_INIT(xfrm_state_gc_list);
+static HLIST_HEAD(xfrm_state_gc_list);
 static DEFINE_SPINLOCK(xfrm_state_gc_lock);
 
 static int xfrm_state_gc_flush_bundles;
@@ -178,8 +178,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 static void xfrm_state_gc_task(void *data)
 {
 	struct xfrm_state *x;
-	struct list_head *entry, *tmp;
-	struct list_head gc_list = LIST_HEAD_INIT(gc_list);
+	struct hlist_node *entry, *tmp;
+	struct hlist_head gc_list;
 
 	if (xfrm_state_gc_flush_bundles) {
 		xfrm_state_gc_flush_bundles = 0;
@@ -187,13 +187,13 @@ static void xfrm_state_gc_task(void *data)
 	}
 
 	spin_lock_bh(&xfrm_state_gc_lock);
-	list_splice_init(&xfrm_state_gc_list, &gc_list);
+	gc_list.first = xfrm_state_gc_list.first;
+	INIT_HLIST_HEAD(&xfrm_state_gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
 
-	list_for_each_safe(entry, tmp, &gc_list) {
-		x = list_entry(entry, struct xfrm_state, bydst);
+	hlist_for_each_entry_safe(x, entry, tmp, &gc_list, bydst)
 		xfrm_state_gc_destroy(x);
-	}
+
 	wake_up(&km_waitq);
 }
 
@@ -287,9 +287,9 @@ struct xfrm_state *xfrm_state_alloc(void)
 	if (x) {
 		atomic_set(&x->refcnt, 1);
 		atomic_set(&x->tunnel_users, 0);
-		INIT_LIST_HEAD(&x->bydst);
-		INIT_LIST_HEAD(&x->bysrc);
-		INIT_LIST_HEAD(&x->byspi);
+		INIT_HLIST_NODE(&x->bydst);
+		INIT_HLIST_NODE(&x->bysrc);
+		INIT_HLIST_NODE(&x->byspi);
 		init_timer(&x->timer);
 		x->timer.function = xfrm_timer_handler;
 		x->timer.data	  = (unsigned long)x;
@@ -314,7 +314,7 @@ void __xfrm_state_destroy(struct xfrm_state *x)
 	BUG_TRAP(x->km.state == XFRM_STATE_DEAD);
 
 	spin_lock_bh(&xfrm_state_gc_lock);
-	list_add(&x->bydst, &xfrm_state_gc_list);
+	hlist_add_head(&x->bydst, &xfrm_state_gc_list);
 	spin_unlock_bh(&xfrm_state_gc_lock);
 	schedule_work(&xfrm_state_gc_work);
 }
@@ -327,12 +327,12 @@ int __xfrm_state_delete(struct xfrm_state *x)
 	if (x->km.state != XFRM_STATE_DEAD) {
 		x->km.state = XFRM_STATE_DEAD;
 		spin_lock(&xfrm_state_lock);
-		list_del(&x->bydst);
+		hlist_del(&x->bydst);
 		__xfrm_state_put(x);
-		list_del(&x->bysrc);
+		hlist_del(&x->bysrc);
 		__xfrm_state_put(x);
 		if (x->id.spi) {
-			list_del(&x->byspi);
+			hlist_del(&x->byspi);
 			__xfrm_state_put(x);
 		}
 		spin_unlock(&xfrm_state_lock);
@@ -378,12 +378,13 @@ EXPORT_SYMBOL(xfrm_state_delete);
 void xfrm_state_flush(u8 proto)
 {
 	int i;
-	struct xfrm_state *x;
 
 	spin_lock_bh(&xfrm_state_lock);
 	for (i = 0; i < XFRM_DST_HSIZE; i++) {
+		struct hlist_node *entry;
+		struct xfrm_state *x;
 restart:
-		list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
+		hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
 			if (!xfrm_state_kern(x) &&
 			    xfrm_id_proto_match(x->id.proto, proto)) {
 				xfrm_state_hold(x);
@@ -420,8 +421,9 @@ static struct xfrm_state *__xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8
 {
 	unsigned int h = xfrm_spi_hash(daddr, spi, proto, family);
 	struct xfrm_state *x;
+	struct hlist_node *entry;
 
-	list_for_each_entry(x, xfrm_state_byspi+h, byspi) {
+	hlist_for_each_entry(x, entry, xfrm_state_byspi+h, byspi) {
 		if (x->props.family != family ||
 		    x->id.spi       != spi ||
 		    x->id.proto     != proto)
@@ -451,8 +453,9 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm
 {
 	unsigned int h = xfrm_src_hash(saddr, family);
 	struct xfrm_state *x;
+	struct hlist_node *entry;
 
-	list_for_each_entry(x, xfrm_state_bysrc+h, bysrc) {
+	hlist_for_each_entry(x, entry, xfrm_state_bysrc+h, bysrc) {
 		if (x->props.family != family ||
 		    x->id.proto     != proto)
 			continue;
@@ -499,14 +502,15 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 		struct xfrm_policy *pol, int *err,
 		unsigned short family)
 {
-	unsigned h = xfrm_dst_hash(daddr, family);
+	unsigned int h = xfrm_dst_hash(daddr, family);
+	struct hlist_node *entry;
 	struct xfrm_state *x, *x0;
 	int acquire_in_progress = 0;
 	int error = 0;
 	struct xfrm_state *best = NULL;
 	
 	spin_lock_bh(&xfrm_state_lock);
-	list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
+	hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) {
 		if (x->props.family == family &&
 		    x->props.reqid == tmpl->reqid &&
 		    !(x->props.flags & XFRM_STATE_WILDRECV) &&
@@ -575,13 +579,14 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
 
 		if (km_query(x, tmpl, pol) == 0) {
 			x->km.state = XFRM_STATE_ACQ;
-			list_add_tail(&x->bydst, xfrm_state_bydst+h);
+			hlist_add_head(&x->bydst, xfrm_state_bydst+h);
 			xfrm_state_hold(x);
-			list_add_tail(&x->bysrc, xfrm_state_bysrc+h);
+			h = xfrm_src_hash(saddr, family);
+			hlist_add_head(&x->bysrc, xfrm_state_bysrc+h);
 			xfrm_state_hold(x);
 			if (x->id.spi) {
 				h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, family);
-				list_add(&x->byspi, xfrm_state_byspi+h);
+				hlist_add_head(&x->byspi, xfrm_state_byspi+h);
 				xfrm_state_hold(x);
 			}
 			x->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
@@ -608,19 +613,19 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 {
 	unsigned h = xfrm_dst_hash(&x->id.daddr, x->props.family);
 
-	list_add(&x->bydst, xfrm_state_bydst+h);
+	hlist_add_head(&x->bydst, xfrm_state_bydst+h);
 	xfrm_state_hold(x);
 
 	h = xfrm_src_hash(&x->props.saddr, x->props.family);
 
-	list_add(&x->bysrc, xfrm_state_bysrc+h);
+	hlist_add_head(&x->bysrc, xfrm_state_bysrc+h);
 	xfrm_state_hold(x);
 
 	if (xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY)) {
 		h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto,
 				  x->props.family);
 
-		list_add(&x->byspi, xfrm_state_byspi+h);
+		hlist_add_head(&x->byspi, xfrm_state_byspi+h);
 		xfrm_state_hold(x);
 	}
 
@@ -648,9 +653,10 @@ EXPORT_SYMBOL(xfrm_state_insert);
 static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 reqid, u8 proto, xfrm_address_t *daddr, xfrm_address_t *saddr, int create)
 {
 	unsigned int h = xfrm_dst_hash(daddr, family);
+	struct hlist_node *entry;
 	struct xfrm_state *x;
 
-	list_for_each_entry(x, xfrm_state_bydst+h, bydst) {
+	hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) {
 		if (x->props.reqid  != reqid ||
 		    x->props.mode   != mode ||
 		    x->props.family != family ||
@@ -717,10 +723,10 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re
 		x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
 		add_timer(&x->timer);
 		xfrm_state_hold(x);
-		list_add_tail(&x->bydst, xfrm_state_bydst+h);
+		hlist_add_head(&x->bydst, xfrm_state_bydst+h);
 		h = xfrm_src_hash(saddr, family);
 		xfrm_state_hold(x);
-		list_add_tail(&x->bysrc, xfrm_state_bysrc+h);
+		hlist_add_head(&x->bysrc, xfrm_state_bysrc+h);
 		wake_up(&km_waitq);
 	}
 
@@ -977,11 +983,14 @@ EXPORT_SYMBOL(xfrm_state_sort);
 static struct xfrm_state *__xfrm_find_acq_byseq(u32 seq)
 {
 	int i;
-	struct xfrm_state *x;
 
 	for (i = 0; i < XFRM_DST_HSIZE; i++) {
-		list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
-			if (x->km.seq == seq && x->km.state == XFRM_STATE_ACQ) {
+		struct hlist_node *entry;
+		struct xfrm_state *x;
+
+		hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
+			if (x->km.seq == seq &&
+			    x->km.state == XFRM_STATE_ACQ) {
 				xfrm_state_hold(x);
 				return x;
 			}
@@ -1047,7 +1056,7 @@ xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi)
 	if (x->id.spi) {
 		spin_lock_bh(&xfrm_state_lock);
 		h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family);
-		list_add(&x->byspi, xfrm_state_byspi+h);
+		hlist_add_head(&x->byspi, xfrm_state_byspi+h);
 		xfrm_state_hold(x);
 		spin_unlock_bh(&xfrm_state_lock);
 		wake_up(&km_waitq);
@@ -1060,12 +1069,13 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
 {
 	int i;
 	struct xfrm_state *x;
+	struct hlist_node *entry;
 	int count = 0;
 	int err = 0;
 
 	spin_lock_bh(&xfrm_state_lock);
 	for (i = 0; i < XFRM_DST_HSIZE; i++) {
-		list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
+		hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
 			if (xfrm_id_proto_match(x->id.proto, proto))
 				count++;
 		}
@@ -1076,7 +1086,7 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
 	}
 
 	for (i = 0; i < XFRM_DST_HSIZE; i++) {
-		list_for_each_entry(x, xfrm_state_bydst+i, bydst) {
+		hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
 			if (!xfrm_id_proto_match(x->id.proto, proto))
 				continue;
 			err = func(x, --count, data);
@@ -1524,9 +1534,9 @@ void __init xfrm_state_init(void)
 	int i;
 
 	for (i=0; i<XFRM_DST_HSIZE; i++) {
-		INIT_LIST_HEAD(&xfrm_state_bydst[i]);
-		INIT_LIST_HEAD(&xfrm_state_bysrc[i]);
-		INIT_LIST_HEAD(&xfrm_state_byspi[i]);
+		INIT_HLIST_HEAD(&xfrm_state_bydst[i]);
+		INIT_HLIST_HEAD(&xfrm_state_bysrc[i]);
+		INIT_HLIST_HEAD(&xfrm_state_byspi[i]);
 	}
 	INIT_WORK(&xfrm_state_gc_work, xfrm_state_gc_task, NULL);
 }
-- 
cgit v1.2.3-70-g09d2


From f034b5d4efdfe0fb9e2a1ce1d95fa7914f24de49 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 24 Aug 2006 03:08:07 -0700
Subject: [XFRM]: Dynamic xfrm_state hash table sizing.

The grow algorithm is simple, we grow if:

1) we see a hash chain collision at insert, and
2) we haven't hit the hash size limit (currently 1*1024*1024 slots), and
3) the number of xfrm_state objects is > the current hash mask

All of this needs some tweaking.

Remove __initdata from "hashdist" so we can use it safely at run time.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bootmem.h |   2 +-
 mm/page_alloc.c         |   2 +-
 net/xfrm/xfrm_state.c   | 247 ++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 197 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 1021f508d82..e319c649e4f 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -114,7 +114,7 @@ extern void *__init alloc_large_system_hash(const char *tablename,
 #else
 #define HASHDIST_DEFAULT 0
 #endif
-extern int __initdata hashdist;		/* Distribute hashes across NUMA nodes? */
+extern int hashdist;		/* Distribute hashes across NUMA nodes? */
 
 
 #endif /* _LINUX_BOOTMEM_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54a4f5375bb..3b5358a0561 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2363,7 +2363,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
-__initdata int hashdist = HASHDIST_DEFAULT;
+int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index fe3c8c38d5e..445263c54c9 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -18,6 +18,9 @@
 #include <linux/pfkeyv2.h>
 #include <linux/ipsec.h>
 #include <linux/module.h>
+#include <linux/bootmem.h>
+#include <linux/vmalloc.h>
+#include <linux/cache.h>
 #include <asm/uaccess.h>
 
 struct sock *xfrm_nl;
@@ -38,102 +41,230 @@ EXPORT_SYMBOL(sysctl_xfrm_aevent_rseqth);
 
 static DEFINE_SPINLOCK(xfrm_state_lock);
 
-#define XFRM_DST_HSIZE		1024
-
 /* Hash table to find appropriate SA towards given target (endpoint
  * of tunnel or destination of transport mode) allowed by selector.
  *
  * Main use is finding SA after policy selected tunnel or transport mode.
  * Also, it can be used by ah/esp icmp error handler to find offending SA.
  */
-static struct hlist_head xfrm_state_bydst[XFRM_DST_HSIZE];
-static struct hlist_head xfrm_state_bysrc[XFRM_DST_HSIZE];
-static struct hlist_head xfrm_state_byspi[XFRM_DST_HSIZE];
-
-static __inline__
-unsigned __xfrm4_dst_hash(xfrm_address_t *addr)
+static struct hlist_head *xfrm_state_bydst __read_mostly;
+static struct hlist_head *xfrm_state_bysrc __read_mostly;
+static struct hlist_head *xfrm_state_byspi __read_mostly;
+static unsigned int xfrm_state_hmask __read_mostly;
+static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
+static unsigned int xfrm_state_num;
+
+static inline unsigned int __xfrm4_dst_hash(xfrm_address_t *addr, unsigned int hmask)
 {
-	unsigned h;
+	unsigned int h;
 	h = ntohl(addr->a4);
-	h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
+	h = (h ^ (h>>16)) & hmask;
 	return h;
 }
 
-static __inline__
-unsigned __xfrm6_dst_hash(xfrm_address_t *addr)
+static inline unsigned int __xfrm6_dst_hash(xfrm_address_t *addr, unsigned int hmask)
 {
-	unsigned h;
+	unsigned int h;
 	h = ntohl(addr->a6[2]^addr->a6[3]);
-	h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
+	h = (h ^ (h>>16)) & hmask;
 	return h;
 }
 
-static __inline__
-unsigned __xfrm4_src_hash(xfrm_address_t *addr)
+static inline unsigned int __xfrm4_src_hash(xfrm_address_t *addr, unsigned int hmask)
 {
-	return __xfrm4_dst_hash(addr);
+	return __xfrm4_dst_hash(addr, hmask);
 }
 
-static __inline__
-unsigned __xfrm6_src_hash(xfrm_address_t *addr)
+static inline unsigned int __xfrm6_src_hash(xfrm_address_t *addr, unsigned int hmask)
 {
-	return __xfrm6_dst_hash(addr);
+	return __xfrm6_dst_hash(addr, hmask);
 }
 
-static __inline__
-unsigned xfrm_src_hash(xfrm_address_t *addr, unsigned short family)
+static inline unsigned __xfrm_src_hash(xfrm_address_t *addr, unsigned short family,  unsigned int hmask)
 {
 	switch (family) {
 	case AF_INET:
-		return __xfrm4_src_hash(addr);
+		return __xfrm4_src_hash(addr, hmask);
 	case AF_INET6:
-		return __xfrm6_src_hash(addr);
+		return __xfrm6_src_hash(addr, hmask);
 	}
 	return 0;
 }
 
-static __inline__
-unsigned xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
+static inline unsigned xfrm_src_hash(xfrm_address_t *addr, unsigned short family)
+{
+	return __xfrm_src_hash(addr, family, xfrm_state_hmask);
+}
+
+static inline unsigned int __xfrm_dst_hash(xfrm_address_t *addr, unsigned short family, unsigned int hmask)
 {
 	switch (family) {
 	case AF_INET:
-		return __xfrm4_dst_hash(addr);
+		return __xfrm4_dst_hash(addr, hmask);
 	case AF_INET6:
-		return __xfrm6_dst_hash(addr);
+		return __xfrm6_dst_hash(addr, hmask);
 	}
 	return 0;
 }
 
-static __inline__
-unsigned __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
+static inline unsigned int xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
+{
+	return __xfrm_dst_hash(addr, family, xfrm_state_hmask);
+}
+
+static inline unsigned int __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto,
+					unsigned int hmask)
 {
-	unsigned h;
+	unsigned int h;
 	h = ntohl(addr->a4^spi^proto);
-	h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
+	h = (h ^ (h>>10) ^ (h>>20)) & hmask;
 	return h;
 }
 
-static __inline__
-unsigned __xfrm6_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
+static inline unsigned int __xfrm6_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto,
+					    unsigned int hmask)
 {
-	unsigned h;
+	unsigned int h;
 	h = ntohl(addr->a6[2]^addr->a6[3]^spi^proto);
-	h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
+	h = (h ^ (h>>10) ^ (h>>20)) & hmask;
 	return h;
 }
 
-static __inline__
-unsigned xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family)
+static inline
+unsigned __xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family,
+			 unsigned int hmask)
 {
 	switch (family) {
 	case AF_INET:
-		return __xfrm4_spi_hash(addr, spi, proto);
+		return __xfrm4_spi_hash(addr, spi, proto, hmask);
 	case AF_INET6:
-		return __xfrm6_spi_hash(addr, spi, proto);
+		return __xfrm6_spi_hash(addr, spi, proto, hmask);
 	}
 	return 0;	/*XXX*/
 }
 
+static inline unsigned int
+xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family)
+{
+	return __xfrm_spi_hash(addr, spi, proto, family, xfrm_state_hmask);
+}
+
+static struct hlist_head *xfrm_state_hash_alloc(unsigned int sz)
+{
+	struct hlist_head *n;
+
+	if (sz <= PAGE_SIZE)
+		n = kmalloc(sz, GFP_KERNEL);
+	else if (hashdist)
+		n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
+	else
+		n = (struct hlist_head *)
+			__get_free_pages(GFP_KERNEL, get_order(sz));
+
+	if (n)
+		memset(n, 0, sz);
+
+	return n;
+}
+
+static void xfrm_state_hash_free(struct hlist_head *n, unsigned int sz)
+{
+	if (sz <= PAGE_SIZE)
+		kfree(n);
+	else if (hashdist)
+		vfree(n);
+	else
+		free_pages((unsigned long)n, get_order(sz));
+}
+
+static void xfrm_hash_transfer(struct hlist_head *list,
+			       struct hlist_head *ndsttable,
+			       struct hlist_head *nsrctable,
+			       struct hlist_head *nspitable,
+			       unsigned int nhashmask)
+{
+	struct hlist_node *entry, *tmp;
+	struct xfrm_state *x;
+
+	hlist_for_each_entry_safe(x, entry, tmp, list, bydst) {
+		unsigned int h;
+
+		h = __xfrm_dst_hash(&x->id.daddr, x->props.family, nhashmask);
+		hlist_add_head(&x->bydst, ndsttable+h);
+
+		h = __xfrm_src_hash(&x->props.saddr, x->props.family,
+				    nhashmask);
+		hlist_add_head(&x->bysrc, nsrctable+h);
+
+		h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto,
+				    x->props.family, nhashmask);
+		hlist_add_head(&x->byspi, nspitable+h);
+	}
+}
+
+static unsigned long xfrm_hash_new_size(void)
+{
+	return ((xfrm_state_hmask + 1) << 1) *
+		sizeof(struct hlist_head);
+}
+
+static DEFINE_MUTEX(hash_resize_mutex);
+
+static void xfrm_hash_resize(void *__unused)
+{
+	struct hlist_head *ndst, *nsrc, *nspi, *odst, *osrc, *ospi;
+	unsigned long nsize, osize;
+	unsigned int nhashmask, ohashmask;
+	int i;
+
+	mutex_lock(&hash_resize_mutex);
+
+	nsize = xfrm_hash_new_size();
+	ndst = xfrm_state_hash_alloc(nsize);
+	if (!ndst)
+		goto out_unlock;
+	nsrc = xfrm_state_hash_alloc(nsize);
+	if (!nsrc) {
+		xfrm_state_hash_free(ndst, nsize);
+		goto out_unlock;
+	}
+	nspi = xfrm_state_hash_alloc(nsize);
+	if (!nspi) {
+		xfrm_state_hash_free(ndst, nsize);
+		xfrm_state_hash_free(nsrc, nsize);
+		goto out_unlock;
+	}
+
+	spin_lock_bh(&xfrm_state_lock);
+
+	nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
+	for (i = xfrm_state_hmask; i >= 0; i--)
+		xfrm_hash_transfer(xfrm_state_bydst+i, ndst, nsrc, nspi,
+				   nhashmask);
+
+	odst = xfrm_state_bydst;
+	osrc = xfrm_state_bysrc;
+	ospi = xfrm_state_byspi;
+	ohashmask = xfrm_state_hmask;
+
+	xfrm_state_bydst = ndst;
+	xfrm_state_bysrc = nsrc;
+	xfrm_state_byspi = nspi;
+	xfrm_state_hmask = nhashmask;
+
+	spin_unlock_bh(&xfrm_state_lock);
+
+	osize = (ohashmask + 1) * sizeof(struct hlist_head);
+	xfrm_state_hash_free(odst, osize);
+	xfrm_state_hash_free(osrc, osize);
+	xfrm_state_hash_free(ospi, osize);
+
+out_unlock:
+	mutex_unlock(&hash_resize_mutex);
+}
+
+static DECLARE_WORK(xfrm_hash_work, xfrm_hash_resize, NULL);
+
 DECLARE_WAIT_QUEUE_HEAD(km_waitq);
 EXPORT_SYMBOL(km_waitq);
 
@@ -335,6 +466,7 @@ int __xfrm_state_delete(struct xfrm_state *x)
 			hlist_del(&x->byspi);
 			__xfrm_state_put(x);
 		}
+		xfrm_state_num--;
 		spin_unlock(&xfrm_state_lock);
 		if (del_timer(&x->timer))
 			__xfrm_state_put(x);
@@ -380,7 +512,7 @@ void xfrm_state_flush(u8 proto)
 	int i;
 
 	spin_lock_bh(&xfrm_state_lock);
-	for (i = 0; i < XFRM_DST_HSIZE; i++) {
+	for (i = 0; i < xfrm_state_hmask; i++) {
 		struct hlist_node *entry;
 		struct xfrm_state *x;
 restart:
@@ -611,7 +743,7 @@ out:
 
 static void __xfrm_state_insert(struct xfrm_state *x)
 {
-	unsigned h = xfrm_dst_hash(&x->id.daddr, x->props.family);
+	unsigned int h = xfrm_dst_hash(&x->id.daddr, x->props.family);
 
 	hlist_add_head(&x->bydst, xfrm_state_bydst+h);
 	xfrm_state_hold(x);
@@ -637,6 +769,13 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 		xfrm_state_hold(x);
 
 	wake_up(&km_waitq);
+
+	xfrm_state_num++;
+
+	if (x->bydst.next != NULL &&
+	    (xfrm_state_hmask + 1) < xfrm_state_hashmax &&
+	    xfrm_state_num > xfrm_state_hmask)
+		schedule_work(&xfrm_hash_work);
 }
 
 void xfrm_state_insert(struct xfrm_state *x)
@@ -984,7 +1123,7 @@ static struct xfrm_state *__xfrm_find_acq_byseq(u32 seq)
 {
 	int i;
 
-	for (i = 0; i < XFRM_DST_HSIZE; i++) {
+	for (i = 0; i <= xfrm_state_hmask; i++) {
 		struct hlist_node *entry;
 		struct xfrm_state *x;
 
@@ -1026,7 +1165,7 @@ EXPORT_SYMBOL(xfrm_get_acqseq);
 void
 xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi)
 {
-	u32 h;
+	unsigned int h;
 	struct xfrm_state *x0;
 
 	if (x->id.spi)
@@ -1074,7 +1213,7 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
 	int err = 0;
 
 	spin_lock_bh(&xfrm_state_lock);
-	for (i = 0; i < XFRM_DST_HSIZE; i++) {
+	for (i = 0; i <= xfrm_state_hmask; i++) {
 		hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
 			if (xfrm_id_proto_match(x->id.proto, proto))
 				count++;
@@ -1085,7 +1224,7 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
 		goto out;
 	}
 
-	for (i = 0; i < XFRM_DST_HSIZE; i++) {
+	for (i = 0; i <= xfrm_state_hmask; i++) {
 		hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
 			if (!xfrm_id_proto_match(x->id.proto, proto))
 				continue;
@@ -1531,13 +1670,17 @@ EXPORT_SYMBOL(xfrm_init_state);
  
 void __init xfrm_state_init(void)
 {
-	int i;
+	unsigned int sz;
+
+	sz = sizeof(struct hlist_head) * 8;
+
+	xfrm_state_bydst = xfrm_state_hash_alloc(sz);
+	xfrm_state_bysrc = xfrm_state_hash_alloc(sz);
+	xfrm_state_byspi = xfrm_state_hash_alloc(sz);
+	if (!xfrm_state_bydst || !xfrm_state_bysrc || !xfrm_state_byspi)
+		panic("XFRM: Cannot allocate bydst/bysrc/byspi hashes.");
+	xfrm_state_hmask = ((sz / sizeof(struct hlist_head)) - 1);
 
-	for (i=0; i<XFRM_DST_HSIZE; i++) {
-		INIT_HLIST_HEAD(&xfrm_state_bydst[i]);
-		INIT_HLIST_HEAD(&xfrm_state_bysrc[i]);
-		INIT_HLIST_HEAD(&xfrm_state_byspi[i]);
-	}
 	INIT_WORK(&xfrm_state_gc_work, xfrm_state_gc_task, NULL);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 9d4a706d852411154d0c91b9ffb3bec68b94b25c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 24 Aug 2006 03:18:09 -0700
Subject: [XFRM]: Add generation count to xfrm_state and xfrm_dst.

Each xfrm_state inserted gets a new generation counter
value.  When a bundle is created, the xfrm_dst objects
get the current generation counter of the xfrm_state
they will attach to at dst->xfrm.

xfrm_bundle_ok() will return false if it sees an
xfrm_dst with a generation count different from the
generation count of the xfrm_state that dst points to.

This provides a facility by which to passively and
cheaply invalidate cached IPSEC routes during SA
database changes.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h      | 3 +++
 net/ipv4/xfrm4_policy.c | 1 +
 net/ipv6/xfrm6_policy.c | 1 +
 net/xfrm/xfrm_policy.c  | 2 ++
 net/xfrm/xfrm_state.c   | 3 +++
 5 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 3405e5d9d51..fd4a300b5ba 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -104,6 +104,8 @@ struct xfrm_state
 	struct xfrm_id		id;
 	struct xfrm_selector	sel;
 
+	u32			genid;
+
 	/* Key manger bits */
 	struct {
 		u8		state;
@@ -590,6 +592,7 @@ struct xfrm_dst
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
+	u32 genid;
 	u32 route_mtu_cached;
 	u32 child_mtu_cached;
 	u32 route_cookie;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 42d8ded0f96..479598566f1 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -93,6 +93,7 @@ __xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 
 		xdst = (struct xfrm_dst *)dst1;
 		xdst->route = &rt->u.dst;
+		xdst->genid = xfrm[i]->genid;
 
 		dst1->next = dst_prev;
 		dst_prev = dst1;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 98c2fe449b3..9391c4c94fe 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -149,6 +149,7 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 
 		xdst = (struct xfrm_dst *)dst1;
 		xdst->route = &rt->u.dst;
+		xdst->genid = xfrm[i]->genid;
 		if (rt->rt6i_node)
 			xdst->route_cookie = rt->rt6i_node->fn_sernum;
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 1732159ffd0..7fc6944ee36 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1536,6 +1536,8 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family, int str
 			return 0;
 		if (dst->xfrm->km.state != XFRM_STATE_VALID)
 			return 0;
+		if (xdst->genid != dst->xfrm->genid)
+			return 0;
 
 		if (strict && fl && dst->xfrm->props.mode != XFRM_MODE_TUNNEL &&
 		    !xfrm_state_addr_flow_check(dst->xfrm, fl, family))
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 445263c54c9..535d43c1472 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -53,6 +53,7 @@ static struct hlist_head *xfrm_state_byspi __read_mostly;
 static unsigned int xfrm_state_hmask __read_mostly;
 static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 static unsigned int xfrm_state_num;
+static unsigned int xfrm_state_genid;
 
 static inline unsigned int __xfrm4_dst_hash(xfrm_address_t *addr, unsigned int hmask)
 {
@@ -745,6 +746,8 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 {
 	unsigned int h = xfrm_dst_hash(&x->id.daddr, x->props.family);
 
+	x->genid = ++xfrm_state_genid;
+
 	hlist_add_head(&x->bydst, xfrm_state_bydst+h);
 	xfrm_state_hold(x);
 
-- 
cgit v1.2.3-70-g09d2


From c7f5ea3a4d1ae6b3b426e113358fdc57494bc754 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 24 Aug 2006 03:29:04 -0700
Subject: [XFRM]: Do not flush all bundles on SA insert.

Instead, simply set all potentially aliasing existing xfrm_state
objects to have the current generation counter value.

This will make routes get relooked up the next time an existing
route mentioning these aliased xfrm_state objects gets used,
via xfrm_dst_check().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     |  1 -
 net/xfrm/xfrm_policy.c | 10 ----------
 net/xfrm/xfrm_state.c  | 25 ++++++++++++++++++++-----
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index fd4a300b5ba..a620a43c9ee 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -996,7 +996,6 @@ struct xfrm_state * xfrm_find_acq(u8 mode, u32 reqid, u8 proto,
 extern void xfrm_policy_flush(u8 type);
 extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
 extern int xfrm_flush_bundles(void);
-extern void xfrm_flush_all_bundles(void);
 extern int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family, int strict);
 extern void xfrm_init_pmtu(struct dst_entry *dst);
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7fc6944ee36..cfa5c692f2e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1478,16 +1478,6 @@ int xfrm_flush_bundles(void)
 	return 0;
 }
 
-static int always_true(struct dst_entry *dst)
-{
-	return 1;
-}
-
-void xfrm_flush_all_bundles(void)
-{
-	xfrm_prune_bundles(always_true);
-}
-
 void xfrm_init_pmtu(struct dst_entry *dst)
 {
 	do {
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 98200397e09..77ef796c9d0 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -761,13 +761,30 @@ static void __xfrm_state_insert(struct xfrm_state *x)
 		schedule_work(&xfrm_hash_work);
 }
 
+/* xfrm_state_lock is held */
+static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
+{
+	unsigned short family = xnew->props.family;
+	u32 reqid = xnew->props.reqid;
+	struct xfrm_state *x;
+	struct hlist_node *entry;
+	unsigned int h;
+
+	h = xfrm_dst_hash(&xnew->id.daddr, reqid, family);
+	hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) {
+		if (x->props.family	== family &&
+		    x->props.reqid	== reqid &&
+		    !xfrm_addr_cmp(&x->id.daddr, &xnew->id.daddr, family))
+			x->genid = xfrm_state_genid;
+	}
+}
+
 void xfrm_state_insert(struct xfrm_state *x)
 {
 	spin_lock_bh(&xfrm_state_lock);
+	__xfrm_state_bump_genids(x);
 	__xfrm_state_insert(x);
 	spin_unlock_bh(&xfrm_state_lock);
-
-	xfrm_flush_all_bundles();
 }
 EXPORT_SYMBOL(xfrm_state_insert);
 
@@ -889,15 +906,13 @@ int xfrm_state_add(struct xfrm_state *x)
 				     x->id.proto,
 				     &x->id.daddr, &x->props.saddr, 0);
 
+	__xfrm_state_bump_genids(x);
 	__xfrm_state_insert(x);
 	err = 0;
 
 out:
 	spin_unlock_bh(&xfrm_state_lock);
 
-	if (!err)
-		xfrm_flush_all_bundles();
-
 	if (x1) {
 		xfrm_state_delete(x1);
 		xfrm_state_put(x1);
-- 
cgit v1.2.3-70-g09d2


From 1c0953997567b22e32fdf85d3b4bc0f2461fd161 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 24 Aug 2006 03:30:28 -0700
Subject: [XFRM]: Purge dst references to deleted SAs passively.

Just let GC and other normal mechanisms take care of getting
rid of DST cache references to deleted xfrm_state objects
instead of walking all the policy bundles.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     |  1 -
 net/xfrm/xfrm_policy.c |  2 +-
 net/xfrm/xfrm_state.c  | 17 -----------------
 3 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index a620a43c9ee..c7870b6eae0 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -995,7 +995,6 @@ struct xfrm_state * xfrm_find_acq(u8 mode, u32 reqid, u8 proto,
 				  int create, unsigned short family);
 extern void xfrm_policy_flush(u8 type);
 extern int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
-extern int xfrm_flush_bundles(void);
 extern int xfrm_bundle_ok(struct xfrm_dst *xdst, struct flowi *fl, int family, int strict);
 extern void xfrm_init_pmtu(struct dst_entry *dst);
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index cfa5c692f2e..1bcaae4adf3 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1472,7 +1472,7 @@ static void __xfrm_garbage_collect(void)
 	xfrm_prune_bundles(unused_bundle);
 }
 
-int xfrm_flush_bundles(void)
+static int xfrm_flush_bundles(void)
 {
 	xfrm_prune_bundles(stale_bundle);
 	return 0;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 77ef796c9d0..9ff00b7d6ad 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -256,8 +256,6 @@ static struct work_struct xfrm_state_gc_work;
 static HLIST_HEAD(xfrm_state_gc_list);
 static DEFINE_SPINLOCK(xfrm_state_gc_lock);
 
-static int xfrm_state_gc_flush_bundles;
-
 int __xfrm_state_delete(struct xfrm_state *x);
 
 static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family);
@@ -293,11 +291,6 @@ static void xfrm_state_gc_task(void *data)
 	struct hlist_node *entry, *tmp;
 	struct hlist_head gc_list;
 
-	if (xfrm_state_gc_flush_bundles) {
-		xfrm_state_gc_flush_bundles = 0;
-		xfrm_flush_bundles();
-	}
-
 	spin_lock_bh(&xfrm_state_gc_lock);
 	gc_list.first = xfrm_state_gc_list.first;
 	INIT_HLIST_HEAD(&xfrm_state_gc_list);
@@ -454,16 +447,6 @@ int __xfrm_state_delete(struct xfrm_state *x)
 		if (del_timer(&x->rtimer))
 			__xfrm_state_put(x);
 
-		/* The number two in this test is the reference
-		 * mentioned in the comment below plus the reference
-		 * our caller holds.  A larger value means that
-		 * there are DSTs attached to this xfrm_state.
-		 */
-		if (atomic_read(&x->refcnt) > 2) {
-			xfrm_state_gc_flush_bundles = 1;
-			schedule_work(&xfrm_state_gc_work);
-		}
-
 		/* All xfrm_state objects are created by xfrm_state_alloc.
 		 * The xfrm_state_alloc call gives a reference, and that
 		 * is what we are dropping here.
-- 
cgit v1.2.3-70-g09d2


From 2518c7c2b3d7f0a6b302b4efe17c911f8dd4049f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 24 Aug 2006 04:45:07 -0700
Subject: [XFRM]: Hash policies when non-prefixed.

This idea is from Alexey Kuznetsov.

It is common for policies to be non-prefixed.  And for
that case we can optimize lookups, insert, etc. quite
a bit.

For each direction, we have a dynamically sized policy
hash table for non-prefixed policies.  We also have a
hash table on policy->index.

For prefixed policies, we have a list per-direction which
we will consult on lookups when a non-prefix hashtable
lookup fails.

This still isn't as efficient as I would like it.  There
are four immediate problems:

1) Lots of excessive refcounting, which can be fixed just
   like xfrm_state was
2) We do 2 hash probes on insert, one to look for dups and
   one to allocate a unique policy->index.  Althought I wonder
   how much this matters since xfrm_state inserts do up to
   3 hash probes and that seems to perform fine.
3) xfrm_policy_insert() is very complex because of the priority
   ordering and entry replacement logic.
4) Lots of counter bumping, in addition to policy refcounts,
   in the form of xfrm_policy_count[].  This is merely used
   to let code path(s) know that some IPSEC rules exist.  So
   this count is indexed per-direction, maybe that is overkill.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     |  23 +-
 net/xfrm/xfrm_policy.c | 681 +++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 546 insertions(+), 158 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c7870b6eae0..0acabf2a0a8 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -331,7 +331,8 @@ struct xfrm_tmpl
 struct xfrm_policy
 {
 	struct xfrm_policy	*next;
-	struct list_head	list;
+	struct hlist_node	bydst;
+	struct hlist_node	byidx;
 
 	/* This lock only affects elements except for entry. */
 	rwlock_t		lock;
@@ -385,21 +386,7 @@ struct xfrm_mgr
 extern int xfrm_register_km(struct xfrm_mgr *km);
 extern int xfrm_unregister_km(struct xfrm_mgr *km);
 
-
-extern struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
-#ifdef CONFIG_XFRM_SUB_POLICY
-extern struct xfrm_policy *xfrm_policy_list_sub[XFRM_POLICY_MAX*2];
-
-static inline int xfrm_policy_lists_empty(int dir)
-{
-	return (!xfrm_policy_list[dir] && !xfrm_policy_list_sub[dir]);
-}
-#else
-static inline int xfrm_policy_lists_empty(int dir)
-{
-	return (!xfrm_policy_list[dir]);
-}
-#endif
+extern unsigned int xfrm_policy_count[XFRM_POLICY_MAX*2];
 
 static inline void xfrm_pol_hold(struct xfrm_policy *policy)
 {
@@ -678,7 +665,7 @@ static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *sk
 	if (sk && sk->sk_policy[XFRM_POLICY_IN])
 		return __xfrm_policy_check(sk, dir, skb, family);
 
-	return	(xfrm_policy_lists_empty(dir) && !skb->sp) ||
+	return	(!xfrm_policy_count[dir] && !skb->sp) ||
 		(skb->dst->flags & DST_NOPOLICY) ||
 		__xfrm_policy_check(sk, dir, skb, family);
 }
@@ -698,7 +685,7 @@ extern int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);
 
 static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
-	return	xfrm_policy_lists_empty(XFRM_POLICY_OUT) ||
+	return	!xfrm_policy_count[XFRM_POLICY_OUT] ||
 		(skb->dst->flags & DST_NOXFRM) ||
 		__xfrm_route_forward(skb, family);
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 1bcaae4adf3..087a5443b05 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -22,6 +22,9 @@
 #include <linux/netdevice.h>
 #include <linux/netfilter.h>
 #include <linux/module.h>
+#include <linux/bootmem.h>
+#include <linux/vmalloc.h>
+#include <linux/cache.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
 
@@ -30,26 +33,8 @@ EXPORT_SYMBOL(xfrm_cfg_mutex);
 
 static DEFINE_RWLOCK(xfrm_policy_lock);
 
-struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2];
-EXPORT_SYMBOL(xfrm_policy_list);
-#ifdef CONFIG_XFRM_SUB_POLICY
-struct xfrm_policy *xfrm_policy_list_sub[XFRM_POLICY_MAX*2];
-EXPORT_SYMBOL(xfrm_policy_list_sub);
-
-#define XFRM_POLICY_LISTS(type) \
-	((type == XFRM_POLICY_TYPE_SUB) ? xfrm_policy_list_sub : \
-	 xfrm_policy_list)
-#define XFRM_POLICY_LISTHEAD(type, dir) \
-	((type == XFRM_POLICY_TYPE_SUB) ? xfrm_policy_list_sub[dir] : \
-	 xfrm_policy_list[dir])
-#define XFRM_POLICY_LISTHEADP(type, dir) \
-	((type == XFRM_POLICY_TYPE_SUB) ? &xfrm_policy_list_sub[dir] : \
-	 &xfrm_policy_list[dir])
-#else
-#define XFRM_POLICY_LISTS(type)              xfrm_policy_list
-#define XFRM_POLICY_LISTHEAD(type, dif)      xfrm_policy_list[dir]
-#define XFRM_POLICY_LISTHEADP(type, dif)     &xfrm_policy_list[dir]
-#endif
+unsigned int xfrm_policy_count[XFRM_POLICY_MAX*2];
+EXPORT_SYMBOL(xfrm_policy_count);
 
 static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
@@ -57,8 +42,7 @@ static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
 static kmem_cache_t *xfrm_dst_cache __read_mostly;
 
 static struct work_struct xfrm_policy_gc_work;
-static struct list_head xfrm_policy_gc_list =
-	LIST_HEAD_INIT(xfrm_policy_gc_list);
+static HLIST_HEAD(xfrm_policy_gc_list);
 static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
@@ -328,8 +312,10 @@ struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp)
 	policy = kzalloc(sizeof(struct xfrm_policy), gfp);
 
 	if (policy) {
-		atomic_set(&policy->refcnt, 1);
+		INIT_HLIST_NODE(&policy->bydst);
+		INIT_HLIST_NODE(&policy->byidx);
 		rwlock_init(&policy->lock);
+		atomic_set(&policy->refcnt, 1);
 		init_timer(&policy->timer);
 		policy->timer.data = (unsigned long)policy;
 		policy->timer.function = xfrm_policy_timer;
@@ -375,17 +361,16 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 static void xfrm_policy_gc_task(void *data)
 {
 	struct xfrm_policy *policy;
-	struct list_head *entry, *tmp;
-	struct list_head gc_list = LIST_HEAD_INIT(gc_list);
+	struct hlist_node *entry, *tmp;
+	struct hlist_head gc_list;
 
 	spin_lock_bh(&xfrm_policy_gc_lock);
-	list_splice_init(&xfrm_policy_gc_list, &gc_list);
+	gc_list.first = xfrm_policy_gc_list.first;
+	INIT_HLIST_HEAD(&xfrm_policy_gc_list);
 	spin_unlock_bh(&xfrm_policy_gc_lock);
 
-	list_for_each_safe(entry, tmp, &gc_list) {
-		policy = list_entry(entry, struct xfrm_policy, list);
+	hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst)
 		xfrm_policy_gc_kill(policy);
-	}
 }
 
 /* Rule must be locked. Release descentant resources, announce
@@ -407,70 +392,354 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
 	}
 
 	spin_lock(&xfrm_policy_gc_lock);
-	list_add(&policy->list, &xfrm_policy_gc_list);
+	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
 	spin_unlock(&xfrm_policy_gc_lock);
 
 	schedule_work(&xfrm_policy_gc_work);
 }
 
+struct xfrm_policy_hash {
+	struct hlist_head	*table;
+	unsigned int		hmask;
+};
+
+static struct hlist_head xfrm_policy_inexact[XFRM_POLICY_MAX*2];
+static struct xfrm_policy_hash xfrm_policy_bydst[XFRM_POLICY_MAX*2] __read_mostly;
+static struct hlist_head *xfrm_policy_byidx __read_mostly;
+static unsigned int xfrm_idx_hmask __read_mostly;
+static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
+
+static inline unsigned int __idx_hash(u32 index, unsigned int hmask)
+{
+	return (index ^ (index >> 8)) & hmask;
+}
+
+static inline unsigned int idx_hash(u32 index)
+{
+	return __idx_hash(index, xfrm_idx_hmask);
+}
+
+static inline unsigned int __sel_hash(struct xfrm_selector *sel, unsigned short family, unsigned int hmask)
+{
+	xfrm_address_t *daddr = &sel->daddr;
+	xfrm_address_t *saddr = &sel->saddr;
+	unsigned int h = 0;
+
+	switch (family) {
+	case AF_INET:
+		if (sel->prefixlen_d != 32 ||
+		    sel->prefixlen_s != 32)
+			return hmask + 1;
+
+		h = ntohl(daddr->a4 ^ saddr->a4);
+		break;
+
+	case AF_INET6:
+		if (sel->prefixlen_d != 128 ||
+		    sel->prefixlen_s != 128)
+			return hmask + 1;
+
+		h = ntohl(daddr->a6[2] ^ daddr->a6[3] ^
+			  saddr->a6[2] ^ saddr->a6[3]);
+		break;
+	};
+	h ^= (h >> 16);
+	return h & hmask;
+}
+
+static inline unsigned int __addr_hash(xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, unsigned int hmask)
+{
+	unsigned int h = 0;
+
+	switch (family) {
+	case AF_INET:
+		h = ntohl(daddr->a4 ^ saddr->a4);
+		break;
+
+	case AF_INET6:
+		h = ntohl(daddr->a6[2] ^ daddr->a6[3] ^
+			  saddr->a6[2] ^ saddr->a6[3]);
+		break;
+	};
+	h ^= (h >> 16);
+	return h & hmask;
+}
+
+static struct hlist_head *policy_hash_bysel(struct xfrm_selector *sel, unsigned short family, int dir)
+{
+	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
+	unsigned int hash = __sel_hash(sel, family, hmask);
+
+	return (hash == hmask + 1 ?
+		&xfrm_policy_inexact[dir] :
+		xfrm_policy_bydst[dir].table + hash);
+}
+
+static struct hlist_head *policy_hash_direct(xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, int dir)
+{
+	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
+	unsigned int hash = __addr_hash(daddr, saddr, family, hmask);
+
+	return xfrm_policy_bydst[dir].table + hash;
+}
+
+static struct hlist_head *xfrm_policy_hash_alloc(unsigned int sz)
+{
+	struct hlist_head *n;
+
+	if (sz <= PAGE_SIZE)
+		n = kmalloc(sz, GFP_KERNEL);
+	else if (hashdist)
+		n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
+	else
+		n = (struct hlist_head *)
+			__get_free_pages(GFP_KERNEL, get_order(sz));
+
+	if (n)
+		memset(n, 0, sz);
+
+	return n;
+}
+
+static void xfrm_policy_hash_free(struct hlist_head *n, unsigned int sz)
+{
+	if (sz <= PAGE_SIZE)
+		kfree(n);
+	else if (hashdist)
+		vfree(n);
+	else
+		free_pages((unsigned long)n, get_order(sz));
+}
+
+static void xfrm_dst_hash_transfer(struct hlist_head *list,
+				   struct hlist_head *ndsttable,
+				   unsigned int nhashmask)
+{
+	struct hlist_node *entry, *tmp;
+	struct xfrm_policy *pol;
+
+	hlist_for_each_entry_safe(pol, entry, tmp, list, bydst) {
+		unsigned int h;
+
+		h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr,
+				pol->family, nhashmask);
+		hlist_add_head(&pol->bydst, ndsttable+h);
+	}
+}
+
+static void xfrm_idx_hash_transfer(struct hlist_head *list,
+				   struct hlist_head *nidxtable,
+				   unsigned int nhashmask)
+{
+	struct hlist_node *entry, *tmp;
+	struct xfrm_policy *pol;
+
+	hlist_for_each_entry_safe(pol, entry, tmp, list, byidx) {
+		unsigned int h;
+
+		h = __idx_hash(pol->index, nhashmask);
+		hlist_add_head(&pol->byidx, nidxtable+h);
+	}
+}
+
+static unsigned long xfrm_new_hash_mask(unsigned int old_hmask)
+{
+	return ((old_hmask + 1) << 1) - 1;
+}
+
+static void xfrm_bydst_resize(int dir)
+{
+	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
+	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
+	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
+	struct hlist_head *odst = xfrm_policy_bydst[dir].table;
+	struct hlist_head *ndst = xfrm_policy_hash_alloc(nsize);
+	int i;
+
+	if (!ndst)
+		return;
+
+	write_lock_bh(&xfrm_policy_lock);
+
+	for (i = hmask; i >= 0; i--)
+		xfrm_dst_hash_transfer(odst + i, ndst, nhashmask);
+
+	xfrm_policy_bydst[dir].table = ndst;
+	xfrm_policy_bydst[dir].hmask = nhashmask;
+
+	write_unlock_bh(&xfrm_policy_lock);
+
+	xfrm_policy_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head));
+}
+
+static void xfrm_byidx_resize(int total)
+{
+	unsigned int hmask = xfrm_idx_hmask;
+	unsigned int nhashmask = xfrm_new_hash_mask(hmask);
+	unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head);
+	struct hlist_head *oidx = xfrm_policy_byidx;
+	struct hlist_head *nidx = xfrm_policy_hash_alloc(nsize);
+	int i;
+
+	if (!nidx)
+		return;
+
+	write_lock_bh(&xfrm_policy_lock);
+
+	for (i = hmask; i >= 0; i--)
+		xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask);
+
+	xfrm_policy_byidx = nidx;
+	xfrm_idx_hmask = nhashmask;
+
+	write_unlock_bh(&xfrm_policy_lock);
+
+	xfrm_policy_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head));
+}
+
+static inline int xfrm_bydst_should_resize(int dir, int *total)
+{
+	unsigned int cnt = xfrm_policy_count[dir];
+	unsigned int hmask = xfrm_policy_bydst[dir].hmask;
+
+	if (total)
+		*total += cnt;
+
+	if ((hmask + 1) < xfrm_policy_hashmax &&
+	    cnt > hmask)
+		return 1;
+
+	return 0;
+}
+
+static inline int xfrm_byidx_should_resize(int total)
+{
+	unsigned int hmask = xfrm_idx_hmask;
+
+	if ((hmask + 1) < xfrm_policy_hashmax &&
+	    total > hmask)
+		return 1;
+
+	return 0;
+}
+
+static DEFINE_MUTEX(hash_resize_mutex);
+
+static void xfrm_hash_resize(void *__unused)
+{
+	int dir, total;
+
+	mutex_lock(&hash_resize_mutex);
+
+	total = 0;
+	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
+		if (xfrm_bydst_should_resize(dir, &total))
+			xfrm_bydst_resize(dir);
+	}
+	if (xfrm_byidx_should_resize(total))
+		xfrm_byidx_resize(total);
+
+	mutex_unlock(&hash_resize_mutex);
+}
+
+static DECLARE_WORK(xfrm_hash_work, xfrm_hash_resize, NULL);
+
 /* Generate new index... KAME seems to generate them ordered by cost
  * of an absolute inpredictability of ordering of rules. This will not pass. */
 static u32 xfrm_gen_index(u8 type, int dir)
 {
-	u32 idx;
-	struct xfrm_policy *p;
 	static u32 idx_generator;
 
 	for (;;) {
+		struct hlist_node *entry;
+		struct hlist_head *list;
+		struct xfrm_policy *p;
+		u32 idx;
+		int found;
+
 		idx = (idx_generator | dir);
 		idx_generator += 8;
 		if (idx == 0)
 			idx = 8;
-		for (p = XFRM_POLICY_LISTHEAD(type, dir); p; p = p->next) {
-			if (p->index == idx)
+		list = xfrm_policy_byidx + idx_hash(idx);
+		found = 0;
+		hlist_for_each_entry(p, entry, list, byidx) {
+			if (p->index == idx) {
+				found = 1;
 				break;
+			}
 		}
-		if (!p)
+		if (!found)
 			return idx;
 	}
 }
 
+static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2)
+{
+	u32 *p1 = (u32 *) s1;
+	u32 *p2 = (u32 *) s2;
+	int len = sizeof(struct xfrm_selector) / sizeof(u32);
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (p1[i] != p2[i])
+			return 1;
+	}
+
+	return 0;
+}
+
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 {
-	struct xfrm_policy *pol, **p;
-	struct xfrm_policy *delpol = NULL;
-	struct xfrm_policy **newpos = NULL;
+	struct xfrm_policy *pol;
+	struct xfrm_policy *delpol;
+	struct hlist_head *chain;
+	struct hlist_node *entry, *newpos, *last;
 	struct dst_entry *gc_list;
 
 	write_lock_bh(&xfrm_policy_lock);
-	for (p = XFRM_POLICY_LISTHEADP(policy->type, dir); (pol=*p)!=NULL;) {
-		if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 &&
+	chain = policy_hash_bysel(&policy->selector, policy->family, dir);
+	delpol = NULL;
+	newpos = NULL;
+	last = NULL;
+	hlist_for_each_entry(pol, entry, chain, bydst) {
+		if (!delpol &&
+		    pol->type == policy->type &&
+		    !selector_cmp(&pol->selector, &policy->selector) &&
 		    xfrm_sec_ctx_match(pol->security, policy->security)) {
 			if (excl) {
 				write_unlock_bh(&xfrm_policy_lock);
 				return -EEXIST;
 			}
-			*p = pol->next;
 			delpol = pol;
 			if (policy->priority > pol->priority)
 				continue;
 		} else if (policy->priority >= pol->priority) {
-			p = &pol->next;
+			last = &pol->bydst;
 			continue;
 		}
 		if (!newpos)
-			newpos = p;
+			newpos = &pol->bydst;
 		if (delpol)
 			break;
-		p = &pol->next;
+		last = &pol->bydst;
 	}
+	if (!newpos)
+		newpos = last;
 	if (newpos)
-		p = newpos;
+		hlist_add_after(newpos, &policy->bydst);
+	else
+		hlist_add_head(&policy->bydst, chain);
 	xfrm_pol_hold(policy);
-	policy->next = *p;
-	*p = policy;
+	xfrm_policy_count[dir]++;
 	atomic_inc(&flow_cache_genid);
+	if (delpol) {
+		hlist_del(&delpol->bydst);
+		hlist_del(&delpol->byidx);
+		xfrm_policy_count[dir]--;
+	}
 	policy->index = delpol ? delpol->index : xfrm_gen_index(policy->type, dir);
+	hlist_add_head(&policy->byidx, xfrm_policy_byidx+idx_hash(policy->index));
 	policy->curlft.add_time = (unsigned long)xtime.tv_sec;
 	policy->curlft.use_time = 0;
 	if (!mod_timer(&policy->timer, jiffies + HZ))
@@ -479,10 +748,13 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 
 	if (delpol)
 		xfrm_policy_kill(delpol);
+	else if (xfrm_bydst_should_resize(dir, NULL))
+		schedule_work(&xfrm_hash_work);
 
 	read_lock_bh(&xfrm_policy_lock);
 	gc_list = NULL;
-	for (policy = policy->next; policy; policy = policy->next) {
+	entry = &policy->bydst;
+	hlist_for_each_entry_continue(policy, entry, bydst) {
 		struct dst_entry *dst;
 
 		write_lock(&policy->lock);
@@ -515,67 +787,112 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(u8 type, int dir,
 					  struct xfrm_selector *sel,
 					  struct xfrm_sec_ctx *ctx, int delete)
 {
-	struct xfrm_policy *pol, **p;
+	struct xfrm_policy *pol, *ret;
+	struct hlist_head *chain;
+	struct hlist_node *entry;
 
 	write_lock_bh(&xfrm_policy_lock);
-	for (p = XFRM_POLICY_LISTHEADP(type, dir); (pol=*p)!=NULL; p = &pol->next) {
-		if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) &&
-		    (xfrm_sec_ctx_match(ctx, pol->security))) {
+	chain = policy_hash_bysel(sel, sel->family, dir);
+	ret = NULL;
+	hlist_for_each_entry(pol, entry, chain, bydst) {
+		if (pol->type == type &&
+		    !selector_cmp(sel, &pol->selector) &&
+		    xfrm_sec_ctx_match(ctx, pol->security)) {
 			xfrm_pol_hold(pol);
-			if (delete)
-				*p = pol->next;
+			if (delete) {
+				hlist_del(&pol->bydst);
+				hlist_del(&pol->byidx);
+				xfrm_policy_count[dir]--;
+			}
+			ret = pol;
 			break;
 		}
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (pol && delete) {
+	if (ret && delete) {
 		atomic_inc(&flow_cache_genid);
-		xfrm_policy_kill(pol);
+		xfrm_policy_kill(ret);
 	}
-	return pol;
+	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
 
 struct xfrm_policy *xfrm_policy_byid(u8 type, int dir, u32 id, int delete)
 {
-	struct xfrm_policy *pol, **p;
+	struct xfrm_policy *pol, *ret;
+	struct hlist_head *chain;
+	struct hlist_node *entry;
 
 	write_lock_bh(&xfrm_policy_lock);
-	for (p = XFRM_POLICY_LISTHEADP(type, dir); (pol=*p)!=NULL; p = &pol->next) {
-		if (pol->index == id) {
+	chain = xfrm_policy_byidx + idx_hash(id);
+	ret = NULL;
+	hlist_for_each_entry(pol, entry, chain, byidx) {
+		if (pol->type == type && pol->index == id) {
 			xfrm_pol_hold(pol);
-			if (delete)
-				*p = pol->next;
+			if (delete) {
+				hlist_del(&pol->bydst);
+				hlist_del(&pol->byidx);
+				xfrm_policy_count[dir]--;
+			}
+			ret = pol;
 			break;
 		}
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (pol && delete) {
+	if (ret && delete) {
 		atomic_inc(&flow_cache_genid);
-		xfrm_policy_kill(pol);
+		xfrm_policy_kill(ret);
 	}
-	return pol;
+	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
 
 void xfrm_policy_flush(u8 type)
 {
-	struct xfrm_policy *xp;
-	struct xfrm_policy **p_list = XFRM_POLICY_LISTS(type);
 	int dir;
 
 	write_lock_bh(&xfrm_policy_lock);
 	for (dir = 0; dir < XFRM_POLICY_MAX; dir++) {
-		while ((xp = p_list[dir]) != NULL) {
-			p_list[dir] = xp->next;
+		struct xfrm_policy *pol;
+		struct hlist_node *entry;
+		int i;
+
+	again1:
+		hlist_for_each_entry(pol, entry,
+				     &xfrm_policy_inexact[dir], bydst) {
+			if (pol->type != type)
+				continue;
+			hlist_del(&pol->bydst);
+			hlist_del(&pol->byidx);
 			write_unlock_bh(&xfrm_policy_lock);
 
-			xfrm_policy_kill(xp);
+			xfrm_policy_kill(pol);
 
 			write_lock_bh(&xfrm_policy_lock);
+			goto again1;
+		}
+
+		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
+	again2:
+			hlist_for_each_entry(pol, entry,
+					     xfrm_policy_bydst[dir].table + i,
+					     bydst) {
+				if (pol->type != type)
+					continue;
+				hlist_del(&pol->bydst);
+				hlist_del(&pol->byidx);
+				write_unlock_bh(&xfrm_policy_lock);
+
+				xfrm_policy_kill(pol);
+
+				write_lock_bh(&xfrm_policy_lock);
+				goto again2;
+			}
 		}
+
+		xfrm_policy_count[dir] = 0;
 	}
 	atomic_inc(&flow_cache_genid);
 	write_unlock_bh(&xfrm_policy_lock);
@@ -585,15 +902,27 @@ EXPORT_SYMBOL(xfrm_policy_flush);
 int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*),
 		     void *data)
 {
-	struct xfrm_policy *xp;
-	int dir;
-	int count = 0;
-	int error = 0;
+	struct xfrm_policy *pol;
+	struct hlist_node *entry;
+	int dir, count, error;
 
 	read_lock_bh(&xfrm_policy_lock);
+	count = 0;
 	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
-		for (xp = XFRM_POLICY_LISTHEAD(type, dir); xp; xp = xp->next)
-			count++;
+		struct hlist_head *table = xfrm_policy_bydst[dir].table;
+		int i;
+
+		hlist_for_each_entry(pol, entry,
+				     &xfrm_policy_inexact[dir], bydst) {
+			if (pol->type == type)
+				count++;
+		}
+		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
+			hlist_for_each_entry(pol, entry, table + i, bydst) {
+				if (pol->type == type)
+					count++;
+			}
+		}
 	}
 
 	if (count == 0) {
@@ -602,13 +931,28 @@ int xfrm_policy_walk(u8 type, int (*func)(struct xfrm_policy *, int, int, void*)
 	}
 
 	for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) {
-		for (xp = XFRM_POLICY_LISTHEAD(type, dir); xp; xp = xp->next) {
-			error = func(xp, dir%XFRM_POLICY_MAX, --count, data);
+		struct hlist_head *table = xfrm_policy_bydst[dir].table;
+		int i;
+
+		hlist_for_each_entry(pol, entry,
+				     &xfrm_policy_inexact[dir], bydst) {
+			if (pol->type != type)
+				continue;
+			error = func(pol, dir % XFRM_POLICY_MAX, --count, data);
 			if (error)
 				goto out;
 		}
+		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
+			hlist_for_each_entry(pol, entry, table + i, bydst) {
+				if (pol->type != type)
+					continue;
+				error = func(pol, dir % XFRM_POLICY_MAX, --count, data);
+				if (error)
+					goto out;
+			}
+		}
 	}
-
+	error = 0;
 out:
 	read_unlock_bh(&xfrm_policy_lock);
 	return error;
@@ -617,31 +961,61 @@ EXPORT_SYMBOL(xfrm_policy_walk);
 
 /* Find policy to apply to this flow. */
 
-static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl,
-						     u16 family, u8 dir)
+static int xfrm_policy_match(struct xfrm_policy *pol, struct flowi *fl,
+			     u8 type, u16 family, int dir)
 {
-	struct xfrm_policy *pol;
+	struct xfrm_selector *sel = &pol->selector;
+	int match;
 
-	read_lock_bh(&xfrm_policy_lock);
-	for (pol = XFRM_POLICY_LISTHEAD(type, dir); pol; pol = pol->next) {
-		struct xfrm_selector *sel = &pol->selector;
-		int match;
+	if (pol->family != family ||
+	    pol->type != type)
+		return 0;
 
-		if (pol->family != family)
-			continue;
+	match = xfrm_selector_match(sel, fl, family);
+	if (match) {
+		if (!security_xfrm_policy_lookup(pol, fl->secid, dir))
+			return 1;
+	}
+
+	return 0;
+}
 
-		match = xfrm_selector_match(sel, fl, family);
+static struct xfrm_policy *xfrm_policy_lookup_bytype(u8 type, struct flowi *fl,
+						     u16 family, u8 dir)
+{
+	struct xfrm_policy *pol, *ret;
+	xfrm_address_t *daddr, *saddr;
+	struct hlist_node *entry;
+	struct hlist_head *chain;
 
-		if (match) {
- 			if (!security_xfrm_policy_lookup(pol, fl->secid, dir)) {
+	daddr = xfrm_flowi_daddr(fl, family);
+	saddr = xfrm_flowi_saddr(fl, family);
+	if (unlikely(!daddr || !saddr))
+		return NULL;
+
+	read_lock_bh(&xfrm_policy_lock);
+	chain = policy_hash_direct(daddr, saddr, family, dir);
+	ret = NULL;
+	hlist_for_each_entry(pol, entry, chain, bydst) {
+		if (xfrm_policy_match(pol, fl, type, family, dir)) {
+			xfrm_pol_hold(pol);
+			ret = pol;
+			break;
+		}
+	}
+	if (!ret) {
+		chain = &xfrm_policy_inexact[dir];
+		hlist_for_each_entry(pol, entry, chain, bydst) {
+			if (xfrm_policy_match(pol, fl, type, family, dir)) {
 				xfrm_pol_hold(pol);
+				ret = pol;
 				break;
 			}
 		}
 	}
 	read_unlock_bh(&xfrm_policy_lock);
 
-	return pol;
+	return ret;
 }
 
 static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
@@ -657,7 +1031,7 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir,
 	pol = xfrm_policy_lookup_bytype(XFRM_POLICY_TYPE_MAIN, fl, family, dir);
 
 #ifdef CONFIG_XFRM_SUB_POLICY
- end:
+end:
 #endif
 	if ((*objp = (void *) pol) != NULL)
 		*obj_refp = &pol->refcnt;
@@ -704,26 +1078,29 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struc
 
 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir)
 {
-	struct xfrm_policy **p_list = XFRM_POLICY_LISTS(pol->type);
+	struct hlist_head *chain = policy_hash_bysel(&pol->selector,
+						     pol->family, dir);
 
-	pol->next = p_list[dir];
-	p_list[dir] = pol;
+	hlist_add_head(&pol->bydst, chain);
+	hlist_add_head(&pol->byidx, xfrm_policy_byidx+idx_hash(pol->index));
+	xfrm_policy_count[dir]++;
 	xfrm_pol_hold(pol);
+
+	if (xfrm_bydst_should_resize(dir, NULL))
+		schedule_work(&xfrm_hash_work);
 }
 
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir)
 {
-	struct xfrm_policy **polp;
+	if (hlist_unhashed(&pol->bydst))
+		return NULL;
 
-	for (polp = XFRM_POLICY_LISTHEADP(pol->type, dir);
-	     *polp != NULL; polp = &(*polp)->next) {
-		if (*polp == pol) {
-			*polp = pol->next;
-			return pol;
-		}
-	}
-	return NULL;
+	hlist_del(&pol->bydst);
+	hlist_del(&pol->byidx);
+	xfrm_policy_count[dir]--;
+
+	return pol;
 }
 
 int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
@@ -968,7 +1345,8 @@ restart:
 
 	if (!policy) {
 		/* To accelerate a bit...  */
-		if ((dst_orig->flags & DST_NOXFRM) || xfrm_policy_lists_empty(XFRM_POLICY_OUT))
+		if ((dst_orig->flags & DST_NOXFRM) ||
+		    !xfrm_policy_count[XFRM_POLICY_OUT])
 			return 0;
 
 		policy = flow_cache_lookup(fl, dst_orig->ops->family,
@@ -1413,50 +1791,50 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }
 
+static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p)
+{
+	struct dst_entry *dst, **dstp;
+
+	write_lock(&pol->lock);
+	dstp = &pol->bundles;
+	while ((dst=*dstp) != NULL) {
+		if (func(dst)) {
+			*dstp = dst->next;
+			dst->next = *gc_list_p;
+			*gc_list_p = dst;
+		} else {
+			dstp = &dst->next;
+		}
+	}
+	write_unlock(&pol->lock);
+}
+
 static void xfrm_prune_bundles(int (*func)(struct dst_entry *))
 {
-	int i;
-	struct xfrm_policy *pol;
-	struct dst_entry *dst, **dstp, *gc_list = NULL;
+	struct dst_entry *gc_list = NULL;
+	int dir;
 
 	read_lock_bh(&xfrm_policy_lock);
-	for (i=0; i<2*XFRM_POLICY_MAX; i++) {
-#ifdef CONFIG_XFRM_SUB_POLICY
-		for (pol = xfrm_policy_list_sub[i]; pol; pol = pol->next) {
-			write_lock(&pol->lock);
-			dstp = &pol->bundles;
-			while ((dst=*dstp) != NULL) {
-				if (func(dst)) {
-					*dstp = dst->next;
-					dst->next = gc_list;
-					gc_list = dst;
-				} else {
-					dstp = &dst->next;
-				}
-			}
-			write_unlock(&pol->lock);
-		}
+	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
+		struct xfrm_policy *pol;
+		struct hlist_node *entry;
+		struct hlist_head *table;
+		int i;
 
-#endif
-		for (pol = xfrm_policy_list[i]; pol; pol = pol->next) {
-			write_lock(&pol->lock);
-			dstp = &pol->bundles;
-			while ((dst=*dstp) != NULL) {
-				if (func(dst)) {
-					*dstp = dst->next;
-					dst->next = gc_list;
-					gc_list = dst;
-				} else {
-					dstp = &dst->next;
-				}
-			}
-			write_unlock(&pol->lock);
+		hlist_for_each_entry(pol, entry,
+				     &xfrm_policy_inexact[dir], bydst)
+			prune_one_bundle(pol, func, &gc_list);
+
+		table = xfrm_policy_bydst[dir].table;
+		for (i = xfrm_policy_bydst[dir].hmask; i >= 0; i--) {
+			hlist_for_each_entry(pol, entry, table + i, bydst)
+				prune_one_bundle(pol, func, &gc_list);
 		}
 	}
 	read_unlock_bh(&xfrm_policy_lock);
 
 	while (gc_list) {
-		dst = gc_list;
+		struct dst_entry *dst = gc_list;
 		gc_list = dst->next;
 		dst_free(dst);
 	}
@@ -1680,6 +2058,9 @@ static struct notifier_block xfrm_dev_notifier = {
 
 static void __init xfrm_policy_init(void)
 {
+	unsigned int hmask, sz;
+	int dir;
+
 	xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache",
 					   sizeof(struct xfrm_dst),
 					   0, SLAB_HWCACHE_ALIGN,
@@ -1687,6 +2068,26 @@ static void __init xfrm_policy_init(void)
 	if (!xfrm_dst_cache)
 		panic("XFRM: failed to allocate xfrm_dst_cache\n");
 
+	hmask = 8 - 1;
+	sz = (hmask+1) * sizeof(struct hlist_head);
+
+	xfrm_policy_byidx = xfrm_policy_hash_alloc(sz);
+	xfrm_idx_hmask = hmask;
+	if (!xfrm_policy_byidx)
+		panic("XFRM: failed to allocate byidx hash\n");
+
+	for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
+		struct xfrm_policy_hash *htab;
+
+		INIT_HLIST_HEAD(&xfrm_policy_inexact[dir]);
+
+		htab = &xfrm_policy_bydst[dir];
+		htab->table = xfrm_policy_hash_alloc(sz);
+		htab->hmask = hmask;
+		if (!htab->table)
+			panic("XFRM: failed to allocate bydst hash\n");
+	}
+
 	INIT_WORK(&xfrm_policy_gc_work, xfrm_policy_gc_task, NULL);
 	register_netdevice_notifier(&xfrm_dev_notifier);
 }
-- 
cgit v1.2.3-70-g09d2


From e4bec827feda76d5e7417a2696a75424834d564f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 22 Sep 2006 15:17:35 -0700
Subject: [IPSEC] esp: Defer output IV initialization to first use.

First of all, if the xfrm_state only gets used for input
packets this entropy is a complete waste.

Secondly, it is often the case that a configuration loads
many rules (perhaps even dynamically) and they don't all
necessarily ever get used.

This get_random_bytes() call was showing up in the profiles
for xfrm_state inserts which is how I noticed this.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/esp.h | 5 +++--
 net/ipv4/esp4.c   | 9 +++++++--
 net/ipv6/esp6.c   | 9 +++++++--
 3 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/esp.h b/include/net/esp.h
index 064366d66ee..713d039f4af 100644
--- a/include/net/esp.h
+++ b/include/net/esp.h
@@ -15,13 +15,14 @@ struct esp_data
 	struct {
 		u8			*key;		/* Key */
 		int			key_len;	/* Key length */
-		u8			*ivec;		/* ivec buffer */
+		int			padlen;		/* 0..255 */
 		/* ivlen is offset from enc_data, where encrypted data start.
 		 * It is logically different of crypto_tfm_alg_ivsize(tfm).
 		 * We assume that it is either zero (no ivec), or
 		 * >= crypto_tfm_alg_ivsize(tfm). */
 		int			ivlen;
-		int			padlen;		/* 0..255 */
+		int			ivinitted;
+		u8			*ivec;		/* ivec buffer */
 		struct crypto_blkcipher	*tfm;		/* crypto handle */
 	} conf;
 
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index e87377e1d6b..13b29360d10 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -95,8 +95,13 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	esph->seq_no = htonl(++x->replay.oseq);
 	xfrm_aevent_doreplay(x);
 
-	if (esp->conf.ivlen)
+	if (esp->conf.ivlen) {
+		if (unlikely(!esp->conf.ivinitted)) {
+			get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
+			esp->conf.ivinitted = 1;
+		}
 		crypto_blkcipher_set_iv(tfm, esp->conf.ivec, esp->conf.ivlen);
+	}
 
 	do {
 		struct scatterlist *sg = &esp->sgbuf[0];
@@ -378,7 +383,7 @@ static int esp_init_state(struct xfrm_state *x)
 		esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
 		if (unlikely(esp->conf.ivec == NULL))
 			goto error;
-		get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
+		esp->conf.ivinitted = 0;
 	}
 	if (crypto_blkcipher_setkey(tfm, esp->conf.key, esp->conf.key_len))
 		goto error;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ae50b951115..e78680a9985 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -99,8 +99,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	esph->seq_no = htonl(++x->replay.oseq);
 	xfrm_aevent_doreplay(x);
 
-	if (esp->conf.ivlen)
+	if (esp->conf.ivlen) {
+		if (unlikely(!esp->conf.ivinitted)) {
+			get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
+			esp->conf.ivinitted = 1;
+		}
 		crypto_blkcipher_set_iv(tfm, esp->conf.ivec, esp->conf.ivlen);
+	}
 
 	do {
 		struct scatterlist *sg = &esp->sgbuf[0];
@@ -353,7 +358,7 @@ static int esp6_init_state(struct xfrm_state *x)
 		esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
 		if (unlikely(esp->conf.ivec == NULL))
 			goto error;
-		get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
+		esp->conf.ivinitted = 0;
 	}
 	if (crypto_blkcipher_setkey(tfm, esp->conf.key, esp->conf.key_len))
 		goto error;
-- 
cgit v1.2.3-70-g09d2


From 75bff8f023e02b045a8f68f36fa7da98dca124b8 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 21 Aug 2006 19:22:01 +0900
Subject: [IPV6] ROUTE: Routing by FWMARK.

Based on patch by Jean Lorchat <lorchat@sfc.wide.ad.jp>.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 include/linux/fib_rules.h |  2 +-
 include/net/flow.h        |  2 ++
 net/ipv6/Kconfig          |  7 +++++++
 net/ipv6/fib6_rules.c     | 23 +++++++++++++++++++++++
 net/ipv6/route.c          |  1 +
 5 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h
index 19a82b6c1c1..2987549d604 100644
--- a/include/linux/fib_rules.h
+++ b/include/linux/fib_rules.h
@@ -34,7 +34,7 @@ enum
 	FRA_UNUSED3,
 	FRA_UNUSED4,
 	FRA_UNUSED5,
-	FRA_FWMARK,	/* netfilter mark (IPv4) */
+	FRA_FWMARK,	/* netfilter mark (IPv4/IPv6) */
 	FRA_FLOW,	/* flow/class id */
 	FRA_UNUSED6,
 	FRA_UNUSED7,
diff --git a/include/net/flow.h b/include/net/flow.h
index e0522914316..3ca210ec137 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -26,6 +26,7 @@ struct flowi {
 		struct {
 			struct in6_addr		daddr;
 			struct in6_addr		saddr;
+			__u32			fwmark;
 			__u32			flowlabel;
 		} ip6_u;
 
@@ -42,6 +43,7 @@ struct flowi {
 #define fld_scope	nl_u.dn_u.scope
 #define fl6_dst		nl_u.ip6_u.daddr
 #define fl6_src		nl_u.ip6_u.saddr
+#define fl6_fwmark	nl_u.ip6_u.fwmark
 #define fl6_flowlabel	nl_u.ip6_u.flowlabel
 #define fl4_dst		nl_u.ip4_u.daddr
 #define fl4_src		nl_u.ip4_u.saddr
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 21e0cc808f4..a2d211da2ab 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -173,3 +173,10 @@ config IPV6_MULTIPLE_TABLES
 	---help---
 	  Support multiple routing tables.
 
+config IPV6_ROUTE_FWMARK
+	bool "IPv6: use netfilter MARK value as routing key"
+	depends on IPV6_MULTIPLE_TABLES && NETFILTER
+	---help---
+	  If you say Y here, you will be able to specify different routes for
+	  packets with different mark values (see iptables(8), MARK target).
+
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 91f6233d8ef..aebd9e2b85a 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -26,6 +26,9 @@ struct fib6_rule
 	struct fib_rule		common;
 	struct rt6key		src;
 	struct rt6key		dst;
+#ifdef CONFIG_IPV6_ROUTE_FWMARK
+	u8			fwmark;
+#endif
 	u8			tclass;
 };
 
@@ -124,6 +127,11 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 	if (r->tclass && r->tclass != ((ntohl(fl->fl6_flowlabel) >> 20) & 0xff))
 		return 0;
 
+#ifdef CONFIG_IPV6_ROUTE_FWMARK
+	if (r->fwmark && (r->fwmark != fl->fl6_fwmark))
+		return 0;
+#endif
+
 	return 1;
 }
 
@@ -164,6 +172,11 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 		nla_memcpy(&rule6->dst.addr, tb[FRA_DST],
 			   sizeof(struct in6_addr));
 
+#ifdef CONFIG_IPV6_ROUTE_FWMARK
+	if (tb[FRA_FWMARK])
+		rule6->fwmark = nla_get_u32(tb[FRA_FWMARK]);
+#endif
+
 	rule6->src.plen = frh->src_len;
 	rule6->dst.plen = frh->dst_len;
 	rule6->tclass = frh->tos;
@@ -195,6 +208,11 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
 	    nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr)))
 		return 0;
 
+#ifdef CONFIG_IPV6_ROUTE_FWMARK
+	if (tb[FRA_FWMARK] && (rule6->fwmark != nla_get_u32(tb[FRA_FWMARK])))
+		return 0;
+#endif
+
 	return 1;
 }
 
@@ -216,6 +234,11 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 		NLA_PUT(skb, FRA_SRC, sizeof(struct in6_addr),
 			&rule6->src.addr);
 
+#ifdef CONFIG_IPV6_ROUTE_FWMARK
+	if (rule6->fwmark)
+		NLA_PUT_U32(skb, FRA_FWMARK, rule6->fwmark);
+#endif
+
 	return 0;
 
 nla_put_failure:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 20691285aee..649350bd929 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -703,6 +703,7 @@ void ip6_route_input(struct sk_buff *skb)
 			.ip6_u = {
 				.daddr = iph->daddr,
 				.saddr = iph->saddr,
+				.fwmark = skb->nfmark,
 				.flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
 			},
 		},
-- 
cgit v1.2.3-70-g09d2


From 1aaec67f9335a17856dfacdd3e5cc6f4c18faeec Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Sun, 25 Jun 2006 23:54:55 +0900
Subject: [NET]: Add common helper functions to convert IPv6/IPv4 address
 string to network address structure.

These helpers can be used in netfilter, cifs etc.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 include/linux/inet.h |   2 +
 net/core/utils.c     | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 217 insertions(+)

(limited to 'include')

diff --git a/include/linux/inet.h b/include/linux/inet.h
index 6c5587af118..b7c6da7d6d3 100644
--- a/include/linux/inet.h
+++ b/include/linux/inet.h
@@ -46,5 +46,7 @@
 #include <linux/types.h>
 
 extern __be32 in_aton(const char *str);
+extern int in4_pton(const char *src, int srclen, u8 *dst, char delim, const char **end);
+extern int in6_pton(const char *src, int srclen, u8 *dst, char delim, const char **end);
 #endif
 #endif	/* _LINUX_INET_H */
diff --git a/net/core/utils.c b/net/core/utils.c
index e31c90e0559..5a06e8a72c1 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -4,6 +4,7 @@
  *	Authors:
  *	net_random Alan Cox
  *	net_ratelimit Andy Kleen
+ *	in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
  *
  *	Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  *
@@ -191,3 +192,217 @@ __be32 in_aton(const char *str)
 }
 
 EXPORT_SYMBOL(in_aton);
+
+#define IN6PTON_XDIGIT		0x00010000
+#define IN6PTON_DIGIT		0x00020000
+#define IN6PTON_COLON_MASK	0x00700000
+#define IN6PTON_COLON_1		0x00100000	/* single : requested */
+#define IN6PTON_COLON_2		0x00200000	/* second : requested */
+#define IN6PTON_COLON_1_2	0x00400000	/* :: requested */
+#define IN6PTON_DOT		0x00800000	/* . */
+#define IN6PTON_DELIM		0x10000000
+#define IN6PTON_NULL		0x20000000	/* first/tail */
+#define IN6PTON_UNKNOWN		0x40000000
+
+static inline int digit2bin(char c, char delim)
+{
+	if (c == delim || c == '\0')
+		return IN6PTON_DELIM;
+	if (c == '.')
+		return IN6PTON_DOT;
+	if (c >= '0' && c <= '9')
+		return (IN6PTON_DIGIT | (c - '0'));
+	return IN6PTON_UNKNOWN;
+}
+
+static inline int xdigit2bin(char c, char delim)
+{
+	if (c == delim || c == '\0')
+		return IN6PTON_DELIM;
+	if (c == ':')
+		return IN6PTON_COLON_MASK;
+	if (c == '.')
+		return IN6PTON_DOT;
+	if (c >= '0' && c <= '9')
+		return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0'));
+	if (c >= 'a' && c <= 'f')
+		return (IN6PTON_XDIGIT | (c - 'a' + 10));
+	if (c >= 'A' && c <= 'F')
+		return (IN6PTON_XDIGIT | (c - 'A' + 10));
+	return IN6PTON_UNKNOWN;
+}
+
+int in4_pton(const char *src, int srclen,
+	     u8 *dst,
+	     char delim, const char **end)
+{
+	const char *s;
+	u8 *d;
+	u8 dbuf[4];
+	int ret = 0;
+	int i;
+	int w = 0;
+
+	if (srclen < 0)
+		srclen = strlen(src);
+	s = src;
+	d = dbuf;
+	i = 0;
+	while(1) {
+		int c;
+		c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+		if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM))) {
+			goto out;
+		}
+		if (c & (IN6PTON_DOT | IN6PTON_DELIM)) {
+			if (w == 0)
+				goto out;
+			*d++ = w & 0xff;
+			w = 0;
+			i++;
+			if (c & IN6PTON_DELIM) {
+				if (i != 4)
+					goto out;
+				break;
+			}
+			goto cont;
+		}
+		w = (w * 10) + c;
+		if ((w & 0xffff) > 255) {
+			goto out;
+		}
+cont:
+		if (i >= 4)
+			goto out;
+		s++;
+		srclen--;
+	}
+	ret = 1;
+	memcpy(dst, dbuf, sizeof(dbuf));
+out:
+	if (end)
+		*end = s;
+	return ret;
+}
+
+EXPORT_SYMBOL(in4_pton);
+
+int in6_pton(const char *src, int srclen,
+	     u8 *dst,
+	     char delim, const char **end)
+{
+	const char *s, *tok = NULL;
+	u8 *d, *dc = NULL;
+	u8 dbuf[16];
+	int ret = 0;
+	int i;
+	int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL;
+	int w = 0;
+
+	memset(dbuf, 0, sizeof(dbuf));
+
+	s = src;
+	d = dbuf;
+	if (srclen < 0)
+		srclen = strlen(src);
+
+	printf("srclen=%d\n", srclen);
+
+	while (1) {
+		int c;
+
+		c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
+		if (!(c & state))
+			goto out;
+		if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
+			/* process one 16-bit word */
+			if (!(state & IN6PTON_NULL)) {
+				*d++ = (w >> 8) & 0xff;
+				*d++ = w & 0xff;
+			}
+			w = 0;
+			if (c & IN6PTON_DELIM) {
+				/* We've processed last word */
+				break;
+			}
+			/*
+			 * COLON_1 => XDIGIT
+			 * COLON_2 => XDIGIT|DELIM
+			 * COLON_1_2 => COLON_2
+			 */
+			switch (state & IN6PTON_COLON_MASK) {
+			case IN6PTON_COLON_2:
+				dc = d;
+				state = IN6PTON_XDIGIT | IN6PTON_DELIM;
+				if (dc - dbuf >= sizeof(dbuf))
+					state |= IN6PTON_NULL;
+				break;
+			case IN6PTON_COLON_1|IN6PTON_COLON_1_2:
+				state = IN6PTON_XDIGIT | IN6PTON_COLON_2;
+				break;
+			case IN6PTON_COLON_1:
+				state = IN6PTON_XDIGIT;
+				break;
+			case IN6PTON_COLON_1_2:
+				state = IN6PTON_COLON_2;
+				break;
+			default:
+				state = 0;
+			}
+			tok = s + 1;
+			goto cont;
+		}
+
+		if (c & IN6PTON_DOT) {
+			ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s);
+			if (ret > 0) {
+				d += 4;
+				break;
+			}
+			goto out;
+		}
+
+		w = (w << 4) | (0xff & c);
+		state = IN6PTON_COLON_1 | IN6PTON_DELIM;
+		if (!(w & 0xf000)) {
+			state |= IN6PTON_XDIGIT;
+		}
+		if (!dc && d + 2 < dbuf + sizeof(dbuf)) {
+			state |= IN6PTON_COLON_1_2;
+			state &= ~IN6PTON_DELIM;
+		}
+		if (d + 2 >= dbuf + sizeof(dbuf)) {
+			state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2);
+		}
+cont:
+		if ((dc && d + 4 < dbuf + sizeof(dbuf)) ||
+		    d + 4 == dbuf + sizeof(dbuf)) {
+			state |= IN6PTON_DOT;
+		}
+		if (d >= dbuf + sizeof(dbuf)) {
+			state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK);
+		}
+		s++;
+		srclen--;
+	}
+
+	i = 15; d--;
+
+	if (dc) {
+		while(d >= dc)
+			dst[i--] = *d--;
+		while(i >= dc - dbuf)
+			dst[i--] = 0;
+		while(i >= 0)
+			dst[i--] = *d--;
+	} else
+		memcpy(dst, dbuf, sizeof(dbuf));
+
+	ret = 1;
+out:
+	if (end)
+		*end = s;
+	return ret;
+}
+
+EXPORT_SYMBOL(in6_pton);
-- 
cgit v1.2.3-70-g09d2


From bbfb39cbf63829d1db607aa90cbdca557a3a131d Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 25 Aug 2006 16:10:14 -0700
Subject: [IPV4]: Add support for fwmark masks in routing rules

Add a FRA_FWMASK attributes for fwmark masks. For compatibility a mask of
0xFFFFFFFF is used when a mark value != 0 is sent without a mask.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/fib_rules.h |  3 ++-
 net/ipv4/fib_rules.c      | 21 +++++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/fib_rules.h b/include/linux/fib_rules.h
index 2987549d604..4418c8d9d47 100644
--- a/include/linux/fib_rules.h
+++ b/include/linux/fib_rules.h
@@ -34,12 +34,13 @@ enum
 	FRA_UNUSED3,
 	FRA_UNUSED4,
 	FRA_UNUSED5,
-	FRA_FWMARK,	/* netfilter mark (IPv4/IPv6) */
+	FRA_FWMARK,	/* netfilter mark */
 	FRA_FLOW,	/* flow/class id */
 	FRA_UNUSED6,
 	FRA_UNUSED7,
 	FRA_UNUSED8,
 	FRA_TABLE,	/* Extended table id */
+	FRA_FWMASK,	/* mask for netfilter mark */
 	__FRA_MAX
 };
 
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index ce185ac6f26..280f424ca9c 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -46,6 +46,7 @@ struct fib4_rule
 	u32			dstmask;
 #ifdef CONFIG_IP_ROUTE_FWMARK
 	u32			fwmark;
+	u32			fwmask;
 #endif
 #ifdef CONFIG_NET_CLS_ROUTE
 	u32			tclassid;
@@ -160,7 +161,7 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 		return 0;
 
 #ifdef CONFIG_IP_ROUTE_FWMARK
-	if (r->fwmark && (r->fwmark != fl->fl4_fwmark))
+	if ((r->fwmark ^ fl->fl4_fwmark) & r->fwmask)
 		return 0;
 #endif
 
@@ -183,6 +184,7 @@ static struct nla_policy fib4_rule_policy[FRA_MAX+1] __read_mostly = {
 	[FRA_SRC]	= { .type = NLA_U32 },
 	[FRA_DST]	= { .type = NLA_U32 },
 	[FRA_FWMARK]	= { .type = NLA_U32 },
+	[FRA_FWMASK]	= { .type = NLA_U32 },
 	[FRA_FLOW]	= { .type = NLA_U32 },
 	[FRA_TABLE]	= { .type = NLA_U32 },
 };
@@ -219,8 +221,17 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 		rule4->dst = nla_get_u32(tb[FRA_DST]);
 
 #ifdef CONFIG_IP_ROUTE_FWMARK
-	if (tb[FRA_FWMARK])
+	if (tb[FRA_FWMARK]) {
 		rule4->fwmark = nla_get_u32(tb[FRA_FWMARK]);
+		if (rule4->fwmark)
+			/* compatibility: if the mark value is non-zero all bits
+			 * are compared unless a mask is explicitly specified.
+			 */
+			rule4->fwmask = 0xFFFFFFFF;
+	}
+
+	if (tb[FRA_FWMASK])
+		rule4->fwmask = nla_get_u32(tb[FRA_FWMASK]);
 #endif
 
 #ifdef CONFIG_NET_CLS_ROUTE
@@ -256,6 +267,9 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
 #ifdef CONFIG_IP_ROUTE_FWMARK
 	if (tb[FRA_FWMARK] && (rule4->fwmark != nla_get_u32(tb[FRA_FWMARK])))
 		return 0;
+
+	if (tb[FRA_FWMASK] && (rule4->fwmask != nla_get_u32(tb[FRA_FWMASK])))
+		return 0;
 #endif
 
 #ifdef CONFIG_NET_CLS_ROUTE
@@ -285,6 +299,9 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 #ifdef CONFIG_IP_ROUTE_FWMARK
 	if (rule4->fwmark)
 		NLA_PUT_U32(skb, FRA_FWMARK, rule4->fwmark);
+
+	if (rule4->fwmask || rule4->fwmark)
+		NLA_PUT_U32(skb, FRA_FWMASK, rule4->fwmask);
 #endif
 
 	if (rule4->dst_len)
-- 
cgit v1.2.3-70-g09d2


From b4e9b520ca5d07a37ea59648e7f50f478e7487a3 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 25 Aug 2006 16:11:42 -0700
Subject: [NET_SCHED]: Add mask support to fwmark classifier

Support masking the nfmark value before the search. The mask value is
global for all filters contained in one instance. It can only be set
when a new instance is created, all filters must specify the same mask.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_cls.h |  1 +
 net/sched/cls_fw.c      | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index bd2c5a2bbbf..c3f01b3085a 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -305,6 +305,7 @@ enum
 	TCA_FW_POLICE,
 	TCA_FW_INDEV, /*  used by CONFIG_NET_CLS_IND */
 	TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */
+	TCA_FW_MASK,
 	__TCA_FW_MAX
 };
 
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index e6973d9b686..e54acc6bccc 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -50,6 +50,7 @@
 struct fw_head
 {
 	struct fw_filter *ht[HTSIZE];
+	u32 mask;
 };
 
 struct fw_filter
@@ -101,7 +102,7 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
 	struct fw_filter *f;
 	int r;
 #ifdef CONFIG_NETFILTER
-	u32 id = skb->nfmark;
+	u32 id = skb->nfmark & head->mask;
 #else
 	u32 id = 0;
 #endif
@@ -209,7 +210,9 @@ static int
 fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
 	struct rtattr **tb, struct rtattr **tca, unsigned long base)
 {
+	struct fw_head *head = (struct fw_head *)tp->root;
 	struct tcf_exts e;
+	u32 mask;
 	int err;
 
 	err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map);
@@ -232,6 +235,15 @@ fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
 	}
 #endif /* CONFIG_NET_CLS_IND */
 
+	if (tb[TCA_FW_MASK-1]) {
+		if (RTA_PAYLOAD(tb[TCA_FW_MASK-1]) != sizeof(u32))
+			goto errout;
+		mask = *(u32*)RTA_DATA(tb[TCA_FW_MASK-1]);
+		if (mask != head->mask)
+			goto errout;
+	} else if (head->mask != 0xFFFFFFFF)
+		goto errout;
+
 	tcf_exts_change(tp, &f->exts, &e);
 
 	return 0;
@@ -267,9 +279,17 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,
 		return -EINVAL;
 
 	if (head == NULL) {
+		u32 mask = 0xFFFFFFFF;
+		if (tb[TCA_FW_MASK-1]) {
+			if (RTA_PAYLOAD(tb[TCA_FW_MASK-1]) != sizeof(u32))
+				return -EINVAL;
+			mask = *(u32*)RTA_DATA(tb[TCA_FW_MASK-1]);
+		}
+
 		head = kzalloc(sizeof(struct fw_head), GFP_KERNEL);
 		if (head == NULL)
 			return -ENOBUFS;
+		head->mask = mask;
 
 		tcf_tree_lock(tp);
 		tp->root = head;
@@ -330,6 +350,7 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 static int fw_dump(struct tcf_proto *tp, unsigned long fh,
 		   struct sk_buff *skb, struct tcmsg *t)
 {
+	struct fw_head *head = (struct fw_head *)tp->root;
 	struct fw_filter *f = (struct fw_filter*)fh;
 	unsigned char	 *b = skb->tail;
 	struct rtattr *rta;
@@ -351,6 +372,8 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh,
 	if (strlen(f->indev))
 		RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev);
 #endif /* CONFIG_NET_CLS_IND */
+	if (head->mask != 0xFFFFFFFF)
+		RTA_PUT(skb, TCA_FW_MASK, 4, &head->mask);
 
 	if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0)
 		goto rtattr_failure;
-- 
cgit v1.2.3-70-g09d2


From 97e5848dd39e7e76bd6077735ebb5473763ab9c5 Mon Sep 17 00:00:00 2001
From: Ian McDonald <ian.mcdonald@jandi.co.nz>
Date: Sat, 26 Aug 2006 19:16:45 -0700
Subject: [DCCP]: Introduce tx buffering

This adds transmit buffering to DCCP.

I have tested with CCID2/3 and with loss and rate limiting.

Signed off by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h |  2 ++
 net/dccp/dccp.h      |  2 +-
 net/dccp/output.c    | 90 +++++++++++++++++++++++++++++++++++++---------------
 net/dccp/proto.c     | 16 +++-------
 4 files changed, 73 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 676333b9fad..2d7671c92c0 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -438,6 +438,7 @@ struct dccp_ackvec;
  * @dccps_role - Role of this sock, one of %dccp_role
  * @dccps_ndp_count - number of Non Data Packets since last data packet
  * @dccps_hc_rx_ackvec - rx half connection ack vector
+ * @dccps_xmit_timer - timer for when CCID is not ready to send
  */
 struct dccp_sock {
 	/* inet_connection_sock has to be the first member of dccp_sock */
@@ -470,6 +471,7 @@ struct dccp_sock {
 	enum dccp_role			dccps_role:2;
 	__u8				dccps_hc_rx_insert_options:1;
 	__u8				dccps_hc_tx_insert_options:1;
+	struct timer_list		dccps_xmit_timer;
 };
  
 static inline struct dccp_sock *dccp_sk(const struct sock *sk)
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index a5c5475724c..0a21be437ed 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -130,7 +130,7 @@ extern void dccp_send_delayed_ack(struct sock *sk);
 extern void dccp_send_sync(struct sock *sk, const u64 seq,
 			   const enum dccp_pkt_type pkt_type);
 
-extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo);
+extern void dccp_write_xmit(struct sock *sk, int block);
 extern void dccp_write_space(struct sock *sk);
 
 extern void dccp_init_xmit_timers(struct sock *sk);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 58669beee13..7102e3aed4c 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -198,7 +198,7 @@ static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb,
 	while (1) {
 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 
-		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		if (sk->sk_err)
 			goto do_error;
 		if (!*timeo)
 			goto do_nonblock;
@@ -234,37 +234,72 @@ do_interrupted:
 	goto out;
 }
 
-int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
+static void dccp_write_xmit_timer(unsigned long data) {
+	struct sock *sk = (struct sock *)data;
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk))
+		sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1);
+	else
+		dccp_write_xmit(sk, 0);
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+void dccp_write_xmit(struct sock *sk, int block)
 {
-	const struct dccp_sock *dp = dccp_sk(sk);
-	int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct sk_buff *skb;
+	long timeo = 30000; 	/* If a packet is taking longer than 2 secs
+				   we have other issues */
+
+	while ((skb = skb_peek(&sk->sk_write_queue))) {
+		int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
 					 skb->len);
 
-	if (err > 0)
-		err = dccp_wait_for_ccid(sk, skb, timeo);
+		if (err > 0) {
+			if (!block) {
+				sk_reset_timer(sk, &dp->dccps_xmit_timer,
+						msecs_to_jiffies(err)+jiffies);
+				break;
+			} else
+				err = dccp_wait_for_ccid(sk, skb, &timeo);
+			if (err) {
+				printk(KERN_CRIT "%s:err at dccp_wait_for_ccid"
+						 " %d\n", __FUNCTION__, err);
+				dump_stack();
+			}
+		}
 
-	if (err == 0) {
-		struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
-		const int len = skb->len;
+		skb_dequeue(&sk->sk_write_queue);
+		if (err == 0) {
+			struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+			const int len = skb->len;
 
-		if (sk->sk_state == DCCP_PARTOPEN) {
-			/* See 8.1.5.  Handshake Completion */
-			inet_csk_schedule_ack(sk);
-			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+			if (sk->sk_state == DCCP_PARTOPEN) {
+				/* See 8.1.5.  Handshake Completion */
+				inet_csk_schedule_ack(sk);
+				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
 						  inet_csk(sk)->icsk_rto,
 						  DCCP_RTO_MAX);
-			dcb->dccpd_type = DCCP_PKT_DATAACK;
-		} else if (dccp_ack_pending(sk))
-			dcb->dccpd_type = DCCP_PKT_DATAACK;
-		else
-			dcb->dccpd_type = DCCP_PKT_DATA;
-
-		err = dccp_transmit_skb(sk, skb);
-		ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
-	} else
-		kfree_skb(skb);
-
-	return err;
+				dcb->dccpd_type = DCCP_PKT_DATAACK;
+			} else if (dccp_ack_pending(sk))
+				dcb->dccpd_type = DCCP_PKT_DATAACK;
+			else
+				dcb->dccpd_type = DCCP_PKT_DATA;
+
+			err = dccp_transmit_skb(sk, skb);
+			ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
+			if (err) {
+				printk(KERN_CRIT "%s:err from "
+					         "ccid_hc_tx_packet_sent %d\n",
+					         __FUNCTION__, err);
+				dump_stack();
+			}
+		} else
+			kfree(skb);
+	}
 }
 
 int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
@@ -426,6 +461,9 @@ static inline void dccp_connect_init(struct sock *sk)
 	dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
 
 	icsk->icsk_retransmits = 0;
+	init_timer(&dp->dccps_xmit_timer);
+	dp->dccps_xmit_timer.data = (unsigned long)sk;
+	dp->dccps_xmit_timer.function = dccp_write_xmit_timer;
 }
 
 int dccp_connect(struct sock *sk)
@@ -560,8 +598,10 @@ void dccp_send_close(struct sock *sk, const int active)
 					DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ;
 
 	if (active) {
+		dccp_write_xmit(sk, 1);
 		dccp_skb_entail(sk, skb);
 		dccp_transmit_skb(sk, skb_clone(skb, prio));
+		/* FIXME do we need a retransmit timer here? */
 	} else
 		dccp_transmit_skb(sk, skb);
 }
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 6f14bb5a28d..962df0ea31a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -662,17 +662,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (rc != 0)
 		goto out_discard;
 
-	rc = dccp_write_xmit(sk, skb, &timeo);
-	/*
-	 * XXX we don't use sk_write_queue, so just discard the packet.
-	 *     Current plan however is to _use_ sk_write_queue with
-	 *     an algorith similar to tcp_sendmsg, where the main difference
-	 *     is that in DCCP we have to respect packet boundaries, so
-	 *     no coalescing of skbs.
-	 *
-	 *     This bug was _quickly_ found & fixed by just looking at an OSTRA
-	 *     generated callgraph 8) -acme
-	 */
+	skb_queue_tail(&sk->sk_write_queue, skb);
+	dccp_write_xmit(sk,0);
 out_release:
 	release_sock(sk);
 	return rc ? : len;
@@ -846,6 +837,7 @@ static int dccp_close_state(struct sock *sk)
 
 void dccp_close(struct sock *sk, long timeout)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
 	struct sk_buff *skb;
 	int state;
 
@@ -862,6 +854,8 @@ void dccp_close(struct sock *sk, long timeout)
 		goto adjudge_to_death;
 	}
 
+	sk_stop_timer(sk, &dp->dccps_xmit_timer);
+
 	/*
 	 * We need to flush the recv. buffs.  We do this only on the
 	 * descriptor close, not protocol-sourced closes, because the
-- 
cgit v1.2.3-70-g09d2


From ff5dfe736dd9f6c74b206aa77c0465dfd503bdb9 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Sat, 26 Aug 2006 19:17:53 -0700
Subject: [NETLINK]: remove third bogus argument from NLA_PUT_FLAG

This patch removes the 'value' argument from NLA_PUT_FLAG which is
unused anyway. The documentation comment was already correct so it
doesn't need an update :)

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 47044da167c..bcb27e3a312 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -828,7 +828,7 @@ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype,
 #define NLA_PUT_STRING(skb, attrtype, value) \
 	NLA_PUT(skb, attrtype, strlen(value) + 1, value)
 
-#define NLA_PUT_FLAG(skb, attrtype, value) \
+#define NLA_PUT_FLAG(skb, attrtype) \
 	NLA_PUT(skb, attrtype, 0, NULL)
 
 #define NLA_PUT_MSECS(skb, attrtype, jiffies) \
-- 
cgit v1.2.3-70-g09d2


From 25030a7f9eeab2dcefff036469e0e2b4f956198f Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sat, 26 Aug 2006 20:06:05 -0700
Subject: [UDP]: Unify UDPv4 and UDPv6 ->get_port()

This patch creates one common function which is called by
udp_v4_get_port() and udp_v6_get_port(). As a result,
  * duplicated code is removed
  * udp_port_rover and local port lookup can now be removed from udp.h
  * further savings follow since the same function will be used by UDP-Litev4
    and UDP-Litev6

In contrast to the patch sent in response to Yoshifujis comments
(fixed by this variant), the code below also removes the
EXPORT_SYMBOL(udp_port_rover), since udp_port_rover can now remain
local to net/ipv4/udp.c.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 18 ++---------
 net/ipv4/udp.c    | 96 ++++++++++++++++++++++++++++++++++---------------------
 net/ipv6/udp.c    | 76 ++-----------------------------------------
 3 files changed, 64 insertions(+), 126 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index 766fba1369c..c490a0f662a 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -30,25 +30,9 @@
 
 #define UDP_HTABLE_SIZE		128
 
-/* udp.c: This needs to be shared by v4 and v6 because the lookup
- *        and hashing code needs to work with different AF's yet
- *        the port space is shared.
- */
 extern struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 extern rwlock_t udp_hash_lock;
 
-extern int udp_port_rover;
-
-static inline int udp_lport_inuse(u16 num)
-{
-	struct sock *sk;
-	struct hlist_node *node;
-
-	sk_for_each(sk, node, &udp_hash[num & (UDP_HTABLE_SIZE - 1)])
-		if (inet_sk(sk)->num == num)
-			return 1;
-	return 0;
-}
 
 /* Note: this must match 'valbool' in sock_setsockopt */
 #define UDP_CSUM_NOXMIT		1
@@ -63,6 +47,8 @@ extern struct proto udp_prot;
 
 struct sk_buff;
 
+extern int	udp_get_port(struct sock *sk, unsigned short snum,
+			     int (*saddr_cmp)(struct sock *, struct sock *));
 extern void	udp_err(struct sk_buff *, u32);
 
 extern int	udp_sendmsg(struct kiocb *iocb, struct sock *sk,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 514c1e9ae81..7552b50bcd8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -118,14 +118,34 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly;
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);
 
-/* Shared by v4/v6 udp. */
+/* Shared by v4/v6 udp_get_port */
 int udp_port_rover;
 
-static int udp_v4_get_port(struct sock *sk, unsigned short snum)
+static inline int udp_lport_inuse(u16 num)
 {
+	struct sock *sk;
 	struct hlist_node *node;
+
+	sk_for_each(sk, node, &udp_hash[num & (UDP_HTABLE_SIZE - 1)])
+		if (inet_sk(sk)->num == num)
+			return 1;
+	return 0;
+}
+
+/**
+ *  udp_get_port  -  common port lookup for IPv4 and IPv6
+ *
+ *  @sk:          socket struct in question
+ *  @snum:        port number to look up
+ *  @saddr_comp:  AF-dependent comparison of bound local IP addresses
+ */
+int udp_get_port(struct sock *sk, unsigned short snum,
+		 int (*saddr_cmp)(struct sock *sk1, struct sock *sk2))
+{
+	struct hlist_node *node;
+	struct hlist_head *head;
 	struct sock *sk2;
-	struct inet_sock *inet = inet_sk(sk);
+	int    error = 1;
 
 	write_lock_bh(&udp_hash_lock);
 	if (snum == 0) {
@@ -137,11 +157,10 @@ static int udp_v4_get_port(struct sock *sk, unsigned short snum)
 		best_size_so_far = 32767;
 		best = result = udp_port_rover;
 		for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
-			struct hlist_head *list;
 			int size;
 
-			list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
-			if (hlist_empty(list)) {
+			head = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
+			if (hlist_empty(head)) {
 				if (result > sysctl_local_port_range[1])
 					result = sysctl_local_port_range[0] +
 						((result - sysctl_local_port_range[0]) &
@@ -149,12 +168,11 @@ static int udp_v4_get_port(struct sock *sk, unsigned short snum)
 				goto gotit;
 			}
 			size = 0;
-			sk_for_each(sk2, node, list)
-				if (++size >= best_size_so_far)
-					goto next;
-			best_size_so_far = size;
-			best = result;
-		next:;
+			sk_for_each(sk2, node, head)
+				if (++size < best_size_so_far) {
+					best_size_so_far = size;
+					best = result;
+				}
 		}
 		result = best;
 		for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
@@ -170,38 +188,44 @@ static int udp_v4_get_port(struct sock *sk, unsigned short snum)
 gotit:
 		udp_port_rover = snum = result;
 	} else {
-		sk_for_each(sk2, node,
-			    &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
-			struct inet_sock *inet2 = inet_sk(sk2);
-
-			if (inet2->num == snum &&
-			    sk2 != sk &&
-			    !ipv6_only_sock(sk2) &&
-			    (!sk2->sk_bound_dev_if ||
-			     !sk->sk_bound_dev_if ||
-			     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-			    (!inet2->rcv_saddr ||
-			     !inet->rcv_saddr ||
-			     inet2->rcv_saddr == inet->rcv_saddr) &&
-			    (!sk2->sk_reuse || !sk->sk_reuse))
+		head = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
+
+		sk_for_each(sk2, node, head)
+			if (inet_sk(sk2)->num == snum                        &&
+			    sk2 != sk                                        &&
+			    (!sk2->sk_reuse        || !sk->sk_reuse)         &&
+			    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
+			     || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+			    (*saddr_cmp)(sk, sk2)                              )
 				goto fail;
-		}
 	}
-	inet->num = snum;
+	inet_sk(sk)->num = snum;
 	if (sk_unhashed(sk)) {
-		struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
-
-		sk_add_node(sk, h);
+		head = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
+		sk_add_node(sk, head);
 		sock_prot_inc_use(sk->sk_prot);
 	}
-	write_unlock_bh(&udp_hash_lock);
-	return 0;
-
+	error = 0;
 fail:
 	write_unlock_bh(&udp_hash_lock);
-	return 1;
+	return error;
+}
+
+static inline int  ipv4_rcv_saddr_equal(struct sock *sk1, struct sock *sk2)
+{
+	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
+
+	return 	( !ipv6_only_sock(sk2)  &&
+		  (!inet1->rcv_saddr || !inet2->rcv_saddr ||
+		   inet1->rcv_saddr == inet2->rcv_saddr      ));
+}
+
+static inline int udp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+	return udp_get_port(sk, snum, ipv4_rcv_saddr_equal);
 }
 
+
 static void udp_v4_hash(struct sock *sk)
 {
 	BUG();
@@ -1596,7 +1620,7 @@ EXPORT_SYMBOL(udp_disconnect);
 EXPORT_SYMBOL(udp_hash);
 EXPORT_SYMBOL(udp_hash_lock);
 EXPORT_SYMBOL(udp_ioctl);
-EXPORT_SYMBOL(udp_port_rover);
+EXPORT_SYMBOL(udp_get_port);
 EXPORT_SYMBOL(udp_prot);
 EXPORT_SYMBOL(udp_sendmsg);
 EXPORT_SYMBOL(udp_poll);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b9cc55ccb00..9662561701d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -61,81 +61,9 @@
 
 DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6) __read_mostly;
 
-/* Grrr, addr_type already calculated by caller, but I don't want
- * to add some silly "cookie" argument to this method just for that.
- */
-static int udp_v6_get_port(struct sock *sk, unsigned short snum)
+static inline int udp_v6_get_port(struct sock *sk, unsigned short snum)
 {
-	struct sock *sk2;
-	struct hlist_node *node;
-
-	write_lock_bh(&udp_hash_lock);
-	if (snum == 0) {
-		int best_size_so_far, best, result, i;
-
-		if (udp_port_rover > sysctl_local_port_range[1] ||
-		    udp_port_rover < sysctl_local_port_range[0])
-			udp_port_rover = sysctl_local_port_range[0];
-		best_size_so_far = 32767;
-		best = result = udp_port_rover;
-		for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
-			int size;
-			struct hlist_head *list;
-
-			list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
-			if (hlist_empty(list)) {
-				if (result > sysctl_local_port_range[1])
-					result = sysctl_local_port_range[0] +
-						((result - sysctl_local_port_range[0]) &
-						 (UDP_HTABLE_SIZE - 1));
-				goto gotit;
-			}
-			size = 0;
-			sk_for_each(sk2, node, list)
-				if (++size >= best_size_so_far)
-					goto next;
-			best_size_so_far = size;
-			best = result;
-		next:;
-		}
-		result = best;
-		for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
-			if (result > sysctl_local_port_range[1])
-				result = sysctl_local_port_range[0]
-					+ ((result - sysctl_local_port_range[0]) &
-					   (UDP_HTABLE_SIZE - 1));
-			if (!udp_lport_inuse(result))
-				break;
-		}
-		if (i >= (1 << 16) / UDP_HTABLE_SIZE)
-			goto fail;
-gotit:
-		udp_port_rover = snum = result;
-	} else {
-		sk_for_each(sk2, node,
-			    &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
-			if (inet_sk(sk2)->num == snum &&
-			    sk2 != sk &&
-			    (!sk2->sk_bound_dev_if ||
-			     !sk->sk_bound_dev_if ||
-			     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-			    (!sk2->sk_reuse || !sk->sk_reuse) &&
-			    ipv6_rcv_saddr_equal(sk, sk2))
-				goto fail;
-		}
-	}
-
-	inet_sk(sk)->num = snum;
-	if (sk_unhashed(sk)) {
-		sk_add_node(sk, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]);
-		sock_prot_inc_use(sk->sk_prot);
-	}
-	write_unlock_bh(&udp_hash_lock);
-	return 0;
-
-fail:
-	write_unlock_bh(&udp_hash_lock);
-	return 1;
+	return udp_get_port(sk, snum, ipv6_rcv_saddr_equal);
 }
 
 static void udp_v6_hash(struct sock *sk)
-- 
cgit v1.2.3-70-g09d2


From e3b4eadbea77ecb3c3a74d1bc81b392f454c7f2e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Sat, 26 Aug 2006 20:10:15 -0700
Subject: [UDP]: saddr_cmp function should take const socket pointers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This also kills a warning while building ipv6:

net/ipv6/udp.c: In function ‘udp_v6_get_port’:
net/ipv6/udp.c:66: warning: passing argument 3 of ‘udp_get_port’ from incompatible pointer type

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 2 +-
 net/ipv4/udp.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index c490a0f662a..db0c05f6754 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -48,7 +48,7 @@ extern struct proto udp_prot;
 struct sk_buff;
 
 extern int	udp_get_port(struct sock *sk, unsigned short snum,
-			     int (*saddr_cmp)(struct sock *, struct sock *));
+			     int (*saddr_cmp)(const struct sock *, const struct sock *));
 extern void	udp_err(struct sk_buff *, u32);
 
 extern int	udp_sendmsg(struct kiocb *iocb, struct sock *sk,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index aa1823050b0..77e265d7bb8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -139,7 +139,7 @@ static inline int udp_lport_inuse(u16 num)
  *  @saddr_comp:  AF-dependent comparison of bound local IP addresses
  */
 int udp_get_port(struct sock *sk, unsigned short snum,
-		 int (*saddr_cmp)(struct sock *sk1, struct sock *sk2))
+		 int (*saddr_cmp)(const struct sock *sk1, const struct sock *sk2))
 {
 	struct hlist_node *node;
 	struct hlist_head *head;
@@ -210,7 +210,7 @@ fail:
 	return error;
 }
 
-static inline int  ipv4_rcv_saddr_equal(struct sock *sk1, struct sock *sk2)
+static inline int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
 {
 	struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
 
-- 
cgit v1.2.3-70-g09d2


From a5531a5d852008be40811496029012f4ad3093d1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 26 Aug 2006 20:11:47 -0700
Subject: [NETLINK]: Improve string attribute validation

Introduces a new attribute type NLA_NUL_STRING to support NUL
terminated strings. Attributes of this kind require to carry
a terminating NUL within the maximum specified in the policy.

The `old' NLA_STRING which is not required to be NUL terminated
is extended to provide means to specify a maximum length of the
string.

Aims at easing the pain with using nla_strlcpy() on temporary
buffers.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 15 +++++++++++----
 net/netlink/attr.c    | 49 +++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 50 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index bcb27e3a312..11dc2e7f679 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -167,6 +167,7 @@ enum {
 	NLA_FLAG,
 	NLA_MSECS,
 	NLA_NESTED,
+	NLA_NUL_STRING,
 	__NLA_TYPE_MAX,
 };
 
@@ -175,21 +176,27 @@ enum {
 /**
  * struct nla_policy - attribute validation policy
  * @type: Type of attribute or NLA_UNSPEC
- * @minlen: Minimal length of payload required to be available
+ * @len: Type specific length of payload
  *
  * Policies are defined as arrays of this struct, the array must be
  * accessible by attribute type up to the highest identifier to be expected.
  *
+ * Meaning of `len' field:
+ *    NLA_STRING           Maximum length of string
+ *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
+ *    NLA_FLAG             Unused
+ *    All other            Exact length of attribute payload
+ *
  * Example:
  * static struct nla_policy my_policy[ATTR_MAX+1] __read_mostly = {
  * 	[ATTR_FOO] = { .type = NLA_U16 },
- *	[ATTR_BAR] = { .type = NLA_STRING },
- *	[ATTR_BAZ] = { .minlen = sizeof(struct mystruct) },
+ *	[ATTR_BAR] = { .type = NLA_STRING, len = BARSIZ },
+ *	[ATTR_BAZ] = { .len = sizeof(struct mystruct) },
  * };
  */
 struct nla_policy {
 	u16		type;
-	u16		minlen;
+	u16		len;
 };
 
 /**
diff --git a/net/netlink/attr.c b/net/netlink/attr.c
index 136e529e578..004139557e0 100644
--- a/net/netlink/attr.c
+++ b/net/netlink/attr.c
@@ -20,7 +20,6 @@ static u16 nla_attr_minlen[NLA_TYPE_MAX+1] __read_mostly = {
 	[NLA_U16]	= sizeof(u16),
 	[NLA_U32]	= sizeof(u32),
 	[NLA_U64]	= sizeof(u64),
-	[NLA_STRING]	= 1,
 	[NLA_NESTED]	= NLA_HDRLEN,
 };
 
@@ -28,7 +27,7 @@ static int validate_nla(struct nlattr *nla, int maxtype,
 			struct nla_policy *policy)
 {
 	struct nla_policy *pt;
-	int minlen = 0;
+	int minlen = 0, attrlen = nla_len(nla);
 
 	if (nla->nla_type <= 0 || nla->nla_type > maxtype)
 		return 0;
@@ -37,16 +36,46 @@ static int validate_nla(struct nlattr *nla, int maxtype,
 
 	BUG_ON(pt->type > NLA_TYPE_MAX);
 
-	if (pt->minlen)
-		minlen = pt->minlen;
-	else if (pt->type != NLA_UNSPEC)
-		minlen = nla_attr_minlen[pt->type];
+	switch (pt->type) {
+	case NLA_FLAG:
+		if (attrlen > 0)
+			return -ERANGE;
+		break;
 
-	if (pt->type == NLA_FLAG && nla_len(nla) > 0)
-		return -ERANGE;
+	case NLA_NUL_STRING:
+		if (pt->len)
+			minlen = min_t(int, attrlen, pt->len + 1);
+		else
+			minlen = attrlen;
 
-	if (nla_len(nla) < minlen)
-		return -ERANGE;
+		if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL)
+			return -EINVAL;
+		/* fall through */
+
+	case NLA_STRING:
+		if (attrlen < 1)
+			return -ERANGE;
+
+		if (pt->len) {
+			char *buf = nla_data(nla);
+
+			if (buf[attrlen - 1] == '\0')
+				attrlen--;
+
+			if (attrlen > pt->len)
+				return -ERANGE;
+		}
+		break;
+
+	default:
+		if (pt->len)
+			minlen = pt->len;
+		else if (pt->type != NLA_UNSPEC)
+			minlen = nla_attr_minlen[pt->type];
+
+		if (attrlen < minlen)
+			return -ERANGE;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From def42ff4dd6f54ebcf78192579a8ff1f81d8e2e8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 28 Aug 2006 23:57:56 -0700
Subject: [IPV4]: Make struct in_addr::s_addr __be32

There will be relatively small increase in sparse endian warnings, but
this (and sin_port) patch is a first step to make networking code
endian clean.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/in.h b/include/linux/in.h
index 94f557fa463..9a9d5dd32e7 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -52,7 +52,7 @@ enum {
 
 /* Internet address. */
 struct in_addr {
-	__u32	s_addr;
+	__be32	s_addr;
 };
 
 #define IP_TOS		1
-- 
cgit v1.2.3-70-g09d2


From cd360007a0eb8cbf17c006cca42aa884d33f96be Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 28 Aug 2006 23:58:32 -0700
Subject: [IPV4]: Make struct sockaddr_in::sin_port __be16

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/in.h b/include/linux/in.h
index 9a9d5dd32e7..bcaca8399ae 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -177,7 +177,7 @@ struct in_pktinfo
 #define __SOCK_SIZE__	16		/* sizeof(struct sockaddr)	*/
 struct sockaddr_in {
   sa_family_t		sin_family;	/* Address family		*/
-  unsigned short int	sin_port;	/* Port number			*/
+  __be16		sin_port;	/* Port number			*/
   struct in_addr	sin_addr;	/* Internet address		*/
 
   /* Pad to size of `struct sockaddr'. */
-- 
cgit v1.2.3-70-g09d2


From 07317621d004e8e6967f2dac8562825267e56135 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 29 Aug 2006 17:48:17 -0700
Subject: [NETFILTER] bridge: code rearrangement for clarity

Cleanup and rearrangement for better style and clarity:
	Split the function nf_bridge_maybe_copy_header into two pieces
	Move copy portion out of line.
	Use Ethernet header size macros.
	Use header file to handle CONFIG_NETFILTER_BRIDGE differences

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_bridge.h | 26 +++++++-------------------
 net/bridge/br_forward.c          |  5 +----
 net/bridge/br_netfilter.c        | 27 +++++++++++++++++++++++++--
 3 files changed, 33 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index 427c67ff89e..274fe4b3315 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -47,26 +47,12 @@ enum nf_br_hook_priorities {
 
 
 /* Only used in br_forward.c */
-static inline
-int nf_bridge_maybe_copy_header(struct sk_buff *skb)
+extern int nf_bridge_copy_header(struct sk_buff *skb);
+static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb)
 {
-	int err;
-
-	if (skb->nf_bridge) {
-		if (skb->protocol == __constant_htons(ETH_P_8021Q)) {
-			err = skb_cow(skb, 18);
-			if (err)
-				return err;
-			memcpy(skb->data - 18, skb->nf_bridge->data, 18);
-			skb_push(skb, 4);
-		} else {
-			err = skb_cow(skb, 16);
-			if (err)
-				return err;
-			memcpy(skb->data - 16, skb->nf_bridge->data, 16);
-		}
-	}
-	return 0;
+	if (skb->nf_bridge)
+		return nf_bridge_copy_header(skb);
+  	return 0;
 }
 
 /* This is called by the IP fragmenting code and it ensures there is
@@ -90,6 +76,8 @@ struct bridge_skb_cb {
 };
 
 extern int brnf_deferred_hooks;
+#else
+#define nf_bridge_maybe_copy_header(skb)	(0)
 #endif /* CONFIG_BRIDGE_NETFILTER */
 
 #endif /* __KERNEL__ */
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 864fbbc7b24..191b861e5e5 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -38,13 +38,10 @@ int br_dev_queue_push_xmit(struct sk_buff *skb)
 	if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
 		kfree_skb(skb);
 	else {
-#ifdef CONFIG_BRIDGE_NETFILTER
 		/* ip_refrag calls ip_fragment, doesn't copy the MAC header. */
 		if (nf_bridge_maybe_copy_header(skb))
 			kfree_skb(skb);
-		else
-#endif
-		{
+		else {
 			skb_push(skb, ETH_HLEN);
 
 			dev_queue_xmit(skb);
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 05b3de88824..b498efcfe45 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -127,14 +127,37 @@ static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
 
 static inline void nf_bridge_save_header(struct sk_buff *skb)
 {
-        int header_size = 16;
+        int header_size = ETH_HLEN;
 
 	if (skb->protocol == htons(ETH_P_8021Q))
-		header_size = 18;
+		header_size += VLAN_HLEN;
 
 	memcpy(skb->nf_bridge->data, skb->data - header_size, header_size);
 }
 
+/*
+ * When forwarding bridge frames, we save a copy of the original
+ * header before processing.
+ */
+int nf_bridge_copy_header(struct sk_buff *skb)
+{
+	int err;
+        int header_size = ETH_HLEN;
+
+	if (skb->protocol == htons(ETH_P_8021Q))
+		header_size += VLAN_HLEN;
+
+	err = skb_cow(skb, header_size);
+	if (err)
+		return err;
+
+	memcpy(skb->data - header_size, skb->nf_bridge->data, header_size);
+
+	if (skb->protocol == htons(ETH_P_8021Q))
+		__skb_push(skb, VLAN_HLEN);
+	return 0;
+}
+
 /* PF_BRIDGE/PRE_ROUTING *********************************************/
 /* Undo the changes made for ip6tables PREROUTING and continue the
  * bridge PRE_ROUTING hook. */
-- 
cgit v1.2.3-70-g09d2


From 9bcfcaf5e9cc887eb39236e43bdbe4b4b2572229 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 29 Aug 2006 17:48:57 -0700
Subject: [NETFILTER] bridge: simplify nf_bridge_pad

Do some simple optimization on the nf_bridge_pad() function
and don't use magic constants. Eliminate a double call and
the #ifdef'd code for CONFIG_BRIDGE_NETFILTER.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_bridge.h | 16 +++++-----------
 net/ipv4/ip_output.c             | 15 +++++++--------
 2 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index 274fe4b3315..9a4dd11af86 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -5,9 +5,8 @@
  */
 
 #include <linux/netfilter.h>
-#if defined(__KERNEL__) && defined(CONFIG_BRIDGE_NETFILTER)
 #include <linux/if_ether.h>
-#endif
+#include <linux/if_vlan.h>
 
 /* Bridge Hooks */
 /* After promisc drops, checksum checks. */
@@ -57,16 +56,10 @@ static inline int nf_bridge_maybe_copy_header(struct sk_buff *skb)
 
 /* This is called by the IP fragmenting code and it ensures there is
  * enough room for the encapsulating header (if there is one). */
-static inline
-int nf_bridge_pad(struct sk_buff *skb)
+static inline int nf_bridge_pad(const struct sk_buff *skb)
 {
-	if (skb->protocol == __constant_htons(ETH_P_IP))
-		return 0;
-	if (skb->nf_bridge) {
-		if (skb->protocol == __constant_htons(ETH_P_8021Q))
-			return 4;
-	}
-	return 0;
+ 	return (skb->nf_bridge && skb->protocol == htons(ETH_P_8021Q))
+		? VLAN_HLEN : 0;
 }
 
 struct bridge_skb_cb {
@@ -78,6 +71,7 @@ struct bridge_skb_cb {
 extern int brnf_deferred_hooks;
 #else
 #define nf_bridge_maybe_copy_header(skb)	(0)
+#define nf_bridge_pad(skb)			(0)
 #endif /* CONFIG_BRIDGE_NETFILTER */
 
 #endif /* __KERNEL__ */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 81b2795a4c2..97aee76fb74 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -426,7 +426,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 	int ptr;
 	struct net_device *dev;
 	struct sk_buff *skb2;
-	unsigned int mtu, hlen, left, len, ll_rs;
+	unsigned int mtu, hlen, left, len, ll_rs, pad;
 	int offset;
 	__be16 not_last_frag;
 	struct rtable *rt = (struct rtable*)skb->dst;
@@ -556,14 +556,13 @@ slow_path:
 	left = skb->len - hlen;		/* Space per frame */
 	ptr = raw + hlen;		/* Where to start from */
 
-#ifdef CONFIG_BRIDGE_NETFILTER
 	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
-	 * we need to make room for the encapsulating header */
-	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
-	mtu -= nf_bridge_pad(skb);
-#else
-	ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
-#endif
+	 * we need to make room for the encapsulating header
+	 */
+	pad = nf_bridge_pad(skb);
+	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
+	mtu -= pad;
+
 	/*
 	 *	Fragment the datagram.
 	 */
-- 
cgit v1.2.3-70-g09d2


From 1b7f775209bbee6b993587bae69acb9fc12ceb17 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Tue, 29 Aug 2006 17:54:17 -0700
Subject: [NetLabel]: remove unused function prototypes

Removed some older function prototypes for functions that no longer exist.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cipso_ipv4.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
index c7175e72580..5aed72ab652 100644
--- a/include/net/cipso_ipv4.h
+++ b/include/net/cipso_ipv4.h
@@ -200,15 +200,9 @@ static inline int cipso_v4_cache_add(const struct sk_buff *skb,
 
 #ifdef CONFIG_NETLABEL
 void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway);
-int cipso_v4_socket_setopt(struct socket *sock,
-			   unsigned char *opt,
-			   u32 opt_len);
 int cipso_v4_socket_setattr(const struct socket *sock,
 			    const struct cipso_v4_doi *doi_def,
 			    const struct netlbl_lsm_secattr *secattr);
-int cipso_v4_socket_getopt(const struct socket *sock,
-			   unsigned char **opt,
-			   u32 *opt_len);
 int cipso_v4_socket_getattr(const struct socket *sock,
 			    struct netlbl_lsm_secattr *secattr);
 int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
-- 
cgit v1.2.3-70-g09d2


From 7a0e1d602288370801c353221c6a938eab925053 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Tue, 29 Aug 2006 17:56:04 -0700
Subject: [NetLabel]: add some missing #includes to various header files

Add some missing include files to the NetLabel related header files.

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cipso_ipv4.h                    | 2 ++
 include/net/netlabel.h                      | 1 +
 net/netlabel/netlabel_domainhash.h          | 4 ++++
 net/netlabel/netlabel_user.h                | 5 +++--
 security/selinux/include/selinux_netlabel.h | 9 +++++++++
 5 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
index 5aed72ab652..59406e0dc5b 100644
--- a/include/net/cipso_ipv4.h
+++ b/include/net/cipso_ipv4.h
@@ -37,6 +37,8 @@
 #include <linux/types.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
 #include <net/netlabel.h>
 
 /* known doi values */
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 7cae730832c..fc2b72fc7e0 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -31,6 +31,7 @@
 #define _NETLABEL_H
 
 #include <linux/types.h>
+#include <linux/net.h>
 #include <linux/skbuff.h>
 #include <net/netlink.h>
 
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index 9217863ce0d..99a2287de24 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -32,6 +32,10 @@
 #ifndef _NETLABEL_DOMAINHASH_H
 #define _NETLABEL_DOMAINHASH_H
 
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+
 /* Domain hash table size */
 /* XXX - currently this number is an uneducated guess */
 #define NETLBL_DOMHSH_BITSIZE       7
diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h
index ccf237b3a12..385a6c7488c 100644
--- a/net/netlabel/netlabel_user.h
+++ b/net/netlabel/netlabel_user.h
@@ -31,11 +31,12 @@
 #ifndef _NETLABEL_USER_H
 #define _NETLABEL_USER_H
 
+#include <linux/types.h>
 #include <linux/skbuff.h>
 #include <linux/capability.h>
-#include <linux/genetlink.h>
-#include <net/netlabel.h>
+#include <net/netlink.h>
 #include <net/genetlink.h>
+#include <net/netlabel.h>
 
 /* NetLabel NETLINK helper functions */
 
diff --git a/security/selinux/include/selinux_netlabel.h b/security/selinux/include/selinux_netlabel.h
index d69ec650cdb..ecab4bddaaf 100644
--- a/security/selinux/include/selinux_netlabel.h
+++ b/security/selinux/include/selinux_netlabel.h
@@ -27,6 +27,15 @@
 #ifndef _SELINUX_NETLABEL_H_
 #define _SELINUX_NETLABEL_H_
 
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
+#include "avc.h"
+#include "objsec.h"
+
 #ifdef CONFIG_NETLABEL
 void selinux_netlbl_cache_invalidate(void);
 int selinux_netlbl_socket_post_create(struct socket *sock,
-- 
cgit v1.2.3-70-g09d2


From fda9ef5d679b07c9d9097aaf6ef7f069d794a8f9 Mon Sep 17 00:00:00 2001
From: Dmitry Mishin <dim@openvz.org>
Date: Thu, 31 Aug 2006 15:28:39 -0700
Subject: [NET]: Fix sk->sk_filter field access

Function sk_filter() is called from tcp_v{4,6}_rcv() functions with arg
needlock = 0, while socket is not locked at that moment. In order to avoid
this and similar issues in the future, use rcu for sk->sk_filter field read
protection.

Signed-off-by: Dmitry Mishin <dim@openvz.org>
Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: Kirill Korotaev <dev@openvz.org>
---
 include/linux/filter.h | 13 +++++++------
 include/net/sock.h     | 34 +++++++++++++++++-----------------
 net/core/filter.c      |  8 ++++----
 net/core/sock.c        | 22 +++++++++-------------
 net/dccp/ipv6.c        |  2 +-
 net/decnet/dn_nsp_in.c |  2 +-
 net/ipv4/tcp_ipv4.c    |  2 +-
 net/ipv6/tcp_ipv6.c    |  4 ++--
 net/packet/af_packet.c | 43 ++++++++++++++++++-------------------------
 net/sctp/input.c       |  2 +-
 10 files changed, 61 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index c6cb8f09508..91b2e3b9251 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -25,10 +25,10 @@
  
 struct sock_filter	/* Filter block */
 {
-        __u16	code;   /* Actual filter code */
-        __u8	jt;	/* Jump true */
-        __u8	jf;	/* Jump false */
-        __u32	k;      /* Generic multiuse field */
+	__u16	code;   /* Actual filter code */
+	__u8	jt;	/* Jump true */
+	__u8	jf;	/* Jump false */
+	__u32	k;      /* Generic multiuse field */
 };
 
 struct sock_fprog	/* Required for SO_ATTACH_FILTER. */
@@ -41,8 +41,9 @@ struct sock_fprog	/* Required for SO_ATTACH_FILTER. */
 struct sk_filter
 {
 	atomic_t		refcnt;
-        unsigned int         	len;	/* Number of filter blocks */
-        struct sock_filter     	insns[0];
+	unsigned int         	len;	/* Number of filter blocks */
+	struct rcu_head		rcu;
+	struct sock_filter     	insns[0];
 };
 
 static inline unsigned int sk_filter_len(struct sk_filter *fp)
diff --git a/include/net/sock.h b/include/net/sock.h
index 337ebec84c7..edd4d73ce7f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -862,30 +862,24 @@ extern void sock_init_data(struct socket *sock, struct sock *sk);
  *
  */
 
-static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock)
+static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 {
 	int err;
+	struct sk_filter *filter;
 	
 	err = security_sock_rcv_skb(sk, skb);
 	if (err)
 		return err;
 	
-	if (sk->sk_filter) {
-		struct sk_filter *filter;
-		
-		if (needlock)
-			bh_lock_sock(sk);
-		
-		filter = sk->sk_filter;
-		if (filter) {
-			unsigned int pkt_len = sk_run_filter(skb, filter->insns,
-							     filter->len);
-			err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
-		}
-
-		if (needlock)
-			bh_unlock_sock(sk);
+	rcu_read_lock_bh();
+	filter = sk->sk_filter;
+	if (filter) {
+		unsigned int pkt_len = sk_run_filter(skb, filter->insns,
+				filter->len);
+		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
 	}
+ 	rcu_read_unlock_bh();
+
 	return err;
 }
 
@@ -897,6 +891,12 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock)
  *	Remove a filter from a socket and release its resources.
  */
  
+static inline void sk_filter_rcu_free(struct rcu_head *rcu)
+{
+	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+	kfree(fp);
+}
+
 static inline void sk_filter_release(struct sock *sk, struct sk_filter *fp)
 {
 	unsigned int size = sk_filter_len(fp);
@@ -904,7 +904,7 @@ static inline void sk_filter_release(struct sock *sk, struct sk_filter *fp)
 	atomic_sub(size, &sk->sk_omem_alloc);
 
 	if (atomic_dec_and_test(&fp->refcnt))
-		kfree(fp);
+		call_rcu_bh(&fp->rcu, sk_filter_rcu_free);
 }
 
 static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
diff --git a/net/core/filter.c b/net/core/filter.c
index 5b4486a60cf..6732782a5a4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -422,10 +422,10 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	if (!err) {
 		struct sk_filter *old_fp;
 
-		spin_lock_bh(&sk->sk_lock.slock);
-		old_fp = sk->sk_filter;
-		sk->sk_filter = fp;
-		spin_unlock_bh(&sk->sk_lock.slock);
+		rcu_read_lock_bh();
+		old_fp = rcu_dereference(sk->sk_filter);
+		rcu_assign_pointer(sk->sk_filter, fp);
+		rcu_read_unlock_bh();
 		fp = old_fp;
 	}
 
diff --git a/net/core/sock.c b/net/core/sock.c
index cfaf09039b0..b77e155cbe6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -247,11 +247,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		goto out;
 	}
 
-	/* It would be deadlock, if sock_queue_rcv_skb is used
-	   with socket lock! We assume that users of this
-	   function are lock free.
-	*/
-	err = sk_filter(sk, skb, 1);
+	err = sk_filter(sk, skb);
 	if (err)
 		goto out;
 
@@ -278,7 +274,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int rc = NET_RX_SUCCESS;
 
-	if (sk_filter(sk, skb, 0))
+	if (sk_filter(sk, skb))
 		goto discard_and_relse;
 
 	skb->dev = NULL;
@@ -606,15 +602,15 @@ set_rcvbuf:
 			break;
 
 		case SO_DETACH_FILTER:
-			spin_lock_bh(&sk->sk_lock.slock);
-			filter = sk->sk_filter;
+			rcu_read_lock_bh();
+			filter = rcu_dereference(sk->sk_filter);
                         if (filter) {
-				sk->sk_filter = NULL;
-				spin_unlock_bh(&sk->sk_lock.slock);
+				rcu_assign_pointer(sk->sk_filter, NULL);
 				sk_filter_release(sk, filter);
+				rcu_read_unlock_bh();
 				break;
 			}
-			spin_unlock_bh(&sk->sk_lock.slock);
+			rcu_read_unlock_bh();
 			ret = -ENONET;
 			break;
 
@@ -884,10 +880,10 @@ void sk_free(struct sock *sk)
 	if (sk->sk_destruct)
 		sk->sk_destruct(sk);
 
-	filter = sk->sk_filter;
+	filter = rcu_dereference(sk->sk_filter);
 	if (filter) {
 		sk_filter_release(sk, filter);
-		sk->sk_filter = NULL;
+		rcu_assign_pointer(sk->sk_filter, NULL);
 	}
 
 	sock_disable_timestamp(sk);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index f9c5e12d703..7a47399cf31 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -970,7 +970,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (skb->protocol == htons(ETH_P_IP))
 		return dccp_v4_do_rcv(sk, skb);
 
-	if (sk_filter(sk, skb, 0))
+	if (sk_filter(sk, skb))
 		goto discard;
 
 	/*
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 86f7f3b28e7..72ecc6e62ec 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -586,7 +586,7 @@ static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig
         	goto out;
         }
 
-	err = sk_filter(sk, skb, 0);
+	err = sk_filter(sk, skb);
 	if (err)
 		goto out;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 23b46e36b14..39b17985608 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1104,7 +1104,7 @@ process:
 		goto discard_and_relse;
 	nf_reset(skb);
 
-	if (sk_filter(sk, skb, 0))
+	if (sk_filter(sk, skb))
 		goto discard_and_relse;
 
 	skb->dev = NULL;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2b18918f301..2546fc9f0a7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1075,7 +1075,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (skb->protocol == htons(ETH_P_IP))
 		return tcp_v4_do_rcv(sk, skb);
 
-	if (sk_filter(sk, skb, 0))
+	if (sk_filter(sk, skb))
 		goto discard;
 
 	/*
@@ -1232,7 +1232,7 @@ process:
 	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
 
-	if (sk_filter(sk, skb, 0))
+	if (sk_filter(sk, skb))
 		goto discard_and_relse;
 
 	skb->dev = NULL;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 300215bdbf4..f4ccb90e673 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -427,21 +427,24 @@ out_unlock:
 }
 #endif
 
-static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
+static inline int run_filter(struct sk_buff *skb, struct sock *sk,
+							unsigned *snaplen)
 {
 	struct sk_filter *filter;
+	int err = 0;
 
-	bh_lock_sock(sk);
-	filter = sk->sk_filter;
-	/*
-	 * Our caller already checked that filter != NULL but we need to
-	 * verify that under bh_lock_sock() to be safe
-	 */
-	if (likely(filter != NULL))
-		res = sk_run_filter(skb, filter->insns, filter->len);
-	bh_unlock_sock(sk);
+	rcu_read_lock_bh();
+	filter = rcu_dereference(sk->sk_filter);
+	if (filter != NULL) {
+		err = sk_run_filter(skb, filter->insns, filter->len);
+		if (!err)
+			err = -EPERM;
+		else if (*snaplen > err)
+			*snaplen = err;
+	}
+	rcu_read_unlock_bh();
 
-	return res;
+	return err;
 }
 
 /*
@@ -491,13 +494,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
 
 	snaplen = skb->len;
 
-	if (sk->sk_filter) {
-		unsigned res = run_filter(skb, sk, snaplen);
-		if (res == 0)
-			goto drop_n_restore;
-		if (snaplen > res)
-			snaplen = res;
-	}
+	if (run_filter(skb, sk, &snaplen) < 0)
+		goto drop_n_restore;
 
 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 	    (unsigned)sk->sk_rcvbuf)
@@ -593,13 +591,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
 
 	snaplen = skb->len;
 
-	if (sk->sk_filter) {
-		unsigned res = run_filter(skb, sk, snaplen);
-		if (res == 0)
-			goto drop_n_restore;
-		if (snaplen > res)
-			snaplen = res;
-	}
+	if (run_filter(skb, sk, &snaplen) < 0)
+		goto drop_n_restore;
 
 	if (sk->sk_type == SOCK_DGRAM) {
 		macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 8a34d95602c..03f65de75d8 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -228,7 +228,7 @@ int sctp_rcv(struct sk_buff *skb)
 		goto discard_release;
 	nf_reset(skb);
 
-	if (sk_filter(sk, skb, 1))
+	if (sk_filter(sk, skb))
                 goto discard_release;
 
 	/* Create an SCTP packet structure. */
-- 
cgit v1.2.3-70-g09d2


From eb878e84575fbce21d2edb079eada78bfa27023d Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Thu, 31 Aug 2006 17:42:59 -0700
Subject: [IPSEC]: output mode to take an xfrm state as input param

Expose IPSEC modes output path to take an xfrm state as input param.
This makes it consistent with the input mode processing (which already
takes the xfrm state as a param).

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h              | 2 +-
 net/ipv4/xfrm4_mode_transport.c | 4 +---
 net/ipv4/xfrm4_mode_tunnel.c    | 3 +--
 net/ipv4/xfrm4_output.c         | 2 +-
 net/ipv6/xfrm6_mode_ro.c        | 3 +--
 net/ipv6/xfrm6_mode_transport.c | 3 +--
 net/ipv6/xfrm6_mode_tunnel.c    | 3 +--
 net/ipv6/xfrm6_output.c         | 2 +-
 8 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 0acabf2a0a8..4d6dc627df9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -285,7 +285,7 @@ extern void xfrm_put_type(struct xfrm_type *type);
 
 struct xfrm_mode {
 	int (*input)(struct xfrm_state *x, struct sk_buff *skb);
-	int (*output)(struct sk_buff *skb);
+	int (*output)(struct xfrm_state *x,struct sk_buff *skb);
 
 	struct module *owner;
 	unsigned int encap;
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index a9e6b3dd19c..92676b7e403 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -21,9 +21,8 @@
  * On exit, skb->h will be set to the start of the payload to be processed
  * by x->type->output and skb->nh will be set to the top IP header.
  */
-static int xfrm4_transport_output(struct sk_buff *skb)
+static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	struct xfrm_state *x;
 	struct iphdr *iph;
 	int ihl;
 
@@ -33,7 +32,6 @@ static int xfrm4_transport_output(struct sk_buff *skb)
 	ihl = iph->ihl * 4;
 	skb->h.raw += ihl;
 
-	x = skb->dst->xfrm;
 	skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl);
 	return 0;
 }
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 13cafbe56ce..e23c21d31a5 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -33,10 +33,9 @@ static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
  * On exit, skb->h will be set to the start of the payload to be processed
  * by x->type->output and skb->nh will be set to the top IP header.
  */
-static int xfrm4_tunnel_output(struct sk_buff *skb)
+static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
-	struct xfrm_state *x = dst->xfrm;
 	struct iphdr *iph, *top_iph;
 	int flags;
 
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 5fd115f0c54..04403fb01a5 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -66,7 +66,7 @@ static int xfrm4_output_one(struct sk_buff *skb)
 		if (err)
 			goto error;
 
-		err = x->mode->output(skb);
+		err = x->mode->output(x, skb);
 		if (err)
 			goto error;
 
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
index c11c335312f..6031c16d46c 100644
--- a/net/ipv6/xfrm6_mode_ro.c
+++ b/net/ipv6/xfrm6_mode_ro.c
@@ -43,9 +43,8 @@
  * its absence, that of the top IP header.  The value of skb->data will always
  * point to the top IP header.
  */
-static int xfrm6_ro_output(struct sk_buff *skb)
+static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	struct xfrm_state *x = skb->dst->xfrm;
 	struct ipv6hdr *iph;
 	u8 *prevhdr;
 	int hdr_len;
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index a5dce216024..3a4b39b12ba 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -25,9 +25,8 @@
  * its absence, that of the top IP header.  The value of skb->data will always
  * point to the top IP header.
  */
-static int xfrm6_transport_output(struct sk_buff *skb)
+static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	struct xfrm_state *x = skb->dst->xfrm;
 	struct ipv6hdr *iph;
 	u8 *prevhdr;
 	int hdr_len;
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index 8af79be2edc..5e7d8a7d641 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -37,10 +37,9 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
  * its absence, that of the top IP header.  The value of skb->data will always
  * point to the top IP header.
  */
-static int xfrm6_tunnel_output(struct sk_buff *skb)
+static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb->dst;
-	struct xfrm_state *x = dst->xfrm;
 	struct ipv6hdr *iph, *top_iph;
 	int dsfield;
 
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index db58104e710..c260ea104c5 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -65,7 +65,7 @@ static int xfrm6_output_one(struct sk_buff *skb)
 		if (err)
 			goto error;
 
-		err = x->mode->output(skb);
+		err = x->mode->output(x, skb);
 		if (err)
 			goto error;
 
-- 
cgit v1.2.3-70-g09d2


From eb328111efde7bca782f340fe805756039ec6a0c Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Mon, 18 Sep 2006 00:01:59 -0700
Subject: [GENL]: Provide more information to userspace about registered genl
 families

Additionaly exports the following information when providing
the list of registered generic netlink families:
  - protocol version
  - header size
  - maximum number of attributes
  - list of available operations including
      - id
      - flags
      - avaiability of policy and doit/dumpit function

libnl HEAD provides a utility to read this new information:

	0x0010 nlctrl version 1
	    hdrsize 0 maxattr 6
	      op GETFAMILY (0x03) [POLICY,DOIT,DUMPIT]
	0x0011 NLBL_MGMT version 1
	    hdrsize 0 maxattr 0
	      op unknown (0x02) [DOIT]
	      op unknown (0x03) [DOIT]
	      ....

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/genetlink.h | 18 ++++++++++++++++++
 include/net/genetlink.h   |  2 --
 net/netlink/genetlink.c   | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
index 84f12a41dc0..9049dc65ae5 100644
--- a/include/linux/genetlink.h
+++ b/include/linux/genetlink.h
@@ -16,6 +16,8 @@ struct genlmsghdr {
 
 #define GENL_HDRLEN	NLMSG_ALIGN(sizeof(struct genlmsghdr))
 
+#define GENL_ADMIN_PERM		0x01
+
 /*
  * List of reserved static generic netlink identifiers:
  */
@@ -43,9 +45,25 @@ enum {
 	CTRL_ATTR_UNSPEC,
 	CTRL_ATTR_FAMILY_ID,
 	CTRL_ATTR_FAMILY_NAME,
+	CTRL_ATTR_VERSION,
+	CTRL_ATTR_HDRSIZE,
+	CTRL_ATTR_MAXATTR,
+	CTRL_ATTR_OPS,
 	__CTRL_ATTR_MAX,
 };
 
 #define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1)
 
+enum {
+	CTRL_ATTR_OP_UNSPEC,
+	CTRL_ATTR_OP_ID,
+	CTRL_ATTR_OP_FLAGS,
+	CTRL_ATTR_OP_POLICY,
+	CTRL_ATTR_OP_DOIT,
+	CTRL_ATTR_OP_DUMPIT,
+	__CTRL_ATTR_OP_MAX,
+};
+
+#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1)
+
 #endif	/* __LINUX_GENERIC_NETLINK_H */
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 97d6d3aba9d..4a38d85e4e2 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -27,8 +27,6 @@ struct genl_family
 	struct list_head	family_list;	/* private */
 };
 
-#define GENL_ADMIN_PERM		0x01
-
 /**
  * struct genl_info - receiving information
  * @snd_seq: sending sequence number
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 3ac942cdb67..49bc2db7982 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -387,7 +387,10 @@ static void genl_rcv(struct sock *sk, int len)
 static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
 			  u32 flags, struct sk_buff *skb, u8 cmd)
 {
+	struct nlattr *nla_ops;
+	struct genl_ops *ops;
 	void *hdr;
+	int idx = 1;
 
 	hdr = genlmsg_put(skb, pid, seq, GENL_ID_CTRL, 0, flags, cmd,
 			  family->version);
@@ -396,6 +399,37 @@ static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq,
 
 	NLA_PUT_STRING(skb, CTRL_ATTR_FAMILY_NAME, family->name);
 	NLA_PUT_U16(skb, CTRL_ATTR_FAMILY_ID, family->id);
+	NLA_PUT_U32(skb, CTRL_ATTR_VERSION, family->version);
+	NLA_PUT_U32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize);
+	NLA_PUT_U32(skb, CTRL_ATTR_MAXATTR, family->maxattr);
+
+	nla_ops = nla_nest_start(skb, CTRL_ATTR_OPS);
+	if (nla_ops == NULL)
+		goto nla_put_failure;
+
+	list_for_each_entry(ops, &family->ops_list, ops_list) {
+		struct nlattr *nest;
+
+		nest = nla_nest_start(skb, idx++);
+		if (nest == NULL)
+			goto nla_put_failure;
+
+		NLA_PUT_U32(skb, CTRL_ATTR_OP_ID, ops->cmd);
+		NLA_PUT_U32(skb, CTRL_ATTR_OP_FLAGS, ops->flags);
+
+		if (ops->policy)
+			NLA_PUT_FLAG(skb, CTRL_ATTR_OP_POLICY);
+
+		if (ops->doit)
+			NLA_PUT_FLAG(skb, CTRL_ATTR_OP_DOIT);
+
+		if (ops->dumpit)
+			NLA_PUT_FLAG(skb, CTRL_ATTR_OP_DUMPIT);
+
+		nla_nest_end(skb, nest);
+	}
+
+	nla_nest_end(skb, nla_ops);
 
 	return genlmsg_end(skb, hdr);
 
@@ -411,6 +445,9 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
 	int chains_to_skip = cb->args[0];
 	int fams_to_skip = cb->args[1];
 
+	if (chains_to_skip != 0)
+		genl_lock();
+
 	for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
 		if (i < chains_to_skip)
 			continue;
@@ -428,6 +465,9 @@ static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb)
 	}
 
 errout:
+	if (chains_to_skip != 0)
+		genl_unlock();
+
 	cb->args[0] = i;
 	cb->args[1] = n;
 
-- 
cgit v1.2.3-70-g09d2


From 161643660129dd7d98f0b12418c0a2710ffa7db6 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 18 Sep 2006 00:40:38 -0700
Subject: [SCTP]: Cleanups

This patch contains the following cleanups:
- make the following needlessly global function static:
  - socket.c: sctp_apply_peer_addr_params()
- add proper prototypes for the several global functions in
  include/net/sctp/sctp.h

Note that this fixes wrong prototypes for the following functions:
- sctp_snmp_proc_exit()
- sctp_eps_proc_exit()
- sctp_assocs_proc_exit()

The latter was spotted by the GNU C compiler and reported
by David Woodhouse.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 13 +++++++++++++
 net/sctp/ipv6.c         |  1 -
 net/sctp/protocol.c     |  7 -------
 net/sctp/socket.c       | 14 +++++++-------
 4 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index e274fd47999..ee68a312407 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -128,6 +128,8 @@ extern int sctp_copy_local_addr_list(struct sctp_bind_addr *,
 				     int flags);
 extern struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
 extern int sctp_register_pf(struct sctp_pf *, sa_family_t);
+int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
+                        void *ptr);
 
 /*
  * sctp/socket.c
@@ -177,6 +179,17 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
 void sctp_backlog_migrate(struct sctp_association *assoc,
 			  struct sock *oldsk, struct sock *newsk);
 
+/*
+ * sctp/proc.c
+ */
+int sctp_snmp_proc_init(void);
+void sctp_snmp_proc_exit(void);
+int sctp_eps_proc_init(void);
+void sctp_eps_proc_exit(void);
+int sctp_assocs_proc_init(void);
+void sctp_assocs_proc_exit(void);
+
+
 /*
  *  Section:  Macros, externs, and inlines
  */
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 99c0cefc04e..fd87e3ceb56 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -78,7 +78,6 @@
 
 #include <asm/uaccess.h>
 
-extern int sctp_inetaddr_event(struct notifier_block *, unsigned long, void *);
 static struct notifier_block sctp_inet6addr_notifier = {
 	.notifier_call = sctp_inetaddr_event,
 };
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index d9dd4c47bc2..fac7674438a 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -82,13 +82,6 @@ static struct sctp_af *sctp_af_v6_specific;
 kmem_cache_t *sctp_chunk_cachep __read_mostly;
 kmem_cache_t *sctp_bucket_cachep __read_mostly;
 
-extern int sctp_snmp_proc_init(void);
-extern int sctp_snmp_proc_exit(void);
-extern int sctp_eps_proc_init(void);
-extern int sctp_eps_proc_exit(void);
-extern int sctp_assocs_proc_init(void);
-extern int sctp_assocs_proc_exit(void);
-
 /* Return the address of the control sock. */
 struct sock *sctp_get_ctl_sock(void)
 {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 7c1dbb1d10d..79c3e072cf2 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2081,13 +2081,13 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
  *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
  *                     results.
  */
-int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
-				struct sctp_transport   *trans,
-				struct sctp_association *asoc,
-				struct sctp_sock        *sp,
-				int                      hb_change,
-				int                      pmtud_change,
-				int                      sackdelay_change)
+static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
+				       struct sctp_transport   *trans,
+				       struct sctp_association *asoc,
+				       struct sctp_sock        *sp,
+				       int                      hb_change,
+				       int                      pmtud_change,
+				       int                      sackdelay_change)
 {
 	int error;
 
-- 
cgit v1.2.3-70-g09d2


From 1ef9696c909060ccdae3ade245ca88692b49285b Mon Sep 17 00:00:00 2001
From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Tue, 19 Sep 2006 12:52:50 -0700
Subject: [TCP]: Send ACKs each 2nd received segment.

It does not affect either mss-sized connections (obviously) or
connections controlled by Nagle (because there is only one small
segment in flight).

The idea is to record the fact that a small segment arrives on a
connection, where one small segment has already been received and
still not-ACKed. In this case ACK is forced after tcp_recvmsg() drains
receive buffer.

In other words, it is a "soft" each-2nd-segment ACK, which is enough
to preserve ACK clock even when ABC is enabled.

Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 3 ++-
 net/ipv4/tcp.c                     | 7 +++++--
 net/ipv4/tcp_input.c               | 2 ++
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 9bf73fe5094..de4e83b6da4 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -147,7 +147,8 @@ extern struct sock *inet_csk_clone(struct sock *sk,
 enum inet_csk_ack_state_t {
 	ICSK_ACK_SCHED	= 1,
 	ICSK_ACK_TIMER  = 2,
-	ICSK_ACK_PUSHED = 4
+	ICSK_ACK_PUSHED = 4,
+	ICSK_ACK_PUSHED2 = 8
 };
 
 extern void inet_csk_init_xmit_timers(struct sock *sk,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 29e3d606db7..66e9a729f6d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -955,8 +955,11 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
 		     * receive buffer and there was a small segment
 		     * in queue.
 		     */
-		    (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
-		     !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+		    (copied > 0 &&
+		     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
+		      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+		       !icsk->icsk_ack.pingpong)) &&
+		      !atomic_read(&sk->sk_rmem_alloc)))
 			time_to_ack = 1;
 	}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 511b738f118..b3def0df14f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -156,6 +156,8 @@ static void tcp_measure_rcv_mss(struct sock *sk,
 				return;
 			}
 		}
+		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
+			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
 		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From a1e59abf824969554b90facd44a4ab16e265afa4 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 19 Sep 2006 12:57:34 -0700
Subject: [XFRM]: Fix wildcard as tunnel source

Hashing SAs by source address breaks templates with wildcards as tunnel
source since the source address used for hashing/lookup is still 0/0.
Move source address lookup to xfrm_tmpl_resolve_one() so we can use the
real address in the lookup.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h      | 13 +++++++++++++
 net/ipv4/xfrm4_policy.c | 20 ++++++++++++++++++++
 net/ipv4/xfrm4_state.c  | 15 ---------------
 net/ipv6/xfrm6_policy.c | 21 +++++++++++++++++++++
 net/ipv6/xfrm6_state.c  | 16 ----------------
 net/xfrm/xfrm_policy.c  | 21 +++++++++++++++++++++
 6 files changed, 75 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4d6dc627df9..11e0b1d6bd4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -222,6 +222,7 @@ struct xfrm_policy_afinfo {
 	struct dst_ops		*dst_ops;
 	void			(*garbage_collect)(void);
 	int			(*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl);
+	int			(*get_saddr)(xfrm_address_t *saddr, xfrm_address_t *daddr);
 	struct dst_entry	*(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy);
 	int			(*bundle_create)(struct xfrm_policy *policy, 
 						 struct xfrm_state **xfrm, 
@@ -630,6 +631,18 @@ secpath_reset(struct sk_buff *skb)
 #endif
 }
 
+static inline int
+xfrm_addr_any(xfrm_address_t *addr, unsigned short family)
+{
+	switch (family) {
+	case AF_INET:
+		return addr->a4 == 0;
+	case AF_INET6:
+		return ipv6_addr_any((struct in6_addr *)&addr->a6);
+	}
+	return 0;
+}
+
 static inline int
 __xfrm4_state_addr_cmp(struct xfrm_tmpl *tmpl, struct xfrm_state *x)
 {
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 479598566f1..eabcd27b176 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -21,6 +21,25 @@ static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 	return __ip_route_output_key((struct rtable**)dst, fl);
 }
 
+static int xfrm4_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr)
+{
+	struct rtable *rt;
+	struct flowi fl_tunnel = {
+		.nl_u = {
+			.ip4_u = {
+				.daddr = daddr->a4,
+			},
+		},
+	};
+
+	if (!xfrm4_dst_lookup((struct xfrm_dst **)&rt, &fl_tunnel)) {
+		saddr->a4 = rt->rt_src;
+		dst_release(&rt->u.dst);
+		return 0;
+	}
+	return -EHOSTUNREACH;
+}
+
 static struct dst_entry *
 __xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
 {
@@ -298,6 +317,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.family = 		AF_INET,
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
+	.get_saddr =		xfrm4_get_saddr,
 	.find_bundle = 		__xfrm4_find_bundle,
 	.bundle_create =	__xfrm4_bundle_create,
 	.decode_session =	_decode_session4,
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 6a2a4ab4277..fe2034494d0 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -42,21 +42,6 @@ __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
 	x->props.saddr = tmpl->saddr;
 	if (x->props.saddr.a4 == 0)
 		x->props.saddr.a4 = saddr->a4;
-	if (tmpl->mode == XFRM_MODE_TUNNEL && x->props.saddr.a4 == 0) {
-		struct rtable *rt;
-	        struct flowi fl_tunnel = {
-        	        .nl_u = {
-        			.ip4_u = {
-					.daddr = x->id.daddr.a4,
-				}
-			}
-		};
-		if (!xfrm_dst_lookup((struct xfrm_dst **)&rt,
-		                     &fl_tunnel, AF_INET)) {
-			x->props.saddr.a4 = rt->rt_src;
-			dst_release(&rt->u.dst);
-		}
-	}
 	x->props.mode = tmpl->mode;
 	x->props.reqid = tmpl->reqid;
 	x->props.family = AF_INET;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 9391c4c94fe..6a252e2134d 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -34,6 +34,26 @@ static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 	return err;
 }
 
+static int xfrm6_get_saddr(xfrm_address_t *saddr, xfrm_address_t *daddr)
+{
+	struct rt6_info *rt;
+	struct flowi fl_tunnel = {
+		.nl_u = {
+			.ip6_u = {
+				.daddr = *(struct in6_addr *)&daddr->a6,
+			},
+		},
+	};
+
+	if (!xfrm6_dst_lookup((struct xfrm_dst **)&rt, &fl_tunnel)) {
+		ipv6_get_saddr(&rt->u.dst, (struct in6_addr *)&daddr->a6,
+			       (struct in6_addr *)&saddr->a6);
+		dst_release(&rt->u.dst);
+		return 0;
+	}
+	return -EHOSTUNREACH;
+}
+
 static struct dst_entry *
 __xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
 {
@@ -362,6 +382,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.family =		AF_INET6,
 	.dst_ops =		&xfrm6_dst_ops,
 	.dst_lookup =		xfrm6_dst_lookup,
+	.get_saddr = 		xfrm6_get_saddr,
 	.find_bundle =		__xfrm6_find_bundle,
 	.bundle_create =	__xfrm6_bundle_create,
 	.decode_session =	_decode_session6,
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index d88cd92c864..711bfafb247 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -42,22 +42,6 @@ __xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl,
 	memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
 	if (ipv6_addr_any((struct in6_addr*)&x->props.saddr))
 		memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
-	if (tmpl->mode == XFRM_MODE_TUNNEL && ipv6_addr_any((struct in6_addr*)&x->props.saddr)) {
-		struct rt6_info *rt;
-		struct flowi fl_tunnel = {
-			.nl_u = {
-				.ip6_u = {
-					.daddr = *(struct in6_addr *)daddr,
-				}
-			}
-		};
-		if (!xfrm_dst_lookup((struct xfrm_dst **)&rt,
-		                     &fl_tunnel, AF_INET6)) {
-			ipv6_get_saddr(&rt->u.dst, (struct in6_addr *)daddr,
-			               (struct in6_addr *)&x->props.saddr);
-			dst_release(&rt->u.dst);
-		}
-	}
 	x->props.mode = tmpl->mode;
 	x->props.reqid = tmpl->reqid;
 	x->props.family = AF_INET6;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 537854fe47c..b6e2e79d726 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1107,6 +1107,20 @@ int __xfrm_sk_clone_policy(struct sock *sk)
 	return 0;
 }
 
+static int
+xfrm_get_saddr(xfrm_address_t *local, xfrm_address_t *remote,
+	       unsigned short family)
+{
+	int err;
+	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+
+	if (unlikely(afinfo == NULL))
+		return -EINVAL;
+	err = afinfo->get_saddr(local, remote);
+	xfrm_policy_put_afinfo(afinfo);
+	return err;
+}
+
 /* Resolve list of templates for the flow, given policy. */
 
 static int
@@ -1118,6 +1132,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
 	int i, error;
 	xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family);
 	xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family);
+	xfrm_address_t tmp;
 
 	for (nx=0, i = 0; i < policy->xfrm_nr; i++) {
 		struct xfrm_state *x;
@@ -1128,6 +1143,12 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, struct flowi *fl,
 		if (tmpl->mode == XFRM_MODE_TUNNEL) {
 			remote = &tmpl->id.daddr;
 			local = &tmpl->saddr;
+			if (xfrm_addr_any(local, family)) {
+				error = xfrm_get_saddr(&tmp, remote, family);
+				if (error)
+					goto fail;
+				local = &tmp;
+			}
 		}
 
 		x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
-- 
cgit v1.2.3-70-g09d2


From 1bf38a36b6a0e810dafae048fdbb999e587f0f2f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 20 Sep 2006 11:57:09 -0700
Subject: [NETFILTER]: remove unused include file

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_logging.h | 33 ---------------------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 include/linux/netfilter_logging.h

(limited to 'include')

diff --git a/include/linux/netfilter_logging.h b/include/linux/netfilter_logging.h
deleted file mode 100644
index 562bb6aad4e..00000000000
--- a/include/linux/netfilter_logging.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Internal logging interface, which relies on the real 
-   LOG target modules */
-#ifndef __LINUX_NETFILTER_LOGGING_H
-#define __LINUX_NETFILTER_LOGGING_H
-
-#ifdef __KERNEL__
-#include <asm/atomic.h>
-
-struct nf_logging_t {
-	void (*nf_log_packet)(struct sk_buff **pskb,
-			      unsigned int hooknum,
-			      const struct net_device *in,
-			      const struct net_device *out,
-			      const char *prefix);
-	void (*nf_log)(char *pfh, size_t len,
-		       const char *prefix);
-};
-
-extern void nf_log_register(int pf, const struct nf_logging_t *logging);
-extern void nf_log_unregister(int pf, const struct nf_logging_t *logging);
-
-extern void nf_log_packet(int pf,
-			  struct sk_buff **pskb,
-			  unsigned int hooknum,
-			  const struct net_device *in,
-			  const struct net_device *out,
-			  const char *fmt, ...);
-extern void nf_log(int pf,
-		   char *pfh, size_t len,
-		   const char *fmt, ...);
-#endif /*__KERNEL__*/
-
-#endif /*__LINUX_NETFILTER_LOGGING_H*/
-- 
cgit v1.2.3-70-g09d2


From df0933dcb027e156cb5253570ad694b81bd52b69 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 20 Sep 2006 11:57:53 -0700
Subject: [NETFILTER]: kill listhelp.h

Kill listhelp.h and use the list.h functions instead.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h           |   4 -
 include/linux/netfilter_ipv4/listhelp.h      | 123 -----------------
 net/bridge/netfilter/ebtables.c              |  76 ++++++-----
 net/ipv4/netfilter/arp_tables.c              |   2 -
 net/ipv4/netfilter/ip_conntrack_core.c       | 189 ++++++++++++---------------
 net/ipv4/netfilter/ip_conntrack_proto_gre.c  |  24 ++--
 net/ipv4/netfilter/ip_conntrack_standalone.c |   1 -
 net/ipv4/netfilter/ip_nat_core.c             |   4 -
 net/ipv4/netfilter/ip_nat_helper.c           |   4 -
 net/ipv4/netfilter/ip_nat_rule.c             |   4 -
 net/ipv4/netfilter/ip_nat_standalone.c       |   4 -
 net/ipv6/netfilter/ip6_tables.c              |   3 -
 net/netfilter/nf_conntrack_core.c            | 185 ++++++++++++--------------
 net/netfilter/nf_conntrack_standalone.c      |   1 -
 net/netfilter/x_tables.c                     |  17 ++-
 15 files changed, 237 insertions(+), 404 deletions(-)
 delete mode 100644 include/linux/netfilter_ipv4/listhelp.h

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 03d1027fb0e..c832295dbf6 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -138,10 +138,6 @@ struct xt_counters_info
 
 #include <linux/netdevice.h>
 
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-#include <linux/netfilter_ipv4/listhelp.h>
-
 #ifdef CONFIG_COMPAT
 #define COMPAT_TO_USER		1
 #define COMPAT_FROM_USER	-1
diff --git a/include/linux/netfilter_ipv4/listhelp.h b/include/linux/netfilter_ipv4/listhelp.h
deleted file mode 100644
index 5d92cf044d9..00000000000
--- a/include/linux/netfilter_ipv4/listhelp.h
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef _LISTHELP_H
-#define _LISTHELP_H
-#include <linux/list.h>
-
-/* Header to do more comprehensive job than linux/list.h; assume list
-   is first entry in structure. */
-
-/* Return pointer to first true entry, if any, or NULL.  A macro
-   required to allow inlining of cmpfn. */
-#define LIST_FIND(head, cmpfn, type, args...)		\
-({							\
-	const struct list_head *__i, *__j = NULL;	\
-							\
-	ASSERT_READ_LOCK(head);				\
-	list_for_each(__i, (head))			\
-		if (cmpfn((const type)__i , ## args)) {	\
-			__j = __i;			\
-			break;				\
-		}					\
-	(type)__j;					\
-})
-
-#define LIST_FIND_W(head, cmpfn, type, args...)		\
-({							\
-	const struct list_head *__i, *__j = NULL;	\
-							\
-	ASSERT_WRITE_LOCK(head);			\
-	list_for_each(__i, (head))			\
-		if (cmpfn((type)__i , ## args)) {	\
-			__j = __i;			\
-			break;				\
-		}					\
-	(type)__j;					\
-})
-
-/* Just like LIST_FIND but we search backwards */
-#define LIST_FIND_B(head, cmpfn, type, args...)		\
-({							\
-	const struct list_head *__i, *__j = NULL;	\
-							\
-	ASSERT_READ_LOCK(head);				\
-	list_for_each_prev(__i, (head))			\
-		if (cmpfn((const type)__i , ## args)) {	\
-			__j = __i;			\
-			break;				\
-		}					\
-	(type)__j;					\
-})
-
-static inline int
-__list_cmp_same(const void *p1, const void *p2) { return p1 == p2; }
-
-/* Is this entry in the list? */
-static inline int
-list_inlist(struct list_head *head, const void *entry)
-{
-	return LIST_FIND(head, __list_cmp_same, void *, entry) != NULL;
-}
-
-/* Delete from list. */
-#ifdef CONFIG_NETFILTER_DEBUG
-#define LIST_DELETE(head, oldentry)					\
-do {									\
-	ASSERT_WRITE_LOCK(head);					\
-	if (!list_inlist(head, oldentry))				\
-		printk("LIST_DELETE: %s:%u `%s'(%p) not in %s.\n",	\
-		       __FILE__, __LINE__, #oldentry, oldentry, #head);	\
-        else list_del((struct list_head *)oldentry);			\
-} while(0)
-#else
-#define LIST_DELETE(head, oldentry) list_del((struct list_head *)oldentry)
-#endif
-
-/* Append. */
-static inline void
-list_append(struct list_head *head, void *new)
-{
-	ASSERT_WRITE_LOCK(head);
-	list_add((new), (head)->prev);
-}
-
-/* Prepend. */
-static inline void
-list_prepend(struct list_head *head, void *new)
-{
-	ASSERT_WRITE_LOCK(head);
-	list_add(new, head);
-}
-
-/* Insert according to ordering function; insert before first true. */
-#define LIST_INSERT(head, new, cmpfn)				\
-do {								\
-	struct list_head *__i;					\
-	ASSERT_WRITE_LOCK(head);				\
-	list_for_each(__i, (head))				\
-		if ((new), (typeof (new))__i)			\
-			break;					\
-	list_add((struct list_head *)(new), __i->prev);		\
-} while(0)
-
-/* If the field after the list_head is a nul-terminated string, you
-   can use these functions. */
-static inline int __list_cmp_name(const void *i, const char *name)
-{
-	return strcmp(name, i+sizeof(struct list_head)) == 0;
-}
-
-/* Returns false if same name already in list, otherwise does insert. */
-static inline int
-list_named_insert(struct list_head *head, void *new)
-{
-	if (LIST_FIND(head, __list_cmp_name, void *,
-		      new + sizeof(struct list_head)))
-		return 0;
-	list_prepend(head, new);
-	return 1;
-}
-
-/* Find this named element in the list. */
-#define list_named_find(head, name)			\
-LIST_FIND(head, __list_cmp_name, void *, name)
-
-#endif /*_LISTHELP_H*/
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index d06a5075b5f..3df55b2bd91 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -24,6 +24,7 @@
 #include <linux/vmalloc.h>
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <linux/smp.h>
 #include <linux/cpumask.h>
@@ -31,12 +32,6 @@
 /* needed for logical [in,out]-dev filtering */
 #include "../br_private.h"
 
-/* list_named_find */
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-#include <linux/netfilter_ipv4/listhelp.h>
-#include <linux/mutex.h>
-
 #define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\
                                          "report to author: "format, ## args)
 /* #define BUGPRINT(format, args...) */
@@ -278,18 +273,22 @@ static inline void *
 find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
    struct mutex *mutex)
 {
-	void *ret;
+	struct {
+		struct list_head list;
+		char name[EBT_FUNCTION_MAXNAMELEN];
+	} *e;
 
 	*error = mutex_lock_interruptible(mutex);
 	if (*error != 0)
 		return NULL;
 
-	ret = list_named_find(head, name);
-	if (!ret) {
-		*error = -ENOENT;
-		mutex_unlock(mutex);
+	list_for_each_entry(e, head, list) {
+		if (strcmp(e->name, name) == 0)
+			return e;
 	}
-	return ret;
+	*error = -ENOENT;
+	mutex_unlock(mutex);
+	return NULL;
 }
 
 #ifndef CONFIG_KMOD
@@ -1043,15 +1042,19 @@ free_newinfo:
 
 int ebt_register_target(struct ebt_target *target)
 {
+	struct ebt_target *t;
 	int ret;
 
 	ret = mutex_lock_interruptible(&ebt_mutex);
 	if (ret != 0)
 		return ret;
-	if (!list_named_insert(&ebt_targets, target)) {
-		mutex_unlock(&ebt_mutex);
-		return -EEXIST;
+	list_for_each_entry(t, &ebt_targets, list) {
+		if (strcmp(t->name, target->name) == 0) {
+			mutex_unlock(&ebt_mutex);
+			return -EEXIST;
+		}
 	}
+	list_add(&target->list, &ebt_targets);
 	mutex_unlock(&ebt_mutex);
 
 	return 0;
@@ -1060,21 +1063,25 @@ int ebt_register_target(struct ebt_target *target)
 void ebt_unregister_target(struct ebt_target *target)
 {
 	mutex_lock(&ebt_mutex);
-	LIST_DELETE(&ebt_targets, target);
+	list_del(&target->list);
 	mutex_unlock(&ebt_mutex);
 }
 
 int ebt_register_match(struct ebt_match *match)
 {
+	struct ebt_match *m;
 	int ret;
 
 	ret = mutex_lock_interruptible(&ebt_mutex);
 	if (ret != 0)
 		return ret;
-	if (!list_named_insert(&ebt_matches, match)) {
-		mutex_unlock(&ebt_mutex);
-		return -EEXIST;
+	list_for_each_entry(m, &ebt_matches, list) {
+		if (strcmp(m->name, match->name) == 0) {
+			mutex_unlock(&ebt_mutex);
+			return -EEXIST;
+		}
 	}
+	list_add(&match->list, &ebt_matches);
 	mutex_unlock(&ebt_mutex);
 
 	return 0;
@@ -1083,21 +1090,25 @@ int ebt_register_match(struct ebt_match *match)
 void ebt_unregister_match(struct ebt_match *match)
 {
 	mutex_lock(&ebt_mutex);
-	LIST_DELETE(&ebt_matches, match);
+	list_del(&match->list);
 	mutex_unlock(&ebt_mutex);
 }
 
 int ebt_register_watcher(struct ebt_watcher *watcher)
 {
+	struct ebt_watcher *w;
 	int ret;
 
 	ret = mutex_lock_interruptible(&ebt_mutex);
 	if (ret != 0)
 		return ret;
-	if (!list_named_insert(&ebt_watchers, watcher)) {
-		mutex_unlock(&ebt_mutex);
-		return -EEXIST;
+	list_for_each_entry(w, &ebt_watchers, list) {
+		if (strcmp(w->name, watcher->name) == 0) {
+			mutex_unlock(&ebt_mutex);
+			return -EEXIST;
+		}
 	}
+	list_add(&watcher->list, &ebt_watchers);
 	mutex_unlock(&ebt_mutex);
 
 	return 0;
@@ -1106,13 +1117,14 @@ int ebt_register_watcher(struct ebt_watcher *watcher)
 void ebt_unregister_watcher(struct ebt_watcher *watcher)
 {
 	mutex_lock(&ebt_mutex);
-	LIST_DELETE(&ebt_watchers, watcher);
+	list_del(&watcher->list);
 	mutex_unlock(&ebt_mutex);
 }
 
 int ebt_register_table(struct ebt_table *table)
 {
 	struct ebt_table_info *newinfo;
+	struct ebt_table *t;
 	int ret, i, countersize;
 
 	if (!table || !table->table ||!table->table->entries ||
@@ -1158,10 +1170,12 @@ int ebt_register_table(struct ebt_table *table)
 	if (ret != 0)
 		goto free_chainstack;
 
-	if (list_named_find(&ebt_tables, table->name)) {
-		ret = -EEXIST;
-		BUGPRINT("Table name already exists\n");
-		goto free_unlock;
+	list_for_each_entry(t, &ebt_tables, list) {
+		if (strcmp(t->name, table->name) == 0) {
+			ret = -EEXIST;
+			BUGPRINT("Table name already exists\n");
+			goto free_unlock;
+		}
 	}
 
 	/* Hold a reference count if the chains aren't empty */
@@ -1169,7 +1183,7 @@ int ebt_register_table(struct ebt_table *table)
 		ret = -ENOENT;
 		goto free_unlock;
 	}
-	list_prepend(&ebt_tables, table);
+	list_add(&table->list, &ebt_tables);
 	mutex_unlock(&ebt_mutex);
 	return 0;
 free_unlock:
@@ -1195,7 +1209,7 @@ void ebt_unregister_table(struct ebt_table *table)
 		return;
 	}
 	mutex_lock(&ebt_mutex);
-	LIST_DELETE(&ebt_tables, table);
+	list_del(&table->list);
 	mutex_unlock(&ebt_mutex);
 	vfree(table->private->entries);
 	if (table->private->chainstack) {
@@ -1465,7 +1479,7 @@ static int __init ebtables_init(void)
 	int ret;
 
 	mutex_lock(&ebt_mutex);
-	list_named_insert(&ebt_targets, &ebt_standard_target);
+	list_add(&ebt_standard_target.list, &ebt_targets);
 	mutex_unlock(&ebt_mutex);
 	if ((ret = nf_register_sockopt(&ebt_sockopts)) < 0)
 		return ret;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 4f10b06413a..aaeaa9ce0f2 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -56,8 +56,6 @@ do {								\
 #define ARP_NF_ASSERT(x)
 #endif
 
-#include <linux/netfilter_ipv4/listhelp.h>
-
 static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
 				      char *hdr_addr, int len)
 {
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 5da25ad5030..2568d480e9a 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -47,7 +47,6 @@
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #define IP_CONNTRACK_VERSION	"2.4"
 
@@ -294,15 +293,10 @@ void ip_ct_remove_expectations(struct ip_conntrack *ct)
 static void
 clean_from_lists(struct ip_conntrack *ct)
 {
-	unsigned int ho, hr;
-	
 	DEBUGP("clean_from_lists(%p)\n", ct);
 	ASSERT_WRITE_LOCK(&ip_conntrack_lock);
-
-	ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-	hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-	LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
-	LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
+	list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
 
 	/* Destroy all pending expectations */
 	ip_ct_remove_expectations(ct);
@@ -367,16 +361,6 @@ static void death_by_timeout(unsigned long ul_conntrack)
 	ip_conntrack_put(ct);
 }
 
-static inline int
-conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
-		    const struct ip_conntrack_tuple *tuple,
-		    const struct ip_conntrack *ignored_conntrack)
-{
-	ASSERT_READ_LOCK(&ip_conntrack_lock);
-	return tuplehash_to_ctrack(i) != ignored_conntrack
-		&& ip_ct_tuple_equal(tuple, &i->tuple);
-}
-
 struct ip_conntrack_tuple_hash *
 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
 		    const struct ip_conntrack *ignored_conntrack)
@@ -386,7 +370,8 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
 
 	ASSERT_READ_LOCK(&ip_conntrack_lock);
 	list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
-		if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+		if (tuplehash_to_ctrack(h) != ignored_conntrack &&
+		    ip_ct_tuple_equal(tuple, &h->tuple)) {
 			CONNTRACK_STAT_INC(found);
 			return h;
 		}
@@ -417,10 +402,10 @@ static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
 					unsigned int repl_hash) 
 {
 	ct->id = ++ip_conntrack_next_id;
-	list_prepend(&ip_conntrack_hash[hash],
-		     &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
-	list_prepend(&ip_conntrack_hash[repl_hash],
-		     &ct->tuplehash[IP_CT_DIR_REPLY].list);
+	list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
+		 &ip_conntrack_hash[hash]);
+	list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
+		 &ip_conntrack_hash[repl_hash]);
 }
 
 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
@@ -440,6 +425,7 @@ int
 __ip_conntrack_confirm(struct sk_buff **pskb)
 {
 	unsigned int hash, repl_hash;
+	struct ip_conntrack_tuple_hash *h;
 	struct ip_conntrack *ct;
 	enum ip_conntrack_info ctinfo;
 
@@ -470,43 +456,43 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
 	/* See if there's one in the list already, including reverse:
            NAT could have grabbed it without realizing, since we're
            not in the hash.  If there is, we lost race. */
-	if (!LIST_FIND(&ip_conntrack_hash[hash],
-		       conntrack_tuple_cmp,
-		       struct ip_conntrack_tuple_hash *,
-		       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
-	    && !LIST_FIND(&ip_conntrack_hash[repl_hash],
-			  conntrack_tuple_cmp,
-			  struct ip_conntrack_tuple_hash *,
-			  &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
-		/* Remove from unconfirmed list */
-		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	list_for_each_entry(h, &ip_conntrack_hash[hash], list)
+		if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				      &h->tuple))
+			goto out;
+	list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
+		if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+				      &h->tuple))
+			goto out;
 
-		__ip_conntrack_hash_insert(ct, hash, repl_hash);
-		/* Timer relative to confirmation time, not original
-		   setting time, otherwise we'd get timer wrap in
-		   weird delay cases. */
-		ct->timeout.expires += jiffies;
-		add_timer(&ct->timeout);
-		atomic_inc(&ct->ct_general.use);
-		set_bit(IPS_CONFIRMED_BIT, &ct->status);
-		CONNTRACK_STAT_INC(insert);
-		write_unlock_bh(&ip_conntrack_lock);
-		if (ct->helper)
-			ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+	/* Remove from unconfirmed list */
+	list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
+	__ip_conntrack_hash_insert(ct, hash, repl_hash);
+	/* Timer relative to confirmation time, not original
+	   setting time, otherwise we'd get timer wrap in
+	   weird delay cases. */
+	ct->timeout.expires += jiffies;
+	add_timer(&ct->timeout);
+	atomic_inc(&ct->ct_general.use);
+	set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	CONNTRACK_STAT_INC(insert);
+	write_unlock_bh(&ip_conntrack_lock);
+	if (ct->helper)
+		ip_conntrack_event_cache(IPCT_HELPER, *pskb);
 #ifdef CONFIG_IP_NF_NAT_NEEDED
-		if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
-		    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
-			ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+	if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+	    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+		ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
 #endif
-		ip_conntrack_event_cache(master_ct(ct) ?
-					 IPCT_RELATED : IPCT_NEW, *pskb);
+	ip_conntrack_event_cache(master_ct(ct) ?
+				 IPCT_RELATED : IPCT_NEW, *pskb);
 
-		return NF_ACCEPT;
-	}
+	return NF_ACCEPT;
 
+out:
 	CONNTRACK_STAT_INC(insert_failed);
 	write_unlock_bh(&ip_conntrack_lock);
-
 	return NF_DROP;
 }
 
@@ -527,23 +513,21 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
 
 /* There's a small race here where we may free a just-assured
    connection.  Too bad: we're in trouble anyway. */
-static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
-{
-	return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
-}
-
 static int early_drop(struct list_head *chain)
 {
 	/* Traverse backwards: gives us oldest, which is roughly LRU */
 	struct ip_conntrack_tuple_hash *h;
-	struct ip_conntrack *ct = NULL;
+	struct ip_conntrack *ct = NULL, *tmp;
 	int dropped = 0;
 
 	read_lock_bh(&ip_conntrack_lock);
-	h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
-	if (h) {
-		ct = tuplehash_to_ctrack(h);
-		atomic_inc(&ct->ct_general.use);
+	list_for_each_entry_reverse(h, chain, list) {
+		tmp = tuplehash_to_ctrack(h);
+		if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
+			ct = tmp;
+			atomic_inc(&ct->ct_general.use);
+			break;
+		}
 	}
 	read_unlock_bh(&ip_conntrack_lock);
 
@@ -559,18 +543,16 @@ static int early_drop(struct list_head *chain)
 	return dropped;
 }
 
-static inline int helper_cmp(const struct ip_conntrack_helper *i,
-			     const struct ip_conntrack_tuple *rtuple)
-{
-	return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
-}
-
 static struct ip_conntrack_helper *
 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
 {
-	return LIST_FIND(&helpers, helper_cmp,
-			 struct ip_conntrack_helper *,
-			 tuple);
+	struct ip_conntrack_helper *h;
+
+	list_for_each_entry(h, &helpers, list) {
+		if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
+			return h;
+	}
+	return NULL;
 }
 
 struct ip_conntrack_helper *
@@ -1062,7 +1044,7 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
 {
 	BUG_ON(me->timeout == 0);
 	write_lock_bh(&ip_conntrack_lock);
-	list_prepend(&helpers, me);
+	list_add(&me->list, &helpers);
 	write_unlock_bh(&ip_conntrack_lock);
 
 	return 0;
@@ -1081,24 +1063,24 @@ __ip_conntrack_helper_find_byname(const char *name)
 	return NULL;
 }
 
-static inline int unhelp(struct ip_conntrack_tuple_hash *i,
-			 const struct ip_conntrack_helper *me)
+static inline void unhelp(struct ip_conntrack_tuple_hash *i,
+			  const struct ip_conntrack_helper *me)
 {
 	if (tuplehash_to_ctrack(i)->helper == me) {
  		ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
 		tuplehash_to_ctrack(i)->helper = NULL;
 	}
-	return 0;
 }
 
 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 {
 	unsigned int i;
+	struct ip_conntrack_tuple_hash *h;
 	struct ip_conntrack_expect *exp, *tmp;
 
 	/* Need write lock here, to delete helper. */
 	write_lock_bh(&ip_conntrack_lock);
-	LIST_DELETE(&helpers, me);
+	list_del(&me->list);
 
 	/* Get rid of expectations */
 	list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
@@ -1108,10 +1090,12 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 		}
 	}
 	/* Get rid of expecteds, set helpers to NULL. */
-	LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
-	for (i = 0; i < ip_conntrack_htable_size; i++)
-		LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
-			    struct ip_conntrack_tuple_hash *, me);
+	list_for_each_entry(h, &unconfirmed, list)
+		unhelp(h, me);
+	for (i = 0; i < ip_conntrack_htable_size; i++) {
+		list_for_each_entry(h, &ip_conntrack_hash[i], list)
+			unhelp(h, me);
+	}
 	write_unlock_bh(&ip_conntrack_lock);
 
 	/* Someone could be still looking at the helper in a bh. */
@@ -1237,46 +1221,43 @@ static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
 	nf_conntrack_get(nskb->nfct);
 }
 
-static inline int
-do_iter(const struct ip_conntrack_tuple_hash *i,
-	int (*iter)(struct ip_conntrack *i, void *data),
-	void *data)
-{
-	return iter(tuplehash_to_ctrack(i), data);
-}
-
 /* Bring out ya dead! */
-static struct ip_conntrack_tuple_hash *
+static struct ip_conntrack *
 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
 		void *data, unsigned int *bucket)
 {
-	struct ip_conntrack_tuple_hash *h = NULL;
+	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack *ct;
 
 	write_lock_bh(&ip_conntrack_lock);
 	for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
-		h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
-				struct ip_conntrack_tuple_hash *, iter, data);
-		if (h)
-			break;
+		list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
+			ct = tuplehash_to_ctrack(h);
+			if (iter(ct, data))
+				goto found;
+		}
+	}
+	list_for_each_entry(h, &unconfirmed, list) {
+		ct = tuplehash_to_ctrack(h);
+		if (iter(ct, data))
+			goto found;
 	}
-	if (!h)
-		h = LIST_FIND_W(&unconfirmed, do_iter,
-				struct ip_conntrack_tuple_hash *, iter, data);
-	if (h)
-		atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
 	write_unlock_bh(&ip_conntrack_lock);
+	return NULL;
 
-	return h;
+found:
+	atomic_inc(&ct->ct_general.use);
+	write_unlock_bh(&ip_conntrack_lock);
+	return ct;
 }
 
 void
 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
 {
-	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack *ct;
 	unsigned int bucket = 0;
 
-	while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
-		struct ip_conntrack *ct = tuplehash_to_ctrack(h);
+	while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
 		/* Time to push up daises... */
 		if (del_timer(&ct->timeout))
 			death_by_timeout((unsigned long)ct);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 4ee016c427b..92c6d8b178c 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -37,7 +37,6 @@ static DEFINE_RWLOCK(ip_ct_gre_lock);
 #define ASSERT_READ_LOCK(x)
 #define ASSERT_WRITE_LOCK(x)
 
-#include <linux/netfilter_ipv4/listhelp.h>
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
@@ -82,10 +81,12 @@ static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t)
 	__be16 key = 0;
 
 	read_lock_bh(&ip_ct_gre_lock);
-	km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
-			struct ip_ct_gre_keymap *, t);
-	if (km)
-		key = km->tuple.src.u.gre.key;
+	list_for_each_entry(km, &gre_keymap_list, list) {
+		if (gre_key_cmpfn(km, t)) {
+			key = km->tuple.src.u.gre.key;
+			break;
+		}
+	}
 	read_unlock_bh(&ip_ct_gre_lock);
 	
 	DEBUGP("lookup src key 0x%x up key for ", key);
@@ -99,7 +100,7 @@ int
 ip_ct_gre_keymap_add(struct ip_conntrack *ct,
 		     struct ip_conntrack_tuple *t, int reply)
 {
-	struct ip_ct_gre_keymap **exist_km, *km, *old;
+	struct ip_ct_gre_keymap **exist_km, *km;
 
 	if (!ct->helper || strcmp(ct->helper->name, "pptp")) {
 		DEBUGP("refusing to add GRE keymap to non-pptp session\n");
@@ -113,13 +114,10 @@ ip_ct_gre_keymap_add(struct ip_conntrack *ct,
 
 	if (*exist_km) {
 		/* check whether it's a retransmission */
-		old = LIST_FIND(&gre_keymap_list, gre_key_cmpfn,
-				struct ip_ct_gre_keymap *, t);
-		if (old == *exist_km) {
-			DEBUGP("retransmission\n");
-			return 0;
+		list_for_each_entry(km, &gre_keymap_list, list) {
+			if (gre_key_cmpfn(km, t) && km == *exist_km)
+				return 0;
 		}
-
 		DEBUGP("trying to override keymap_%s for ct %p\n", 
 			reply? "reply":"orig", ct);
 		return -EEXIST;
@@ -136,7 +134,7 @@ ip_ct_gre_keymap_add(struct ip_conntrack *ct,
 	DUMP_TUPLE_GRE(&km->tuple);
 
 	write_lock_bh(&ip_ct_gre_lock);
-	list_append(&gre_keymap_list, km);
+	list_add_tail(&km->list, &gre_keymap_list);
 	write_unlock_bh(&ip_ct_gre_lock);
 
 	return 0;
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 3f5d495b853..02135756562 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -35,7 +35,6 @@
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
 #define DEBUGP printk
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 4c540d03d48..71f3e09cbc8 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -22,9 +22,6 @@
 #include <linux/udp.h>
 #include <linux/jhash.h>
 
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -33,7 +30,6 @@
 #include <linux/netfilter_ipv4/ip_nat_core.h>
 #include <linux/netfilter_ipv4/ip_nat_helper.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
 #define DEBUGP printk
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 021c3daae3e..7f6a75984f6 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -27,16 +27,12 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_nat.h>
 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
 #include <linux/netfilter_ipv4/ip_nat_core.h>
 #include <linux/netfilter_ipv4/ip_nat_helper.h>
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
 #define DEBUGP printk
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index e59f5a8ecb6..7b703839aa5 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -19,14 +19,10 @@
 #include <net/route.h>
 #include <linux/bitops.h>
 
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv4/ip_nat.h>
 #include <linux/netfilter_ipv4/ip_nat_core.h>
 #include <linux/netfilter_ipv4/ip_nat_rule.h>
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
 #define DEBUGP printk
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index f3b77835543..9c577db6204 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -30,9 +30,6 @@
 #include <net/checksum.h>
 #include <linux/spinlock.h>
 
-#define ASSERT_READ_LOCK(x)
-#define ASSERT_WRITE_LOCK(x)
-
 #include <linux/netfilter_ipv4/ip_nat.h>
 #include <linux/netfilter_ipv4/ip_nat_rule.h>
 #include <linux/netfilter_ipv4/ip_nat_protocol.h>
@@ -40,7 +37,6 @@
 #include <linux/netfilter_ipv4/ip_nat_helper.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
 #define DEBUGP printk
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index d1c315364ee..73d477ce216 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -70,9 +70,6 @@ do {								\
 #define IP_NF_ASSERT(x)
 #endif
 
-
-#include <linux/netfilter_ipv4/listhelp.h>
-
 #if 0
 /* All the better to debug you with... */
 #define static
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3b64dbee662..927137b8b3b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -57,7 +57,6 @@
 #include <net/netfilter/nf_conntrack_protocol.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_core.h>
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #define NF_CONNTRACK_VERSION	"0.5.0"
 
@@ -539,15 +538,10 @@ void nf_ct_remove_expectations(struct nf_conn *ct)
 static void
 clean_from_lists(struct nf_conn *ct)
 {
-	unsigned int ho, hr;
-	
 	DEBUGP("clean_from_lists(%p)\n", ct);
 	ASSERT_WRITE_LOCK(&nf_conntrack_lock);
-
-	ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-	hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-	LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
-	LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
+	list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
 
 	/* Destroy all pending expectations */
 	nf_ct_remove_expectations(ct);
@@ -617,16 +611,6 @@ static void death_by_timeout(unsigned long ul_conntrack)
 	nf_ct_put(ct);
 }
 
-static inline int
-conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
-		    const struct nf_conntrack_tuple *tuple,
-		    const struct nf_conn *ignored_conntrack)
-{
-	ASSERT_READ_LOCK(&nf_conntrack_lock);
-	return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
-		&& nf_ct_tuple_equal(tuple, &i->tuple);
-}
-
 struct nf_conntrack_tuple_hash *
 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
 		    const struct nf_conn *ignored_conntrack)
@@ -636,7 +620,8 @@ __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
 
 	ASSERT_READ_LOCK(&nf_conntrack_lock);
 	list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
-		if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+		if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
+		    nf_ct_tuple_equal(tuple, &h->tuple)) {
 			NF_CT_STAT_INC(found);
 			return h;
 		}
@@ -667,10 +652,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
 				       unsigned int repl_hash) 
 {
 	ct->id = ++nf_conntrack_next_id;
-	list_prepend(&nf_conntrack_hash[hash],
-		     &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
-	list_prepend(&nf_conntrack_hash[repl_hash],
-		     &ct->tuplehash[IP_CT_DIR_REPLY].list);
+	list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
+		 &nf_conntrack_hash[hash]);
+	list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
+		 &nf_conntrack_hash[repl_hash]);
 }
 
 void nf_conntrack_hash_insert(struct nf_conn *ct)
@@ -690,7 +675,9 @@ int
 __nf_conntrack_confirm(struct sk_buff **pskb)
 {
 	unsigned int hash, repl_hash;
+	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
+	struct nf_conn_help *help;
 	enum ip_conntrack_info ctinfo;
 
 	ct = nf_ct_get(*pskb, &ctinfo);
@@ -720,41 +707,41 @@ __nf_conntrack_confirm(struct sk_buff **pskb)
 	/* See if there's one in the list already, including reverse:
 	   NAT could have grabbed it without realizing, since we're
 	   not in the hash.  If there is, we lost race. */
-	if (!LIST_FIND(&nf_conntrack_hash[hash],
-		       conntrack_tuple_cmp,
-		       struct nf_conntrack_tuple_hash *,
-		       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
-	    && !LIST_FIND(&nf_conntrack_hash[repl_hash],
-			  conntrack_tuple_cmp,
-			  struct nf_conntrack_tuple_hash *,
-			  &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
-		struct nf_conn_help *help;
-		/* Remove from unconfirmed list */
-		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	list_for_each_entry(h, &nf_conntrack_hash[hash], list)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				      &h->tuple))
+			goto out;
+	list_for_each_entry(h, &nf_conntrack_hash[repl_hash], list)
+		if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+				      &h->tuple))
+			goto out;
 
-		__nf_conntrack_hash_insert(ct, hash, repl_hash);
-		/* Timer relative to confirmation time, not original
-		   setting time, otherwise we'd get timer wrap in
-		   weird delay cases. */
-		ct->timeout.expires += jiffies;
-		add_timer(&ct->timeout);
-		atomic_inc(&ct->ct_general.use);
-		set_bit(IPS_CONFIRMED_BIT, &ct->status);
-		NF_CT_STAT_INC(insert);
-		write_unlock_bh(&nf_conntrack_lock);
-		help = nfct_help(ct);
-		if (help && help->helper)
-			nf_conntrack_event_cache(IPCT_HELPER, *pskb);
+	/* Remove from unconfirmed list */
+	list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
+	__nf_conntrack_hash_insert(ct, hash, repl_hash);
+	/* Timer relative to confirmation time, not original
+	   setting time, otherwise we'd get timer wrap in
+	   weird delay cases. */
+	ct->timeout.expires += jiffies;
+	add_timer(&ct->timeout);
+	atomic_inc(&ct->ct_general.use);
+	set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	NF_CT_STAT_INC(insert);
+	write_unlock_bh(&nf_conntrack_lock);
+	help = nfct_help(ct);
+	if (help && help->helper)
+		nf_conntrack_event_cache(IPCT_HELPER, *pskb);
 #ifdef CONFIG_NF_NAT_NEEDED
-		if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
-		    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
-			nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
+	if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+	    test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+		nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
 #endif
-		nf_conntrack_event_cache(master_ct(ct) ?
-					 IPCT_RELATED : IPCT_NEW, *pskb);
-		return NF_ACCEPT;
-	}
+	nf_conntrack_event_cache(master_ct(ct) ?
+				 IPCT_RELATED : IPCT_NEW, *pskb);
+	return NF_ACCEPT;
 
+out:
 	NF_CT_STAT_INC(insert_failed);
 	write_unlock_bh(&nf_conntrack_lock);
 	return NF_DROP;
@@ -777,24 +764,21 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 
 /* There's a small race here where we may free a just-assured
    connection.  Too bad: we're in trouble anyway. */
-static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
-{
-	return !(test_bit(IPS_ASSURED_BIT,
-			  &nf_ct_tuplehash_to_ctrack(i)->status));
-}
-
 static int early_drop(struct list_head *chain)
 {
 	/* Traverse backwards: gives us oldest, which is roughly LRU */
 	struct nf_conntrack_tuple_hash *h;
-	struct nf_conn *ct = NULL;
+	struct nf_conn *ct = NULL, *tmp;
 	int dropped = 0;
 
 	read_lock_bh(&nf_conntrack_lock);
-	h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
-	if (h) {
-		ct = nf_ct_tuplehash_to_ctrack(h);
-		atomic_inc(&ct->ct_general.use);
+	list_for_each_entry_reverse(h, chain, list) {
+		tmp = nf_ct_tuplehash_to_ctrack(h);
+		if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
+			ct = tmp;
+			atomic_inc(&ct->ct_general.use);
+			break;
+		}
 	}
 	read_unlock_bh(&nf_conntrack_lock);
 
@@ -810,18 +794,16 @@ static int early_drop(struct list_head *chain)
 	return dropped;
 }
 
-static inline int helper_cmp(const struct nf_conntrack_helper *i,
-			     const struct nf_conntrack_tuple *rtuple)
-{
-	return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
-}
-
 static struct nf_conntrack_helper *
 __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
 {
-	return LIST_FIND(&helpers, helper_cmp,
-			 struct nf_conntrack_helper *,
-			 tuple);
+	struct nf_conntrack_helper *h;
+
+	list_for_each_entry(h, &helpers, list) {
+		if (nf_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
+			return h;
+	}
+	return NULL;
 }
 
 struct nf_conntrack_helper *
@@ -1323,7 +1305,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 		return ret;
 	}
 	write_lock_bh(&nf_conntrack_lock);
-	list_prepend(&helpers, me);
+	list_add(&me->list, &helpers);
 	write_unlock_bh(&nf_conntrack_lock);
 
 	return 0;
@@ -1342,8 +1324,8 @@ __nf_conntrack_helper_find_byname(const char *name)
 	return NULL;
 }
 
-static inline int unhelp(struct nf_conntrack_tuple_hash *i,
-			 const struct nf_conntrack_helper *me)
+static inline void unhelp(struct nf_conntrack_tuple_hash *i,
+			  const struct nf_conntrack_helper *me)
 {
 	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
 	struct nf_conn_help *help = nfct_help(ct);
@@ -1352,17 +1334,17 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i,
 		nf_conntrack_event(IPCT_HELPER, ct);
 		help->helper = NULL;
 	}
-	return 0;
 }
 
 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
 {
 	unsigned int i;
+	struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_expect *exp, *tmp;
 
 	/* Need write lock here, to delete helper. */
 	write_lock_bh(&nf_conntrack_lock);
-	LIST_DELETE(&helpers, me);
+	list_del(&me->list);
 
 	/* Get rid of expectations */
 	list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
@@ -1374,10 +1356,12 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
 	}
 
 	/* Get rid of expecteds, set helpers to NULL. */
-	LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
-	for (i = 0; i < nf_conntrack_htable_size; i++)
-		LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
-			    struct nf_conntrack_tuple_hash *, me);
+	list_for_each_entry(h, &unconfirmed, list)
+		unhelp(h, me);
+	for (i = 0; i < nf_conntrack_htable_size; i++) {
+		list_for_each_entry(h, &nf_conntrack_hash[i], list)
+			unhelp(h, me);
+	}
 	write_unlock_bh(&nf_conntrack_lock);
 
 	/* Someone could be still looking at the helper in a bh. */
@@ -1510,37 +1494,40 @@ do_iter(const struct nf_conntrack_tuple_hash *i,
 }
 
 /* Bring out ya dead! */
-static struct nf_conntrack_tuple_hash *
+static struct nf_conn *
 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
 		void *data, unsigned int *bucket)
 {
-	struct nf_conntrack_tuple_hash *h = NULL;
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
 
 	write_lock_bh(&nf_conntrack_lock);
 	for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
-		h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
-				struct nf_conntrack_tuple_hash *, iter, data);
-		if (h)
-			break;
+		list_for_each_entry(h, &nf_conntrack_hash[*bucket], list) {
+			ct = nf_ct_tuplehash_to_ctrack(h);
+			if (iter(ct, data))
+				goto found;
+		}
  	}
-	if (!h)
-		h = LIST_FIND_W(&unconfirmed, do_iter,
-				struct nf_conntrack_tuple_hash *, iter, data);
-	if (h)
-		atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
+	list_for_each_entry(h, &unconfirmed, list) {
+		ct = nf_ct_tuplehash_to_ctrack(h);
+		if (iter(ct, data))
+			goto found;
+	}
+	return NULL;
+found:
+	atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
 	write_unlock_bh(&nf_conntrack_lock);
-
-	return h;
+	return ct;
 }
 
 void
 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
 {
-	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
 	unsigned int bucket = 0;
 
-	while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
-		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+	while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
 		/* Time to push up daises... */
 		if (del_timer(&ct->timeout))
 			death_by_timeout((unsigned long)ct);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 9a1de0ca475..5954f677381 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -37,7 +37,6 @@
 #include <net/netfilter/nf_conntrack_protocol.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_helper.h>
-#include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
 #define DEBUGP printk
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8037ba63d58..be7baf4f684 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -81,7 +81,7 @@ xt_unregister_target(struct xt_target *target)
 	int af = target->family;
 
 	mutex_lock(&xt[af].mutex);
-	LIST_DELETE(&xt[af].target, target);
+	list_del(&target->list);
 	mutex_unlock(&xt[af].mutex);
 }
 EXPORT_SYMBOL(xt_unregister_target);
@@ -138,7 +138,7 @@ xt_unregister_match(struct xt_match *match)
 	int af =  match->family;
 
 	mutex_lock(&xt[af].mutex);
-	LIST_DELETE(&xt[af].match, match);
+	list_del(&match->list);
 	mutex_unlock(&xt[af].mutex);
 }
 EXPORT_SYMBOL(xt_unregister_match);
@@ -575,15 +575,18 @@ int xt_register_table(struct xt_table *table,
 {
 	int ret;
 	struct xt_table_info *private;
+	struct xt_table *t;
 
 	ret = mutex_lock_interruptible(&xt[table->af].mutex);
 	if (ret != 0)
 		return ret;
 
 	/* Don't autoload: we'd eat our tail... */
-	if (list_named_find(&xt[table->af].tables, table->name)) {
-		ret = -EEXIST;
-		goto unlock;
+	list_for_each_entry(t, &xt[table->af].tables, list) {
+		if (strcmp(t->name, table->name) == 0) {
+			ret = -EEXIST;
+			goto unlock;
+		}
 	}
 
 	/* Simplifies replace_table code. */
@@ -598,7 +601,7 @@ int xt_register_table(struct xt_table *table,
 	/* save number of initial entries */
 	private->initial_entries = private->number;
 
-	list_prepend(&xt[table->af].tables, table);
+	list_add(&table->list, &xt[table->af].tables);
 
 	ret = 0;
  unlock:
@@ -613,7 +616,7 @@ void *xt_unregister_table(struct xt_table *table)
 
 	mutex_lock(&xt[table->af].mutex);
 	private = table->private;
-	LIST_DELETE(&xt[table->af].tables, table);
+	list_del(&table->list);
 	mutex_unlock(&xt[table->af].mutex);
 
 	return private;
-- 
cgit v1.2.3-70-g09d2


From 9123de2c043996050bacf77031cad845f5976f5d Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 20 Sep 2006 11:59:42 -0700
Subject: [NETFILTER]: ip6table_mangle: reroute when nfmark changes in
 NF_IP6_LOCAL_OUT

Now that IPv6 supports policy routing we need to reroute in NF_IP6_LOCAL_OUT
when the mark value changes.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6.h       | 1 +
 include/net/ip6_route.h              | 2 --
 net/ipv6/netfilter/ip6table_mangle.c | 8 ++------
 3 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 52a7b9e7642..d97e268cdfe 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -73,6 +73,7 @@ enum nf_ip6_hook_priorities {
 };
 
 #ifdef CONFIG_NETFILTER
+extern int ip6_route_me_harder(struct sk_buff *skb);
 extern unsigned int nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
 				    unsigned int dataoff, u_int8_t protocol);
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 29790957004..6ca6b71dfe0 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -57,8 +57,6 @@ extern void			ip6_route_input(struct sk_buff *skb);
 extern struct dst_entry *	ip6_route_output(struct sock *sk,
 						 struct flowi *fl);
 
-extern int			ip6_route_me_harder(struct sk_buff *skb);
-
 extern void			ip6_route_init(void);
 extern void			ip6_route_cleanup(void);
 
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 32db04fd831..386ea260e76 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -180,12 +180,8 @@ ip6t_local_hook(unsigned int hook,
 		&& (memcmp(&(*pskb)->nh.ipv6h->saddr, &saddr, sizeof(saddr))
 		    || memcmp(&(*pskb)->nh.ipv6h->daddr, &daddr, sizeof(daddr))
 		    || (*pskb)->nfmark != nfmark
-		    || (*pskb)->nh.ipv6h->hop_limit != hop_limit)) {
-
-		/* something which could affect routing has changed */
-
-		DEBUGP("ip6table_mangle: we'd need to re-route a packet\n");
-	}
+		    || (*pskb)->nh.ipv6h->hop_limit != hop_limit))
+		return ip6_route_me_harder(*pskb) == 0 ? ret : NF_DROP;
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From c1fe3ca5106d9568791433fa6c7f27e71ac69e1b Mon Sep 17 00:00:00 2001
From: George Hansper <georgeh@anstat.com.au>
Date: Wed, 20 Sep 2006 12:03:23 -0700
Subject: [NETFILTER]: TCP conntrack: improve dead connection detection

Don't count window updates as retransmissions.

Signed-off-by: George Hansper <georgeh@anstat.com.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nf_conntrack_tcp.h  | 1 +
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 4 +++-
 net/netfilter/nf_conntrack_proto_tcp.c      | 4 +++-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nf_conntrack_tcp.h b/include/linux/netfilter/nf_conntrack_tcp.h
index b2feeffde38..6b01ba29772 100644
--- a/include/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/linux/netfilter/nf_conntrack_tcp.h
@@ -49,6 +49,7 @@ struct ip_ct_tcp
 	u_int32_t	last_seq;	/* Last sequence number seen in dir */
 	u_int32_t	last_ack;	/* Last sequence number seen in opposite dir */
 	u_int32_t	last_end;	/* Last seq + len */
+	u_int16_t	last_win;	/* Last window advertisement seen in dir */
 };
 
 #endif /* __KERNEL__ */
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 75a7237eb8c..03ae9a04cb3 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -731,13 +731,15 @@ static int tcp_in_window(struct ip_ct_tcp *state,
 			if (state->last_dir == dir
 			    && state->last_seq == seq
 			    && state->last_ack == ack
-			    && state->last_end == end)
+			    && state->last_end == end
+			    && state->last_win == win)
 				state->retrans++;
 			else {
 				state->last_dir = dir;
 				state->last_seq = seq;
 				state->last_ack = ack;
 				state->last_end = end;
+				state->last_win = win;
 				state->retrans = 0;
 			}
 		}
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 9fc0ee61f92..238bbb5b72e 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -688,13 +688,15 @@ static int tcp_in_window(struct ip_ct_tcp *state,
 			if (state->last_dir == dir
 			    && state->last_seq == seq
 			    && state->last_ack == ack
-			    && state->last_end == end)
+			    && state->last_end == end
+			    && state->last_win == win)
 				state->retrans++;
 			else {
 				state->last_dir = dir;
 				state->last_seq = seq;
 				state->last_ack = ack;
 				state->last_end = end;
+				state->last_win = win;
 				state->retrans = 0;
 			}
 		}
-- 
cgit v1.2.3-70-g09d2


From 9fa492cdc160cd27ce1046cb36f47d3b2b1efa21 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 20 Sep 2006 12:05:37 -0700
Subject: [NETFILTER]: x_tables: simplify compat API

Split the xt_compat_match/xt_compat_target into smaller type-safe functions
performing just one operation. Handle all alignment and size-related
conversions centrally in these function instead of requiring each module to
implement a full-blown conversion function. Replace ->compat callback by
->compat_from_user and ->compat_to_user callbacks, responsible for
converting just a single private structure.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h |  29 +++---
 net/ipv4/netfilter/ip_tables.c     | 115 +++++-----------------
 net/netfilter/x_tables.c           | 192 +++++++++++++++++++++----------------
 3 files changed, 151 insertions(+), 185 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index c832295dbf6..739a98eebe2 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -138,12 +138,6 @@ struct xt_counters_info
 
 #include <linux/netdevice.h>
 
-#ifdef CONFIG_COMPAT
-#define COMPAT_TO_USER		1
-#define COMPAT_FROM_USER	-1
-#define COMPAT_CALC_SIZE	0
-#endif
-
 struct xt_match
 {
 	struct list_head list;
@@ -176,7 +170,8 @@ struct xt_match
 	void (*destroy)(const struct xt_match *match, void *matchinfo);
 
 	/* Called when userspace align differs from kernel space one */
-	int (*compat)(void *match, void **dstptr, int *size, int convert);
+	void (*compat_from_user)(void *dst, void *src);
+	int (*compat_to_user)(void __user *dst, void *src);
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
@@ -186,6 +181,7 @@ struct xt_match
 
 	char *table;
 	unsigned int matchsize;
+	unsigned int compatsize;
 	unsigned int hooks;
 	unsigned short proto;
 
@@ -224,13 +220,15 @@ struct xt_target
 	void (*destroy)(const struct xt_target *target, void *targinfo);
 
 	/* Called when userspace align differs from kernel space one */
-	int (*compat)(void *target, void **dstptr, int *size, int convert);
+	void (*compat_from_user)(void *dst, void *src);
+	int (*compat_to_user)(void __user *dst, void *src);
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
 
 	char *table;
 	unsigned int targetsize;
+	unsigned int compatsize;
 	unsigned int hooks;
 	unsigned short proto;
 
@@ -387,9 +385,18 @@ struct compat_xt_counters_info
 
 extern void xt_compat_lock(int af);
 extern void xt_compat_unlock(int af);
-extern int xt_compat_match(void *match, void **dstptr, int *size, int convert);
-extern int xt_compat_target(void *target, void **dstptr, int *size,
-		int convert);
+
+extern int xt_compat_match_offset(struct xt_match *match);
+extern void xt_compat_match_from_user(struct xt_entry_match *m,
+				      void **dstptr, int *size);
+extern int xt_compat_match_to_user(struct xt_entry_match *m,
+				   void * __user *dstptr, int *size);
+
+extern int xt_compat_target_offset(struct xt_target *target);
+extern void xt_compat_target_from_user(struct xt_entry_target *t,
+				       void **dstptr, int *size);
+extern int xt_compat_target_to_user(struct xt_entry_target *t,
+				    void * __user *dstptr, int *size);
 
 #endif /* CONFIG_COMPAT */
 #endif /* __KERNEL__ */
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 673581db986..800067d69a9 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -942,73 +942,28 @@ static short compat_calc_jump(u_int16_t offset)
 	return delta;
 }
 
-struct compat_ipt_standard_target
+static void compat_standard_from_user(void *dst, void *src)
 {
-	struct compat_xt_entry_target target;
-	compat_int_t verdict;
-};
-
-struct compat_ipt_standard
-{
-	struct compat_ipt_entry entry;
-	struct compat_ipt_standard_target target;
-};
+	int v = *(compat_int_t *)src;
 
-#define IPT_ST_LEN		XT_ALIGN(sizeof(struct ipt_standard_target))
-#define IPT_ST_COMPAT_LEN	COMPAT_XT_ALIGN(sizeof(struct compat_ipt_standard_target))
-#define IPT_ST_OFFSET		(IPT_ST_LEN - IPT_ST_COMPAT_LEN)
+	if (v > 0)
+		v += compat_calc_jump(v);
+	memcpy(dst, &v, sizeof(v));
+}
 
-static int compat_ipt_standard_fn(void *target,
-		void **dstptr, int *size, int convert)
+static int compat_standard_to_user(void __user *dst, void *src)
 {
-	struct compat_ipt_standard_target compat_st, *pcompat_st;
-	struct ipt_standard_target st, *pst;
-	int ret;
+	compat_int_t cv = *(int *)src;
 
-	ret = 0;
-	switch (convert) {
-		case COMPAT_TO_USER:
-			pst = target;
-			memcpy(&compat_st.target, &pst->target,
-				sizeof(compat_st.target));
-			compat_st.verdict = pst->verdict;
-			if (compat_st.verdict > 0)
-				compat_st.verdict -=
-					compat_calc_jump(compat_st.verdict);
-			compat_st.target.u.user.target_size = IPT_ST_COMPAT_LEN;
-			if (copy_to_user(*dstptr, &compat_st, IPT_ST_COMPAT_LEN))
-				ret = -EFAULT;
-			*size -= IPT_ST_OFFSET;
-			*dstptr += IPT_ST_COMPAT_LEN;
-			break;
-		case COMPAT_FROM_USER:
-			pcompat_st = target;
-			memcpy(&st.target, &pcompat_st->target, IPT_ST_COMPAT_LEN);
-			st.verdict = pcompat_st->verdict;
-			if (st.verdict > 0)
-				st.verdict += compat_calc_jump(st.verdict);
-			st.target.u.user.target_size = IPT_ST_LEN;
-			memcpy(*dstptr, &st, IPT_ST_LEN);
-			*size += IPT_ST_OFFSET;
-			*dstptr += IPT_ST_LEN;
-			break;
-		case COMPAT_CALC_SIZE:
-			*size += IPT_ST_OFFSET;
-			break;
-		default:
-			ret = -ENOPROTOOPT;
-			break;
-	}
-	return ret;
+	if (cv > 0)
+		cv -= compat_calc_jump(cv);
+	return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
 }
 
 static inline int
 compat_calc_match(struct ipt_entry_match *m, int * size)
 {
-	if (m->u.kernel.match->compat)
-		m->u.kernel.match->compat(m, NULL, size, COMPAT_CALC_SIZE);
-	else
-		xt_compat_match(m, NULL, size, COMPAT_CALC_SIZE);
+	*size += xt_compat_match_offset(m->u.kernel.match);
 	return 0;
 }
 
@@ -1023,10 +978,7 @@ static int compat_calc_entry(struct ipt_entry *e, struct xt_table_info *info,
 	entry_offset = (void *)e - base;
 	IPT_MATCH_ITERATE(e, compat_calc_match, &off);
 	t = ipt_get_target(e);
-	if (t->u.kernel.target->compat)
-		t->u.kernel.target->compat(t, NULL, &off, COMPAT_CALC_SIZE);
-	else
-		xt_compat_target(t, NULL, &off, COMPAT_CALC_SIZE);
+	off += xt_compat_target_offset(t->u.kernel.target);
 	newinfo->size -= off;
 	ret = compat_add_offset(entry_offset, off);
 	if (ret)
@@ -1412,17 +1364,13 @@ struct compat_ipt_replace {
 };
 
 static inline int compat_copy_match_to_user(struct ipt_entry_match *m,
-		void __user **dstptr, compat_uint_t *size)
+		void * __user *dstptr, compat_uint_t *size)
 {
-	if (m->u.kernel.match->compat)
-		return m->u.kernel.match->compat(m, dstptr, size,
-				COMPAT_TO_USER);
-	else
-		return xt_compat_match(m, dstptr, size, COMPAT_TO_USER);
+	return xt_compat_match_to_user(m, dstptr, size);
 }
 
 static int compat_copy_entry_to_user(struct ipt_entry *e,
-		void __user **dstptr, compat_uint_t *size)
+		void * __user *dstptr, compat_uint_t *size)
 {
 	struct ipt_entry_target __user *t;
 	struct compat_ipt_entry __user *ce;
@@ -1442,11 +1390,7 @@ static int compat_copy_entry_to_user(struct ipt_entry *e,
 	if (ret)
 		goto out;
 	t = ipt_get_target(e);
-	if (t->u.kernel.target->compat)
-		ret = t->u.kernel.target->compat(t, dstptr, size,
-				COMPAT_TO_USER);
-	else
-		ret = xt_compat_target(t, dstptr, size, COMPAT_TO_USER);
+	ret = xt_compat_target_to_user(t, dstptr, size);
 	if (ret)
 		goto out;
 	ret = -EFAULT;
@@ -1478,11 +1422,7 @@ compat_check_calc_match(struct ipt_entry_match *m,
 		return match ? PTR_ERR(match) : -ENOENT;
 	}
 	m->u.kernel.match = match;
-
-	if (m->u.kernel.match->compat)
-		m->u.kernel.match->compat(m, NULL, size, COMPAT_CALC_SIZE);
-	else
-		xt_compat_match(m, NULL, size, COMPAT_CALC_SIZE);
+	*size += xt_compat_match_offset(match);
 
 	(*i)++;
 	return 0;
@@ -1543,10 +1483,7 @@ check_compat_entry_size_and_hooks(struct ipt_entry *e,
 	}
 	t->u.kernel.target = target;
 
-	if (t->u.kernel.target->compat)
-		t->u.kernel.target->compat(t, NULL, &off, COMPAT_CALC_SIZE);
-	else
-		xt_compat_target(t, NULL, &off, COMPAT_CALC_SIZE);
+	off += xt_compat_target_offset(target);
 	*size += off;
 	ret = compat_add_offset(entry_offset, off);
 	if (ret)
@@ -1584,10 +1521,7 @@ static inline int compat_copy_match_from_user(struct ipt_entry_match *m,
 
 	dm = (struct ipt_entry_match *)*dstptr;
 	match = m->u.kernel.match;
-	if (match->compat)
-		match->compat(m, dstptr, size, COMPAT_FROM_USER);
-	else
-		xt_compat_match(m, dstptr, size, COMPAT_FROM_USER);
+	xt_compat_match_from_user(m, dstptr, size);
 
 	ret = xt_check_match(match, AF_INET, dm->u.match_size - sizeof(*dm),
 			     name, hookmask, ip->proto,
@@ -1635,10 +1569,7 @@ static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
 	de->target_offset = e->target_offset - (origsize - *size);
 	t = ipt_get_target(e);
 	target = t->u.kernel.target;
-	if (target->compat)
-		target->compat(t, dstptr, size, COMPAT_FROM_USER);
-	else
-		xt_compat_target(t, dstptr, size, COMPAT_FROM_USER);
+	xt_compat_target_from_user(t, dstptr, size);
 
 	de->next_offset = e->next_offset - (origsize - *size);
 	for (h = 0; h < NF_IP_NUMHOOKS; h++) {
@@ -2205,7 +2136,9 @@ static struct ipt_target ipt_standard_target = {
 	.targetsize	= sizeof(int),
 	.family		= AF_INET,
 #ifdef CONFIG_COMPAT
-	.compat		= &compat_ipt_standard_fn,
+	.compatsize	= sizeof(compat_int_t),
+	.compat_from_user = compat_standard_from_user,
+	.compat_to_user	= compat_standard_to_user,
 #endif
 };
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index be7baf4f684..58522fc65d3 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -333,52 +333,65 @@ int xt_check_match(const struct xt_match *match, unsigned short family,
 EXPORT_SYMBOL_GPL(xt_check_match);
 
 #ifdef CONFIG_COMPAT
-int xt_compat_match(void *match, void **dstptr, int *size, int convert)
+int xt_compat_match_offset(struct xt_match *match)
 {
-	struct xt_match *m;
-	struct compat_xt_entry_match *pcompat_m;
-	struct xt_entry_match *pm;
-	u_int16_t msize;
-	int off, ret;
+	u_int16_t csize = match->compatsize ? : match->matchsize;
+	return XT_ALIGN(match->matchsize) - COMPAT_XT_ALIGN(csize);
+}
+EXPORT_SYMBOL_GPL(xt_compat_match_offset);
 
-	ret = 0;
-	m = ((struct xt_entry_match *)match)->u.kernel.match;
-	off = XT_ALIGN(m->matchsize) - COMPAT_XT_ALIGN(m->matchsize);
-	switch (convert) {
-		case COMPAT_TO_USER:
-			pm = (struct xt_entry_match *)match;
-			msize = pm->u.user.match_size;
-			if (copy_to_user(*dstptr, pm, msize)) {
-				ret = -EFAULT;
-				break;
-			}
-			msize -= off;
-			if (put_user(msize, (u_int16_t *)*dstptr))
-				ret = -EFAULT;
-			*size -= off;
-			*dstptr += msize;
-			break;
-		case COMPAT_FROM_USER:
-			pcompat_m = (struct compat_xt_entry_match *)match;
-			pm = (struct xt_entry_match *)*dstptr;
-			msize = pcompat_m->u.user.match_size;
-			memcpy(pm, pcompat_m, msize);
-			msize += off;
-			pm->u.user.match_size = msize;
-			*size += off;
-			*dstptr += msize;
-			break;
-		case COMPAT_CALC_SIZE:
-			*size += off;
-			break;
-		default:
-			ret = -ENOPROTOOPT;
-			break;
+void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
+			       int *size)
+{
+	struct xt_match *match = m->u.kernel.match;
+	struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m;
+	int pad, off = xt_compat_match_offset(match);
+	u_int16_t msize = cm->u.user.match_size;
+
+	m = *dstptr;
+	memcpy(m, cm, sizeof(*cm));
+	if (match->compat_from_user)
+		match->compat_from_user(m->data, cm->data);
+	else
+		memcpy(m->data, cm->data, msize - sizeof(*cm));
+	pad = XT_ALIGN(match->matchsize) - match->matchsize;
+	if (pad > 0)
+		memset(m->data + match->matchsize, 0, pad);
+
+	msize += off;
+	m->u.user.match_size = msize;
+
+	*size += off;
+	*dstptr += msize;
+}
+EXPORT_SYMBOL_GPL(xt_compat_match_from_user);
+
+int xt_compat_match_to_user(struct xt_entry_match *m, void __user **dstptr,
+			    int *size)
+{
+	struct xt_match *match = m->u.kernel.match;
+	struct compat_xt_entry_match __user *cm = *dstptr;
+	int off = xt_compat_match_offset(match);
+	u_int16_t msize = m->u.user.match_size - off;
+
+	if (copy_to_user(cm, m, sizeof(*cm)) ||
+	    put_user(msize, &cm->u.user.match_size))
+	    	return -EFAULT;
+
+	if (match->compat_to_user) {
+		if (match->compat_to_user((void __user *)cm->data, m->data))
+			return -EFAULT;
+	} else {
+		if (copy_to_user(cm->data, m->data, msize - sizeof(*cm)))
+			return -EFAULT;
 	}
-	return ret;
+
+	*size -= off;
+	*dstptr += msize;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(xt_compat_match);
-#endif
+EXPORT_SYMBOL_GPL(xt_compat_match_to_user);
+#endif /* CONFIG_COMPAT */
 
 int xt_check_target(const struct xt_target *target, unsigned short family,
 		    unsigned int size, const char *table, unsigned int hook_mask,
@@ -410,51 +423,64 @@ int xt_check_target(const struct xt_target *target, unsigned short family,
 EXPORT_SYMBOL_GPL(xt_check_target);
 
 #ifdef CONFIG_COMPAT
-int xt_compat_target(void *target, void **dstptr, int *size, int convert)
+int xt_compat_target_offset(struct xt_target *target)
 {
-	struct xt_target *t;
-	struct compat_xt_entry_target *pcompat;
-	struct xt_entry_target *pt;
-	u_int16_t tsize;
-	int off, ret;
+	u_int16_t csize = target->compatsize ? : target->targetsize;
+	return XT_ALIGN(target->targetsize) - COMPAT_XT_ALIGN(csize);
+}
+EXPORT_SYMBOL_GPL(xt_compat_target_offset);
 
-	ret = 0;
-	t = ((struct xt_entry_target *)target)->u.kernel.target;
-	off = XT_ALIGN(t->targetsize) - COMPAT_XT_ALIGN(t->targetsize);
-	switch (convert) {
-		case COMPAT_TO_USER:
-			pt = (struct xt_entry_target *)target;
-			tsize = pt->u.user.target_size;
-			if (copy_to_user(*dstptr, pt, tsize)) {
-				ret = -EFAULT;
-				break;
-			}
-			tsize -= off;
-			if (put_user(tsize, (u_int16_t *)*dstptr))
-				ret = -EFAULT;
-			*size -= off;
-			*dstptr += tsize;
-			break;
-		case COMPAT_FROM_USER:
-			pcompat = (struct compat_xt_entry_target *)target;
-			pt = (struct xt_entry_target *)*dstptr;
-			tsize = pcompat->u.user.target_size;
-			memcpy(pt, pcompat, tsize);
-			tsize += off;
-			pt->u.user.target_size = tsize;
-			*size += off;
-			*dstptr += tsize;
-			break;
-		case COMPAT_CALC_SIZE:
-			*size += off;
-			break;
-		default:
-			ret = -ENOPROTOOPT;
-			break;
+void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
+			        int *size)
+{
+	struct xt_target *target = t->u.kernel.target;
+	struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
+	int pad, off = xt_compat_target_offset(target);
+	u_int16_t tsize = ct->u.user.target_size;
+
+	t = *dstptr;
+	memcpy(t, ct, sizeof(*ct));
+	if (target->compat_from_user)
+		target->compat_from_user(t->data, ct->data);
+	else
+		memcpy(t->data, ct->data, tsize - sizeof(*ct));
+	pad = XT_ALIGN(target->targetsize) - target->targetsize;
+	if (pad > 0)
+		memset(t->data + target->targetsize, 0, pad);
+
+	tsize += off;
+	t->u.user.target_size = tsize;
+
+	*size += off;
+	*dstptr += tsize;
+}
+EXPORT_SYMBOL_GPL(xt_compat_target_from_user);
+
+int xt_compat_target_to_user(struct xt_entry_target *t, void __user **dstptr,
+			     int *size)
+{
+	struct xt_target *target = t->u.kernel.target;
+	struct compat_xt_entry_target __user *ct = *dstptr;
+	int off = xt_compat_target_offset(target);
+	u_int16_t tsize = t->u.user.target_size - off;
+
+	if (copy_to_user(ct, t, sizeof(*ct)) ||
+	    put_user(tsize, &ct->u.user.target_size))
+	    	return -EFAULT;
+
+	if (target->compat_to_user) {
+		if (target->compat_to_user((void __user *)ct->data, t->data))
+			return -EFAULT;
+	} else {
+		if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct)))
+			return -EFAULT;
 	}
-	return ret;
+
+	*size -= off;
+	*dstptr += tsize;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(xt_compat_target);
+EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
 #endif
 
 struct xt_table_info *xt_alloc_table_info(unsigned int size)
-- 
cgit v1.2.3-70-g09d2


From edd5a329cf69c112882e03c8ab55e985062a5d2a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 20 Sep 2006 12:07:39 -0700
Subject: [NETFILTER]: PPTP conntrack: fix whitespace errors

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_pptp.h | 26 +++----
 net/ipv4/netfilter/ip_conntrack_helper_pptp.c    | 76 ++++++++++----------
 net/ipv4/netfilter/ip_conntrack_proto_gre.c      | 28 ++++----
 net/ipv4/netfilter/ip_nat_helper_pptp.c          | 92 ++++++++++++------------
 net/ipv4/netfilter/ip_nat_proto_gre.c            | 20 +++---
 5 files changed, 121 insertions(+), 121 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
index 816144c75de..88f66d3c876 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
@@ -285,19 +285,19 @@ struct PptpSetLinkInfo {
 };
 
 union pptp_ctrl_union {
-		struct PptpStartSessionRequest	sreq;
-		struct PptpStartSessionReply	srep;
-		struct PptpStopSessionRequest	streq;
-		struct PptpStopSessionReply	strep;
-                struct PptpOutCallRequest       ocreq;
-                struct PptpOutCallReply         ocack;
-                struct PptpInCallRequest        icreq;
-                struct PptpInCallReply          icack;
-                struct PptpInCallConnected      iccon;
-		struct PptpClearCallRequest	clrreq;
-                struct PptpCallDisconnectNotify disc;
-                struct PptpWanErrorNotify       wanerr;
-                struct PptpSetLinkInfo          setlink;
+	struct PptpStartSessionRequest	sreq;
+	struct PptpStartSessionReply	srep;
+	struct PptpStopSessionRequest	streq;
+	struct PptpStopSessionReply	strep;
+	struct PptpOutCallRequest	ocreq;
+	struct PptpOutCallReply		ocack;
+	struct PptpInCallRequest	icreq;
+	struct PptpInCallReply		icack;
+	struct PptpInCallConnected	iccon;
+	struct PptpClearCallRequest	clrreq;
+	struct PptpCallDisconnectNotify disc;
+	struct PptpWanErrorNotify	wanerr;
+	struct PptpSetLinkInfo		setlink;
 };
 
 extern int
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
index b020a33e65e..6c94dd5d476 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -20,11 +20,11 @@
  * 	 - We can only support one single call within each session
  *
  * TODO:
- *	 - testing of incoming PPTP calls 
+ *	 - testing of incoming PPTP calls
  *
- * Changes: 
+ * Changes:
  * 	2002-02-05 - Version 1.3
- * 	  - Call ip_conntrack_unexpect_related() from 
+ * 	  - Call ip_conntrack_unexpect_related() from
  * 	    pptp_destroy_siblings() to destroy expectations in case
  * 	    CALL_DISCONNECT_NOTIFY or tcp fin packet was seen
  * 	    (Philip Craig <philipc@snapgear.com>)
@@ -141,7 +141,7 @@ static void pptp_expectfn(struct ip_conntrack *ct,
 		invert_tuplepr(&inv_t, &exp->tuple);
 		DEBUGP("trying to unexpect other dir: ");
 		DUMP_TUPLE(&inv_t);
-	
+
 		exp_other = ip_conntrack_expect_find(&inv_t);
 		if (exp_other) {
 			/* delete other expectation.  */
@@ -194,7 +194,7 @@ static void pptp_destroy_siblings(struct ip_conntrack *ct)
 {
 	struct ip_conntrack_tuple t;
 
-	/* Since ct->sibling_list has literally rusted away in 2.6.11, 
+	/* Since ct->sibling_list has literally rusted away in 2.6.11,
 	 * we now need another way to find out about our sibling
 	 * contrack and expects... -HW */
 
@@ -264,7 +264,7 @@ exp_gre(struct ip_conntrack *master,
 	exp_orig->mask.dst.u.gre.key = htons(0xffff);
 	exp_orig->mask.dst.ip = 0xffffffff;
 	exp_orig->mask.dst.protonum = 0xff;
-		
+
 	exp_orig->master = master;
 	exp_orig->expectfn = pptp_expectfn;
 	exp_orig->flags = 0;
@@ -322,7 +322,7 @@ out_unexpect_orig:
 	goto out_put_both;
 }
 
-static inline int 
+static inline int
 pptp_inbound_pkt(struct sk_buff **pskb,
 		 struct tcphdr *tcph,
 		 unsigned int nexthdr_off,
@@ -336,7 +336,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 	struct ip_ct_pptp_master *info = &ct->help.ct_pptp_info;
 	u_int16_t msg;
 	__be16 *cid, *pcid;
-	u_int32_t seq;	
+	u_int32_t seq;
 
 	ctlh = skb_header_pointer(*pskb, nexthdr_off, sizeof(_ctlh), &_ctlh);
 	if (!ctlh) {
@@ -373,7 +373,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		}
 		if (pptpReq->srep.resultCode == PPTP_START_OK)
 			info->sstate = PPTP_SESSION_CONFIRMED;
-		else 
+		else
 			info->sstate = PPTP_SESSION_ERROR;
 		break;
 
@@ -420,22 +420,22 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		pcid = &pptpReq->ocack.peersCallID;
 
 		info->pac_call_id = ntohs(*cid);
-		
+
 		if (htons(info->pns_call_id) != *pcid) {
 			DEBUGP("%s for unknown callid %u\n",
 				pptp_msg_name[msg], ntohs(*pcid));
 			break;
 		}
 
-		DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], 
+		DEBUGP("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg],
 			ntohs(*cid), ntohs(*pcid));
-		
+
 		info->cstate = PPTP_CALL_OUT_CONF;
 
 		seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr)
 				       + sizeof(struct PptpControlHeader)
 				       + ((void *)pcid - (void *)pptpReq);
-			
+
 		if (exp_gre(ct, seq, *cid, *pcid) != 0)
 			printk("ip_conntrack_pptp: error during exp_gre\n");
 		break;
@@ -479,7 +479,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		cid = &info->pac_call_id;
 
 		if (info->pns_call_id != ntohs(*pcid)) {
-			DEBUGP("%s for unknown CallID %u\n", 
+			DEBUGP("%s for unknown CallID %u\n",
 				pptp_msg_name[msg], ntohs(*pcid));
 			break;
 		}
@@ -491,7 +491,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		seq = ntohl(tcph->seq) + sizeof(struct pptp_pkt_hdr)
 				       + sizeof(struct PptpControlHeader)
 				       + ((void *)pcid - (void *)pptpReq);
-			
+
 		if (exp_gre(ct, seq, *cid, *pcid) != 0)
 			printk("ip_conntrack_pptp: error during exp_gre\n");
 
@@ -554,7 +554,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		return NF_ACCEPT;
 	nexthdr_off += sizeof(_ctlh);
 	datalen -= sizeof(_ctlh);
-	
+
 	reqlen = datalen;
 	if (reqlen > sizeof(*pptpReq))
 		reqlen = sizeof(*pptpReq);
@@ -606,7 +606,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		/* client answers incoming call */
 		if (info->cstate != PPTP_CALL_IN_REQ
 		    && info->cstate != PPTP_CALL_IN_REP) {
-			DEBUGP("%s without incall_req\n", 
+			DEBUGP("%s without incall_req\n",
 				pptp_msg_name[msg]);
 			break;
 		}
@@ -616,7 +616,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		}
 		pcid = &pptpReq->icack.peersCallID;
 		if (info->pac_call_id != ntohs(*pcid)) {
-			DEBUGP("%s for unknown call %u\n", 
+			DEBUGP("%s for unknown call %u\n",
 				pptp_msg_name[msg], ntohs(*pcid));
 			break;
 		}
@@ -644,12 +644,12 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		/* I don't have to explain these ;) */
 		break;
 	default:
-		DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX)? 
+		DEBUGP("invalid %s (TY=%d)\n", (msg <= PPTP_MSG_MAX)?
 			pptp_msg_name[msg]:pptp_msg_name[0], msg);
 		/* unknown: no need to create GRE masq table entry */
 		break;
 	}
-	
+
 	if (ip_nat_pptp_hook_outbound)
 		return ip_nat_pptp_hook_outbound(pskb, ct, ctinfo, ctlh,
 						 pptpReq);
@@ -659,7 +659,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 
 
 /* track caller id inside control connection, call expect_related */
-static int 
+static int
 conntrack_pptp_help(struct sk_buff **pskb,
 		    struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
 
@@ -676,12 +676,12 @@ conntrack_pptp_help(struct sk_buff **pskb,
 	int ret;
 
 	/* don't do any tracking before tcp handshake complete */
-	if (ctinfo != IP_CT_ESTABLISHED 
+	if (ctinfo != IP_CT_ESTABLISHED
 	    && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
 		DEBUGP("ctinfo = %u, skipping\n", ctinfo);
 		return NF_ACCEPT;
 	}
-	
+
 	nexthdr_off = (*pskb)->nh.iph->ihl*4;
 	tcph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_tcph), &_tcph);
 	BUG_ON(!tcph);
@@ -735,28 +735,28 @@ conntrack_pptp_help(struct sk_buff **pskb,
 }
 
 /* control protocol helper */
-static struct ip_conntrack_helper pptp = { 
+static struct ip_conntrack_helper pptp = {
 	.list = { NULL, NULL },
-	.name = "pptp", 
+	.name = "pptp",
 	.me = THIS_MODULE,
 	.max_expected = 2,
 	.timeout = 5 * 60,
-	.tuple = { .src = { .ip = 0, 
-		 	    .u = { .tcp = { .port =  
-				    __constant_htons(PPTP_CONTROL_PORT) } } 
-			  }, 
-		   .dst = { .ip = 0, 
+	.tuple = { .src = { .ip = 0,
+		 	    .u = { .tcp = { .port =
+				    __constant_htons(PPTP_CONTROL_PORT) } }
+			  },
+		   .dst = { .ip = 0,
 			    .u = { .all = 0 },
 			    .protonum = IPPROTO_TCP
-			  } 
+			  }
 		 },
-	.mask = { .src = { .ip = 0, 
-			   .u = { .tcp = { .port = __constant_htons(0xffff) } } 
-			 }, 
-		  .dst = { .ip = 0, 
+	.mask = { .src = { .ip = 0,
+			   .u = { .tcp = { .port = __constant_htons(0xffff) } }
+			 },
+		  .dst = { .ip = 0,
 			   .u = { .all = 0 },
-			   .protonum = 0xff 
-		 	 } 
+			   .protonum = 0xff
+		 	 }
 		},
 	.help = conntrack_pptp_help
 };
@@ -768,7 +768,7 @@ extern int __init ip_ct_proto_gre_init(void);
 static int __init ip_conntrack_helper_pptp_init(void)
 {
 	int retcode;
- 
+
 	retcode = ip_ct_proto_gre_init();
 	if (retcode < 0)
 		return retcode;
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index 92c6d8b178c..5fe026f467d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -1,15 +1,15 @@
 /*
- * ip_conntrack_proto_gre.c - Version 3.0 
+ * ip_conntrack_proto_gre.c - Version 3.0
  *
  * Connection tracking protocol helper module for GRE.
  *
  * GRE is a generic encapsulation protocol, which is generally not very
  * suited for NAT, as it has no protocol-specific part as port numbers.
  *
- * It has an optional key field, which may help us distinguishing two 
+ * It has an optional key field, which may help us distinguishing two
  * connections between the same two hosts.
  *
- * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
  *
  * PPTP is built on top of a modified version of GRE, and has a mandatory
  * field called "CallID", which serves us for the same purpose as the key
@@ -61,7 +61,7 @@ MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE");
 #define DEBUGP(x, args...)
 #define DUMP_TUPLE_GRE(x)
 #endif
-				
+
 /* GRE KEYMAP HANDLING FUNCTIONS */
 static LIST_HEAD(gre_keymap_list);
 
@@ -88,7 +88,7 @@ static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t)
 		}
 	}
 	read_unlock_bh(&ip_ct_gre_lock);
-	
+
 	DEBUGP("lookup src key 0x%x up key for ", key);
 	DUMP_TUPLE_GRE(t);
 
@@ -107,7 +107,7 @@ ip_ct_gre_keymap_add(struct ip_conntrack *ct,
 		return -1;
 	}
 
-	if (!reply) 
+	if (!reply)
 		exist_km = &ct->help.ct_pptp_info.keymap_orig;
 	else
 		exist_km = &ct->help.ct_pptp_info.keymap_reply;
@@ -118,7 +118,7 @@ ip_ct_gre_keymap_add(struct ip_conntrack *ct,
 			if (gre_key_cmpfn(km, t) && km == *exist_km)
 				return 0;
 		}
-		DEBUGP("trying to override keymap_%s for ct %p\n", 
+		DEBUGP("trying to override keymap_%s for ct %p\n",
 			reply? "reply":"orig", ct);
 		return -EEXIST;
 	}
@@ -152,7 +152,7 @@ void ip_ct_gre_keymap_destroy(struct ip_conntrack *ct)
 
 	write_lock_bh(&ip_ct_gre_lock);
 	if (ct->help.ct_pptp_info.keymap_orig) {
-		DEBUGP("removing %p from list\n", 
+		DEBUGP("removing %p from list\n",
 			ct->help.ct_pptp_info.keymap_orig);
 		list_del(&ct->help.ct_pptp_info.keymap_orig->list);
 		kfree(ct->help.ct_pptp_info.keymap_orig);
@@ -220,7 +220,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb,
 static int gre_print_tuple(struct seq_file *s,
 			   const struct ip_conntrack_tuple *tuple)
 {
-	return seq_printf(s, "srckey=0x%x dstkey=0x%x ", 
+	return seq_printf(s, "srckey=0x%x dstkey=0x%x ",
 			  ntohs(tuple->src.u.gre.key),
 			  ntohs(tuple->dst.u.gre.key));
 }
@@ -250,14 +250,14 @@ static int gre_packet(struct ip_conntrack *ct,
 	} else
 		ip_ct_refresh_acct(ct, conntrackinfo, skb,
 				   ct->proto.gre.timeout);
-	
+
 	return NF_ACCEPT;
 }
 
 /* Called when a new connection for this protocol found. */
 static int gre_new(struct ip_conntrack *ct,
 		   const struct sk_buff *skb)
-{ 
+{
 	DEBUGP(": ");
 	DUMP_TUPLE_GRE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 
@@ -283,9 +283,9 @@ static void gre_destroy(struct ip_conntrack *ct)
 }
 
 /* protocol helper struct */
-static struct ip_conntrack_protocol gre = { 
+static struct ip_conntrack_protocol gre = {
 	.proto		 = IPPROTO_GRE,
-	.name		 = "gre", 
+	.name		 = "gre",
 	.pkt_to_tuple	 = gre_pkt_to_tuple,
 	.invert_tuple	 = gre_invert_tuple,
 	.print_tuple	 = gre_print_tuple,
@@ -323,7 +323,7 @@ void ip_ct_proto_gre_fini(void)
 	}
 	write_unlock_bh(&ip_ct_gre_lock);
 
-	ip_conntrack_protocol_unregister(&gre); 
+	ip_conntrack_protocol_unregister(&gre);
 }
 
 EXPORT_SYMBOL(ip_ct_gre_keymap_add);
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
index 1d149964dc3..5dde1da1c30 100644
--- a/net/ipv4/netfilter/ip_nat_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c
@@ -32,7 +32,7 @@
  *     2005-06-10 - Version 3.0
  *       - kernel >= 2.6.11 version,
  *	   funded by Oxcoda NetBox Blue (http://www.netboxblue.com/)
- * 
+ *
  */
 
 #include <linux/module.h>
@@ -93,10 +93,10 @@ static void pptp_nat_expected(struct ip_conntrack *ct,
 		DEBUGP("we are PAC->PNS\n");
 		/* build tuple for PNS->PAC */
 		t.src.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
-		t.src.u.gre.key = 
+		t.src.u.gre.key =
 			htons(master->nat.help.nat_pptp_info.pns_call_id);
 		t.dst.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
-		t.dst.u.gre.key = 
+		t.dst.u.gre.key =
 			htons(master->nat.help.nat_pptp_info.pac_call_id);
 		t.dst.protonum = IPPROTO_GRE;
 	}
@@ -153,47 +153,47 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 	unsigned int cid_off;
 
 	new_callid = htons(ct_pptp_info->pns_call_id);
-	
+
 	switch (msg = ntohs(ctlh->messageType)) {
-		case PPTP_OUT_CALL_REQUEST:
-			cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
-			/* FIXME: ideally we would want to reserve a call ID
-			 * here.  current netfilter NAT core is not able to do
-			 * this :( For now we use TCP source port. This breaks
-			 * multiple calls within one control session */
-
-			/* save original call ID in nat_info */
-			nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
-
-			/* don't use tcph->source since we are at a DSTmanip
-			 * hook (e.g. PREROUTING) and pkt is not mangled yet */
-			new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
-
-			/* save new call ID in ct info */
-			ct_pptp_info->pns_call_id = ntohs(new_callid);
-			break;
-		case PPTP_IN_CALL_REPLY:
-			cid_off = offsetof(union pptp_ctrl_union, icreq.callID);
-			break;
-		case PPTP_CALL_CLEAR_REQUEST:
-			cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
-			break;
-		default:
-			DEBUGP("unknown outbound packet 0x%04x:%s\n", msg,
-			      (msg <= PPTP_MSG_MAX)? 
-			      pptp_msg_name[msg]:pptp_msg_name[0]);
-			/* fall through */
-
-		case PPTP_SET_LINK_INFO:
-			/* only need to NAT in case PAC is behind NAT box */
-		case PPTP_START_SESSION_REQUEST:
-		case PPTP_START_SESSION_REPLY:
-		case PPTP_STOP_SESSION_REQUEST:
-		case PPTP_STOP_SESSION_REPLY:
-		case PPTP_ECHO_REQUEST:
-		case PPTP_ECHO_REPLY:
-			/* no need to alter packet */
-			return NF_ACCEPT;
+	case PPTP_OUT_CALL_REQUEST:
+		cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
+		/* FIXME: ideally we would want to reserve a call ID
+		 * here.  current netfilter NAT core is not able to do
+		 * this :( For now we use TCP source port. This breaks
+		 * multiple calls within one control session */
+
+		/* save original call ID in nat_info */
+		nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
+
+		/* don't use tcph->source since we are at a DSTmanip
+		 * hook (e.g. PREROUTING) and pkt is not mangled yet */
+		new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+		/* save new call ID in ct info */
+		ct_pptp_info->pns_call_id = ntohs(new_callid);
+		break;
+	case PPTP_IN_CALL_REPLY:
+		cid_off = offsetof(union pptp_ctrl_union, icreq.callID);
+		break;
+	case PPTP_CALL_CLEAR_REQUEST:
+		cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
+		break;
+	default:
+		DEBUGP("unknown outbound packet 0x%04x:%s\n", msg,
+		      (msg <= PPTP_MSG_MAX)?
+		      pptp_msg_name[msg]:pptp_msg_name[0]);
+		/* fall through */
+
+	case PPTP_SET_LINK_INFO:
+		/* only need to NAT in case PAC is behind NAT box */
+	case PPTP_START_SESSION_REQUEST:
+	case PPTP_START_SESSION_REPLY:
+	case PPTP_STOP_SESSION_REQUEST:
+	case PPTP_STOP_SESSION_REPLY:
+	case PPTP_ECHO_REQUEST:
+	case PPTP_ECHO_REPLY:
+		/* no need to alter packet */
+		return NF_ACCEPT;
 	}
 
 	/* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
@@ -216,9 +216,9 @@ static int
 pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
 	     struct ip_conntrack_expect *expect_reply)
 {
-	struct ip_ct_pptp_master *ct_pptp_info = 
+	struct ip_ct_pptp_master *ct_pptp_info =
 				&expect_orig->master->help.ct_pptp_info;
-	struct ip_nat_pptp *nat_pptp_info = 
+	struct ip_nat_pptp *nat_pptp_info =
 				&expect_orig->master->nat.help.nat_pptp_info;
 
 	struct ip_conntrack *ct = expect_orig->master;
@@ -324,7 +324,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		break;
 
 	default:
-		DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)? 
+		DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)?
 			pptp_msg_name[msg]:pptp_msg_name[0]);
 		/* fall through */
 
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
index 70a65372225..a5226691f02 100644
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -6,10 +6,10 @@
  * GRE is a generic encapsulation protocol, which is generally not very
  * suited for NAT, as it has no protocol-specific part as port numbers.
  *
- * It has an optional key field, which may help us distinguishing two 
+ * It has an optional key field, which may help us distinguishing two
  * connections between the same two hosts.
  *
- * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 
+ * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
  *
  * PPTP is built on top of a modified version of GRE, and has a mandatory
  * field called "CallID", which serves us for the same purpose as the key
@@ -60,7 +60,7 @@ gre_in_range(const struct ip_conntrack_tuple *tuple,
 }
 
 /* generate unique tuple ... */
-static int 
+static int
 gre_unique_tuple(struct ip_conntrack_tuple *tuple,
 		 const struct ip_nat_range *range,
 		 enum ip_nat_manip_type maniptype,
@@ -84,7 +84,7 @@ gre_unique_tuple(struct ip_conntrack_tuple *tuple,
 		range_size = ntohs(range->max.gre.key) - min + 1;
 	}
 
-	DEBUGP("min = %u, range_size = %u\n", min, range_size); 
+	DEBUGP("min = %u, range_size = %u\n", min, range_size);
 
 	for (i = 0; i < range_size; i++, key++) {
 		*keyptr = htons(min + key % range_size);
@@ -117,7 +117,7 @@ gre_manip_pkt(struct sk_buff **pskb,
 	greh = (void *)(*pskb)->data + hdroff;
 	pgreh = (struct gre_hdr_pptp *) greh;
 
-	/* we only have destination manip of a packet, since 'source key' 
+	/* we only have destination manip of a packet, since 'source key'
 	 * is not present in the packet itself */
 	if (maniptype == IP_NAT_MANIP_DST) {
 		/* key manipulation is always dest */
@@ -129,7 +129,7 @@ gre_manip_pkt(struct sk_buff **pskb,
 			}
 			if (greh->csum) {
 				/* FIXME: Never tested this code... */
-				*(gre_csum(greh)) = 
+				*(gre_csum(greh)) =
 					nf_proto_csum_update(*pskb,
 							~*(gre_key(greh)),
 							tuple->dst.u.gre.key,
@@ -138,7 +138,7 @@ gre_manip_pkt(struct sk_buff **pskb,
 			*(gre_key(greh)) = tuple->dst.u.gre.key;
 			break;
 		case GRE_VERSION_PPTP:
-			DEBUGP("call_id -> 0x%04x\n", 
+			DEBUGP("call_id -> 0x%04x\n",
 				ntohs(tuple->dst.u.gre.key));
 			pgreh->call_id = tuple->dst.u.gre.key;
 			break;
@@ -152,8 +152,8 @@ gre_manip_pkt(struct sk_buff **pskb,
 }
 
 /* nat helper struct */
-static struct ip_nat_protocol gre = { 
-	.name		= "GRE", 
+static struct ip_nat_protocol gre = {
+	.name		= "GRE",
 	.protonum	= IPPROTO_GRE,
 	.manip_pkt	= gre_manip_pkt,
 	.in_range	= gre_in_range,
@@ -164,7 +164,7 @@ static struct ip_nat_protocol gre = {
 	.nfattr_to_range	= ip_nat_port_nfattr_to_range,
 #endif
 };
-				  
+
 int __init ip_nat_proto_gre_init(void)
 {
 	return ip_nat_protocol_register(&gre);
-- 
cgit v1.2.3-70-g09d2


From 955b944293dd4c931ec866ebe19a6b2463b8f9a0 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 20 Sep 2006 12:08:03 -0700
Subject: [NETFILTER]: PPTP conntrack: get rid of unnecessary byte order
 conversions

The conntrack structure contains the call ID in host byte order for no
reason, get rid of back and forth conversions.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_pptp.h   |  8 ++---
 .../linux/netfilter_ipv4/ip_conntrack_proto_gre.h  | 22 ++++++------
 include/linux/netfilter_ipv4/ip_nat_pptp.h         |  4 +--
 net/ipv4/netfilter/ip_conntrack_helper_pptp.c      | 22 ++++++------
 net/ipv4/netfilter/ip_nat_helper_pptp.c            | 42 +++++++++++-----------
 net/ipv4/netfilter/ip_nat_proto_gre.c              |  2 +-
 6 files changed, 50 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
index 88f66d3c876..0d35623f945 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
@@ -31,8 +31,8 @@ struct ip_ct_pptp_master {
 	/* everything below is going to be per-expectation in newnat,
 	 * since there could be more than one call within one session */
 	enum pptp_ctrlcall_state cstate;	/* call state */
-	u_int16_t pac_call_id;			/* call id of PAC, host byte order */
-	u_int16_t pns_call_id;			/* call id of PNS, host byte order */
+	__be16 pac_call_id;			/* call id of PAC, host byte order */
+	__be16 pns_call_id;			/* call id of PNS, host byte order */
 
 	/* in pre-2.6.11 this used to be per-expect. Now it is per-conntrack
 	 * and therefore imposes a fixed limit on the number of maps */
@@ -42,8 +42,8 @@ struct ip_ct_pptp_master {
 /* conntrack_expect private member */
 struct ip_ct_pptp_expect {
 	enum pptp_ctrlcall_state cstate; 	/* call state */
-	u_int16_t pac_call_id;			/* call id of PAC */
-	u_int16_t pns_call_id;			/* call id of PNS */
+	__be16 pac_call_id;			/* call id of PAC */
+	__be16 pns_call_id;			/* call id of PNS */
 };
 
 
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h b/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h
index 8d090ef82f5..1d853aa873e 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_proto_gre.h
@@ -49,18 +49,18 @@ struct gre_hdr {
 #else
 #error "Adjust your <asm/byteorder.h> defines"
 #endif
-	__u16	protocol;
+	__be16	protocol;
 };
 
 /* modified GRE header for PPTP */
 struct gre_hdr_pptp {
-	__u8  flags;		/* bitfield */
-	__u8  version;		/* should be GRE_VERSION_PPTP */
-	__u16 protocol;		/* should be GRE_PROTOCOL_PPTP */
-	__u16 payload_len;	/* size of ppp payload, not inc. gre header */
-	__u16 call_id;		/* peer's call_id for this session */
-	__u32 seq;		/* sequence number.  Present if S==1 */
-	__u32 ack;		/* seq number of highest packet recieved by */
+	__u8   flags;		/* bitfield */
+	__u8   version;		/* should be GRE_VERSION_PPTP */
+	__be16 protocol;	/* should be GRE_PROTOCOL_PPTP */
+	__be16 payload_len;	/* size of ppp payload, not inc. gre header */
+	__be16 call_id;		/* peer's call_id for this session */
+	__be32 seq;		/* sequence number.  Present if S==1 */
+	__be32 ack;		/* seq number of highest packet recieved by */
 				/*  sender in this session */
 };
 
@@ -92,13 +92,13 @@ void ip_ct_gre_keymap_destroy(struct ip_conntrack *ct);
 
 
 /* get pointer to gre key, if present */
-static inline u_int32_t *gre_key(struct gre_hdr *greh)
+static inline __be32 *gre_key(struct gre_hdr *greh)
 {
 	if (!greh->key)
 		return NULL;
 	if (greh->csum || greh->routing)
-		return (u_int32_t *) (greh+sizeof(*greh)+4);
-	return (u_int32_t *) (greh+sizeof(*greh));
+		return (__be32 *) (greh+sizeof(*greh)+4);
+	return (__be32 *) (greh+sizeof(*greh));
 }
 
 /* get pointer ot gre csum, if present */
diff --git a/include/linux/netfilter_ipv4/ip_nat_pptp.h b/include/linux/netfilter_ipv4/ip_nat_pptp.h
index eaf66c2e8f9..36668bf0f37 100644
--- a/include/linux/netfilter_ipv4/ip_nat_pptp.h
+++ b/include/linux/netfilter_ipv4/ip_nat_pptp.h
@@ -4,8 +4,8 @@
 
 /* conntrack private data */
 struct ip_nat_pptp {
-	u_int16_t pns_call_id;		/* NAT'ed PNS call id */
-	u_int16_t pac_call_id;		/* NAT'ed PAC call id */
+	__be16 pns_call_id;		/* NAT'ed PNS call id */
+	__be16 pac_call_id;		/* NAT'ed PAC call id */
 };
 
 #endif /* _NAT_PPTP_H */
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
index 6c94dd5d476..57637ca2b82 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -201,8 +201,8 @@ static void pptp_destroy_siblings(struct ip_conntrack *ct)
 	/* try original (pns->pac) tuple */
 	memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t));
 	t.dst.protonum = IPPROTO_GRE;
-	t.src.u.gre.key = htons(ct->help.ct_pptp_info.pns_call_id);
-	t.dst.u.gre.key = htons(ct->help.ct_pptp_info.pac_call_id);
+	t.src.u.gre.key = ct->help.ct_pptp_info.pns_call_id;
+	t.dst.u.gre.key = ct->help.ct_pptp_info.pac_call_id;
 
 	if (!destroy_sibling_or_exp(&t))
 		DEBUGP("failed to timeout original pns->pac ct/exp\n");
@@ -210,8 +210,8 @@ static void pptp_destroy_siblings(struct ip_conntrack *ct)
 	/* try reply (pac->pns) tuple */
 	memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t));
 	t.dst.protonum = IPPROTO_GRE;
-	t.src.u.gre.key = htons(ct->help.ct_pptp_info.pac_call_id);
-	t.dst.u.gre.key = htons(ct->help.ct_pptp_info.pns_call_id);
+	t.src.u.gre.key = ct->help.ct_pptp_info.pac_call_id;
+	t.dst.u.gre.key = ct->help.ct_pptp_info.pns_call_id;
 
 	if (!destroy_sibling_or_exp(&t))
 		DEBUGP("failed to timeout reply pac->pns ct/exp\n");
@@ -419,9 +419,9 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		cid = &pptpReq->ocack.callID;
 		pcid = &pptpReq->ocack.peersCallID;
 
-		info->pac_call_id = ntohs(*cid);
+		info->pac_call_id = *cid;
 
-		if (htons(info->pns_call_id) != *pcid) {
+		if (info->pns_call_id != *pcid) {
 			DEBUGP("%s for unknown callid %u\n",
 				pptp_msg_name[msg], ntohs(*pcid));
 			break;
@@ -454,7 +454,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		pcid = &pptpReq->icack.peersCallID;
 		DEBUGP("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(*pcid));
 		info->cstate = PPTP_CALL_IN_REQ;
-		info->pac_call_id = ntohs(*pcid);
+		info->pac_call_id = *pcid;
 		break;
 
 	case PPTP_IN_CALL_CONNECT:
@@ -478,7 +478,7 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		pcid = &pptpReq->iccon.peersCallID;
 		cid = &info->pac_call_id;
 
-		if (info->pns_call_id != ntohs(*pcid)) {
+		if (info->pns_call_id != *pcid) {
 			DEBUGP("%s for unknown CallID %u\n",
 				pptp_msg_name[msg], ntohs(*pcid));
 			break;
@@ -595,7 +595,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		/* track PNS call id */
 		cid = &pptpReq->ocreq.callID;
 		DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(*cid));
-		info->pns_call_id = ntohs(*cid);
+		info->pns_call_id = *cid;
 		break;
 	case PPTP_IN_CALL_REPLY:
 		if (reqlen < sizeof(_pptpReq.icack)) {
@@ -615,7 +615,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 			break;
 		}
 		pcid = &pptpReq->icack.peersCallID;
-		if (info->pac_call_id != ntohs(*pcid)) {
+		if (info->pac_call_id != *pcid) {
 			DEBUGP("%s for unknown call %u\n",
 				pptp_msg_name[msg], ntohs(*pcid));
 			break;
@@ -623,7 +623,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		DEBUGP("%s, CID=%X\n", pptp_msg_name[msg], ntohs(*pcid));
 		/* part two of the three-way handshake */
 		info->cstate = PPTP_CALL_IN_REP;
-		info->pns_call_id = ntohs(pptpReq->icack.callID);
+		info->pns_call_id = pptpReq->icack.callID;
 		break;
 
 	case PPTP_CALL_CLEAR_REQUEST:
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
index 5dde1da1c30..6e8bd6b3431 100644
--- a/net/ipv4/netfilter/ip_nat_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c
@@ -85,19 +85,17 @@ static void pptp_nat_expected(struct ip_conntrack *ct,
 		DEBUGP("we are PNS->PAC\n");
 		/* therefore, build tuple for PAC->PNS */
 		t.src.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip;
-		t.src.u.gre.key = htons(master->help.ct_pptp_info.pac_call_id);
+		t.src.u.gre.key = master->help.ct_pptp_info.pac_call_id;
 		t.dst.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
-		t.dst.u.gre.key = htons(master->help.ct_pptp_info.pns_call_id);
+		t.dst.u.gre.key = master->help.ct_pptp_info.pns_call_id;
 		t.dst.protonum = IPPROTO_GRE;
 	} else {
 		DEBUGP("we are PAC->PNS\n");
 		/* build tuple for PNS->PAC */
 		t.src.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
-		t.src.u.gre.key =
-			htons(master->nat.help.nat_pptp_info.pns_call_id);
+		t.src.u.gre.key = master->nat.help.nat_pptp_info.pns_call_id;
 		t.dst.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
-		t.dst.u.gre.key =
-			htons(master->nat.help.nat_pptp_info.pac_call_id);
+		t.dst.u.gre.key = master->nat.help.nat_pptp_info.pac_call_id;
 		t.dst.protonum = IPPROTO_GRE;
 	}
 
@@ -149,10 +147,11 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 {
 	struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info;
 	struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
-	u_int16_t msg, new_callid;
+	u_int16_t msg;
+	__be16 new_callid;
 	unsigned int cid_off;
 
-	new_callid = htons(ct_pptp_info->pns_call_id);
+	new_callid = ct_pptp_info->pns_call_id;
 
 	switch (msg = ntohs(ctlh->messageType)) {
 	case PPTP_OUT_CALL_REQUEST:
@@ -170,7 +169,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 		new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
 
 		/* save new call ID in ct info */
-		ct_pptp_info->pns_call_id = ntohs(new_callid);
+		ct_pptp_info->pns_call_id = new_callid;
 		break;
 	case PPTP_IN_CALL_REPLY:
 		cid_off = offsetof(union pptp_ctrl_union, icreq.callID);
@@ -235,14 +234,14 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
 
 	/* alter expectation for PNS->PAC direction */
 	invert_tuplepr(&inv_t, &expect_orig->tuple);
-	expect_orig->saved_proto.gre.key = htons(ct_pptp_info->pns_call_id);
-	expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
-	expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
+	expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
+	expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
+	expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
 	expect_orig->dir = IP_CT_DIR_ORIGINAL;
 	inv_t.src.ip = reply_t->src.ip;
 	inv_t.dst.ip = reply_t->dst.ip;
-	inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
-	inv_t.dst.u.gre.key = htons(ct_pptp_info->pns_call_id);
+	inv_t.src.u.gre.key = nat_pptp_info->pac_call_id;
+	inv_t.dst.u.gre.key = ct_pptp_info->pns_call_id;
 
 	if (!ip_conntrack_expect_related(expect_orig)) {
 		DEBUGP("successfully registered expect\n");
@@ -253,14 +252,14 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
 
 	/* alter expectation for PAC->PNS direction */
 	invert_tuplepr(&inv_t, &expect_reply->tuple);
-	expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id);
-	expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id);
-	expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id);
+	expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
+	expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
+	expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
 	expect_reply->dir = IP_CT_DIR_REPLY;
 	inv_t.src.ip = orig_t->src.ip;
 	inv_t.dst.ip = orig_t->dst.ip;
-	inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id);
-	inv_t.dst.u.gre.key = htons(ct_pptp_info->pac_call_id);
+	inv_t.src.u.gre.key = nat_pptp_info->pns_call_id;
+	inv_t.dst.u.gre.key = ct_pptp_info->pac_call_id;
 
 	if (!ip_conntrack_expect_related(expect_reply)) {
 		DEBUGP("successfully registered expect\n");
@@ -297,10 +296,11 @@ pptp_inbound_pkt(struct sk_buff **pskb,
 		 union pptp_ctrl_union *pptpReq)
 {
 	struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
-	u_int16_t msg, new_cid = 0, new_pcid;
+	u_int16_t msg, new_cid = 0;
+	__be16 new_pcid;
 	unsigned int pcid_off, cid_off = 0;
 
-	new_pcid = htons(nat_pptp_info->pns_call_id);
+	new_pcid = nat_pptp_info->pns_call_id;
 
 	switch (msg = ntohs(ctlh->messageType)) {
 	case PPTP_OUT_CALL_REPLY:
diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c
index a5226691f02..bf91f9312b3 100644
--- a/net/ipv4/netfilter/ip_nat_proto_gre.c
+++ b/net/ipv4/netfilter/ip_nat_proto_gre.c
@@ -67,7 +67,7 @@ gre_unique_tuple(struct ip_conntrack_tuple *tuple,
 		 const struct ip_conntrack *conntrack)
 {
 	static u_int16_t key;
-	u_int16_t *keyptr;
+	__be16 *keyptr;
 	unsigned int min, i, range_size;
 
 	if (maniptype == IP_NAT_MANIP_SRC)
-- 
cgit v1.2.3-70-g09d2


From 6013c0a13e335674a783215e182c367406294392 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 20 Sep 2006 12:08:56 -0700
Subject: [NETFILTER]: PPTP conntrack: fix header definitions

Fix a few header definitions to match RFC2637. Most importantly the
PptpOutCallRequest header included an invalid padding field and a
size check was disabled because of this.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_pptp.h | 9 +++++----
 net/ipv4/netfilter/ip_conntrack_helper_pptp.c    | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
index 0d35623f945..620bf06fabc 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
@@ -107,8 +107,7 @@ struct PptpControlHeader {
 
 struct PptpStartSessionRequest {
 	__be16	protocolVersion;
-	__u8	reserved1;
-	__u8	reserved2;
+	__u16	reserved1;
 	__be32	framingCapability;
 	__be32	bearerCapability;
 	__be16	maxChannels;
@@ -143,6 +142,8 @@ struct PptpStartSessionReply {
 
 struct PptpStopSessionRequest {
 	__u8	reason;
+	__u8	reserved1;
+	__u16	reserved2;
 };
 
 /* PptpStopSessionResultCode */
@@ -152,6 +153,7 @@ struct PptpStopSessionRequest {
 struct PptpStopSessionReply {
 	__u8	resultCode;
 	__u8	generalErrorCode;
+	__u16	reserved1;
 };
 
 struct PptpEchoRequest {
@@ -188,9 +190,8 @@ struct PptpOutCallRequest {
 	__be32	framingType;
 	__be16	packetWindow;
 	__be16	packetProcDelay;
-	__u16	reserved1;
 	__be16	phoneNumberLength;
-	__u16	reserved2;
+	__u16	reserved1;
 	__u8	phoneNumber[64];
 	__u8	subAddress[64];
 };
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
index 0510ee50dc6..1a8da9015d8 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -569,7 +569,7 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 	case PPTP_OUT_CALL_REQUEST:
 		if (reqlen < sizeof(_pptpReq.ocreq)) {
 			DEBUGP("%s: short packet\n", pptp_msg_name[msg]);
-			/* FIXME: break; */
+			break;
 		}
 
 		/* client initiating connection to server */
-- 
cgit v1.2.3-70-g09d2


From cf9f81523ef3e95d9f222c896d266e4562999150 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 20 Sep 2006 12:09:34 -0700
Subject: [NETFILTER]: PPTP conntrack: simplify expectation handling

Remove duplicated expectation handling in the NAT helper and simplify
the remains in the conntrack helper.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_pptp.h |  2 +-
 net/ipv4/netfilter/ip_conntrack_helper_pptp.c    | 92 ++++++++----------------
 net/ipv4/netfilter/ip_nat_helper_pptp.c          | 58 +--------------
 3 files changed, 35 insertions(+), 117 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
index 620bf06fabc..2644b1faddd 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_pptp.h
@@ -315,7 +315,7 @@ extern int
 			  struct PptpControlHeader *ctlh,
 			  union pptp_ctrl_union *pptpReq);
 
-extern int
+extern void
 (*ip_nat_pptp_hook_exp_gre)(struct ip_conntrack_expect *exp_orig,
 			    struct ip_conntrack_expect *exp_reply);
 
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
index 5f7af6ef388..57eac6e3871 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -80,7 +80,7 @@ int
 			  struct PptpControlHeader *ctlh,
 			  union pptp_ctrl_union *pptpReq);
 
-int
+void
 (*ip_nat_pptp_hook_exp_gre)(struct ip_conntrack_expect *expect_orig,
 			    struct ip_conntrack_expect *expect_reply);
 
@@ -219,93 +219,63 @@ static void pptp_destroy_siblings(struct ip_conntrack *ct)
 
 /* expect GRE connections (PNS->PAC and PAC->PNS direction) */
 static inline int
-exp_gre(struct ip_conntrack *master,
+exp_gre(struct ip_conntrack *ct,
 	__be16 callid,
 	__be16 peer_callid)
 {
-	struct ip_conntrack_tuple inv_tuple;
-	struct ip_conntrack_tuple exp_tuples[] = {
-		/* tuple in original direction, PNS->PAC */
-		{ .src = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip,
-			   .u = { .gre = { .key = peer_callid } }
-			 },
-		  .dst = { .ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip,
-			   .u = { .gre = { .key = callid } },
-			   .protonum = IPPROTO_GRE
-			 },
-		 },
-		/* tuple in reply direction, PAC->PNS */
-		{ .src = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
-			   .u = { .gre = { .key = callid } }
-			 },
-		  .dst = { .ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
-			   .u = { .gre = { .key = peer_callid } },
-			   .protonum = IPPROTO_GRE
-			 },
-		 }
-	};
 	struct ip_conntrack_expect *exp_orig, *exp_reply;
 	int ret = 1;
 
-	exp_orig = ip_conntrack_expect_alloc(master);
+	exp_orig = ip_conntrack_expect_alloc(ct);
 	if (exp_orig == NULL)
 		goto out;
 
-	exp_reply = ip_conntrack_expect_alloc(master);
+	exp_reply = ip_conntrack_expect_alloc(ct);
 	if (exp_reply == NULL)
 		goto out_put_orig;
 
-	memcpy(&exp_orig->tuple, &exp_tuples[0], sizeof(exp_orig->tuple));
+	/* original direction, PNS->PAC */
+	exp_orig->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
+	exp_orig->tuple.src.u.gre.key = peer_callid;
+	exp_orig->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
+	exp_orig->tuple.dst.u.gre.key = callid;
+	exp_orig->tuple.dst.protonum = IPPROTO_GRE;
 
 	exp_orig->mask.src.ip = 0xffffffff;
 	exp_orig->mask.src.u.all = 0;
-	exp_orig->mask.dst.u.all = 0;
 	exp_orig->mask.dst.u.gre.key = htons(0xffff);
 	exp_orig->mask.dst.ip = 0xffffffff;
 	exp_orig->mask.dst.protonum = 0xff;
 
-	exp_orig->master = master;
+	exp_orig->master = ct;
 	exp_orig->expectfn = pptp_expectfn;
 	exp_orig->flags = 0;
 
 	/* both expectations are identical apart from tuple */
 	memcpy(exp_reply, exp_orig, sizeof(*exp_reply));
-	memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple));
-
-	if (ip_nat_pptp_hook_exp_gre)
-		ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply);
-	else {
 
-		DEBUGP("calling expect_related PNS->PAC");
-		DUMP_TUPLE(&exp_orig->tuple);
+	/* reply direction, PAC->PNS */
+	exp_reply->tuple.src.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip;
+	exp_reply->tuple.src.u.gre.key = callid;
+	exp_reply->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip;
+	exp_reply->tuple.dst.u.gre.key = peer_callid;
+	exp_reply->tuple.dst.protonum = IPPROTO_GRE;
 
-		if (ip_conntrack_expect_related(exp_orig) != 0) {
-			DEBUGP("cannot expect_related()\n");
-			goto out_put_both;
-		}
-
-		DEBUGP("calling expect_related PAC->PNS");
-		DUMP_TUPLE(&exp_reply->tuple);
-
-		if (ip_conntrack_expect_related(exp_reply) != 0) {
-			DEBUGP("cannot expect_related()\n");
-			goto out_unexpect_orig;
-		}
-
-		/* Add GRE keymap entries */
-		if (ip_ct_gre_keymap_add(master, &exp_reply->tuple, 0) != 0) {
-			DEBUGP("cannot keymap_add() exp\n");
-			goto out_unexpect_both;
-		}
-
-		invert_tuplepr(&inv_tuple, &exp_reply->tuple);
-		if (ip_ct_gre_keymap_add(master, &inv_tuple, 1) != 0) {
-			ip_ct_gre_keymap_destroy(master);
-			DEBUGP("cannot keymap_add() exp_inv\n");
-			goto out_unexpect_both;
-		}
-		ret = 0;
+	if (ip_nat_pptp_hook_exp_gre)
+		ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply);
+	if (ip_conntrack_expect_related(exp_orig) != 0)
+		goto out_put_both;
+	if (ip_conntrack_expect_related(exp_reply) != 0)
+		goto out_unexpect_orig;
+
+	/* Add GRE keymap entries */
+	if (ip_ct_gre_keymap_add(ct, &exp_orig->tuple, 0) != 0)
+		goto out_unexpect_both;
+	if (ip_ct_gre_keymap_add(ct, &exp_reply->tuple, 1) != 0) {
+		ip_ct_gre_keymap_destroy(ct);
+		goto out_unexpect_both;
 	}
+	ret = 0;
 
 out_put_both:
 	ip_conntrack_expect_put(exp_reply);
diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c
index 0f5e753b481..84f6bd09fcd 100644
--- a/net/ipv4/netfilter/ip_nat_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c
@@ -211,80 +211,28 @@ pptp_outbound_pkt(struct sk_buff **pskb,
 	return NF_ACCEPT;
 }
 
-static int
+static void
 pptp_exp_gre(struct ip_conntrack_expect *expect_orig,
 	     struct ip_conntrack_expect *expect_reply)
 {
-	struct ip_ct_pptp_master *ct_pptp_info =
-				&expect_orig->master->help.ct_pptp_info;
-	struct ip_nat_pptp *nat_pptp_info =
-				&expect_orig->master->nat.help.nat_pptp_info;
-
 	struct ip_conntrack *ct = expect_orig->master;
-
-	struct ip_conntrack_tuple inv_t;
-	struct ip_conntrack_tuple *orig_t, *reply_t;
+	struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info;
+	struct ip_nat_pptp *nat_pptp_info = &ct->nat.help.nat_pptp_info;
 
 	/* save original PAC call ID in nat_info */
 	nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
 
-	/* alter expectation */
-	orig_t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-	reply_t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-
 	/* alter expectation for PNS->PAC direction */
-	invert_tuplepr(&inv_t, &expect_orig->tuple);
 	expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
 	expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
 	expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
 	expect_orig->dir = IP_CT_DIR_ORIGINAL;
-	inv_t.src.ip = reply_t->src.ip;
-	inv_t.dst.ip = reply_t->dst.ip;
-	inv_t.src.u.gre.key = nat_pptp_info->pac_call_id;
-	inv_t.dst.u.gre.key = ct_pptp_info->pns_call_id;
-
-	if (!ip_conntrack_expect_related(expect_orig)) {
-		DEBUGP("successfully registered expect\n");
-	} else {
-		DEBUGP("can't expect_related(expect_orig)\n");
-		return 1;
-	}
 
 	/* alter expectation for PAC->PNS direction */
-	invert_tuplepr(&inv_t, &expect_reply->tuple);
 	expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
 	expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
 	expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
 	expect_reply->dir = IP_CT_DIR_REPLY;
-	inv_t.src.ip = orig_t->src.ip;
-	inv_t.dst.ip = orig_t->dst.ip;
-	inv_t.src.u.gre.key = nat_pptp_info->pns_call_id;
-	inv_t.dst.u.gre.key = ct_pptp_info->pac_call_id;
-
-	if (!ip_conntrack_expect_related(expect_reply)) {
-		DEBUGP("successfully registered expect\n");
-	} else {
-		DEBUGP("can't expect_related(expect_reply)\n");
-		ip_conntrack_unexpect_related(expect_orig);
-		return 1;
-	}
-
-	if (ip_ct_gre_keymap_add(ct, &expect_reply->tuple, 0) < 0) {
-		DEBUGP("can't register original keymap\n");
-		ip_conntrack_unexpect_related(expect_orig);
-		ip_conntrack_unexpect_related(expect_reply);
-		return 1;
-	}
-
-	if (ip_ct_gre_keymap_add(ct, &inv_t, 1) < 0) {
-		DEBUGP("can't register reply keymap\n");
-		ip_conntrack_unexpect_related(expect_orig);
-		ip_conntrack_unexpect_related(expect_reply);
-		ip_ct_gre_keymap_destroy(ct);
-		return 1;
-	}
-
-	return 0;
 }
 
 /* inbound packets == from PAC to PNS */
-- 
cgit v1.2.3-70-g09d2


From 4c5de695cf7f71c85ad8cfff509f6475b8bd4d27 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 20 Sep 2006 12:11:30 -0700
Subject: [NETFILTER]: PPTP conntrack: fix another GRE keymap leak

When the master PPTP connection times out while still having unfullfilled
expectations (and a GRE keymap entry) associated with it, the keymap entry
is not destroyed.

Add a destroy callback to struct ip_conntrack_helper and use it to destroy
PPTP siblings when the master is destroyed.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack_helper.h |  2 ++
 net/ipv4/netfilter/ip_conntrack_core.c             |  5 +++++
 net/ipv4/netfilter/ip_conntrack_helper_pptp.c      | 12 ++----------
 3 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper.h b/include/linux/netfilter_ipv4/ip_conntrack_helper.h
index 8d69279ccfe..77fe868d36f 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_helper.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_helper.h
@@ -25,6 +25,8 @@ struct ip_conntrack_helper
 		    struct ip_conntrack *ct,
 		    enum ip_conntrack_info conntrackinfo);
 
+	void (*destroy)(struct ip_conntrack *ct);
+
 	int (*to_nfattr)(struct sk_buff *skb, const struct ip_conntrack *ct);
 };
 
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 2b6f24fc727..c432b316360 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -307,6 +307,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 {
 	struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
 	struct ip_conntrack_protocol *proto;
+	struct ip_conntrack_helper *helper;
 
 	DEBUGP("destroy_conntrack(%p)\n", ct);
 	IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
@@ -315,6 +316,10 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	ip_conntrack_event(IPCT_DESTROY, ct);
 	set_bit(IPS_DYING_BIT, &ct->status);
 
+	helper = ct->helper;
+	if (helper && helper->destroy)
+		helper->destroy(ct);
+
 	/* To make sure we don't get any weird locking issues here:
 	 * destroy_conntrack() MUST NOT be called with a write lock
 	 * to ip_conntrack_lock!!! -HW */
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
index 98267b0d2a4..fb0aee69172 100644
--- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
+++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c
@@ -553,15 +553,6 @@ conntrack_pptp_help(struct sk_buff **pskb,
 	nexthdr_off += tcph->doff * 4;
  	datalen = tcplen - tcph->doff * 4;
 
-	if (tcph->fin || tcph->rst) {
-		DEBUGP("RST/FIN received, timeouting GRE\n");
-		/* can't do this after real newnat */
-		info->cstate = PPTP_CALL_NONE;
-
-		/* untrack this call id, unexpect GRE packets */
-		pptp_destroy_siblings(ct);
-	}
-
 	pptph = skb_header_pointer(*pskb, nexthdr_off, sizeof(_pptph), &_pptph);
 	if (!pptph) {
 		DEBUGP("no full PPTP header, can't track\n");
@@ -640,7 +631,8 @@ static struct ip_conntrack_helper pptp = {
 			   .protonum = 0xff
 		 	 }
 		},
-	.help = conntrack_pptp_help
+	.help = conntrack_pptp_help,
+	.destroy = pptp_destroy_siblings,
 };
 
 extern void ip_ct_proto_gre_fini(void);
-- 
cgit v1.2.3-70-g09d2


From 62dd93181aaa1d5a501a9cebcb254f44b8a48af7 Mon Sep 17 00:00:00 2001
From: Ville Nuorvala <vnuorval@tcs.hut.fi>
Date: Fri, 22 Sep 2006 14:43:19 -0700
Subject: [IPV6] NDISC: Set per-entry is_router flag in Proxy NA.

We have sent NA with router flag from the node-wide forwarding
configuration.  This is not appropriate for proxy NA, and it should be
set according to each proxy entry's configuration.

This is used by Mobile IPv6 home agent to support physical home link
in acting as a proxy router for mobile node which is not a router,
for example.

Based on MIPL2 kernel patch.

Signed-off-by: Ville Nuorvala <vnuorval@tcs.hut.fi>
Signed-off-by: Masahide NAKAMURA <nakam@linux-ipv6.org>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 include/net/neighbour.h |  1 +
 net/core/neighbour.c    | 11 ++++++++---
 net/ipv6/ndisc.c        | 14 +++++++++++---
 3 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index bd187daffdb..c8aacbd2e33 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -126,6 +126,7 @@ struct pneigh_entry
 {
 	struct pneigh_entry	*next;
 	struct net_device		*dev;
+	u8			flags;
 	u8			key[0];
 };
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a45bd2124d6..b6c69e1463e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1544,9 +1544,14 @@ int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
 
 		if (ndm->ndm_flags & NTF_PROXY) {
-			err = 0;
-			if (pneigh_lookup(tbl, dst, dev, 1) == NULL)
-				err = -ENOBUFS;
+			struct pneigh_entry *pn;
+
+			err = -ENOBUFS;
+			pn = pneigh_lookup(tbl, dst, dev, 1);
+			if (pn) {
+				pn->flags = ndm->ndm_flags;
+				err = 0;
+			}
 			goto out_dev_put;
 		}
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0e0d6ce6902..ddf038636f0 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -736,8 +736,10 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev = NULL;
 	struct neighbour *neigh;
+	struct pneigh_entry *pneigh = NULL;
 	int dad = ipv6_addr_any(saddr);
 	int inc;
+	int is_router;
 
 	if (ipv6_addr_is_multicast(&msg->target)) {
 		ND_PRINTK2(KERN_WARNING 
@@ -822,7 +824,8 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 
 		if (ipv6_chk_acast_addr(dev, &msg->target) ||
 		    (idev->cnf.forwarding && 
-		     pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) {
+		     (pneigh = pneigh_lookup(&nd_tbl,
+					     &msg->target, dev, 0)) != NULL)) {
 			if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
 			    skb->pkt_type != PACKET_HOST &&
 			    inc != 0 &&
@@ -843,12 +846,17 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 			goto out;
 	}
 
+	if (pneigh)
+		is_router = pneigh->flags & NTF_ROUTER;
+	else
+		is_router = idev->cnf.forwarding;
+
 	if (dad) {
 		struct in6_addr maddr;
 
 		ipv6_addr_all_nodes(&maddr);
 		ndisc_send_na(dev, NULL, &maddr, &msg->target,
-			      idev->cnf.forwarding, 0, (ifp != NULL), 1);
+			      is_router, 0, (ifp != NULL), 1);
 		goto out;
 	}
 
@@ -869,7 +877,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 			     NEIGH_UPDATE_F_OVERRIDE);
 	if (neigh || !dev->hard_header) {
 		ndisc_send_na(dev, neigh, saddr, &msg->target,
-			      idev->cnf.forwarding, 
+			      is_router,
 			      1, (ifp != NULL && inc), inc);
 		if (neigh)
 			neigh_release(neigh);
-- 
cgit v1.2.3-70-g09d2


From fbea49e1e2404baa2d88ab47e2db89e49551b53b Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Fri, 22 Sep 2006 14:43:49 -0700
Subject: [IPV6] NDISC: Add proxy_ndp sysctl.

We do not always need proxy NDP functionality even we
enable forwarding.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  3 +++
 include/linux/ipv6.h                   |  2 ++
 include/linux/sysctl.h                 |  1 +
 net/ipv6/addrconf.c                    | 11 +++++++++++
 net/ipv6/ip6_output.c                  |  4 +++-
 net/ipv6/ndisc.c                       |  8 +++++++-
 6 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 307cd4ec8ed..935e298f674 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -765,6 +765,9 @@ conf/all/forwarding - BOOLEAN
 
 	This referred to as global forwarding.
 
+proxy_ndp - BOOLEAN
+	Do proxy ndp.
+
 conf/interface/*:
 	Change special settings per interface.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 1d6d3ccc941..caca57df0d7 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -176,6 +176,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_rt_info_max_plen;
 #endif
 #endif
+	__s32		proxy_ndp;
 	void		*sysctl;
 };
 
@@ -203,6 +204,7 @@ enum {
 	DEVCONF_ACCEPT_RA_RTR_PREF,
 	DEVCONF_RTR_PROBE_INTERVAL,
 	DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN,
+	DEVCONF_PROXY_NDP,
 	DEVCONF_MAX
 };
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index af61d923540..736ed917a4f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -556,6 +556,7 @@ enum {
 	NET_IPV6_ACCEPT_RA_RTR_PREF=20,
 	NET_IPV6_RTR_PROBE_INTERVAL=21,
 	NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN=22,
+	NET_IPV6_PROXY_NDP=23,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1e5a296d0a8..825a291d5aa 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -175,6 +175,7 @@ struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_ra_rt_info_max_plen = 0,
 #endif
 #endif
+	.proxy_ndp		= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -205,6 +206,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.accept_ra_rt_info_max_plen = 0,
 #endif
 #endif
+	.proxy_ndp		= 0,
 };
 
 /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
@@ -3337,6 +3339,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
 #endif
 #endif
+	array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
 }
 
 /* Maximum length of ifinfomsg attributes */
@@ -3859,6 +3862,14 @@ static struct addrconf_sysctl_table
 		},
 #endif
 #endif
+		{
+			.ctl_name	=	NET_IPV6_PROXY_NDP,
+			.procname	=	"proxy_ndp",
+			.data		=	&ipv6_devconf.proxy_ndp,
+			.maxlen		=	sizeof(int),
+			.mode		=	0644,
+			.proc_handler	=	&proc_dointvec,
+		},
 		{
 			.ctl_name	=	0,	/* sentinel */
 		}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b2be749d221..66716911962 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -412,7 +412,9 @@ int ip6_forward(struct sk_buff *skb)
 		return -ETIMEDOUT;
 	}
 
-	if (pneigh_lookup(&nd_tbl, &hdr->daddr, skb->dev, 0)) {
+	/* XXX: idev->cnf.proxy_ndp? */
+	if (ipv6_devconf.proxy_ndp &&
+	    pneigh_lookup(&nd_tbl, &hdr->daddr, skb->dev, 0)) {
 		int proxied = ip6_forward_proxy_check(skb);
 		if (proxied > 0)
 			return ip6_input(skb);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index ddf038636f0..76517a5f657 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -824,6 +824,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 
 		if (ipv6_chk_acast_addr(dev, &msg->target) ||
 		    (idev->cnf.forwarding && 
+		     (ipv6_devconf.proxy_ndp || idev->cnf.proxy_ndp) &&
 		     (pneigh = pneigh_lookup(&nd_tbl,
 					     &msg->target, dev, 0)) != NULL)) {
 			if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
@@ -966,8 +967,13 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		 * has already sent a NA to us.
 		 */
 		if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
-		    pneigh_lookup(&nd_tbl, &msg->target, dev, 0))
+		    ipv6_devconf.forwarding && ipv6_devconf.proxy_ndp &&
+		    pneigh_lookup(&nd_tbl, &msg->target, dev, 0)) {
+			/* XXX: idev->cnf.prixy_ndp */
+			WARN_ON(skb->dst != NULL &&
+				((struct rt6_info *)skb->dst)->rt6i_idev);
 			goto out;
+		}
 
 		neigh_update(neigh, lladdr,
 			     msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
-- 
cgit v1.2.3-70-g09d2


From 8814c4b533817df825485ff32ce6ac406c3a54d1 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Fri, 22 Sep 2006 14:44:24 -0700
Subject: [IPV6] ADDRCONF: Convert addrconf_lock to RCU.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 10 ++++------
 include/net/if_inet6.h |  1 +
 net/core/pktgen.c      |  4 ++--
 net/ipv6/addrconf.c    | 46 ++++++++++++++++++++++++----------------------
 net/ipv6/anycast.c     |  4 ++--
 net/ipv6/ipv6_syms.c   |  1 -
 net/sctp/ipv6.c        |  6 +++---
 7 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 5fc8627435e..aa2ed8f0a9d 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -133,20 +133,18 @@ extern int unregister_inet6addr_notifier(struct notifier_block *nb);
 static inline struct inet6_dev *
 __in6_dev_get(struct net_device *dev)
 {
-	return (struct inet6_dev *)dev->ip6_ptr;
+	return rcu_dereference(dev->ip6_ptr);
 }
 
-extern rwlock_t addrconf_lock;
-
 static inline struct inet6_dev *
 in6_dev_get(struct net_device *dev)
 {
 	struct inet6_dev *idev = NULL;
-	read_lock(&addrconf_lock);
-	idev = dev->ip6_ptr;
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
 	if (idev)
 		atomic_inc(&idev->refcnt);
-	read_unlock(&addrconf_lock);
+	rcu_read_unlock();
 	return idev;
 }
 
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index e459e1a0ae4..34489c13c11 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -189,6 +189,7 @@ struct inet6_dev
 	struct ipv6_devconf	cnf;
 	struct ipv6_devstat	stats;
 	unsigned long		tstamp; /* ipv6InterfaceTable update timestamp */
+	struct rcu_head		rcu;
 };
 
 extern struct ipv6_devconf ipv6_devconf;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 6a7320b39ed..72145d4a260 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1786,7 +1786,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 			 * use ipv6_get_lladdr if/when it's get exported
 			 */
 
-			read_lock(&addrconf_lock);
+			rcu_read_lock();
 			if ((idev = __in6_dev_get(pkt_dev->odev)) != NULL) {
 				struct inet6_ifaddr *ifp;
 
@@ -1805,7 +1805,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 				}
 				read_unlock_bh(&idev->lock);
 			}
-			read_unlock(&addrconf_lock);
+			rcu_read_unlock();
 			if (err)
 				printk("pktgen: ERROR: IPv6 link address not availble.\n");
 		}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 825a291d5aa..c09ebb7bb98 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -119,9 +119,6 @@ static int ipv6_count_addresses(struct inet6_dev *idev);
 static struct inet6_ifaddr		*inet6_addr_lst[IN6_ADDR_HSIZE];
 static DEFINE_RWLOCK(addrconf_hash_lock);
 
-/* Protects inet6 devices */
-DEFINE_RWLOCK(addrconf_lock);
-
 static void addrconf_verify(unsigned long);
 
 static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0);
@@ -318,6 +315,12 @@ static void addrconf_mod_timer(struct inet6_ifaddr *ifp,
 
 /* Nobody refers to this device, we may destroy it. */
 
+static void in6_dev_finish_destroy_rcu(struct rcu_head *head)
+{
+	struct inet6_dev *idev = container_of(head, struct inet6_dev, rcu);
+	kfree(idev);
+}
+
 void in6_dev_finish_destroy(struct inet6_dev *idev)
 {
 	struct net_device *dev = idev->dev;
@@ -332,7 +335,7 @@ void in6_dev_finish_destroy(struct inet6_dev *idev)
 		return;
 	}
 	snmp6_free_dev(idev);
-	kfree(idev);
+	call_rcu(&idev->rcu, in6_dev_finish_destroy_rcu);
 }
 
 static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
@@ -408,9 +411,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
 	if (netif_carrier_ok(dev))
 		ndev->if_flags |= IF_READY;
 
-	write_lock_bh(&addrconf_lock);
-	dev->ip6_ptr = ndev;
-	write_unlock_bh(&addrconf_lock);
+	/* protected by rtnl_lock */
+	rcu_assign_pointer(dev->ip6_ptr, ndev);
 
 	ipv6_mc_init_dev(ndev);
 	ndev->tstamp = jiffies;
@@ -474,7 +476,7 @@ static void addrconf_forward_change(void)
 
 	read_lock(&dev_base_lock);
 	for (dev=dev_base; dev; dev=dev->next) {
-		read_lock(&addrconf_lock);
+		rcu_read_lock();
 		idev = __in6_dev_get(dev);
 		if (idev) {
 			int changed = (!idev->cnf.forwarding) ^ (!ipv6_devconf.forwarding);
@@ -482,7 +484,7 @@ static void addrconf_forward_change(void)
 			if (changed)
 				dev_forward_change(idev);
 		}
-		read_unlock(&addrconf_lock);
+		rcu_read_unlock();
 	}
 	read_unlock(&dev_base_lock);
 }
@@ -543,7 +545,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 	int hash;
 	int err = 0;
 
-	read_lock_bh(&addrconf_lock);
+	rcu_read_lock_bh();
 	if (idev->dead) {
 		err = -ENODEV;			/*XXX*/
 		goto out2;
@@ -612,7 +614,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
 	in6_ifa_hold(ifa);
 	write_unlock(&idev->lock);
 out2:
-	read_unlock_bh(&addrconf_lock);
+	rcu_read_unlock_bh();
 
 	if (likely(err == 0))
 		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
@@ -915,7 +917,7 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev,
 	memset(&hiscore, 0, sizeof(hiscore));
 
 	read_lock(&dev_base_lock);
-	read_lock(&addrconf_lock);
+	rcu_read_lock();
 
 	for (dev = dev_base; dev; dev=dev->next) {
 		struct inet6_dev *idev;
@@ -1127,7 +1129,7 @@ record_it:
 		}
 		read_unlock_bh(&idev->lock);
 	}
-	read_unlock(&addrconf_lock);
+	rcu_read_unlock();
 	read_unlock(&dev_base_lock);
 
 	if (!ifa_result)
@@ -1151,7 +1153,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr)
 	struct inet6_dev *idev;
 	int err = -EADDRNOTAVAIL;
 
-	read_lock(&addrconf_lock);
+	rcu_read_lock();
 	if ((idev = __in6_dev_get(dev)) != NULL) {
 		struct inet6_ifaddr *ifp;
 
@@ -1165,7 +1167,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr)
 		}
 		read_unlock_bh(&idev->lock);
 	}
-	read_unlock(&addrconf_lock);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -1466,7 +1468,7 @@ static void ipv6_regen_rndid(unsigned long data)
 	struct inet6_dev *idev = (struct inet6_dev *) data;
 	unsigned long expires;
 
-	read_lock_bh(&addrconf_lock);
+	rcu_read_lock_bh();
 	write_lock_bh(&idev->lock);
 
 	if (idev->dead)
@@ -1490,7 +1492,7 @@ static void ipv6_regen_rndid(unsigned long data)
 
 out:
 	write_unlock_bh(&idev->lock);
-	read_unlock_bh(&addrconf_lock);
+	rcu_read_unlock_bh();
 	in6_dev_put(idev);
 }
 
@@ -2342,10 +2344,10 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 	           Do not dev_put!
 	 */
 	if (how == 1) {
-		write_lock_bh(&addrconf_lock);
-		dev->ip6_ptr = NULL;
 		idev->dead = 1;
-		write_unlock_bh(&addrconf_lock);
+
+		/* protected by rtnl_lock */
+		rcu_assign_pointer(dev->ip6_ptr, NULL);
 
 		/* Step 1.5: remove snmp6 entry */
 		snmp6_unregister_dev(idev);
@@ -3573,10 +3575,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 
 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 {
-	read_lock_bh(&addrconf_lock);
+	rcu_read_lock_bh();
 	if (likely(ifp->idev->dead == 0))
 		__ipv6_ifa_notify(event, ifp);
-	read_unlock_bh(&addrconf_lock);
+	rcu_read_unlock_bh();
 }
 
 #ifdef CONFIG_SYSCTL
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index b80fc502ca0..a9604764e01 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -56,7 +56,7 @@ ip6_onlink(struct in6_addr *addr, struct net_device *dev)
 	int	onlink;
 
 	onlink = 0;
-	read_lock(&addrconf_lock);
+	rcu_read_lock();
 	idev = __in6_dev_get(dev);
 	if (idev) {
 		read_lock_bh(&idev->lock);
@@ -68,7 +68,7 @@ ip6_onlink(struct in6_addr *addr, struct net_device *dev)
 		}
 		read_unlock_bh(&idev->lock);
 	}
-	read_unlock(&addrconf_lock);
+	rcu_read_unlock();
 	return onlink;
 }
 
diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 7b7b90d9c3d..0e8e0676a03 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -14,7 +14,6 @@ EXPORT_SYMBOL(ndisc_mc_map);
 EXPORT_SYMBOL(register_inet6addr_notifier);
 EXPORT_SYMBOL(unregister_inet6addr_notifier);
 EXPORT_SYMBOL(ip6_route_output);
-EXPORT_SYMBOL(addrconf_lock);
 EXPORT_SYMBOL(ipv6_setsockopt);
 EXPORT_SYMBOL(ipv6_getsockopt);
 EXPORT_SYMBOL(inet6_register_protosw);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fd87e3ceb56..249e5033c1a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -321,9 +321,9 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
 	struct inet6_ifaddr *ifp;
 	struct sctp_sockaddr_entry *addr;
 
-	read_lock(&addrconf_lock);
+	rcu_read_lock();
 	if ((in6_dev = __in6_dev_get(dev)) == NULL) {
-		read_unlock(&addrconf_lock);
+		rcu_read_unlock();
 		return;
 	}
 
@@ -342,7 +342,7 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist,
 	}
 
 	read_unlock(&in6_dev->lock);
-	read_unlock(&addrconf_lock);
+	rcu_read_unlock();
 }
 
 /* Initialize a sockaddr_storage from in incoming skb. */
-- 
cgit v1.2.3-70-g09d2


From 55ebaef1d5db9c1c76ba01a87fd986db5dee550d Mon Sep 17 00:00:00 2001
From: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Date: Fri, 22 Sep 2006 14:45:27 -0700
Subject: [IPV6] ADDRCONF: Allow non-DAD'able addresses.

IFA_F_NODAD flag, similar to IN6_IFF_NODAD in BSDs, is introduced
to skip DAD.

This flag should be set to Mobile IPv6 Home Address(es) on Mobile
Node because DAD would fail if we should perform DAD; our Home Agent
protects our Home Address(es).

Signed-off-by: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_addr.h |  1 +
 net/ipv6/addrconf.c     | 31 ++++++++++++++++---------------
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_addr.h b/include/linux/if_addr.h
index e1590454db5..ca24b9de13f 100644
--- a/include/linux/if_addr.h
+++ b/include/linux/if_addr.h
@@ -38,6 +38,7 @@ enum
 #define IFA_F_SECONDARY		0x01
 #define IFA_F_TEMPORARY		IFA_F_SECONDARY
 
+#define	IFA_F_NODAD		0x02
 #define IFA_F_DEPRECATED	0x20
 #define IFA_F_TENTATIVE		0x40
 #define IFA_F_PERMANENT		0x80
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c09ebb7bb98..adb583a2615 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1873,12 +1873,11 @@ err_exit:
  *	Manual configuration of address on an interface
  */
 static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen,
-			  __u32 prefered_lft, __u32 valid_lft)
+			  __u8 ifa_flags, __u32 prefered_lft, __u32 valid_lft)
 {
 	struct inet6_ifaddr *ifp;
 	struct inet6_dev *idev;
 	struct net_device *dev;
-	__u8 ifa_flags = 0;
 	int scope;
 
 	ASSERT_RTNL();
@@ -1971,7 +1970,7 @@ int addrconf_add_ifaddr(void __user *arg)
 
 	rtnl_lock();
 	err = inet6_addr_add(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen,
-			     INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
+			     IFA_F_PERMANENT, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
 	rtnl_unlock();
 	return err;
 }
@@ -2514,7 +2513,8 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
 	spin_lock_bh(&ifp->lock);
 
 	if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
-	    !(ifp->flags&IFA_F_TENTATIVE)) {
+	    !(ifp->flags&IFA_F_TENTATIVE) ||
+	    ifp->flags & IFA_F_NODAD) {
 		ifp->flags &= ~IFA_F_TENTATIVE;
 		spin_unlock_bh(&ifp->lock);
 		read_unlock_bh(&idev->lock);
@@ -2912,28 +2912,25 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	return inet6_addr_del(ifm->ifa_index, pfx, ifm->ifa_prefixlen);
 }
 
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 prefered_lft,
-			     u32 valid_lft)
+static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
+			     u32 prefered_lft, u32 valid_lft)
 {
-	int ifa_flags = 0;
-
 	if (!valid_lft || (prefered_lft > valid_lft))
 		return -EINVAL;
 
 	if (valid_lft == INFINITY_LIFE_TIME)
-		ifa_flags = IFA_F_PERMANENT;
+		ifa_flags |= IFA_F_PERMANENT;
 	else if (valid_lft >= 0x7FFFFFFF/HZ)
 		valid_lft = 0x7FFFFFFF/HZ;
 
 	if (prefered_lft == 0)
-		ifa_flags = IFA_F_DEPRECATED;
+		ifa_flags |= IFA_F_DEPRECATED;
 	else if ((prefered_lft >= 0x7FFFFFFF/HZ) &&
 		 (prefered_lft != INFINITY_LIFE_TIME))
 		prefered_lft = 0x7FFFFFFF/HZ;
 
 	spin_lock_bh(&ifp->lock);
-	ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED|IFA_F_PERMANENT)) | ifa_flags;
-
+	ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD)) | ifa_flags;
 	ifp->tstamp = jiffies;
 	ifp->valid_lft = valid_lft;
 	ifp->prefered_lft = prefered_lft;
@@ -2955,7 +2952,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct in6_addr *pfx;
 	struct inet6_ifaddr *ifa;
 	struct net_device *dev;
-	u32 valid_lft, preferred_lft;
+	u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
+	u8 ifa_flags;
 	int err;
 
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy);
@@ -2982,6 +2980,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	if (dev == NULL)
 		return -ENODEV;
 
+	/* We ignore other flags so far. */
+	ifa_flags = ifm->ifa_flags & IFA_F_NODAD;
+
 	ifa = ipv6_get_ifaddr(pfx, dev, 1);
 	if (ifa == NULL) {
 		/*
@@ -2989,14 +2990,14 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		 * userspace alreay relies on not having to provide this.
 		 */
 		return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen,
-				      preferred_lft, valid_lft);
+				      ifa_flags, preferred_lft, valid_lft);
 	}
 
 	if (nlh->nlmsg_flags & NLM_F_EXCL ||
 	    !(nlh->nlmsg_flags & NLM_F_REPLACE))
 		err = -EEXIST;
 	else
-		err = inet6_addr_modify(ifa, preferred_lft, valid_lft);
+		err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft);
 
 	in6_ifa_put(ifa);
 
-- 
cgit v1.2.3-70-g09d2


From 3b9f9a1c3903b64c38505f9fed3bb11e48dbc931 Mon Sep 17 00:00:00 2001
From: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Date: Fri, 22 Sep 2006 14:45:56 -0700
Subject: [IPV6] ADDRCONF: Mobile IPv6 Home Address support.

IFA_F_HOMEADDRESS is introduced for Mobile IPv6 Home Addresses on
Mobile Node.

The IFA_F_HOMEADDRESS flag should be set for Mobile IPv6 Home
Addresses for 2 purposes. 1) We need to check this on receipt of
Type 2 Routing Header (RFC3775 Secion 6.4), 2) We prefer Home
Address(es) in source address selection (RFC3484 Section 5 Rule 4).

Signed-off-by: Noriaki TAKAMIYA <takamiya@po.ntts.co.jp>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_addr.h |  1 +
 include/net/addrconf.h  |  6 +-----
 net/ipv6/addrconf.c     | 44 +++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_addr.h b/include/linux/if_addr.h
index ca24b9de13f..dbe8f6120a4 100644
--- a/include/linux/if_addr.h
+++ b/include/linux/if_addr.h
@@ -39,6 +39,7 @@ enum
 #define IFA_F_TEMPORARY		IFA_F_SECONDARY
 
 #define	IFA_F_NODAD		0x02
+#define	IFA_F_HOMEADDRESS	0x10
 #define IFA_F_DEPRECATED	0x20
 #define IFA_F_TENTATIVE		0x40
 #define IFA_F_PERMANENT		0x80
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index aa2ed8f0a9d..44f1b673f91 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -61,12 +61,8 @@ extern int			addrconf_set_dstaddr(void __user *arg);
 extern int			ipv6_chk_addr(struct in6_addr *addr,
 					      struct net_device *dev,
 					      int strict);
-/* XXX: this is a placeholder till addrconf supports */
 #ifdef CONFIG_IPV6_MIP6
-static inline int ipv6_chk_home_addr(struct in6_addr *addr)
-{
-	return 0;
-}
+extern int			ipv6_chk_home_addr(struct in6_addr *addr);
 #endif
 extern struct inet6_ifaddr *	ipv6_get_ifaddr(struct in6_addr *addr,
 						struct net_device *dev,
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index adb583a2615..c1867635239 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1038,9 +1038,27 @@ int ipv6_dev_get_saddr(struct net_device *daddr_dev,
 					continue;
 			}
 
-			/* Rule 4: Prefer home address -- not implemented yet */
+			/* Rule 4: Prefer home address */
+#ifdef CONFIG_IPV6_MIP6
+			if (hiscore.rule < 4) {
+				if (ifa_result->flags & IFA_F_HOMEADDRESS)
+					hiscore.attrs |= IPV6_SADDR_SCORE_HOA;
+				hiscore.rule++;
+			}
+			if (ifa->flags & IFA_F_HOMEADDRESS) {
+				score.attrs |= IPV6_SADDR_SCORE_HOA;
+				if (!(ifa_result->flags & IFA_F_HOMEADDRESS)) {
+					score.rule = 4;
+					goto record_it;
+				}
+			} else {
+				if (hiscore.attrs & IPV6_SADDR_SCORE_HOA)
+					continue;
+			}
+#else
 			if (hiscore.rule < 4)
 				hiscore.rule++;
+#endif
 
 			/* Rule 5: Prefer outgoing interface */
 			if (hiscore.rule < 5) {
@@ -2759,6 +2777,26 @@ void if6_proc_exit(void)
 }
 #endif	/* CONFIG_PROC_FS */
 
+#ifdef CONFIG_IPV6_MIP6
+/* Check if address is a home address configured on any interface. */
+int ipv6_chk_home_addr(struct in6_addr *addr)
+{
+	int ret = 0;
+	struct inet6_ifaddr * ifp;
+	u8 hash = ipv6_addr_hash(addr);
+	read_lock_bh(&addrconf_hash_lock);
+	for (ifp = inet6_addr_lst[hash]; ifp; ifp = ifp->lst_next) {
+		if (ipv6_addr_cmp(&ifp->addr, addr) == 0 &&
+		    (ifp->flags & IFA_F_HOMEADDRESS)) {
+			ret = 1;
+			break;
+		}
+	}
+	read_unlock_bh(&addrconf_hash_lock);
+	return ret;
+}
+#endif
+
 /*
  *	Periodic address status verification
  */
@@ -2930,7 +2968,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags,
 		prefered_lft = 0x7FFFFFFF/HZ;
 
 	spin_lock_bh(&ifp->lock);
-	ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD)) | ifa_flags;
+	ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags;
 	ifp->tstamp = jiffies;
 	ifp->valid_lft = valid_lft;
 	ifp->prefered_lft = prefered_lft;
@@ -2981,7 +3019,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		return -ENODEV;
 
 	/* We ignore other flags so far. */
-	ifa_flags = ifm->ifa_flags & IFA_F_NODAD;
+	ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS);
 
 	ifa = ipv6_get_ifaddr(pfx, dev, 1);
 	if (ifa == NULL) {
-- 
cgit v1.2.3-70-g09d2


From 1c3c07e9f6cc50dab2aeb8051325e317d4f6c70e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 25 Jul 2006 11:28:18 -0400
Subject: NFS: Add a new ACCESS rpc call cache to the linux nfs client

The current access cache only allows one entry at a time to be cached for each
inode. Add a per-inode red-black tree in order to allow more than one to
be cached at a time.

Should significantly cut down the time spent in path traversal for shared
directories such as ${PATH}, /usr/share, etc.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 133 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/nfs/inode.c         |  13 ++---
 include/linux/nfs_fs.h |   5 +-
 3 files changed, 124 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e7ffb4deb3e..094afded2b1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1638,35 +1638,134 @@ out:
 	return error;
 }
 
-int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static void nfs_access_free_entry(struct nfs_access_entry *entry)
+{
+	put_rpccred(entry->cred);
+	kfree(entry);
+}
+
+static void __nfs_access_zap_cache(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_access_entry *cache = &nfsi->cache_access;
+	struct rb_root *root_node = &nfsi->access_cache;
+	struct rb_node *n, *dispose = NULL;
+	struct nfs_access_entry *entry;
+
+	/* Unhook entries from the cache */
+	while ((n = rb_first(root_node)) != NULL) {
+		entry = rb_entry(n, struct nfs_access_entry, rb_node);
+		rb_erase(n, root_node);
+		n->rb_left = dispose;
+		dispose = n;
+	}
+	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+	spin_unlock(&inode->i_lock);
 
-	if (cache->cred != cred
-			|| time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
-			|| (nfsi->cache_validity & NFS_INO_INVALID_ACCESS))
-		return -ENOENT;
-	memcpy(res, cache, sizeof(*res));
-	return 0;
+	/* Now kill them all! */
+	while (dispose != NULL) {
+		n = dispose;
+		dispose = n->rb_left;
+		nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node));
+	}
 }
 
-void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+void nfs_access_zap_cache(struct inode *inode)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_access_entry *cache = &nfsi->cache_access;
+	spin_lock(&inode->i_lock);
+	/* This will release the spinlock */
+	__nfs_access_zap_cache(inode);
+}
 
-	if (cache->cred != set->cred) {
-		if (cache->cred)
-			put_rpccred(cache->cred);
-		cache->cred = get_rpccred(set->cred);
+static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred)
+{
+	struct rb_node *n = NFS_I(inode)->access_cache.rb_node;
+	struct nfs_access_entry *entry;
+
+	while (n != NULL) {
+		entry = rb_entry(n, struct nfs_access_entry, rb_node);
+
+		if (cred < entry->cred)
+			n = n->rb_left;
+		else if (cred > entry->cred)
+			n = n->rb_right;
+		else
+			return entry;
 	}
-	/* FIXME: replace current access_cache BKL reliance with inode->i_lock */
+	return NULL;
+}
+
+int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_access_entry *cache;
+	int err = -ENOENT;
+
 	spin_lock(&inode->i_lock);
-	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+	if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+		goto out_zap;
+	cache = nfs_access_search_rbtree(inode, cred);
+	if (cache == NULL)
+		goto out;
+	if (time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)))
+		goto out_stale;
+	res->jiffies = cache->jiffies;
+	res->cred = cache->cred;
+	res->mask = cache->mask;
+	err = 0;
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+out_stale:
+	rb_erase(&cache->rb_node, &nfsi->access_cache);
+	spin_unlock(&inode->i_lock);
+	nfs_access_free_entry(cache);
+	return -ENOENT;
+out_zap:
+	/* This will release the spinlock */
+	__nfs_access_zap_cache(inode);
+	return -ENOENT;
+}
+
+static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
+{
+	struct rb_root *root_node = &NFS_I(inode)->access_cache;
+	struct rb_node **p = &root_node->rb_node;
+	struct rb_node *parent = NULL;
+	struct nfs_access_entry *entry;
+
+	spin_lock(&inode->i_lock);
+	while (*p != NULL) {
+		parent = *p;
+		entry = rb_entry(parent, struct nfs_access_entry, rb_node);
+
+		if (set->cred < entry->cred)
+			p = &parent->rb_left;
+		else if (set->cred > entry->cred)
+			p = &parent->rb_right;
+		else
+			goto found;
+	}
+	rb_link_node(&set->rb_node, parent, p);
+	rb_insert_color(&set->rb_node, root_node);
 	spin_unlock(&inode->i_lock);
+	return;
+found:
+	rb_replace_node(parent, &set->rb_node, root_node);
+	spin_unlock(&inode->i_lock);
+	nfs_access_free_entry(entry);
+}
+
+void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
+{
+	struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
+	if (cache == NULL)
+		return;
+	RB_CLEAR_NODE(&cache->rb_node);
 	cache->jiffies = set->jiffies;
+	cache->cred = get_rpccred(set->cred);
 	cache->mask = set->mask;
+
+	nfs_access_add_rbtree(inode, cache);
 }
 
 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d349fb2245d..b94ab060bb1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -76,19 +76,14 @@ int nfs_write_inode(struct inode *inode, int sync)
 
 void nfs_clear_inode(struct inode *inode)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct rpc_cred *cred;
-
 	/*
 	 * The following should never happen...
 	 */
 	BUG_ON(nfs_have_writebacks(inode));
-	BUG_ON (!list_empty(&nfsi->open_files));
+	BUG_ON(!list_empty(&NFS_I(inode)->open_files));
+	BUG_ON(atomic_read(&NFS_I(inode)->data_updates) != 0);
 	nfs_zap_acl_cache(inode);
-	cred = nfsi->cache_access.cred;
-	if (cred)
-		put_rpccred(cred);
-	BUG_ON(atomic_read(&nfsi->data_updates) != 0);
+	nfs_access_zap_cache(inode);
 }
 
 /**
@@ -290,7 +285,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = jiffies;
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-		nfsi->cache_access.cred = NULL;
+		nfsi->access_cache = RB_ROOT;
 
 		unlock_new_inode(inode);
 	} else
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6c2066caeaa..cc013ed2e52 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -42,6 +42,7 @@
 #include <linux/in.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/rbtree.h>
 #include <linux/rwsem.h>
 #include <linux/wait.h>
 
@@ -69,6 +70,7 @@
  * NFSv3/v4 Access mode cache entry
  */
 struct nfs_access_entry {
+	struct rb_node		rb_node;
 	unsigned long		jiffies;
 	struct rpc_cred *	cred;
 	int			mask;
@@ -145,7 +147,7 @@ struct nfs_inode {
 	 */
 	atomic_t		data_updates;
 
-	struct nfs_access_entry	cache_access;
+	struct rb_root		access_cache;
 #ifdef CONFIG_NFS_V3_ACL
 	struct posix_acl	*acl_access;
 	struct posix_acl	*acl_default;
@@ -297,6 +299,7 @@ extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int nfs_permission(struct inode *, int, struct nameidata *);
 extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *);
 extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
+extern void nfs_access_zap_cache(struct inode *inode);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
-- 
cgit v1.2.3-70-g09d2


From cfcea3e8c66c2dcde98d5c2693d4bff50b5cac97 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 25 Jul 2006 11:28:18 -0400
Subject: NFS: Add a global LRU list for the ACCESS cache

...in order to allow the addition of a memory shrinker.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 35 ++++++++++++++++++++++++++++++++++-
 fs/nfs/inode.c         |  2 ++
 include/linux/nfs_fs.h |  4 ++++
 3 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 094afded2b1..bf4f5ffda70 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1638,10 +1638,17 @@ out:
 	return error;
 }
 
+static DEFINE_SPINLOCK(nfs_access_lru_lock);
+static LIST_HEAD(nfs_access_lru_list);
+static atomic_long_t nfs_access_nr_entries;
+
 static void nfs_access_free_entry(struct nfs_access_entry *entry)
 {
 	put_rpccred(entry->cred);
 	kfree(entry);
+	smp_mb__before_atomic_dec();
+	atomic_long_dec(&nfs_access_nr_entries);
+	smp_mb__after_atomic_dec();
 }
 
 static void __nfs_access_zap_cache(struct inode *inode)
@@ -1655,6 +1662,7 @@ static void __nfs_access_zap_cache(struct inode *inode)
 	while ((n = rb_first(root_node)) != NULL) {
 		entry = rb_entry(n, struct nfs_access_entry, rb_node);
 		rb_erase(n, root_node);
+		list_del(&entry->lru);
 		n->rb_left = dispose;
 		dispose = n;
 	}
@@ -1671,6 +1679,13 @@ static void __nfs_access_zap_cache(struct inode *inode)
 
 void nfs_access_zap_cache(struct inode *inode)
 {
+	/* Remove from global LRU init */
+	if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+		spin_lock(&nfs_access_lru_lock);
+		list_del_init(&NFS_I(inode)->access_cache_inode_lru);
+		spin_unlock(&nfs_access_lru_lock);
+	}
+
 	spin_lock(&inode->i_lock);
 	/* This will release the spinlock */
 	__nfs_access_zap_cache(inode);
@@ -1711,12 +1726,14 @@ int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs
 	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
 	res->mask = cache->mask;
+	list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
 	err = 0;
 out:
 	spin_unlock(&inode->i_lock);
 	return err;
 out_stale:
 	rb_erase(&cache->rb_node, &nfsi->access_cache);
+	list_del(&cache->lru);
 	spin_unlock(&inode->i_lock);
 	nfs_access_free_entry(cache);
 	return -ENOENT;
@@ -1728,7 +1745,8 @@ out_zap:
 
 static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
 {
-	struct rb_root *root_node = &NFS_I(inode)->access_cache;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct rb_root *root_node = &nfsi->access_cache;
 	struct rb_node **p = &root_node->rb_node;
 	struct rb_node *parent = NULL;
 	struct nfs_access_entry *entry;
@@ -1747,10 +1765,13 @@ static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *
 	}
 	rb_link_node(&set->rb_node, parent, p);
 	rb_insert_color(&set->rb_node, root_node);
+	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
 	spin_unlock(&inode->i_lock);
 	return;
 found:
 	rb_replace_node(parent, &set->rb_node, root_node);
+	list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
+	list_del(&entry->lru);
 	spin_unlock(&inode->i_lock);
 	nfs_access_free_entry(entry);
 }
@@ -1766,6 +1787,18 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 	cache->mask = set->mask;
 
 	nfs_access_add_rbtree(inode, cache);
+
+	/* Update accounting */
+	smp_mb__before_atomic_inc();
+	atomic_long_inc(&nfs_access_nr_entries);
+	smp_mb__after_atomic_inc();
+
+	/* Add inode to global LRU list */
+	if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) {
+		spin_lock(&nfs_access_lru_lock);
+		list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list);
+		spin_unlock(&nfs_access_lru_lock);
+	}
 }
 
 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b94ab060bb1..6ed018c9aad 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1104,6 +1104,8 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		INIT_LIST_HEAD(&nfsi->dirty);
 		INIT_LIST_HEAD(&nfsi->commit);
 		INIT_LIST_HEAD(&nfsi->open_files);
+		INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+		INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 		INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
 		atomic_set(&nfsi->data_updates, 0);
 		nfsi->ndirty = 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index cc013ed2e52..a36e01cd632 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -71,6 +71,7 @@
  */
 struct nfs_access_entry {
 	struct rb_node		rb_node;
+	struct list_head	lru;
 	unsigned long		jiffies;
 	struct rpc_cred *	cred;
 	int			mask;
@@ -148,6 +149,8 @@ struct nfs_inode {
 	atomic_t		data_updates;
 
 	struct rb_root		access_cache;
+	struct list_head	access_cache_entry_lru;
+	struct list_head	access_cache_inode_lru;
 #ifdef CONFIG_NFS_V3_ACL
 	struct posix_acl	*acl_access;
 	struct posix_acl	*acl_default;
@@ -201,6 +204,7 @@ struct nfs_inode {
 #define NFS_INO_REVALIDATING	(0)		/* revalidating attrs */
 #define NFS_INO_ADVISE_RDPLUS	(1)		/* advise readdirplus */
 #define NFS_INO_STALE		(2)		/* possible stale inode */
+#define NFS_INO_ACL_LRU_SET	(3)		/* Inode is on the LRU list */
 
 static inline struct nfs_inode *NFS_I(struct inode *inode)
 {
-- 
cgit v1.2.3-70-g09d2


From 770bfad846ab6628444428467b11fa6773ae9ea1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:07 -0400
Subject: NFS: Add dentry materialisation op

The attached patch adds a new directory cache management function that prepares
a disconnected anonymous function to be connected into the dentry tree. The
anonymous dentry is transferred the name and parentage from another dentry.

The following changes were made in [try #2]:

 (*) d_materialise_dentry() now switches the parentage of the two nodes around
     correctly when one or other of them is self-referential.

The following changes were made in [try #7]:

 (*) d_instantiate_unique() has had the interior part split out as function
     __d_instantiate_unique(). Callers of this latter function must be holding
     the appropriate locks.

 (*) _d_rehash() has been added as a wrapper around __d_rehash() to call it
     with the most obvious hash list (the one from the name). d_rehash() now
     calls _d_rehash().

 (*) d_materialise_dentry() is now __d_materialise_dentry() and is static.

 (*) d_materialise_unique() added to perform the combination of d_find_alias(),
     d_materialise_dentry() and d_add_unique() that the NFS client was doing
     twice, all within a single dcache_lock critical section. This reduces the
     number of times two different spinlocks were being accessed.

The following further changes were made:

 (*) Add the dentries onto their parents d_subdirs lists.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/dcache.c            | 164 ++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/dcache.h |   1 +
 2 files changed, 151 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 1b4a3a34ec5..17b392a2049 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -828,17 +828,19 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
  * (or otherwise set) by the caller to indicate that it is now
  * in use by the dcache.
  */
-struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
+static struct dentry *__d_instantiate_unique(struct dentry *entry,
+					     struct inode *inode)
 {
 	struct dentry *alias;
 	int len = entry->d_name.len;
 	const char *name = entry->d_name.name;
 	unsigned int hash = entry->d_name.hash;
 
-	BUG_ON(!list_empty(&entry->d_alias));
-	spin_lock(&dcache_lock);
-	if (!inode)
-		goto do_negative;
+	if (!inode) {
+		entry->d_inode = NULL;
+		return NULL;
+	}
+
 	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
 		struct qstr *qstr = &alias->d_name;
 
@@ -851,19 +853,35 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
 		if (memcmp(qstr->name, name, len))
 			continue;
 		dget_locked(alias);
-		spin_unlock(&dcache_lock);
-		BUG_ON(!d_unhashed(alias));
-		iput(inode);
 		return alias;
 	}
+
 	list_add(&entry->d_alias, &inode->i_dentry);
-do_negative:
 	entry->d_inode = inode;
 	fsnotify_d_instantiate(entry, inode);
-	spin_unlock(&dcache_lock);
-	security_d_instantiate(entry, inode);
 	return NULL;
 }
+
+struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
+{
+	struct dentry *result;
+
+	BUG_ON(!list_empty(&entry->d_alias));
+
+	spin_lock(&dcache_lock);
+	result = __d_instantiate_unique(entry, inode);
+	spin_unlock(&dcache_lock);
+
+	if (!result) {
+		security_d_instantiate(entry, inode);
+		return NULL;
+	}
+
+	BUG_ON(!d_unhashed(result));
+	iput(inode);
+	return result;
+}
+
 EXPORT_SYMBOL(d_instantiate_unique);
 
 /**
@@ -1235,6 +1253,11 @@ static void __d_rehash(struct dentry * entry, struct hlist_head *list)
  	hlist_add_head_rcu(&entry->d_hash, list);
 }
 
+static void _d_rehash(struct dentry * entry)
+{
+	__d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
+}
+
 /**
  * d_rehash	- add an entry back to the hash
  * @entry: dentry to add to the hash
@@ -1244,11 +1267,9 @@ static void __d_rehash(struct dentry * entry, struct hlist_head *list)
  
 void d_rehash(struct dentry * entry)
 {
-	struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash);
-
 	spin_lock(&dcache_lock);
 	spin_lock(&entry->d_lock);
-	__d_rehash(entry, list);
+	_d_rehash(entry);
 	spin_unlock(&entry->d_lock);
 	spin_unlock(&dcache_lock);
 }
@@ -1386,6 +1407,120 @@ already_unhashed:
 	spin_unlock(&dcache_lock);
 }
 
+/*
+ * Prepare an anonymous dentry for life in the superblock's dentry tree as a
+ * named dentry in place of the dentry to be replaced.
+ */
+static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
+{
+	struct dentry *dparent, *aparent;
+
+	switch_names(dentry, anon);
+	do_switch(dentry->d_name.len, anon->d_name.len);
+	do_switch(dentry->d_name.hash, anon->d_name.hash);
+
+	dparent = dentry->d_parent;
+	aparent = anon->d_parent;
+
+	dentry->d_parent = (aparent == anon) ? dentry : aparent;
+	list_del(&dentry->d_u.d_child);
+	if (!IS_ROOT(dentry))
+		list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
+	else
+		INIT_LIST_HEAD(&dentry->d_u.d_child);
+
+	anon->d_parent = (dparent == dentry) ? anon : dparent;
+	list_del(&anon->d_u.d_child);
+	if (!IS_ROOT(anon))
+		list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs);
+	else
+		INIT_LIST_HEAD(&anon->d_u.d_child);
+
+	anon->d_flags &= ~DCACHE_DISCONNECTED;
+}
+
+/**
+ * d_materialise_unique - introduce an inode into the tree
+ * @dentry: candidate dentry
+ * @inode: inode to bind to the dentry, to which aliases may be attached
+ *
+ * Introduces an dentry into the tree, substituting an extant disconnected
+ * root directory alias in its place if there is one
+ */
+struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
+{
+	struct dentry *alias, *actual;
+
+	BUG_ON(!d_unhashed(dentry));
+
+	spin_lock(&dcache_lock);
+
+	if (!inode) {
+		actual = dentry;
+		dentry->d_inode = NULL;
+		goto found_lock;
+	}
+
+	/* See if a disconnected directory already exists as an anonymous root
+	 * that we should splice into the tree instead */
+	if (S_ISDIR(inode->i_mode) && (alias = __d_find_alias(inode, 1))) {
+		spin_lock(&alias->d_lock);
+
+		/* Is this a mountpoint that we could splice into our tree? */
+		if (IS_ROOT(alias))
+			goto connect_mountpoint;
+
+		if (alias->d_name.len == dentry->d_name.len &&
+		    alias->d_parent == dentry->d_parent &&
+		    memcmp(alias->d_name.name,
+			   dentry->d_name.name,
+			   dentry->d_name.len) == 0)
+			goto replace_with_alias;
+
+		spin_unlock(&alias->d_lock);
+
+		/* Doh! Seem to be aliasing directories for some reason... */
+		dput(alias);
+	}
+
+	/* Add a unique reference */
+	actual = __d_instantiate_unique(dentry, inode);
+	if (!actual)
+		actual = dentry;
+	else if (unlikely(!d_unhashed(actual)))
+		goto shouldnt_be_hashed;
+
+found_lock:
+	spin_lock(&actual->d_lock);
+found:
+	_d_rehash(actual);
+	spin_unlock(&actual->d_lock);
+	spin_unlock(&dcache_lock);
+
+	if (actual == dentry) {
+		security_d_instantiate(dentry, inode);
+		return NULL;
+	}
+
+	iput(inode);
+	return actual;
+
+	/* Convert the anonymous/root alias into an ordinary dentry */
+connect_mountpoint:
+	__d_materialise_dentry(dentry, alias);
+
+	/* Replace the candidate dentry with the alias in the tree */
+replace_with_alias:
+	__d_drop(alias);
+	actual = alias;
+	goto found;
+
+shouldnt_be_hashed:
+	spin_unlock(&dcache_lock);
+	BUG();
+	goto shouldnt_be_hashed;
+}
+
 /**
  * d_path - return the path of a dentry
  * @dentry: dentry to report
@@ -1784,6 +1919,7 @@ EXPORT_SYMBOL(d_instantiate);
 EXPORT_SYMBOL(d_invalidate);
 EXPORT_SYMBOL(d_lookup);
 EXPORT_SYMBOL(d_move);
+EXPORT_SYMBOL_GPL(d_materialise_unique);
 EXPORT_SYMBOL(d_path);
 EXPORT_SYMBOL(d_prune_aliases);
 EXPORT_SYMBOL(d_rehash);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 471781ffeab..44605be5940 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -221,6 +221,7 @@ static inline int dname_external(struct dentry *dentry)
  */
 extern void d_instantiate(struct dentry *, struct inode *);
 extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *);
+extern struct dentry * d_materialise_unique(struct dentry *, struct inode *);
 extern void d_delete(struct dentry *);
 
 /* allocate/de-allocate */
-- 
cgit v1.2.3-70-g09d2


From adfa6f980bd46974e6b32b22dd0c45e3f52063f4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:08 -0400
Subject: NFS: Rename struct nfs4_client to struct nfs_client

Rename struct nfs4_client to struct nfs_client so that it can become the basis
for a general client record for NFS2 and NFS3 in addition to NFS4.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c         |  2 +-
 fs/nfs/callback_proc.c    |  4 ++--
 fs/nfs/delegation.c       | 24 +++++++++++------------
 fs/nfs/delegation.h       | 10 +++++-----
 fs/nfs/idmap.c            | 12 ++++++------
 fs/nfs/nfs4_fs.h          | 30 ++++++++++++++--------------
 fs/nfs/nfs4proc.c         | 32 +++++++++++++++---------------
 fs/nfs/nfs4renewd.c       |  8 ++++----
 fs/nfs/nfs4state.c        | 50 +++++++++++++++++++++++------------------------
 fs/nfs/nfs4xdr.c          | 18 ++++++++---------
 fs/nfs/super.c            |  4 ++--
 include/linux/nfs_fs_sb.h |  2 +-
 include/linux/nfs_idmap.h | 14 ++++++-------
 13 files changed, 105 insertions(+), 105 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index b1f7dc41539..1b596b6d9dc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -167,7 +167,7 @@ void nfs_callback_down(void)
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
 	struct in_addr *addr = &rqstp->rq_addr.sin_addr;
-	struct nfs4_client *clp;
+	struct nfs_client *clp;
 
 	/* Don't talk to strangers */
 	clp = nfs4_find_client(addr);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 7719483ecdf..55d6e2ec157 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -15,7 +15,7 @@
  
 unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
 {
-	struct nfs4_client *clp;
+	struct nfs_client *clp;
 	struct nfs_delegation *delegation;
 	struct nfs_inode *nfsi;
 	struct inode *inode;
@@ -56,7 +56,7 @@ out:
 
 unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 {
-	struct nfs4_client *clp;
+	struct nfs_client *clp;
 	struct inode *inode;
 	unsigned res;
 	
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 9540a316c05..5a1105c258b 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -114,7 +114,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
  */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-	struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs4_state;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int status = 0;
@@ -176,7 +176,7 @@ static void nfs_msync_inode(struct inode *inode)
  */
 int __nfs_inode_return_delegation(struct inode *inode)
 {
-	struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs4_state;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int res = 0;
@@ -208,7 +208,7 @@ int __nfs_inode_return_delegation(struct inode *inode)
  */
 void nfs_return_all_delegations(struct super_block *sb)
 {
-	struct nfs4_client *clp = NFS_SB(sb)->nfs4_state;
+	struct nfs_client *clp = NFS_SB(sb)->nfs4_state;
 	struct nfs_delegation *delegation;
 	struct inode *inode;
 
@@ -232,7 +232,7 @@ restart:
 
 int nfs_do_expire_all_delegations(void *ptr)
 {
-	struct nfs4_client *clp = ptr;
+	struct nfs_client *clp = ptr;
 	struct nfs_delegation *delegation;
 	struct inode *inode;
 
@@ -258,7 +258,7 @@ out:
 	module_put_and_exit(0);
 }
 
-void nfs_expire_all_delegations(struct nfs4_client *clp)
+void nfs_expire_all_delegations(struct nfs_client *clp)
 {
 	struct task_struct *task;
 
@@ -276,7 +276,7 @@ void nfs_expire_all_delegations(struct nfs4_client *clp)
 /*
  * Return all delegations following an NFS4ERR_CB_PATH_DOWN error.
  */
-void nfs_handle_cb_pathdown(struct nfs4_client *clp)
+void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
 	struct nfs_delegation *delegation;
 	struct inode *inode;
@@ -299,7 +299,7 @@ restart:
 
 struct recall_threadargs {
 	struct inode *inode;
-	struct nfs4_client *clp;
+	struct nfs_client *clp;
 	const nfs4_stateid *stateid;
 
 	struct completion started;
@@ -310,7 +310,7 @@ static int recall_thread(void *data)
 {
 	struct recall_threadargs *args = (struct recall_threadargs *)data;
 	struct inode *inode = igrab(args->inode);
-	struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs4_state;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 
@@ -371,7 +371,7 @@ out_module_put:
 /*
  * Retrieve the inode associated with a delegation
  */
-struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle)
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle)
 {
 	struct nfs_delegation *delegation;
 	struct inode *res = NULL;
@@ -389,7 +389,7 @@ struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nf
 /*
  * Mark all delegations as needing to be reclaimed
  */
-void nfs_delegation_mark_reclaim(struct nfs4_client *clp)
+void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 {
 	struct nfs_delegation *delegation;
 	spin_lock(&clp->cl_lock);
@@ -401,7 +401,7 @@ void nfs_delegation_mark_reclaim(struct nfs4_client *clp)
 /*
  * Reap all unclaimed delegations after reboot recovery is done
  */
-void nfs_delegation_reap_unclaimed(struct nfs4_client *clp)
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
 	struct nfs_delegation *delegation, *n;
 	LIST_HEAD(head);
@@ -423,7 +423,7 @@ void nfs_delegation_reap_unclaimed(struct nfs4_client *clp)
 
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
-	struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs4_state;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int res = 0;
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 3858694652f..2cfd4b24c7f 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -29,13 +29,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
 int __nfs_inode_return_delegation(struct inode *inode);
 int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
 
-struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle);
+struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
 void nfs_return_all_delegations(struct super_block *sb);
-void nfs_expire_all_delegations(struct nfs4_client *clp);
-void nfs_handle_cb_pathdown(struct nfs4_client *clp);
+void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_handle_cb_pathdown(struct nfs_client *clp);
 
-void nfs_delegation_mark_reclaim(struct nfs4_client *clp);
-void nfs_delegation_reap_unclaimed(struct nfs4_client *clp);
+void nfs_delegation_mark_reclaim(struct nfs_client *clp);
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 
 /* NFSv4 delegation-related procedures */
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 873deb96a6c..d05148ec941 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -109,7 +109,7 @@ static struct rpc_pipe_ops idmap_upcall_ops = {
 };
 
 void
-nfs_idmap_new(struct nfs4_client *clp)
+nfs_idmap_new(struct nfs_client *clp)
 {
 	struct idmap *idmap;
 
@@ -138,7 +138,7 @@ nfs_idmap_new(struct nfs4_client *clp)
 }
 
 void
-nfs_idmap_delete(struct nfs4_client *clp)
+nfs_idmap_delete(struct nfs_client *clp)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
@@ -491,27 +491,27 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
 	return (hash);
 }
 
-int nfs_map_name_to_uid(struct nfs4_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
 	return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
 }
 
-int nfs_map_group_to_gid(struct nfs4_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
 	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
 
-int nfs_map_uid_to_name(struct nfs4_client *clp, __u32 uid, char *buf)
+int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
 	return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
 }
-int nfs_map_gid_to_group(struct nfs4_client *clp, __u32 uid, char *buf)
+int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf)
 {
 	struct idmap *idmap = clp->cl_idmap;
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9a102860df3..4e334cb4849 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -43,9 +43,9 @@ enum nfs4_client_state {
 };
 
 /*
- * The nfs4_client identifies our client state to the server.
+ * The nfs_client identifies our client state to the server.
  */
-struct nfs4_client {
+struct nfs_client {
 	struct list_head	cl_servers;	/* Global list of servers */
 	struct in_addr		cl_addr;	/* Server identifier */
 	u64			cl_clientid;	/* constant */
@@ -127,7 +127,7 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status
 struct nfs4_state_owner {
 	spinlock_t	     so_lock;
 	struct list_head     so_list;	 /* per-clientid list of state_owners */
-	struct nfs4_client   *so_client;
+	struct nfs_client    *so_client;
 	u32                  so_id;      /* 32-bit identifier, unique */
 	atomic_t	     so_count;
 
@@ -210,10 +210,10 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 
 /* nfs4proc.c */
 extern int nfs4_map_errors(int err);
-extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short, struct rpc_cred *);
-extern int nfs4_proc_setclientid_confirm(struct nfs4_client *, struct rpc_cred *);
-extern int nfs4_proc_async_renew(struct nfs4_client *, struct rpc_cred *);
-extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
+extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
+extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
+extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
@@ -231,19 +231,19 @@ extern const u32 nfs4_fsinfo_bitmap[2];
 extern const u32 nfs4_fs_locations_bitmap[2];
 
 /* nfs4renewd.c */
-extern void nfs4_schedule_state_renewal(struct nfs4_client *);
+extern void nfs4_schedule_state_renewal(struct nfs_client *);
 extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
-extern void nfs4_kill_renewd(struct nfs4_client *);
+extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(void *);
 
 /* nfs4state.c */
 extern void init_nfsv4_state(struct nfs_server *);
 extern void destroy_nfsv4_state(struct nfs_server *);
-extern struct nfs4_client *nfs4_get_client(struct in_addr *);
-extern void nfs4_put_client(struct nfs4_client *clp);
-extern struct nfs4_client *nfs4_find_client(struct in_addr *);
-struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp);
-extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *);
+extern struct nfs_client *nfs4_get_client(struct in_addr *);
+extern void nfs4_put_client(struct nfs_client *clp);
+extern struct nfs_client *nfs4_find_client(struct in_addr *);
+struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
+extern u32 nfs4_alloc_lockowner_id(struct nfs_client *);
 
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
@@ -252,7 +252,7 @@ extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state
 extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct nfs4_state *, mode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t);
-extern void nfs4_schedule_state_recovery(struct nfs4_client *);
+extern void nfs4_schedule_state_recovery(struct nfs_client *);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b14145b7b87..168f3ffb059 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -64,7 +64,7 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp);
+static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
 
 /* Prevent leaks of NFSv4 errors into userland */
 int nfs4_map_errors(int err)
@@ -195,7 +195,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
 
 static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
 {
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 	spin_lock(&clp->cl_lock);
 	if (time_before(clp->cl_last_renewal,timestamp))
 		clp->cl_last_renewal = timestamp;
@@ -792,7 +792,7 @@ out:
 
 int nfs4_recover_expired_lease(struct nfs_server *server)
 {
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 
 	if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
 		nfs4_schedule_state_recovery(clp);
@@ -867,7 +867,7 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
 {
 	struct nfs_delegation *delegation;
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs4_state_owner *sp = NULL;
 	struct nfs4_state *state = NULL;
@@ -953,7 +953,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
 	struct nfs_server       *server = NFS_SERVER(dir);
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 	struct nfs4_opendata *opendata;
 	int                     status;
 
@@ -2521,7 +2521,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how)
  */
 static void nfs4_renew_done(struct rpc_task *task, void *data)
 {
-	struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
+	struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp;
 	unsigned long timestamp = (unsigned long)data;
 
 	if (task->tk_status < 0) {
@@ -2543,7 +2543,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {
 	.rpc_call_done = nfs4_renew_done,
 };
 
-int nfs4_proc_async_renew(struct nfs4_client *clp, struct rpc_cred *cred)
+int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -2555,7 +2555,7 @@ int nfs4_proc_async_renew(struct nfs4_client *clp, struct rpc_cred *cred)
 			&nfs4_renew_ops, (void *)jiffies);
 }
 
-int nfs4_proc_renew(struct nfs4_client *clp, struct rpc_cred *cred)
+int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -2791,7 +2791,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 static int
 nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
 {
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 
 	if (!clp || task->tk_status >= 0)
 		return 0;
@@ -2828,7 +2828,7 @@ static int nfs4_wait_bit_interruptible(void *word)
 	return 0;
 }
 
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp)
+static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
 {
 	sigset_t oldset;
 	int res;
@@ -2871,7 +2871,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
  */
 int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
 {
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 	int ret = errorcode;
 
 	exception->retry = 0;
@@ -2898,7 +2898,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct
 	return nfs4_map_errors(ret);
 }
 
-int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
+int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
 	nfs4_verifier sc_verifier;
 	struct nfs4_setclientid setclientid = {
@@ -2945,7 +2945,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p
 	return status;
 }
 
-static int _nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred)
+static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	struct nfs_fsinfo fsinfo;
 	struct rpc_message msg = {
@@ -2969,7 +2969,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cr
 	return status;
 }
 
-int nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred)
+int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	long timeout;
 	int err;
@@ -3106,7 +3106,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 {
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 	struct nfs_lockt_args arg = {
 		.fh = NFS_FH(inode),
 		.fl = request,
@@ -3513,7 +3513,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-	struct nfs4_client *clp = state->owner->so_client;
+	struct nfs_client *clp = state->owner->so_client;
 	unsigned char fl_flags = request->fl_flags;
 	int status;
 
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 5d764d8e6d8..208764069f6 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -61,7 +61,7 @@
 void
 nfs4_renew_state(void *data)
 {
-	struct nfs4_client *clp = (struct nfs4_client *)data;
+	struct nfs_client *clp = (struct nfs_client *)data;
 	struct rpc_cred *cred;
 	long lease, timeout;
 	unsigned long last, now;
@@ -108,7 +108,7 @@ out:
 
 /* Must be called with clp->cl_sem locked for writes */
 void
-nfs4_schedule_state_renewal(struct nfs4_client *clp)
+nfs4_schedule_state_renewal(struct nfs_client *clp)
 {
 	long timeout;
 
@@ -127,7 +127,7 @@ nfs4_schedule_state_renewal(struct nfs4_client *clp)
 void
 nfs4_renewd_prepare_shutdown(struct nfs_server *server)
 {
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 
 	if (!clp)
 		return;
@@ -140,7 +140,7 @@ nfs4_renewd_prepare_shutdown(struct nfs_server *server)
 
 /* Must be called with clp->cl_sem locked for writes */
 void
-nfs4_kill_renewd(struct nfs4_client *clp)
+nfs4_kill_renewd(struct nfs_client *clp)
 {
 	down_read(&clp->cl_sem);
 	if (!list_empty(&clp->cl_superblocks)) {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 090a36b07a2..c0b6439f1f7 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -83,10 +83,10 @@ destroy_nfsv4_state(struct nfs_server *server)
  * Since these are allocated/deallocated very rarely, we don't
  * bother putting them in a slab cache...
  */
-static struct nfs4_client *
+static struct nfs_client *
 nfs4_alloc_client(struct in_addr *addr)
 {
-	struct nfs4_client *clp;
+	struct nfs_client *clp;
 
 	if (nfs_callback_up() < 0)
 		return NULL;
@@ -111,7 +111,7 @@ nfs4_alloc_client(struct in_addr *addr)
 }
 
 static void
-nfs4_free_client(struct nfs4_client *clp)
+nfs4_free_client(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
 
@@ -130,9 +130,9 @@ nfs4_free_client(struct nfs4_client *clp)
 	nfs_callback_down();
 }
 
-static struct nfs4_client *__nfs4_find_client(struct in_addr *addr)
+static struct nfs_client *__nfs4_find_client(struct in_addr *addr)
 {
-	struct nfs4_client *clp;
+	struct nfs_client *clp;
 	list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) {
 		if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0) {
 			atomic_inc(&clp->cl_count);
@@ -142,19 +142,19 @@ static struct nfs4_client *__nfs4_find_client(struct in_addr *addr)
 	return NULL;
 }
 
-struct nfs4_client *nfs4_find_client(struct in_addr *addr)
+struct nfs_client *nfs4_find_client(struct in_addr *addr)
 {
-	struct nfs4_client *clp;
+	struct nfs_client *clp;
 	spin_lock(&state_spinlock);
 	clp = __nfs4_find_client(addr);
 	spin_unlock(&state_spinlock);
 	return clp;
 }
 
-struct nfs4_client *
+struct nfs_client *
 nfs4_get_client(struct in_addr *addr)
 {
-	struct nfs4_client *clp, *new = NULL;
+	struct nfs_client *clp, *new = NULL;
 
 	spin_lock(&state_spinlock);
 	for (;;) {
@@ -180,7 +180,7 @@ nfs4_get_client(struct in_addr *addr)
 }
 
 void
-nfs4_put_client(struct nfs4_client *clp)
+nfs4_put_client(struct nfs_client *clp)
 {
 	if (!atomic_dec_and_lock(&clp->cl_count, &state_spinlock))
 		return;
@@ -192,7 +192,7 @@ nfs4_put_client(struct nfs4_client *clp)
 	nfs4_free_client(clp);
 }
 
-static int nfs4_init_client(struct nfs4_client *clp, struct rpc_cred *cred)
+static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
 			nfs_callback_tcpport, cred);
@@ -204,13 +204,13 @@ static int nfs4_init_client(struct nfs4_client *clp, struct rpc_cred *cred)
 }
 
 u32
-nfs4_alloc_lockowner_id(struct nfs4_client *clp)
+nfs4_alloc_lockowner_id(struct nfs_client *clp)
 {
 	return clp->cl_lockowner_id ++;
 }
 
 static struct nfs4_state_owner *
-nfs4_client_grab_unused(struct nfs4_client *clp, struct rpc_cred *cred)
+nfs4_client_grab_unused(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	struct nfs4_state_owner *sp = NULL;
 
@@ -224,7 +224,7 @@ nfs4_client_grab_unused(struct nfs4_client *clp, struct rpc_cred *cred)
 	return sp;
 }
 
-struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp)
+struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
 	struct rpc_cred *cred = NULL;
@@ -238,7 +238,7 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp)
 	return cred;
 }
 
-struct rpc_cred *nfs4_get_setclientid_cred(struct nfs4_client *clp)
+struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
 
@@ -251,7 +251,7 @@ struct rpc_cred *nfs4_get_setclientid_cred(struct nfs4_client *clp)
 }
 
 static struct nfs4_state_owner *
-nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred)
+nfs4_find_state_owner(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	struct nfs4_state_owner *sp, *res = NULL;
 
@@ -294,7 +294,7 @@ nfs4_alloc_state_owner(void)
 void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-	struct nfs4_client *clp = sp->so_client;
+	struct nfs_client *clp = sp->so_client;
 	spin_lock(&clp->cl_lock);
 	list_del_init(&sp->so_list);
 	spin_unlock(&clp->cl_lock);
@@ -306,7 +306,7 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
  */
 struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 {
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 	struct nfs4_state_owner *sp, *new;
 
 	get_rpccred(cred);
@@ -337,7 +337,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
  */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
-	struct nfs4_client *clp = sp->so_client;
+	struct nfs_client *clp = sp->so_client;
 	struct rpc_cred *cred = sp->so_cred;
 
 	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
@@ -540,7 +540,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
 	struct nfs4_lock_state *lsp;
-	struct nfs4_client *clp = state->owner->so_client;
+	struct nfs_client *clp = state->owner->so_client;
 
 	lsp = kzalloc(sizeof(*lsp), GFP_KERNEL);
 	if (lsp == NULL)
@@ -752,7 +752,7 @@ out:
 
 static int reclaimer(void *);
 
-static inline void nfs4_clear_recover_bit(struct nfs4_client *clp)
+static inline void nfs4_clear_recover_bit(struct nfs_client *clp)
 {
 	smp_mb__before_clear_bit();
 	clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state);
@@ -764,7 +764,7 @@ static inline void nfs4_clear_recover_bit(struct nfs4_client *clp)
 /*
  * State recovery routine
  */
-static void nfs4_recover_state(struct nfs4_client *clp)
+static void nfs4_recover_state(struct nfs_client *clp)
 {
 	struct task_struct *task;
 
@@ -782,7 +782,7 @@ static void nfs4_recover_state(struct nfs4_client *clp)
 /*
  * Schedule a state recovery attempt
  */
-void nfs4_schedule_state_recovery(struct nfs4_client *clp)
+void nfs4_schedule_state_recovery(struct nfs_client *clp)
 {
 	if (!clp)
 		return;
@@ -879,7 +879,7 @@ out_err:
 	return status;
 }
 
-static void nfs4_state_mark_reclaim(struct nfs4_client *clp)
+static void nfs4_state_mark_reclaim(struct nfs_client *clp)
 {
 	struct nfs4_state_owner *sp;
 	struct nfs4_state *state;
@@ -903,7 +903,7 @@ static void nfs4_state_mark_reclaim(struct nfs4_client *clp)
 
 static int reclaimer(void *ptr)
 {
-	struct nfs4_client *clp = ptr;
+	struct nfs_client *clp = ptr;
 	struct nfs4_state_owner *sp;
 	struct nfs4_state_recovery_ops *ops;
 	struct rpc_cred *cred;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 1dee6ef7e5a..04748ab9ed5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1160,7 +1160,7 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
 	return 0;
 }
 
-static int encode_renew(struct xdr_stream *xdr, const struct nfs4_client *client_stateid)
+static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
 {
 	uint32_t *p;
 
@@ -1246,7 +1246,7 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
 	return 0;
 }
 
-static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_client *client_state)
+static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
 {
         uint32_t *p;
 
@@ -1945,7 +1945,7 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const str
 /*
  * a RENEW request
  */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs4_client *clp)
+static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1975,7 +1975,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nf
 /*
  * a SETCLIENTID_CONFIRM request
  */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs4_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -2132,7 +2132,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 }
 
 /* Dummy routine */
-static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
+static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
 {
 	uint32_t *p;
 	unsigned int strlen;
@@ -2636,7 +2636,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 	return 0;
 }
 
-static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_client *clp, int32_t *uid)
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *uid)
 {
 	uint32_t len, *p;
 
@@ -2660,7 +2660,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 	return 0;
 }
 
-static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_client *clp, int32_t *gid)
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *gid)
 {
 	uint32_t len, *p;
 
@@ -3565,7 +3565,7 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
 	return 0;
 }
 
-static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_client *clp)
+static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 {
 	uint32_t *p;
 	uint32_t opnum;
@@ -4335,7 +4335,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
  * a SETCLIENTID request
  */
 static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
-		struct nfs4_client *clp)
+		struct nfs_client *clp)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 63497345806..d03ede5b1ac 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1099,7 +1099,7 @@ static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
 static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 	struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
 {
-	struct nfs4_client *clp;
+	struct nfs_client *clp;
 	struct rpc_xprt *xprt = NULL;
 	struct rpc_clnt *clnt = NULL;
 	int err = -EIO;
@@ -1416,7 +1416,7 @@ static inline char *nfs4_dup_path(const struct dentry *dentry)
 static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
 {
 	const struct dentry *dentry = data->dentry;
-	struct nfs4_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs4_state;
 	struct super_block *sb;
 
 	server->fsid = data->fattr->fsid;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 6b4a13c7947..4db90df2aed 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -43,7 +43,7 @@ struct nfs_server {
 	 */
 	char			ip_addr[16];
 	char *			mnt_path;
-	struct nfs4_client *	nfs4_state;	/* all NFSv4 state starts here */
+	struct nfs_client *	nfs4_state;	/* all NFSv4 state starts here */
 	struct list_head	nfs4_siblings;	/* List of other nfs_server structs
 						 * that share the same clientid
 						 */
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index 102e5609429..678fe68982e 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -62,15 +62,15 @@ struct idmap_msg {
 #ifdef __KERNEL__
 
 /* Forward declaration to make this header independent of others */
-struct nfs4_client;
+struct nfs_client;
 
-void nfs_idmap_new(struct nfs4_client *);
-void nfs_idmap_delete(struct nfs4_client *);
+void nfs_idmap_new(struct nfs_client *);
+void nfs_idmap_delete(struct nfs_client *);
 
-int nfs_map_name_to_uid(struct nfs4_client *, const char *, size_t, __u32 *);
-int nfs_map_group_to_gid(struct nfs4_client *, const char *, size_t, __u32 *);
-int nfs_map_uid_to_name(struct nfs4_client *, __u32, char *);
-int nfs_map_gid_to_group(struct nfs4_client *, __u32, char *);
+int nfs_map_name_to_uid(struct nfs_client *, const char *, size_t, __u32 *);
+int nfs_map_group_to_gid(struct nfs_client *, const char *, size_t, __u32 *);
+int nfs_map_uid_to_name(struct nfs_client *, __u32, char *);
+int nfs_map_gid_to_group(struct nfs_client *, __u32, char *);
 
 extern unsigned int nfs_idmap_cache_timeout;
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3-70-g09d2


From 7539bbab8062aadc1db95a22b377146843cfa88f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:09 -0400
Subject: NFS: Rename nfs_server::nfs4_state

Rename nfs_server::nfs4_state to nfs_client as it will be used to represent the
client state for NFS2 and NFS3 also.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c       | 12 ++++++------
 fs/nfs/nfs4proc.c         | 26 +++++++++++++-------------
 fs/nfs/nfs4renewd.c       |  2 +-
 fs/nfs/nfs4state.c        | 10 +++++-----
 fs/nfs/nfs4xdr.c          | 10 +++++-----
 fs/nfs/super.c            |  6 +++---
 include/linux/nfs_fs_sb.h |  2 +-
 7 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5a1105c258b..cfe239736ac 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -52,7 +52,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 			case -NFS4ERR_EXPIRED:
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 			case -NFS4ERR_STALE_CLIENTID:
-				nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs4_state);
+				nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs_client);
 				goto out_err;
 		}
 	}
@@ -114,7 +114,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
  */
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs4_state;
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int status = 0;
@@ -176,7 +176,7 @@ static void nfs_msync_inode(struct inode *inode)
  */
 int __nfs_inode_return_delegation(struct inode *inode)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs4_state;
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int res = 0;
@@ -208,7 +208,7 @@ int __nfs_inode_return_delegation(struct inode *inode)
  */
 void nfs_return_all_delegations(struct super_block *sb)
 {
-	struct nfs_client *clp = NFS_SB(sb)->nfs4_state;
+	struct nfs_client *clp = NFS_SB(sb)->nfs_client;
 	struct nfs_delegation *delegation;
 	struct inode *inode;
 
@@ -310,7 +310,7 @@ static int recall_thread(void *data)
 {
 	struct recall_threadargs *args = (struct recall_threadargs *)data;
 	struct inode *inode = igrab(args->inode);
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs4_state;
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 
@@ -423,7 +423,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
 {
-	struct nfs_client *clp = NFS_SERVER(inode)->nfs4_state;
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int res = 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 168f3ffb059..b46597fc81e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -195,7 +195,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
 
 static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
 {
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 	spin_lock(&clp->cl_lock);
 	if (time_before(clp->cl_last_renewal,timestamp))
 		clp->cl_last_renewal = timestamp;
@@ -252,7 +252,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	atomic_inc(&sp->so_count);
 	p->o_arg.fh = NFS_FH(dir);
 	p->o_arg.open_flags = flags,
-	p->o_arg.clientid = server->nfs4_state->cl_clientid;
+	p->o_arg.clientid = server->nfs_client->cl_clientid;
 	p->o_arg.id = sp->so_id;
 	p->o_arg.name = &dentry->d_name;
 	p->o_arg.server = server;
@@ -550,7 +550,7 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_EXPIRED:
 				/* Don't recall a delegation if it was lost */
-				nfs4_schedule_state_recovery(server->nfs4_state);
+				nfs4_schedule_state_recovery(server->nfs_client);
 				return err;
 		}
 		err = nfs4_handle_exception(server, err, &exception);
@@ -792,7 +792,7 @@ out:
 
 int nfs4_recover_expired_lease(struct nfs_server *server)
 {
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 
 	if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
 		nfs4_schedule_state_recovery(clp);
@@ -867,7 +867,7 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred
 {
 	struct nfs_delegation *delegation;
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs4_state_owner *sp = NULL;
 	struct nfs4_state *state = NULL;
@@ -953,7 +953,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
 	struct nfs4_state_owner  *sp;
 	struct nfs4_state     *state = NULL;
 	struct nfs_server       *server = NFS_SERVER(dir);
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_opendata *opendata;
 	int                     status;
 
@@ -1133,7 +1133,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 			break;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
-			nfs4_schedule_state_recovery(server->nfs4_state);
+			nfs4_schedule_state_recovery(server->nfs_client);
 			break;
 		default:
 			if (nfs4_async_handle_error(task, server) == -EAGAIN) {
@@ -2791,7 +2791,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 static int
 nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
 {
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 
 	if (!clp || task->tk_status >= 0)
 		return 0;
@@ -2871,7 +2871,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
  */
 int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
 {
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 	int ret = errorcode;
 
 	exception->retry = 0;
@@ -3077,7 +3077,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 		switch (err) {
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_EXPIRED:
-				nfs4_schedule_state_recovery(server->nfs4_state);
+				nfs4_schedule_state_recovery(server->nfs_client);
 			case 0:
 				return 0;
 		}
@@ -3106,7 +3106,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 {
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs_lockt_args arg = {
 		.fh = NFS_FH(inode),
 		.fl = request,
@@ -3231,7 +3231,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 			break;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
-			nfs4_schedule_state_recovery(calldata->server->nfs4_state);
+			nfs4_schedule_state_recovery(calldata->server->nfs_client);
 			break;
 		default:
 			if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) {
@@ -3343,7 +3343,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	if (p->arg.lock_seqid == NULL)
 		goto out_free;
 	p->arg.lock_stateid = &lsp->ls_stateid;
-	p->arg.lock_owner.clientid = server->nfs4_state->cl_clientid;
+	p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
 	p->arg.lock_owner.id = lsp->ls_id;
 	p->lsp = lsp;
 	atomic_inc(&lsp->ls_count);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 208764069f6..ff947ecb8b8 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -127,7 +127,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
 void
 nfs4_renewd_prepare_shutdown(struct nfs_server *server)
 {
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 
 	if (!clp)
 		return;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c0b6439f1f7..fa51a7d4c02 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -61,7 +61,7 @@ static LIST_HEAD(nfs4_clientid_list);
 void
 init_nfsv4_state(struct nfs_server *server)
 {
-	server->nfs4_state = NULL;
+	server->nfs_client = NULL;
 	INIT_LIST_HEAD(&server->nfs4_siblings);
 }
 
@@ -70,9 +70,9 @@ destroy_nfsv4_state(struct nfs_server *server)
 {
 	kfree(server->mnt_path);
 	server->mnt_path = NULL;
-	if (server->nfs4_state) {
-		nfs4_put_client(server->nfs4_state);
-		server->nfs4_state = NULL;
+	if (server->nfs_client) {
+		nfs4_put_client(server->nfs_client);
+		server->nfs_client = NULL;
 	}
 }
 
@@ -306,7 +306,7 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
  */
 struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 {
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state_owner *sp, *new;
 
 	get_rpccred(cred);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 04748ab9ed5..99926067eca 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -529,7 +529,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 	if (iap->ia_valid & ATTR_MODE)
 		len += 4;
 	if (iap->ia_valid & ATTR_UID) {
-		owner_namelen = nfs_map_uid_to_name(server->nfs4_state, iap->ia_uid, owner_name);
+		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name);
 		if (owner_namelen < 0) {
 			printk(KERN_WARNING "nfs: couldn't resolve uid %d to string\n",
 			       iap->ia_uid);
@@ -541,7 +541,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
 	}
 	if (iap->ia_valid & ATTR_GID) {
-		owner_grouplen = nfs_map_gid_to_group(server->nfs4_state, iap->ia_gid, owner_group);
+		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group);
 		if (owner_grouplen < 0) {
 			printk(KERN_WARNING "nfs4: couldn't resolve gid %d to string\n",
 			       iap->ia_gid);
@@ -3051,9 +3051,9 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	fattr->mode |= fmode;
 	if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
 		goto xdr_error;
-	if ((status = decode_attr_owner(xdr, bitmap, server->nfs4_state, &fattr->uid)) != 0)
+	if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0)
 		goto xdr_error;
-	if ((status = decode_attr_group(xdr, bitmap, server->nfs4_state, &fattr->gid)) != 0)
+	if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0)
 		goto xdr_error;
@@ -3254,7 +3254,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 			if (decode_space_limit(xdr, &res->maxsize) < 0)
 				return -EIO;
 	}
-	return decode_ace(xdr, NULL, res->server->nfs4_state);
+	return decode_ace(xdr, NULL, res->server->nfs_client);
 }
 
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d03ede5b1ac..ab4c78ee840 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1141,7 +1141,7 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
 	clnt = rpc_clone_client(clp->cl_rpcclient);
 	if (!IS_ERR(clnt))
-		server->nfs4_state = clp;
+		server->nfs_client = clp;
 	up_write(&clp->cl_sem);
 	clp = NULL;
 
@@ -1151,7 +1151,7 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 		return clnt;
 	}
 
-	if (server->nfs4_state->cl_idmap == NULL) {
+	if (server->nfs_client->cl_idmap == NULL) {
 		dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -1416,7 +1416,7 @@ static inline char *nfs4_dup_path(const struct dentry *dentry)
 static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
 {
 	const struct dentry *dentry = data->dentry;
-	struct nfs_client *clp = server->nfs4_state;
+	struct nfs_client *clp = server->nfs_client;
 	struct super_block *sb;
 
 	server->fsid = data->fattr->fsid;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 4db90df2aed..fc20d6b934f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -43,7 +43,7 @@ struct nfs_server {
 	 */
 	char			ip_addr[16];
 	char *			mnt_path;
-	struct nfs_client *	nfs4_state;	/* all NFSv4 state starts here */
+	struct nfs_client *	nfs_client;	/* all NFSv4 state starts here */
 	struct list_head	nfs4_siblings;	/* List of other nfs_server structs
 						 * that share the same clientid
 						 */
-- 
cgit v1.2.3-70-g09d2


From b7162792b5c0e0f6e91b8997f8e6bbc76ec5420a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:09 -0400
Subject: NFS: Return an error when starting the idmapping pipe

Return an error when starting the idmapping pipe so that we can detect it
failing.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c            | 12 ++++++++----
 fs/nfs/super.c            |  3 ++-
 include/linux/nfs_idmap.h |  2 +-
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index d05148ec941..231c20ffc0f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -108,15 +108,17 @@ static struct rpc_pipe_ops idmap_upcall_ops = {
         .destroy_msg    = idmap_pipe_destroy_msg,
 };
 
-void
+int
 nfs_idmap_new(struct nfs_client *clp)
 {
 	struct idmap *idmap;
+	int error;
 
 	if (clp->cl_idmap != NULL)
-		return;
+		return 0;
+
         if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL)
-                return;
+                return -ENOMEM;
 
 	snprintf(idmap->idmap_path, sizeof(idmap->idmap_path),
 	    "%s/idmap", clp->cl_rpcclient->cl_pathname);
@@ -124,8 +126,9 @@ nfs_idmap_new(struct nfs_client *clp)
         idmap->idmap_dentry = rpc_mkpipe(idmap->idmap_path,
 	    idmap, &idmap_upcall_ops, 0);
         if (IS_ERR(idmap->idmap_dentry)) {
+		error = PTR_ERR(idmap->idmap_dentry);
 		kfree(idmap);
-		return;
+		return error;
 	}
 
         mutex_init(&idmap->idmap_lock);
@@ -135,6 +138,7 @@ nfs_idmap_new(struct nfs_client *clp)
 	idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP;
 
 	clp->cl_idmap = idmap;
+	return 0;
 }
 
 void
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ab4c78ee840..3ee85c4e65d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1136,7 +1136,8 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 		clnt->cl_softrtry = 1;
 		clp->cl_rpcclient = clnt;
 		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
-		nfs_idmap_new(clp);
+		if (nfs_idmap_new(clp) < 0)
+			goto out_fail;
 	}
 	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
 	clnt = rpc_clone_client(clp->cl_rpcclient);
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index 678fe68982e..15a9f3b7289 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -64,7 +64,7 @@ struct idmap_msg {
 /* Forward declaration to make this header independent of others */
 struct nfs_client;
 
-void nfs_idmap_new(struct nfs_client *);
+int nfs_idmap_new(struct nfs_client *);
 void nfs_idmap_delete(struct nfs_client *);
 
 int nfs_map_name_to_uid(struct nfs_client *, const char *, size_t, __u32 *);
-- 
cgit v1.2.3-70-g09d2


From 2b3de4411b3ccaeb00018c99d1bbe7203554cf7f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:09 -0400
Subject: NFS: Add a lookupfh NFS RPC op

Add a lookup filehandle NFS RPC op so that a file handle can be looked up
without requiring dentries and inodes and other VFS stuff when doing an NFS4
pathwalk during mounting.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_xdr.h |  3 +++
 2 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b46597fc81e..de2006f754e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1583,6 +1583,52 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	return status;
 }
 
+static int _nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
+		struct qstr *name, struct nfs_fh *fhandle,
+		struct nfs_fattr *fattr)
+{
+	int		       status;
+	struct nfs4_lookup_arg args = {
+		.bitmask = server->attr_bitmask,
+		.dir_fh = dirfh,
+		.name = name,
+	};
+	struct nfs4_lookup_res res = {
+		.server = server,
+		.fattr = fattr,
+		.fh = fhandle,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP],
+		.rpc_argp = &args,
+		.rpc_resp = &res,
+	};
+
+	nfs_fattr_init(fattr);
+
+	dprintk("NFS call  lookupfh %s\n", name->name);
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("NFS reply lookupfh: %d\n", status);
+	if (status == -NFS4ERR_MOVED)
+		status = -EREMOTE;
+	return status;
+}
+
+static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
+			      struct qstr *name, struct nfs_fh *fhandle,
+			      struct nfs_fattr *fattr)
+{
+	struct nfs4_exception exception = { };
+	int err;
+	do {
+		err = nfs4_handle_exception(server,
+				_nfs4_proc_lookupfh(server, dirfh, name,
+						    fhandle, fattr),
+				&exception);
+	} while (exception.retry);
+	return err;
+}
+
 static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
 		struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
@@ -3723,6 +3769,7 @@ struct nfs_rpc_ops	nfs_v4_clientops = {
 	.getroot	= nfs4_proc_get_root,
 	.getattr	= nfs4_proc_getattr,
 	.setattr	= nfs4_proc_setattr,
+	.lookupfh	= nfs4_proc_lookupfh,
 	.lookup		= nfs4_proc_lookup,
 	.access		= nfs4_proc_access,
 	.readlink	= nfs4_proc_readlink,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 41e5a19199e..26879771831 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -770,6 +770,9 @@ struct nfs_rpc_ops {
 
 	int	(*getroot) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fsinfo *);
+	int	(*lookupfh)(struct nfs_server *, struct nfs_fh *,
+			    struct qstr *, struct nfs_fh *,
+			    struct nfs_fattr *);
 	int	(*getattr) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fattr *);
 	int	(*setattr) (struct dentry *, struct nfs_fattr *,
-- 
cgit v1.2.3-70-g09d2


From e9326dcab413848e70ab746c7c5363da13e5f801 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:10 -0400
Subject: NFS: Add a server capabilities NFS RPC op

Add a set_capabilities NFS RPC op so that the server capabilities can be set.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 1 +
 include/linux/nfs_xdr.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index de2006f754e..850f0851023 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3790,6 +3790,7 @@ struct nfs_rpc_ops	nfs_v4_clientops = {
 	.statfs		= nfs4_proc_statfs,
 	.fsinfo		= nfs4_proc_fsinfo,
 	.pathconf	= nfs4_proc_pathconf,
+	.set_capabilities = nfs4_server_capabilities,
 	.decode_dirent	= nfs4_decode_dirent,
 	.read_setup	= nfs4_proc_read_setup,
 	.read_done	= nfs4_read_done,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 26879771831..dd9ae6761f7 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -809,6 +809,7 @@ struct nfs_rpc_ops {
 			    struct nfs_fsinfo *);
 	int	(*pathconf) (struct nfs_server *, struct nfs_fh *,
 			     struct nfs_pathconf *);
+	int	(*set_capabilities)(struct nfs_server *, struct nfs_fh *);
 	u32 *	(*decode_dirent)(u32 *, struct nfs_entry *, int plus);
 	void	(*read_setup)   (struct nfs_read_data *);
 	int	(*read_done)  (struct rpc_task *, struct nfs_read_data *);
-- 
cgit v1.2.3-70-g09d2


From 24c8dbbb5f777187d660393599641ab3307b4b97 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:10 -0400
Subject: NFS: Generalise the nfs_client structure

Generalise the nfs_client structure by:

 (1) Moving nfs_client to a more general place (nfs_fs_sb.h).

 (2) Renaming its maintenance routines to be non-NFS4 specific.

 (3) Move those maintenance routines to a new non-NFS4 specific file (client.c)
     and move the declarations to internal.h.

 (4) Make nfs_find/get_client() take a full sockaddr_in to include the port
     number (will be required for NFS2/3).

 (5) Make nfs_find/get_client() take the NFS protocol version (again will be
     required to differentiate NFS2, 3 & 4 client records).

Also:

 (6) Make nfs_client construction proceed akin to inodes, marking them as under
     construction and providing a function to indicate completion.

 (7) Make nfs_get_client() wait interruptibly if it finds a client that it can
     share, but that client is currently being constructed.

 (8) Make nfs4_create_client() use (6) and (7) instead of locking cl_sem.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile           |   6 +-
 fs/nfs/callback.c         |   9 +-
 fs/nfs/callback_proc.c    |   9 +-
 fs/nfs/client.c           | 312 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/delegation.c       |   9 +-
 fs/nfs/internal.h         |   6 +
 fs/nfs/nfs4_fs.h          |  52 --------
 fs/nfs/nfs4proc.c         |   2 +-
 fs/nfs/nfs4state.c        | 128 +------------------
 fs/nfs/super.c            |  53 ++++----
 include/linux/nfs_fs.h    |   1 +
 include/linux/nfs_fs_sb.h |  60 +++++++++
 12 files changed, 425 insertions(+), 222 deletions(-)
 create mode 100644 fs/nfs/client.c

(limited to 'include')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 0b572a0c196..3b993a6f816 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,9 +4,9 @@
 
 obj-$(CONFIG_NFS_FS) += nfs.o
 
-nfs-y 			:= dir.o file.o inode.o super.o nfs2xdr.o pagelist.o \
-			   proc.o read.o symlink.o unlink.o write.o \
-			   namespace.o
+nfs-y 			:= client.o dir.o file.o inode.o super.o nfs2xdr.o \
+			   pagelist.o proc.o read.o symlink.o unlink.o \
+			   write.o namespace.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 1b596b6d9dc..a3ee11364db 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -19,6 +19,7 @@
 
 #include "nfs4_fs.h"
 #include "callback.h"
+#include "internal.h"
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 
@@ -166,15 +167,15 @@ void nfs_callback_down(void)
 
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
-	struct in_addr *addr = &rqstp->rq_addr.sin_addr;
+	struct sockaddr_in *addr = &rqstp->rq_addr;
 	struct nfs_client *clp;
 
 	/* Don't talk to strangers */
-	clp = nfs4_find_client(addr);
+	clp = nfs_find_client(addr, 4);
 	if (clp == NULL)
 		return SVC_DROP;
-	dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr));
-	nfs4_put_client(clp);
+	dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr->sin_addr));
+	nfs_put_client(clp);
 	switch (rqstp->rq_authop->flavour) {
 		case RPC_AUTH_NULL:
 			if (rqstp->rq_proc != CB_NULL)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 55d6e2ec157..97cf8f71451 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -10,6 +10,7 @@
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
+#include "internal.h"
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
  
@@ -22,7 +23,7 @@ unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres
 	
 	res->bitmap[0] = res->bitmap[1] = 0;
 	res->status = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs4_find_client(&args->addr->sin_addr);
+	clp = nfs_find_client(args->addr, 4);
 	if (clp == NULL)
 		goto out;
 	inode = nfs_delegation_find_inode(clp, &args->fh);
@@ -48,7 +49,7 @@ out_iput:
 	up_read(&nfsi->rwsem);
 	iput(inode);
 out_putclient:
-	nfs4_put_client(clp);
+	nfs_put_client(clp);
 out:
 	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res->status));
 	return res->status;
@@ -61,7 +62,7 @@ unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 	unsigned res;
 	
 	res = htonl(NFS4ERR_BADHANDLE);
-	clp = nfs4_find_client(&args->addr->sin_addr);
+	clp = nfs_find_client(args->addr, 4);
 	if (clp == NULL)
 		goto out;
 	inode = nfs_delegation_find_inode(clp, &args->fh);
@@ -80,7 +81,7 @@ unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 	}
 	iput(inode);
 out_putclient:
-	nfs4_put_client(clp);
+	nfs_put_client(clp);
 out:
 	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
 	return res;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
new file mode 100644
index 00000000000..cb5e92463bd
--- /dev/null
+++ b/fs/nfs/client.c
@@ -0,0 +1,312 @@
+/* client.c: NFS client sharing and management code
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+
+#include <asm/system.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_CLIENT
+
+static DEFINE_SPINLOCK(nfs_client_lock);
+static LIST_HEAD(nfs_client_list);
+static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
+
+/*
+ * Allocate a shared client record
+ *
+ * Since these are allocated/deallocated very rarely, we don't
+ * bother putting them in a slab cache...
+ */
+static struct nfs_client *nfs_alloc_client(const char *hostname,
+					   const struct sockaddr_in *addr,
+					   int nfsversion)
+{
+	struct nfs_client *clp;
+	int error;
+
+	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
+		goto error_0;
+
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
+		__set_bit(NFS_CS_RPCIOD, &clp->cl_res_state);
+		goto error_1;
+	}
+
+	if (nfsversion == 4) {
+		if (nfs_callback_up() < 0)
+			goto error_2;
+		__set_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+	}
+
+	atomic_set(&clp->cl_count, 1);
+	clp->cl_cons_state = NFS_CS_INITING;
+
+	clp->cl_nfsversion = nfsversion;
+	memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
+
+	if (hostname) {
+		clp->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+		if (!clp->cl_hostname)
+			goto error_3;
+	}
+
+	INIT_LIST_HEAD(&clp->cl_superblocks);
+	clp->cl_rpcclient = ERR_PTR(-EINVAL);
+
+#ifdef CONFIG_NFS_V4
+	init_rwsem(&clp->cl_sem);
+	INIT_LIST_HEAD(&clp->cl_delegations);
+	INIT_LIST_HEAD(&clp->cl_state_owners);
+	INIT_LIST_HEAD(&clp->cl_unused);
+	spin_lock_init(&clp->cl_lock);
+	INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
+	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
+	clp->cl_boot_time = CURRENT_TIME;
+	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
+#endif
+
+	return clp;
+
+error_3:
+	nfs_callback_down();
+	__clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state);
+error_2:
+	rpciod_down();
+	__clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state);
+error_1:
+	kfree(clp);
+error_0:
+	return NULL;
+}
+
+/*
+ * Destroy a shared client record
+ */
+static void nfs_free_client(struct nfs_client *clp)
+{
+	dprintk("--> nfs_free_client(%d)\n", clp->cl_nfsversion);
+
+#ifdef CONFIG_NFS_V4
+	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) {
+		while (!list_empty(&clp->cl_unused)) {
+			struct nfs4_state_owner *sp;
+
+			sp = list_entry(clp->cl_unused.next,
+					struct nfs4_state_owner,
+					so_list);
+			list_del(&sp->so_list);
+			kfree(sp);
+		}
+		BUG_ON(!list_empty(&clp->cl_state_owners));
+		nfs_idmap_delete(clp);
+	}
+#endif
+
+	/* -EIO all pending I/O */
+	if (!IS_ERR(clp->cl_rpcclient))
+		rpc_shutdown_client(clp->cl_rpcclient);
+
+	if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
+		nfs_callback_down();
+
+	if (__test_and_clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state))
+	rpciod_down();
+
+	kfree(clp->cl_hostname);
+	kfree(clp);
+
+	dprintk("<-- nfs_free_client()\n");
+}
+
+/*
+ * Release a reference to a shared client record
+ */
+void nfs_put_client(struct nfs_client *clp)
+{
+	dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count));
+
+	if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) {
+		list_del(&clp->cl_share_link);
+		spin_unlock(&nfs_client_lock);
+
+		BUG_ON(!list_empty(&clp->cl_superblocks));
+
+		nfs_free_client(clp);
+	}
+}
+
+/*
+ * Find a client by address
+ * - caller must hold nfs_client_lock
+ */
+static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+{
+	struct nfs_client *clp;
+
+	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		/* Different NFS versions cannot share the same nfs_client */
+		if (clp->cl_nfsversion != nfsversion)
+			continue;
+
+		if (memcmp(&clp->cl_addr.sin_addr, &addr->sin_addr,
+			   sizeof(clp->cl_addr.sin_addr)) != 0)
+			continue;
+
+		if (clp->cl_addr.sin_port == addr->sin_port)
+			goto found;
+	}
+
+	return NULL;
+
+found:
+	atomic_inc(&clp->cl_count);
+	return clp;
+}
+
+/*
+ * Find a client by IP address and protocol version
+ * - returns NULL if no such client
+ */
+struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+{
+	struct nfs_client *clp;
+
+	spin_lock(&nfs_client_lock);
+	clp = __nfs_find_client(addr, nfsversion);
+	spin_unlock(&nfs_client_lock);
+
+	BUG_ON(clp->cl_cons_state == 0);
+
+	return clp;
+}
+
+/*
+ * Look up a client by IP address and protocol version
+ * - creates a new record if one doesn't yet exist
+ */
+struct nfs_client *nfs_get_client(const char *hostname,
+				  const struct sockaddr_in *addr,
+				  int nfsversion)
+{
+	struct nfs_client *clp, *new = NULL;
+	int error;
+
+	dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%d)\n",
+		hostname ?: "", NIPQUAD(addr->sin_addr),
+		addr->sin_port, nfsversion);
+
+	/* see if the client already exists */
+	do {
+		spin_lock(&nfs_client_lock);
+
+		clp = __nfs_find_client(addr, nfsversion);
+		if (clp)
+			goto found_client;
+		if (new)
+			goto install_client;
+
+		spin_unlock(&nfs_client_lock);
+
+		new = nfs_alloc_client(hostname, addr, nfsversion);
+	} while (new);
+
+	return ERR_PTR(-ENOMEM);
+
+	/* install a new client and return with it unready */
+install_client:
+	clp = new;
+	list_add(&clp->cl_share_link, &nfs_client_list);
+	spin_unlock(&nfs_client_lock);
+	dprintk("--> nfs_get_client() = %p [new]\n", clp);
+	return clp;
+
+	/* found an existing client
+	 * - make sure it's ready before returning
+	 */
+found_client:
+	spin_unlock(&nfs_client_lock);
+
+	if (new)
+		nfs_free_client(new);
+
+	if (clp->cl_cons_state == NFS_CS_INITING) {
+		DECLARE_WAITQUEUE(myself, current);
+
+		add_wait_queue(&nfs_client_active_wq, &myself);
+
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (signal_pending(current) ||
+			    clp->cl_cons_state > NFS_CS_READY)
+				break;
+			schedule();
+		}
+
+		remove_wait_queue(&nfs_client_active_wq, &myself);
+
+		if (signal_pending(current)) {
+			nfs_put_client(clp);
+			return ERR_PTR(-ERESTARTSYS);
+		}
+	}
+
+	if (clp->cl_cons_state < NFS_CS_READY) {
+		error = clp->cl_cons_state;
+		nfs_put_client(clp);
+		return ERR_PTR(error);
+	}
+
+	dprintk("--> nfs_get_client() = %p [share]\n", clp);
+	return clp;
+}
+
+/*
+ * Mark a server as ready or failed
+ */
+void nfs_mark_client_ready(struct nfs_client *clp, int state)
+{
+	clp->cl_cons_state = state;
+	wake_up_all(&nfs_client_active_wq);
+}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cfe239736ac..57133678db1 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -18,6 +18,7 @@
 
 #include "nfs4_fs.h"
 #include "delegation.h"
+#include "internal.h"
 
 static struct nfs_delegation *nfs_alloc_delegation(void)
 {
@@ -145,7 +146,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 					sizeof(delegation->stateid)) != 0 ||
 				delegation->type != nfsi->delegation->type) {
 			printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n",
-					__FUNCTION__, NIPQUAD(clp->cl_addr));
+					__FUNCTION__, NIPQUAD(clp->cl_addr.sin_addr));
 			status = -EIO;
 		}
 	}
@@ -254,7 +255,7 @@ restart:
 	}
 out:
 	spin_unlock(&clp->cl_lock);
-	nfs4_put_client(clp);
+	nfs_put_client(clp);
 	module_put_and_exit(0);
 }
 
@@ -266,10 +267,10 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
 	atomic_inc(&clp->cl_count);
 	task = kthread_run(nfs_do_expire_all_delegations, clp,
 			"%u.%u.%u.%u-delegreturn",
-			NIPQUAD(clp->cl_addr));
+			NIPQUAD(clp->cl_addr.sin_addr));
 	if (!IS_ERR(task))
 		return;
-	nfs4_put_client(clp);
+	nfs_put_client(clp);
 	module_put(THIS_MODULE);
 }
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4802157963f..ac370d5d449 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -15,6 +15,12 @@ struct nfs_clone_mount {
 	rpc_authflavor_t authflavor;
 };
 
+/* client.c */
+extern void nfs_put_client(struct nfs_client *);
+extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
+extern struct nfs_client *nfs_get_client(const char *, const struct sockaddr_in *, int);
+extern void nfs_mark_client_ready(struct nfs_client *, int);
+
 /* nfs4namespace.c */
 #ifdef CONFIG_NFS_V4
 extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4e334cb4849..e7879245361 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -42,55 +42,6 @@ enum nfs4_client_state {
 	NFS4CLNT_LEASE_EXPIRED,
 };
 
-/*
- * The nfs_client identifies our client state to the server.
- */
-struct nfs_client {
-	struct list_head	cl_servers;	/* Global list of servers */
-	struct in_addr		cl_addr;	/* Server identifier */
-	u64			cl_clientid;	/* constant */
-	nfs4_verifier		cl_confirm;
-	unsigned long		cl_state;
-
-	u32			cl_lockowner_id;
-
-	/*
-	 * The following rwsem ensures exclusive access to the server
-	 * while we recover the state following a lease expiration.
-	 */
-	struct rw_semaphore	cl_sem;
-
-	struct list_head	cl_delegations;
-	struct list_head	cl_state_owners;
-	struct list_head	cl_unused;
-	int			cl_nunused;
-	spinlock_t		cl_lock;
-	atomic_t		cl_count;
-
-	struct rpc_clnt *	cl_rpcclient;
-
-	struct list_head	cl_superblocks;	/* List of nfs_server structs */
-
-	unsigned long		cl_lease_time;
-	unsigned long		cl_last_renewal;
-	struct work_struct	cl_renewd;
-	struct work_struct	cl_recoverd;
-
-	struct rpc_wait_queue	cl_rpcwaitq;
-
-	/* used for the setclientid verifier */
-	struct timespec		cl_boot_time;
-
-	/* idmapper */
-	struct idmap *		cl_idmap;
-
-	/* Our own IP address, as a null-terminated string.
-	 * This is used to generate the clientid, and the callback address.
-	 */
-	char			cl_ipaddr[16];
-	unsigned char		cl_id_uniquifier;
-};
-
 /*
  * struct rpc_sequence ensures that RPC calls are sent in the exact
  * order that they appear on the list.
@@ -239,9 +190,6 @@ extern void nfs4_renew_state(void *);
 /* nfs4state.c */
 extern void init_nfsv4_state(struct nfs_server *);
 extern void destroy_nfsv4_state(struct nfs_server *);
-extern struct nfs_client *nfs4_get_client(struct in_addr *);
-extern void nfs4_put_client(struct nfs_client *clp);
-extern struct nfs_client *nfs4_find_client(struct in_addr *);
 struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
 extern u32 nfs4_alloc_lockowner_id(struct nfs_client *);
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 850f0851023..803c31b88bb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2968,7 +2968,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
 	for(;;) {
 		setclientid.sc_name_len = scnprintf(setclientid.sc_name,
 				sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u",
-				clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr),
+				clp->cl_ipaddr, NIPQUAD(clp->cl_addr.sin_addr),
 				cred->cr_ops->cr_name,
 				clp->cl_id_uniquifier);
 		setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index fa51a7d4c02..058811e3955 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -50,12 +50,12 @@
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
+#include "internal.h"
 
 #define OPENOWNER_POOL_SIZE	8
 
 const nfs4_stateid zero_stateid;
 
-static DEFINE_SPINLOCK(state_spinlock);
 static LIST_HEAD(nfs4_clientid_list);
 
 void
@@ -71,127 +71,11 @@ destroy_nfsv4_state(struct nfs_server *server)
 	kfree(server->mnt_path);
 	server->mnt_path = NULL;
 	if (server->nfs_client) {
-		nfs4_put_client(server->nfs_client);
+		nfs_put_client(server->nfs_client);
 		server->nfs_client = NULL;
 	}
 }
 
-/*
- * nfs4_get_client(): returns an empty client structure
- * nfs4_put_client(): drops reference to client structure
- *
- * Since these are allocated/deallocated very rarely, we don't
- * bother putting them in a slab cache...
- */
-static struct nfs_client *
-nfs4_alloc_client(struct in_addr *addr)
-{
-	struct nfs_client *clp;
-
-	if (nfs_callback_up() < 0)
-		return NULL;
-	if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) {
-		nfs_callback_down();
-		return NULL;
-	}
-	memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
-	init_rwsem(&clp->cl_sem);
-	INIT_LIST_HEAD(&clp->cl_delegations);
-	INIT_LIST_HEAD(&clp->cl_state_owners);
-	INIT_LIST_HEAD(&clp->cl_unused);
-	spin_lock_init(&clp->cl_lock);
-	atomic_set(&clp->cl_count, 1);
-	INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
-	INIT_LIST_HEAD(&clp->cl_superblocks);
-	rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
-	clp->cl_rpcclient = ERR_PTR(-EINVAL);
-	clp->cl_boot_time = CURRENT_TIME;
-	clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
-	return clp;
-}
-
-static void
-nfs4_free_client(struct nfs_client *clp)
-{
-	struct nfs4_state_owner *sp;
-
-	while (!list_empty(&clp->cl_unused)) {
-		sp = list_entry(clp->cl_unused.next,
-				struct nfs4_state_owner,
-				so_list);
-		list_del(&sp->so_list);
-		kfree(sp);
-	}
-	BUG_ON(!list_empty(&clp->cl_state_owners));
-	nfs_idmap_delete(clp);
-	if (!IS_ERR(clp->cl_rpcclient))
-		rpc_shutdown_client(clp->cl_rpcclient);
-	kfree(clp);
-	nfs_callback_down();
-}
-
-static struct nfs_client *__nfs4_find_client(struct in_addr *addr)
-{
-	struct nfs_client *clp;
-	list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) {
-		if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0) {
-			atomic_inc(&clp->cl_count);
-			return clp;
-		}
-	}
-	return NULL;
-}
-
-struct nfs_client *nfs4_find_client(struct in_addr *addr)
-{
-	struct nfs_client *clp;
-	spin_lock(&state_spinlock);
-	clp = __nfs4_find_client(addr);
-	spin_unlock(&state_spinlock);
-	return clp;
-}
-
-struct nfs_client *
-nfs4_get_client(struct in_addr *addr)
-{
-	struct nfs_client *clp, *new = NULL;
-
-	spin_lock(&state_spinlock);
-	for (;;) {
-		clp = __nfs4_find_client(addr);
-		if (clp != NULL)
-			break;
-		clp = new;
-		if (clp != NULL) {
-			list_add(&clp->cl_servers, &nfs4_clientid_list);
-			new = NULL;
-			break;
-		}
-		spin_unlock(&state_spinlock);
-		new = nfs4_alloc_client(addr);
-		spin_lock(&state_spinlock);
-		if (new == NULL)
-			break;
-	}
-	spin_unlock(&state_spinlock);
-	if (new)
-		nfs4_free_client(new);
-	return clp;
-}
-
-void
-nfs4_put_client(struct nfs_client *clp)
-{
-	if (!atomic_dec_and_lock(&clp->cl_count, &state_spinlock))
-		return;
-	list_del(&clp->cl_servers);
-	spin_unlock(&state_spinlock);
-	BUG_ON(!list_empty(&clp->cl_superblocks));
-	rpc_wake_up(&clp->cl_rpcwaitq);
-	nfs4_kill_renewd(clp);
-	nfs4_free_client(clp);
-}
-
 static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
@@ -771,11 +655,11 @@ static void nfs4_recover_state(struct nfs_client *clp)
 	__module_get(THIS_MODULE);
 	atomic_inc(&clp->cl_count);
 	task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim",
-			NIPQUAD(clp->cl_addr));
+			NIPQUAD(clp->cl_addr.sin_addr));
 	if (!IS_ERR(task))
 		return;
 	nfs4_clear_recover_bit(clp);
-	nfs4_put_client(clp);
+	nfs_put_client(clp);
 	module_put(THIS_MODULE);
 }
 
@@ -970,12 +854,12 @@ out:
 	if (status == -NFS4ERR_CB_PATH_DOWN)
 		nfs_handle_cb_pathdown(clp);
 	nfs4_clear_recover_bit(clp);
-	nfs4_put_client(clp);
+	nfs_put_client(clp);
 	module_put_and_exit(0);
 	return 0;
 out_error:
 	printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n",
-				NIPQUAD(clp->cl_addr.s_addr), -status);
+				NIPQUAD(clp->cl_addr.sin_addr), -status);
 	set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
 	goto out;
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3ee85c4e65d..f97d7d9c5c3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1104,47 +1104,46 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 	struct rpc_clnt *clnt = NULL;
 	int err = -EIO;
 
-	clp = nfs4_get_client(&server->addr.sin_addr);
+	clp = nfs_get_client(server->hostname, &server->addr, 4);
 	if (!clp) {
 		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
 		return ERR_PTR(err);
 	}
 
 	/* Now create transport and client */
-	down_write(&clp->cl_sem);
-	if (IS_ERR(clp->cl_rpcclient)) {
+	if (clp->cl_cons_state == NFS_CS_INITING) {
 		xprt = xprt_create_proto(proto, &server->addr, timeparms);
 		if (IS_ERR(xprt)) {
-			up_write(&clp->cl_sem);
 			err = PTR_ERR(xprt);
 			dprintk("%s: cannot create RPC transport. Error = %d\n",
 					__FUNCTION__, err);
-			goto out_fail;
+			goto client_init_error;
 		}
 		/* Bind to a reserved port! */
 		xprt->resvport = 1;
 		clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
 				server->rpc_ops->version, flavor);
 		if (IS_ERR(clnt)) {
-			up_write(&clp->cl_sem);
 			err = PTR_ERR(clnt);
 			dprintk("%s: cannot create RPC client. Error = %d\n",
 					__FUNCTION__, err);
-			goto out_fail;
+			goto client_init_error;
 		}
 		clnt->cl_intr     = 1;
 		clnt->cl_softrtry = 1;
 		clp->cl_rpcclient = clnt;
 		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
-		if (nfs_idmap_new(clp) < 0)
-			goto out_fail;
+		err = nfs_idmap_new(clp);
+		if (err < 0) {
+			dprintk("%s: failed to create idmapper.\n",
+				__FUNCTION__);
+			goto client_init_error;
+		}
+		__set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
+		nfs_mark_client_ready(clp, 0);
 	}
-	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+
 	clnt = rpc_clone_client(clp->cl_rpcclient);
-	if (!IS_ERR(clnt))
-		server->nfs_client = clp;
-	up_write(&clp->cl_sem);
-	clp = NULL;
 
 	if (IS_ERR(clnt)) {
 		dprintk("%s: cannot create RPC client. Error = %d\n",
@@ -1152,11 +1151,6 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 		return clnt;
 	}
 
-	if (server->nfs_client->cl_idmap == NULL) {
-		dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
-		return ERR_PTR(-ENOMEM);
-	}
-
 	if (clnt->cl_auth->au_flavor != flavor) {
 		struct rpc_auth *auth;
 
@@ -1166,11 +1160,16 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 			return (struct rpc_clnt *)auth;
 		}
 	}
+
+	server->nfs_client = clp;
+	down_write(&clp->cl_sem);
+	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+	up_write(&clp->cl_sem);
 	return clnt;
 
- out_fail:
-	if (clp)
-		nfs4_put_client(clp);
+client_init_error:
+	nfs_mark_client_ready(clp, err);
+	nfs_put_client(clp);
 	return ERR_PTR(err);
 }
 
@@ -1329,14 +1328,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		goto out_free;
 	}
 
-	/* Fire up rpciod if not yet running */
-	error = rpciod_up();
-	if (error < 0) {
-		dprintk("%s: couldn't start rpciod! Error = %d\n",
-				__FUNCTION__, error);
-		goto out_free;
-	}
-
 	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
@@ -1383,8 +1374,6 @@ static void nfs4_kill_super(struct super_block *sb)
 
 	destroy_nfsv4_state(server);
 
-	rpciod_down();
-
 	nfs_free_iostats(server->io_stats);
 	kfree(server->hostname);
 	kfree(server);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a36e01cd632..70e1dc9162e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -586,6 +586,7 @@ extern void * nfs_root_data(void);
 #define NFSDBG_FILE		0x0040
 #define NFSDBG_ROOT		0x0080
 #define NFSDBG_CALLBACK		0x0100
+#define NFSDBG_CLIENT		0x0200
 #define NFSDBG_ALL		0xFFFF
 
 #ifdef __KERNEL__
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index fc20d6b934f..a727657e0ad 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -6,6 +6,66 @@
 
 struct nfs_iostats;
 
+/*
+ * The nfs_client identifies our client state to the server.
+ */
+struct nfs_client {
+	atomic_t		cl_count;
+	int			cl_cons_state;	/* current construction state (-ve: init error) */
+#define NFS_CS_READY		0		/* ready to be used */
+#define NFS_CS_INITING		1		/* busy initialising */
+	int			cl_nfsversion;	/* NFS protocol version */
+	unsigned long		cl_res_state;	/* NFS resources state */
+#define NFS_CS_RPCIOD		0		/* - rpciod started */
+#define NFS_CS_CALLBACK		1		/* - callback started */
+#define NFS_CS_IDMAP		2		/* - idmap started */
+	struct sockaddr_in	cl_addr;	/* server identifier */
+	char *			cl_hostname;	/* hostname of server */
+	struct list_head	cl_share_link;	/* link in global client list */
+	struct list_head	cl_superblocks;	/* List of nfs_server structs */
+
+	struct rpc_clnt *	cl_rpcclient;
+
+#ifdef CONFIG_NFS_V4
+	u64			cl_clientid;	/* constant */
+	nfs4_verifier		cl_confirm;
+	unsigned long		cl_state;
+
+	u32			cl_lockowner_id;
+
+	/*
+	 * The following rwsem ensures exclusive access to the server
+	 * while we recover the state following a lease expiration.
+	 */
+	struct rw_semaphore	cl_sem;
+
+	struct list_head	cl_delegations;
+	struct list_head	cl_state_owners;
+	struct list_head	cl_unused;
+	int			cl_nunused;
+	spinlock_t		cl_lock;
+
+	unsigned long		cl_lease_time;
+	unsigned long		cl_last_renewal;
+	struct work_struct	cl_renewd;
+	struct work_struct	cl_recoverd;
+
+	struct rpc_wait_queue	cl_rpcwaitq;
+
+	/* used for the setclientid verifier */
+	struct timespec		cl_boot_time;
+
+	/* idmapper */
+	struct idmap *		cl_idmap;
+
+	/* Our own IP address, as a null-terminated string.
+	 * This is used to generate the clientid, and the callback address.
+	 */
+	char			cl_ipaddr[16];
+	unsigned char		cl_id_uniquifier;
+#endif
+};
+
 /*
  * NFS client parameters stored in the superblock.
  */
-- 
cgit v1.2.3-70-g09d2


From 509de8111656a7d89b4a1a5f430f4460ce510f0f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:11 -0400
Subject: NFS: Add extra const qualifiers

Add some extra const qualifiers into NFS.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/namespace.c        |  3 ++-
 fs/nfs/nfs3proc.c         |  2 +-
 fs/nfs/nfs4namespace.c    |  8 ++++----
 fs/nfs/nfs4proc.c         |  2 +-
 fs/nfs/proc.c             |  2 +-
 fs/nfs/super.c            | 10 +++++-----
 include/linux/nfs_fs_sb.h |  2 +-
 include/linux/nfs_xdr.h   |  6 +++---
 8 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 86b3169c8ca..85d9ed1dcf4 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -172,7 +172,8 @@ void nfs_release_automount_timer(void)
 /*
  * Clone a mountpoint of the appropriate type
  */
-static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname,
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
+					   const char *devname,
 					   struct nfs_clone_mount *mountdata)
 {
 #ifdef CONFIG_NFS_V4
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 7143b1f82ce..3e5371241ce 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -886,7 +886,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 	return nlmclnt_proc(filp->f_dentry->d_inode, cmd, fl);
 }
 
-struct nfs_rpc_ops	nfs_v3_clientops = {
+const struct nfs_rpc_ops nfs_v3_clientops = {
 	.version	= 3,			/* protocol version */
 	.dentry_ops	= &nfs_dentry_operations,
 	.dir_inode_ops	= &nfs3_dir_inode_operations,
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index ea38d27b74e..faed9bcba50 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -23,7 +23,7 @@
 /*
  * Check if fs_root is valid
  */
-static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname,
+static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
 					 char *buffer, ssize_t buflen)
 {
 	char *end = buffer + buflen;
@@ -34,7 +34,7 @@ static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname,
 
 	n = pathname->ncomponents;
 	while (--n >= 0) {
-		struct nfs4_string *component = &pathname->components[n];
+		const struct nfs4_string *component = &pathname->components[n];
 		buflen -= component->len + 1;
 		if (buflen < 0)
 			goto Elong;
@@ -60,7 +60,7 @@ Elong:
  */
 static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 					    const struct dentry *dentry,
-					    struct nfs4_fs_locations *locations)
+					    const struct nfs4_fs_locations *locations)
 {
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
 	struct nfs_clone_mount mountdata = {
@@ -108,7 +108,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 
 	loc = 0;
 	while (loc < locations->nlocations && IS_ERR(mnt)) {
-		struct nfs4_fs_location *location = &locations->locations[loc];
+		const struct nfs4_fs_location *location = &locations->locations[loc];
 		char *mnt_path;
 
 		if (location == NULL || location->nservers <= 0 ||
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 803c31b88bb..061be713b20 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3761,7 +3761,7 @@ static struct inode_operations nfs4_file_inode_operations = {
 	.listxattr	= nfs4_listxattr,
 };
 
-struct nfs_rpc_ops	nfs_v4_clientops = {
+const struct nfs_rpc_ops nfs_v4_clientops = {
 	.version	= 4,			/* protocol version */
 	.dentry_ops	= &nfs4_dentry_operations,
 	.dir_inode_ops	= &nfs4_dir_inode_operations,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b3899ea3229..77676903e0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -671,7 +671,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 }
 
 
-struct nfs_rpc_ops	nfs_v2_clientops = {
+const struct nfs_rpc_ops nfs_v2_clientops = {
 	.version	= 2,		       /* protocol version */
 	.dentry_ops	= &nfs_dentry_operations,
 	.dir_inode_ops	= &nfs_dir_inode_operations,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a41d516ed59..c97f3096795 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -329,10 +329,10 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
  */
 static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
 {
-	static struct proc_nfs_info {
+	static const struct proc_nfs_info {
 		int flag;
-		char *str;
-		char *nostr;
+		const char *str;
+		const char *nostr;
 	} nfs_info[] = {
 		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
 		{ NFS_MOUNT_INTR, ",intr", "" },
@@ -342,9 +342,9 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		{ NFS_MOUNT_NOACL, ",noacl", "" },
 		{ 0, NULL, NULL }
 	};
-	struct proc_nfs_info *nfs_infop;
+	const struct proc_nfs_info *nfs_infop;
 	char buf[12];
-	char *proto;
+	const char *proto;
 
 	seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
 	seq_printf(m, ",rsize=%d", nfss->rsize);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index a727657e0ad..95f32d5f6e9 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -73,7 +73,7 @@ struct nfs_server {
 	struct rpc_clnt *	client;		/* RPC client handle */
 	struct rpc_clnt *	client_sys;	/* 2nd handle for FSINFO */
 	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
-	struct nfs_rpc_ops *	rpc_ops;	/* NFS protocol vector */
+	const struct nfs_rpc_ops *rpc_ops;	/* NFS protocol vector */
 	struct nfs_iostats *	io_stats;	/* I/O statistics */
 	struct backing_dev_info	backing_dev_info;
 	int			flags;		/* various flags */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index dd9ae6761f7..2426b11b6cc 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -833,9 +833,9 @@ struct nfs_rpc_ops {
 /*
  * Function vectors etc. for the NFS client
  */
-extern struct nfs_rpc_ops	nfs_v2_clientops;
-extern struct nfs_rpc_ops	nfs_v3_clientops;
-extern struct nfs_rpc_ops	nfs_v4_clientops;
+extern const struct nfs_rpc_ops	nfs_v2_clientops;
+extern const struct nfs_rpc_ops	nfs_v3_clientops;
+extern const struct nfs_rpc_ops	nfs_v4_clientops;
 extern struct rpc_version	nfs_version2;
 extern struct rpc_version	nfs_version3;
 extern struct rpc_version	nfs_version4;
-- 
cgit v1.2.3-70-g09d2


From 27951bd26031f6c27d38df9e94623bbe208a2464 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:11 -0400
Subject: NFS: Maintain a common server record for NFS2/3 as well as for NFS4

Maintain a common server record for NFS2/3 as well as for NFS4 so that common
stuff can be moved there from struct nfs_server.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c            | 21 ++++++++++++++++++++-
 include/linux/nfs_fs_sb.h |  2 +-
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c97f3096795..d1b4a5b36e3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -658,11 +658,19 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned
 static struct rpc_clnt *
 nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 {
+	struct nfs_client	*clp;
 	struct rpc_timeout	timeparms;
 	struct rpc_xprt		*xprt = NULL;
 	struct rpc_clnt		*clnt = NULL;
 	int			proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
 
+	clp = nfs_get_client(server->hostname, &server->addr,
+			     server->rpc_ops->version);
+	if (!clp) {
+		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
+		return ERR_PTR(PTR_ERR(clp));
+	}
+
 	nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
 
 	server->retrans_timeo = timeparms.to_initval;
@@ -673,6 +681,8 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 	if (IS_ERR(xprt)) {
 		dprintk("%s: cannot create RPC transport. Error = %ld\n",
 				__FUNCTION__, PTR_ERR(xprt));
+		nfs_mark_client_ready(clp, PTR_ERR(xprt));
+		nfs_put_client(clp);
 		return (struct rpc_clnt *)xprt;
 	}
 	clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
@@ -686,9 +696,13 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 	clnt->cl_intr     = 1;
 	clnt->cl_softrtry = 1;
 
+	nfs_mark_client_ready(clp, 0);
+	server->nfs_client = clp;
 	return clnt;
 
 out_fail:
+	nfs_mark_client_ready(clp, PTR_ERR(xprt));
+	nfs_put_client(clp);
 	return clnt;
 }
 
@@ -764,6 +778,7 @@ static int nfs_clone_generic_sb(struct nfs_clone_mount *data,
 	if (server == NULL)
 		goto out_err;
 	memcpy(server, parent, sizeof(*server));
+	atomic_inc(&server->nfs_client->cl_count);
 	hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
 	len = strlen(hostname) + 1;
 	server->hostname = kmalloc(len, GFP_KERNEL);
@@ -796,6 +811,7 @@ out_deactivate:
 out_rpciod_down:
 	rpciod_down();
 	kfree(server->hostname);
+	nfs_put_client(server->nfs_client);
 	kfree(server);
 	return simple_set_mnt(mnt, sb);
 kill_rpciod:
@@ -803,6 +819,7 @@ kill_rpciod:
 free_hostname:
 	kfree(server->hostname);
 free_server:
+	nfs_put_client(server->nfs_client);
 	kfree(server);
 out_err:
 	return error;
@@ -1071,6 +1088,7 @@ static void nfs_kill_super(struct super_block *s)
 
 	nfs_free_iostats(server->io_stats);
 	kfree(server->hostname);
+	nfs_put_client(server->nfs_client);
 	kfree(server);
 	nfs_release_automount_timer();
 }
@@ -1421,7 +1439,6 @@ static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_c
 	nfs4_server_capabilities(server, &server->fh);
 
 	down_write(&clp->cl_sem);
-	atomic_inc(&clp->cl_count);
 	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
 	up_write(&clp->cl_sem);
 	return sb;
@@ -1476,6 +1493,8 @@ static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nf
 	retrans = 1;
 	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
 
+	nfs_put_client(server->nfs_client);
+	server->nfs_client = NULL;
 	server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
 	if (IS_ERR((err = server->client)))
 		goto out_err;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 95f32d5f6e9..e7d7662f51f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -70,6 +70,7 @@ struct nfs_client {
  * NFS client parameters stored in the superblock.
  */
 struct nfs_server {
+	struct nfs_client *	nfs_client;	/* shared client and NFS4 state */
 	struct rpc_clnt *	client;		/* RPC client handle */
 	struct rpc_clnt *	client_sys;	/* 2nd handle for FSINFO */
 	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
@@ -103,7 +104,6 @@ struct nfs_server {
 	 */
 	char			ip_addr[16];
 	char *			mnt_path;
-	struct nfs_client *	nfs_client;	/* all NFSv4 state starts here */
 	struct list_head	nfs4_siblings;	/* List of other nfs_server structs
 						 * that share the same clientid
 						 */
-- 
cgit v1.2.3-70-g09d2


From 8fa5c000d7f986ef9cdc6d95f9f7fcee20e0a7d6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:12 -0400
Subject: NFS: Move rpc_ops from nfs_server to nfs_client

Move the rpc_ops from the nfs_server struct to the nfs_client struct as they're
common to all server records of a particular NFS protocol version.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c              |  2 +-
 fs/nfs/inode.c            |  4 ++--
 fs/nfs/namespace.c        |  6 +++--
 fs/nfs/nfs4proc.c         |  2 +-
 fs/nfs/super.c            | 59 ++++++++++++++++++++++++++---------------------
 include/linux/nfs_fs.h    |  2 +-
 include/linux/nfs_fs_sb.h |  2 +-
 7 files changed, 43 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 067d144d141..19362712452 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1147,7 +1147,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 	}
 	if (!(fattr->valid & NFS_ATTR_FATTR)) {
 		struct nfs_server *server = NFS_SB(dentry->d_sb);
-		error = server->rpc_ops->getattr(server, fhandle, fattr);
+		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
 		if (error < 0)
 			goto out_err;
 	}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6ed018c9aad..771c3b83375 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -237,13 +237,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		/* Why so? Because we want revalidate for devices/FIFOs, and
 		 * that's precisely what we have in nfs_file_inode_operations.
 		 */
-		inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops;
+		inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
 		if (S_ISREG(inode->i_mode)) {
 			inode->i_fop = &nfs_file_operations;
 			inode->i_data.a_ops = &nfs_file_aops;
 			inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
 		} else if (S_ISDIR(inode->i_mode)) {
-			inode->i_op = NFS_SB(sb)->rpc_ops->dir_inode_ops;
+			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
 			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 85d9ed1dcf4..d8b8d56266c 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -104,7 +104,9 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		goto out_follow;
 	/* Look it up again */
 	parent = dget_parent(nd->dentry);
-	err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr);
+	err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
+						  &nd->dentry->d_name,
+						  &fh, &fattr);
 	dput(parent);
 	if (err != 0)
 		goto out_err;
@@ -178,7 +180,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 {
 #ifdef CONFIG_NFS_V4
 	struct vfsmount *mnt = NULL;
-	switch (server->rpc_ops->version) {
+	switch (server->nfs_client->cl_nfsversion) {
 		case 2:
 		case 3:
 			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b731b194527..1573eeb07ce 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -758,7 +758,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	}
 	nfs_confirm_seqid(&data->owner->so_seqid, 0);
 	if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
-		return server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
+		return server->nfs_client->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr);
 	return 0;
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d1b4a5b36e3..e1e5eab0259 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -252,7 +252,7 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	lock_kernel();
 
-	error = server->rpc_ops->statfs(server, fh, &res);
+	error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
 	buf->f_type = NFS_SUPER_MAGIC;
 	if (error < 0)
 		goto out_err;
@@ -343,10 +343,11 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 		{ 0, NULL, NULL }
 	};
 	const struct proc_nfs_info *nfs_infop;
+	struct nfs_client *clp = nfss->nfs_client;
 	char buf[12];
 	const char *proto;
 
-	seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
+	seq_printf(m, ",vers=%d", clp->rpc_ops->version);
 	seq_printf(m, ",rsize=%d", nfss->rsize);
 	seq_printf(m, ",wsize=%d", nfss->wsize);
 	if (nfss->acregmin != 3*HZ || showdefaults)
@@ -427,7 +428,7 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	seq_printf(m, ",namelen=%d", nfss->namelen);
 
 #ifdef CONFIG_NFS_V4
-	if (nfss->rpc_ops->version == 4) {
+	if (nfss->nfs_client->cl_nfsversion == 4) {
 		seq_printf(m, "\n\tnfsv4:\t");
 		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
 		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
@@ -503,7 +504,7 @@ nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *f
 	struct nfs_server	*server = NFS_SB(sb);
 	int			error;
 
-	error = server->rpc_ops->getroot(server, rootfh, fsinfo);
+	error = server->nfs_client->rpc_ops->getroot(server, rootfh, fsinfo);
 	if (error < 0) {
 		dprintk("nfs_get_root: getattr error = %d\n", -error);
 		return ERR_PTR(error);
@@ -553,14 +554,14 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
 		no_root_error = -ENOMEM;
 		goto out_no_root;
 	}
-	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+	sb->s_root->d_op = server->nfs_client->rpc_ops->dentry_ops;
 
 	/* mount time stamp, in seconds */
 	server->mount_time = jiffies;
 
 	/* Get some general file system info */
 	if (server->namelen == 0 &&
-	    server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
+	    server->nfs_client->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
 		server->namelen = pathinfo.max_namelen;
 	/* Work out a lot of parameters */
 	if (server->rsize == 0)
@@ -663,9 +664,14 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 	struct rpc_xprt		*xprt = NULL;
 	struct rpc_clnt		*clnt = NULL;
 	int			proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
+	int			nfsversion = 2;
 
-	clp = nfs_get_client(server->hostname, &server->addr,
-			     server->rpc_ops->version);
+#ifdef CONFIG_NFS_V3
+	if (server->flags & NFS_MOUNT_VER3)
+		nfsversion = 3;
+#endif
+
+	clp = nfs_get_client(server->hostname, &server->addr, nfsversion);
 	if (!clp) {
 		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
 		return ERR_PTR(PTR_ERR(clp));
@@ -676,6 +682,19 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 	server->retrans_timeo = timeparms.to_initval;
 	server->retrans_count = timeparms.to_retries;
 
+	/* Check NFS protocol revision and initialize RPC op vector
+	 * and file handle pool. */
+#ifdef CONFIG_NFS_V3
+	if (nfsversion == 3) {
+		clp->rpc_ops = &nfs_v3_clientops;
+		server->caps |= NFS_CAP_READDIRPLUS;
+	} else {
+		clp->rpc_ops = &nfs_v2_clientops;
+	}
+#else
+	clp->rpc_ops = &nfs_v2_clientops;
+#endif
+
 	/* create transport and client */
 	xprt = xprt_create_proto(proto, &server->addr, &timeparms);
 	if (IS_ERR(xprt)) {
@@ -686,7 +705,7 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 		return (struct rpc_clnt *)xprt;
 	}
 	clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-				 server->rpc_ops->version, data->pseudoflavor);
+				 clp->cl_nfsversion, data->pseudoflavor);
 	if (IS_ERR(clnt)) {
 		dprintk("%s: cannot create RPC client. Error = %ld\n",
 				__FUNCTION__, PTR_ERR(xprt));
@@ -750,7 +769,7 @@ static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_cl
 	fsinfo.fattr = data->fattr;
 	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
 		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
-	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+	sb->s_root->d_op = server->nfs_client->rpc_ops->dentry_ops;
 	sb->s_flags |= MS_ACTIVE;
 	return server;
 out_put_root:
@@ -865,19 +884,6 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
 		return -ENOMEM;
 	strcpy(server->hostname, data->hostname);
 
-	/* Check NFS protocol revision and initialize RPC op vector
-	 * and file handle pool. */
-#ifdef CONFIG_NFS_V3
-	if (server->flags & NFS_MOUNT_VER3) {
-		server->rpc_ops = &nfs_v3_clientops;
-		server->caps |= NFS_CAP_READDIRPLUS;
-	} else {
-		server->rpc_ops = &nfs_v2_clientops;
-	}
-#else
-	server->rpc_ops = &nfs_v2_clientops;
-#endif
-
 	/* Fill in pseudoflavor for mount version < 5 */
 	if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
 		data->pseudoflavor = RPC_AUTH_UNIX;
@@ -888,6 +894,7 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
 	server->client = nfs_create_client(server, data);
 	if (IS_ERR(server->client))
 		return PTR_ERR(server->client);
+
 	/* RFC 2623, sec 2.3.2 */
 	if (authflavor != RPC_AUTH_UNIX) {
 		struct rpc_auth *auth;
@@ -1129,6 +1136,8 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 
 	/* Now create transport and client */
 	if (clp->cl_cons_state == NFS_CS_INITING) {
+		clp->rpc_ops = &nfs_v4_clientops;
+
 		xprt = xprt_create_proto(proto, &server->addr, timeparms);
 		if (IS_ERR(xprt)) {
 			err = PTR_ERR(xprt);
@@ -1139,7 +1148,7 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 		/* Bind to a reserved port! */
 		xprt->resvport = 1;
 		clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-				server->rpc_ops->version, flavor);
+				clp->cl_nfsversion, flavor);
 		if (IS_ERR(clnt)) {
 			err = PTR_ERR(clnt);
 			dprintk("%s: cannot create RPC client. Error = %d\n",
@@ -1215,8 +1224,6 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
 	server->acdirmin = data->acdirmin*HZ;
 	server->acdirmax = data->acdirmax*HZ;
 
-	server->rpc_ops = &nfs_v4_clientops;
-
 	nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
 
 	server->retrans_timeo = timeparms.to_initval;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 70e1dc9162e..51e9bd90ded 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -215,7 +215,7 @@ static inline struct nfs_inode *NFS_I(struct inode *inode)
 #define NFS_FH(inode)			(&NFS_I(inode)->fh)
 #define NFS_SERVER(inode)		(NFS_SB(inode->i_sb))
 #define NFS_CLIENT(inode)		(NFS_SERVER(inode)->client)
-#define NFS_PROTO(inode)		(NFS_SERVER(inode)->rpc_ops)
+#define NFS_PROTO(inode)		(NFS_SERVER(inode)->nfs_client->rpc_ops)
 #define NFS_ADDR(inode)			(RPC_PEERADDR(NFS_CLIENT(inode)))
 #define NFS_COOKIEVERF(inode)		(NFS_I(inode)->cookieverf)
 #define NFS_READTIME(inode)		(NFS_I(inode)->read_cache_jiffies)
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index e7d7662f51f..aae7c117597 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -25,6 +25,7 @@ struct nfs_client {
 	struct list_head	cl_superblocks;	/* List of nfs_server structs */
 
 	struct rpc_clnt *	cl_rpcclient;
+	const struct nfs_rpc_ops *rpc_ops;	/* NFS protocol vector */
 
 #ifdef CONFIG_NFS_V4
 	u64			cl_clientid;	/* constant */
@@ -74,7 +75,6 @@ struct nfs_server {
 	struct rpc_clnt *	client;		/* RPC client handle */
 	struct rpc_clnt *	client_sys;	/* 2nd handle for FSINFO */
 	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
-	const struct nfs_rpc_ops *rpc_ops;	/* NFS protocol vector */
 	struct nfs_iostats *	io_stats;	/* I/O statistics */
 	struct backing_dev_info	backing_dev_info;
 	int			flags;		/* various flags */
-- 
cgit v1.2.3-70-g09d2


From 5006a76cca8f86c6975c16fcf67e83b8b0eee2b6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:12 -0400
Subject: NFS: Eliminate client_sys in favour of cl_rpcclient

Eliminate nfs_server::client_sys in favour of nfs_client::cl_rpcclient as we
only really need one per server that we're talking to since it doesn't have any
security on it.

The retransmission management variables are also moved to the common struct as
they're required to set up the cl_rpcclient connection.

The NFS2/3 client and client_acl connections are thenceforth derived by cloning
the cl_rpcclient connection and post-applying the authorisation flavour.

The code for setting up the initial common connection has been moved to
client.c as nfs_create_rpc_client().  All the NFS program definition tables are
also moved there as that's where they're now required rather than super.c.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 119 +++++++++++++++++++++++++
 fs/nfs/internal.h         |   2 +
 fs/nfs/nfs3proc.c         |   6 +-
 fs/nfs/proc.c             |   4 +-
 fs/nfs/super.c            | 222 +++++++++++-----------------------------------
 include/linux/nfs_fs_sb.h |   5 +-
 6 files changed, 179 insertions(+), 179 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index cb5e92463bd..c08cab935ad 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -50,6 +50,48 @@ static DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
 
+/*
+ * RPC cruft for NFS
+ */
+static struct rpc_version *nfs_version[5] = {
+	[2]			= &nfs_version2,
+#ifdef CONFIG_NFS_V3
+	[3]			= &nfs_version3,
+#endif
+#ifdef CONFIG_NFS_V4
+	[4]			= &nfs_version4,
+#endif
+};
+
+struct rpc_program nfs_program = {
+	.name			= "nfs",
+	.number			= NFS_PROGRAM,
+	.nrvers			= ARRAY_SIZE(nfs_version),
+	.version		= nfs_version,
+	.stats			= &nfs_rpcstat,
+	.pipe_dir_name		= "/nfs",
+};
+
+struct rpc_stat nfs_rpcstat = {
+	.program		= &nfs_program
+};
+
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
+static struct rpc_version *	nfsacl_version[] = {
+	[3]			= &nfsacl_version3,
+};
+
+struct rpc_program		nfsacl_program = {
+	.name			= "nfsacl",
+	.number			= NFS_ACL_PROGRAM,
+	.nrvers			= ARRAY_SIZE(nfsacl_version),
+	.version		= nfsacl_version,
+	.stats			= &nfsacl_rpcstat,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
+
 /*
  * Allocate a shared client record
  *
@@ -310,3 +352,80 @@ void nfs_mark_client_ready(struct nfs_client *clp, int state)
 	clp->cl_cons_state = state;
 	wake_up_all(&nfs_client_active_wq);
 }
+
+/*
+ * Initialise the timeout values for a connection
+ */
+static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
+				    unsigned int timeo, unsigned int retrans)
+{
+	to->to_initval = timeo * HZ / 10;
+	to->to_retries = retrans;
+	if (!to->to_retries)
+		to->to_retries = 2;
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		if (!to->to_initval)
+			to->to_initval = 60 * HZ;
+		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+			to->to_initval = NFS_MAX_TCP_TIMEOUT;
+		to->to_increment = to->to_initval;
+		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+		to->to_exponential = 0;
+		break;
+	case IPPROTO_UDP:
+	default:
+		if (!to->to_initval)
+			to->to_initval = 11 * HZ / 10;
+		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+			to->to_initval = NFS_MAX_UDP_TIMEOUT;
+		to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+		to->to_exponential = 1;
+		break;
+	}
+}
+
+/*
+ * Create an RPC client handle
+ */
+int nfs_create_rpc_client(struct nfs_client *clp, int proto,
+			  unsigned int timeo,
+			  unsigned int retrans,
+			  rpc_authflavor_t flavor)
+{
+	struct rpc_timeout	timeparms;
+	struct rpc_xprt		*xprt = NULL;
+	struct rpc_clnt		*clnt = NULL;
+
+	if (!IS_ERR(clp->cl_rpcclient))
+		return 0;
+
+	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
+	clp->retrans_timeo = timeparms.to_initval;
+	clp->retrans_count = timeparms.to_retries;
+
+	/* create transport and client */
+	xprt = xprt_create_proto(proto, &clp->cl_addr, &timeparms);
+	if (IS_ERR(xprt)) {
+		dprintk("%s: cannot create RPC transport. Error = %ld\n",
+				__FUNCTION__, PTR_ERR(xprt));
+		return PTR_ERR(xprt);
+	}
+
+	/* Bind to a reserved port! */
+	xprt->resvport = 1;
+	/* Create the client RPC handle */
+	clnt = rpc_create_client(xprt, clp->cl_hostname, &nfs_program,
+				 clp->rpc_ops->version, RPC_AUTH_UNIX);
+	if (IS_ERR(clnt)) {
+		dprintk("%s: cannot create RPC client. Error = %ld\n",
+				__FUNCTION__, PTR_ERR(clnt));
+		return PTR_ERR(clnt);
+	}
+
+	clnt->cl_intr     = 1;
+	clnt->cl_softrtry = 1;
+	clp->cl_rpcclient = clnt;
+	return 0;
+}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ac370d5d449..2f3aa52fbef 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -20,6 +20,8 @@ extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
 extern struct nfs_client *nfs_get_client(const char *, const struct sockaddr_in *, int);
 extern void nfs_mark_client_ready(struct nfs_client *, int);
+extern int nfs_create_rpc_client(struct nfs_client *, int, unsigned int,
+				 unsigned int, rpc_authflavor_t);
 
 /* nfs4namespace.c */
 #ifdef CONFIG_NFS_V4
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3e5371241ce..0622af0122b 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -90,8 +90,8 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	int	status;
 
 	status = do_proc_get_root(server->client, fhandle, info);
-	if (status && server->client_sys != server->client)
-		status = do_proc_get_root(server->client_sys, fhandle, info);
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info);
 	return status;
 }
 
@@ -785,7 +785,7 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 
 	dprintk("NFS call  fsinfo\n");
 	nfs_fattr_init(info->fattr);
-	status = rpc_call_sync(server->client_sys, &msg, 0);
+	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
 	dprintk("NFS reply fsinfo: %d\n", status);
 	return status;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77676903e0f..5a8b9407ee9 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -66,14 +66,14 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 
 	dprintk("%s: call getattr\n", __FUNCTION__);
 	nfs_fattr_init(fattr);
-	status = rpc_call_sync(server->client_sys, &msg, 0);
+	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
 	dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
 	if (status)
 		return status;
 	dprintk("%s: call statfs\n", __FUNCTION__);
 	msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
 	msg.rpc_resp = &fsinfo;
-	status = rpc_call_sync(server->client_sys, &msg, 0);
+	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
 	dprintk("%s: reply statfs: %d\n", __FUNCTION__, status);
 	if (status)
 		return status;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e1e5eab0259..85583414a3c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -60,52 +60,6 @@
  */
 #define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
 
-/*
- * RPC cruft for NFS
- */
-static struct rpc_version * nfs_version[] = {
-	NULL,
-	NULL,
-	&nfs_version2,
-#if defined(CONFIG_NFS_V3)
-	&nfs_version3,
-#elif defined(CONFIG_NFS_V4)
-	NULL,
-#endif
-#if defined(CONFIG_NFS_V4)
-	&nfs_version4,
-#endif
-};
-
-static struct rpc_program nfs_program = {
-	.name			= "nfs",
-	.number			= NFS_PROGRAM,
-	.nrvers			= ARRAY_SIZE(nfs_version),
-	.version		= nfs_version,
-	.stats			= &nfs_rpcstat,
-	.pipe_dir_name		= "/nfs",
-};
-
-struct rpc_stat nfs_rpcstat = {
-	.program		= &nfs_program
-};
-
-
-#ifdef CONFIG_NFS_V3_ACL
-static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
-static struct rpc_version *	nfsacl_version[] = {
-	[3]			= &nfsacl_version3,
-};
-
-struct rpc_program		nfsacl_program = {
-	.name =			"nfsacl",
-	.number =		NFS_ACL_PROGRAM,
-	.nrvers =		ARRAY_SIZE(nfsacl_version),
-	.version =		nfsacl_version,
-	.stats =		&nfsacl_rpcstat,
-};
-#endif  /* CONFIG_NFS_V3_ACL */
-
 static void nfs_umount_begin(struct vfsmount *, int);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
@@ -376,8 +330,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 			proto = buf;
 	}
 	seq_printf(m, ",proto=%s", proto);
-	seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
-	seq_printf(m, ",retrans=%u", nfss->retrans_count);
+	seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ);
+	seq_printf(m, ",retrans=%u", clp->retrans_count);
 	seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
 }
 
@@ -621,38 +575,6 @@ out_no_root:
 	return no_root_error;
 }
 
-/*
- * Initialise the timeout values for a connection
- */
-static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
-{
-	to->to_initval = timeo * HZ / 10;
-	to->to_retries = retrans;
-	if (!to->to_retries)
-		to->to_retries = 2;
-
-	switch (proto) {
-	case IPPROTO_TCP:
-		if (!to->to_initval)
-			to->to_initval = 60 * HZ;
-		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
-			to->to_initval = NFS_MAX_TCP_TIMEOUT;
-		to->to_increment = to->to_initval;
-		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
-		to->to_exponential = 0;
-		break;
-	case IPPROTO_UDP:
-	default:
-		if (!to->to_initval)
-			to->to_initval = 11 * HZ / 10;
-		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
-			to->to_initval = NFS_MAX_UDP_TIMEOUT;
-		to->to_maxval = NFS_MAX_UDP_TIMEOUT;
-		to->to_exponential = 1;
-		break;
-	}
-}
-
 /*
  * Create an RPC client handle.
  */
@@ -660,11 +582,10 @@ static struct rpc_clnt *
 nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 {
 	struct nfs_client	*clp;
-	struct rpc_timeout	timeparms;
-	struct rpc_xprt		*xprt = NULL;
-	struct rpc_clnt		*clnt = NULL;
+	struct rpc_clnt		*clnt;
 	int			proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
 	int			nfsversion = 2;
+	int			err;
 
 #ifdef CONFIG_NFS_V3
 	if (server->flags & NFS_MOUNT_VER3)
@@ -677,52 +598,54 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
 		return ERR_PTR(PTR_ERR(clp));
 	}
 
-	nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
-
-	server->retrans_timeo = timeparms.to_initval;
-	server->retrans_count = timeparms.to_retries;
-
-	/* Check NFS protocol revision and initialize RPC op vector
-	 * and file handle pool. */
+	if (clp->cl_cons_state == NFS_CS_INITING) {
+		/* Check NFS protocol revision and initialize RPC op
+		 * vector and file handle pool. */
 #ifdef CONFIG_NFS_V3
-	if (nfsversion == 3) {
-		clp->rpc_ops = &nfs_v3_clientops;
-		server->caps |= NFS_CAP_READDIRPLUS;
-	} else {
-		clp->rpc_ops = &nfs_v2_clientops;
-	}
+		if (nfsversion == 3) {
+			clp->rpc_ops = &nfs_v3_clientops;
+			server->caps |= NFS_CAP_READDIRPLUS;
+		} else {
+			clp->rpc_ops = &nfs_v2_clientops;
+		}
 #else
-	clp->rpc_ops = &nfs_v2_clientops;
+		clp->rpc_ops = &nfs_v2_clientops;
 #endif
 
-	/* create transport and client */
-	xprt = xprt_create_proto(proto, &server->addr, &timeparms);
-	if (IS_ERR(xprt)) {
-		dprintk("%s: cannot create RPC transport. Error = %ld\n",
-				__FUNCTION__, PTR_ERR(xprt));
-		nfs_mark_client_ready(clp, PTR_ERR(xprt));
-		nfs_put_client(clp);
-		return (struct rpc_clnt *)xprt;
+		/* create transport and client */
+		err = nfs_create_rpc_client(clp, proto, data->timeo,
+					    data->retrans, RPC_AUTH_UNIX);
+		if (err < 0)
+			goto client_init_error;
+
+		nfs_mark_client_ready(clp, 0);
 	}
-	clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-				 clp->cl_nfsversion, data->pseudoflavor);
+
+	/* create an nfs_server-specific client */
+	clnt = rpc_clone_client(clp->cl_rpcclient);
 	if (IS_ERR(clnt)) {
-		dprintk("%s: cannot create RPC client. Error = %ld\n",
-				__FUNCTION__, PTR_ERR(xprt));
-		goto out_fail;
+		dprintk("%s: couldn't create rpc_client!\n", __FUNCTION__);
+		nfs_put_client(clp);
+		return ERR_PTR(PTR_ERR(clnt));
 	}
 
-	clnt->cl_intr     = 1;
-	clnt->cl_softrtry = 1;
+	if (data->pseudoflavor != clp->cl_rpcclient->cl_auth->au_flavor) {
+		struct rpc_auth *auth;
+
+		auth = rpcauth_create(data->pseudoflavor, server->client);
+		if (IS_ERR(auth)) {
+			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
+			return ERR_PTR(PTR_ERR(auth));
+		}
+	}
 
-	nfs_mark_client_ready(clp, 0);
 	server->nfs_client = clp;
 	return clnt;
 
-out_fail:
-	nfs_mark_client_ready(clp, PTR_ERR(xprt));
+client_init_error:
+	nfs_mark_client_ready(clp, err);
 	nfs_put_client(clp);
-	return clnt;
+	return ERR_PTR(err);
 }
 
 /*
@@ -741,7 +664,7 @@ static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_cl
 	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
 	sb->s_maxbytes = data->sb->s_maxbytes;
 
-	server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+	server->client_acl = ERR_PTR(-EINVAL);
 	server->io_stats = nfs_alloc_iostats();
 	if (server->io_stats == NULL)
 		goto out;
@@ -750,11 +673,6 @@ static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_cl
 	if (IS_ERR((err = server->client)))
 		goto out;
 
-	if (!IS_ERR(parent->client_sys)) {
-		server->client_sys = rpc_clone_client(parent->client_sys);
-		if (IS_ERR((err = server->client_sys)))
-			goto out;
-	}
 	if (!IS_ERR(parent->client_acl)) {
 		server->client_acl = rpc_clone_client(parent->client_acl);
 		if (IS_ERR((err = server->client_acl)))
@@ -813,7 +731,7 @@ static int nfs_clone_generic_sb(struct nfs_clone_mount *data,
 		error = PTR_ERR(sb);
 		goto kill_rpciod;
 	}
-		
+
 	if (sb->s_root)
 		goto out_rpciod_down;
 
@@ -896,19 +814,6 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
 		return PTR_ERR(server->client);
 
 	/* RFC 2623, sec 2.3.2 */
-	if (authflavor != RPC_AUTH_UNIX) {
-		struct rpc_auth *auth;
-
-		server->client_sys = rpc_clone_client(server->client);
-		if (IS_ERR(server->client_sys))
-			return PTR_ERR(server->client_sys);
-		auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
-		if (IS_ERR(auth))
-			return PTR_ERR(auth);
-	} else {
-		atomic_inc(&server->client->cl_count);
-		server->client_sys = server->client;
-	}
 	if (server->flags & NFS_MOUNT_VER3) {
 #ifdef CONFIG_NFS_V3_ACL
 		if (!(server->flags & NFS_MOUNT_NOACL)) {
@@ -1012,7 +917,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 		goto out_err_noserver;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
-	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+	server->client = server->client_acl = ERR_PTR(-EINVAL);
 
 	root = &server->fh;
 	if (data->flags & NFS_MOUNT_VER3)
@@ -1083,8 +988,6 @@ static void nfs_kill_super(struct super_block *s)
 
 	if (!IS_ERR(server->client))
 		rpc_shutdown_client(server->client);
-	if (!IS_ERR(server->client_sys))
-		rpc_shutdown_client(server->client_sys);
 	if (!IS_ERR(server->client_acl))
 		rpc_shutdown_client(server->client_acl);
 
@@ -1121,10 +1024,9 @@ static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
 
 #ifdef CONFIG_NFS_V4
 static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
-	struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
+	int timeo, int retrans, int proto, rpc_authflavor_t flavor)
 {
 	struct nfs_client *clp;
-	struct rpc_xprt *xprt = NULL;
 	struct rpc_clnt *clnt = NULL;
 	int err = -EIO;
 
@@ -1138,26 +1040,10 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 	if (clp->cl_cons_state == NFS_CS_INITING) {
 		clp->rpc_ops = &nfs_v4_clientops;
 
-		xprt = xprt_create_proto(proto, &server->addr, timeparms);
-		if (IS_ERR(xprt)) {
-			err = PTR_ERR(xprt);
-			dprintk("%s: cannot create RPC transport. Error = %d\n",
-					__FUNCTION__, err);
+		err = nfs_create_rpc_client(clp, proto, timeo, retrans, flavor);
+		if (err < 0)
 			goto client_init_error;
-		}
-		/* Bind to a reserved port! */
-		xprt->resvport = 1;
-		clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-				clp->cl_nfsversion, flavor);
-		if (IS_ERR(clnt)) {
-			err = PTR_ERR(clnt);
-			dprintk("%s: cannot create RPC client. Error = %d\n",
-					__FUNCTION__, err);
-			goto client_init_error;
-		}
-		clnt->cl_intr     = 1;
-		clnt->cl_softrtry = 1;
-		clp->cl_rpcclient = clnt;
+
 		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
 		err = nfs_idmap_new(clp);
 		if (err < 0) {
@@ -1205,7 +1091,6 @@ client_init_error:
 static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
 {
 	struct nfs_server *server;
-	struct rpc_timeout timeparms;
 	rpc_authflavor_t authflavour;
 	int err = -EIO;
 
@@ -1224,11 +1109,6 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
 	server->acdirmin = data->acdirmin*HZ;
 	server->acdirmax = data->acdirmax*HZ;
 
-	nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
-
-	server->retrans_timeo = timeparms.to_initval;
-	server->retrans_count = timeparms.to_retries;
-
 	/* Now create transport and client */
 	authflavour = RPC_AUTH_UNIX;
 	if (data->auth_flavourlen != 0) {
@@ -1244,7 +1124,8 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
 		}
 	}
 
-	server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour);
+	server->client = nfs4_create_client(server, data->timeo, data->retrans,
+					    data->proto, authflavour);
 	if (IS_ERR(server->client)) {
 		err = PTR_ERR(server->client);
 			dprintk("%s: cannot create RPC client. Error = %d\n",
@@ -1318,7 +1199,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		return -ENOMEM;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
-	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+	server->client = server->client_acl = ERR_PTR(-EINVAL);
 
 	p = nfs_copy_user_string(NULL, &data->hostname, 256);
 	if (IS_ERR(p))
@@ -1489,7 +1370,6 @@ err:
 static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
 {
 	struct nfs_server *server = NFS_SB(sb);
-	struct rpc_timeout timeparms;
 	int proto, timeo, retrans;
 	void *err;
 
@@ -1498,11 +1378,11 @@ static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nf
 	   set the timeouts and retries to low values */
 	timeo = 2;
 	retrans = 1;
-	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
 
 	nfs_put_client(server->nfs_client);
 	server->nfs_client = NULL;
-	server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
+	server->client = nfs4_create_client(server, timeo, retrans, proto,
+					    data->authflavor);
 	if (IS_ERR((err = server->client)))
 		goto out_err;
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index aae7c117597..d404ceca916 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -26,6 +26,8 @@ struct nfs_client {
 
 	struct rpc_clnt *	cl_rpcclient;
 	const struct nfs_rpc_ops *rpc_ops;	/* NFS protocol vector */
+	unsigned long		retrans_timeo;	/* retransmit timeout */
+	unsigned int		retrans_count;	/* number of retransmit tries */
 
 #ifdef CONFIG_NFS_V4
 	u64			cl_clientid;	/* constant */
@@ -73,7 +75,6 @@ struct nfs_client {
 struct nfs_server {
 	struct nfs_client *	nfs_client;	/* shared client and NFS4 state */
 	struct rpc_clnt *	client;		/* RPC client handle */
-	struct rpc_clnt *	client_sys;	/* 2nd handle for FSINFO */
 	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
 	struct nfs_iostats *	io_stats;	/* I/O statistics */
 	struct backing_dev_info	backing_dev_info;
@@ -90,8 +91,6 @@ struct nfs_server {
 	unsigned int		acregmax;
 	unsigned int		acdirmin;
 	unsigned int		acdirmax;
-	unsigned long		retrans_timeo;	/* retransmit timeout */
-	unsigned int		retrans_count;	/* number of retransmit tries */
 	unsigned int		namelen;
 	char *			hostname;	/* remote hostname */
 	struct nfs_fh		fh;
-- 
cgit v1.2.3-70-g09d2


From 54ceac4515986030c2502960be620198dd8fe25b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 22 Aug 2006 20:06:13 -0400
Subject: NFS: Share NFS superblocks per-protocol per-server per-FSID

The attached patch makes NFS share superblocks between mounts from the same
server and FSID over the same protocol.

It does this by creating each superblock with a false root and returning the
real root dentry in the vfsmount presented by get_sb(). The root dentry set
starts off as an anonymous dentry if we don't already have the dentry for its
inode, otherwise it simply returns the dentry we already have.

We may thus end up with several trees of dentries in the superblock, and if at
some later point one of anonymous tree roots is discovered by normal filesystem
activity to be located in another tree within the superblock, the anonymous
root is named and materialises attached to the second tree at the appropriate
point.

Why do it this way? Why not pass an extra argument to the mount() syscall to
indicate the subpath and then pathwalk from the server root to the desired
directory? You can't guarantee this will work for two reasons:

 (1) The root and intervening nodes may not be accessible to the client.

     With NFS2 and NFS3, for instance, mountd is called on the server to get
     the filehandle for the tip of a path. mountd won't give us handles for
     anything we don't have permission to access, and so we can't set up NFS
     inodes for such nodes, and so can't easily set up dentries (we'd have to
     have ghost inodes or something).

     With this patch we don't actually create dentries until we get handles
     from the server that we can use to set up their inodes, and we don't
     actually bind them into the tree until we know for sure where they go.

 (2) Inaccessible symbolic links.

     If we're asked to mount two exports from the server, eg:

	mount warthog:/warthog/aaa/xxx /mmm
	mount warthog:/warthog/bbb/yyy /nnn

     We may not be able to access anything nearer the root than xxx and yyy,
     but we may find out later that /mmm/www/yyy, say, is actually the same
     directory as the one mounted on /nnn. What we might then find out, for
     example, is that /warthog/bbb was actually a symbolic link to
     /warthog/aaa/xxx/www, but we can't actually determine that by talking to
     the server until /warthog is made available by NFS.

     This would lead to having constructed an errneous dentry tree which we
     can't easily fix. We can end up with a dentry marked as a directory when
     it should actually be a symlink, or we could end up with an apparently
     hardlinked directory.

     With this patch we need not make assumptions about the type of a dentry
     for which we can't retrieve information, nor need we assume we know its
     place in the grand scheme of things until we actually see that place.

This patch reduces the possibility of aliasing in the inode and page caches for
inodes that may be accessed by more than one NFS export. It also reduces the
number of superblocks required for NFS where there are many NFS exports being
used from a server (home directory server + autofs for example).

This in turn makes it simpler to do local caching of network filesystems, as it
can then be guaranteed that there won't be links from multiple inodes in
separate superblocks to the same cache file.

Obviously, cache aliasing between different levels of NFS protocol could still
be a problem, but at least that gives us another key to use when indexing the
cache.

This patch makes the following changes:

 (1) The server record construction/destruction has been abstracted out into
     its own set of functions to make things easier to get right.  These have
     been moved into fs/nfs/client.c.

     All the code in fs/nfs/client.c has to do with the management of
     connections to servers, and doesn't touch superblocks in any way; the
     remaining code in fs/nfs/super.c has to do with VFS superblock management.

 (2) The sequence of events undertaken by NFS mount is now reordered:

     (a) A volume representation (struct nfs_server) is allocated.

     (b) A server representation (struct nfs_client) is acquired.  This may be
     	 allocated or shared, and is keyed on server address, port and NFS
     	 version.

     (c) If allocated, the client representation is initialised.  The state
     	 member variable of nfs_client is used to prevent a race during
     	 initialisation from two mounts.

     (d) For NFS4 a simple pathwalk is performed, walking from FH to FH to find
     	 the root filehandle for the mount (fs/nfs/getroot.c).  For NFS2/3 we
     	 are given the root FH in advance.

     (e) The volume FSID is probed for on the root FH.

     (f) The volume representation is initialised from the FSINFO record
     	 retrieved on the root FH.

     (g) sget() is called to acquire a superblock.  This may be allocated or
     	 shared, keyed on client pointer and FSID.

     (h) If allocated, the superblock is initialised.

     (i) If the superblock is shared, then the new nfs_server record is
     	 discarded.

     (j) The root dentry for this mount is looked up from the root FH.

     (k) The root dentry for this mount is assigned to the vfsmount.

 (3) nfs_readdir_lookup() creates dentries for each of the entries readdir()
     returns; this function now attaches disconnected trees from alternate
     roots that happen to be discovered attached to a directory being read (in
     the same way nfs_lookup() is made to do for lookup ops).

     The new d_materialise_unique() function is now used to do this, thus
     permitting the whole thing to be done under one set of locks, and thus
     avoiding any race between mount and lookup operations on the same
     directory.

 (4) The client management code uses a new debug facility: NFSDBG_CLIENT which
     is set by echoing 1024 to /proc/net/sunrpc/nfs_debug.

 (5) Clone mounts are now called xdev mounts.

 (6) Use the dentry passed to the statfs() op as the handle for retrieving fs
     statistics rather than the root dentry of the superblock (which is now a
     dummy).

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile           |    2 +-
 fs/nfs/client.c           |  735 ++++++++++++++++++++++++++-
 fs/nfs/dir.c              |   16 +-
 fs/nfs/getroot.c          |  306 ++++++++++++
 fs/nfs/idmap.c            |    3 +-
 fs/nfs/inode.c            |    2 +-
 fs/nfs/internal.h         |   82 +--
 fs/nfs/namespace.c        |   25 +-
 fs/nfs/nfs3proc.c         |    2 +-
 fs/nfs/nfs4_fs.h          |    6 -
 fs/nfs/nfs4namespace.c    |  110 ++++-
 fs/nfs/nfs4proc.c         |   59 +--
 fs/nfs/nfs4renewd.c       |   13 -
 fs/nfs/nfs4state.c        |   18 -
 fs/nfs/read.c             |    2 +-
 fs/nfs/super.c            | 1207 +++++++++++++++++----------------------------
 fs/nfs/write.c            |    2 +-
 include/linux/nfs_fs_sb.h |   21 +-
 18 files changed, 1655 insertions(+), 956 deletions(-)
 create mode 100644 fs/nfs/getroot.c

(limited to 'include')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 3b993a6f816..f4580b44eef 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_NFS_FS) += nfs.o
 
-nfs-y 			:= client.o dir.o file.o inode.o super.o nfs2xdr.o \
+nfs-y 			:= client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
 			   pagelist.o proc.o read.o symlink.o unlink.o \
 			   write.o namespace.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c08cab935ad..dafba608c0a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -48,6 +48,7 @@
 
 static DEFINE_SPINLOCK(nfs_client_lock);
 static LIST_HEAD(nfs_client_list);
+static LIST_HEAD(nfs_volume_list);
 static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq);
 
 /*
@@ -268,9 +269,9 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio
  * Look up a client by IP address and protocol version
  * - creates a new record if one doesn't yet exist
  */
-struct nfs_client *nfs_get_client(const char *hostname,
-				  const struct sockaddr_in *addr,
-				  int nfsversion)
+static struct nfs_client *nfs_get_client(const char *hostname,
+					 const struct sockaddr_in *addr,
+					 int nfsversion)
 {
 	struct nfs_client *clp, *new = NULL;
 	int error;
@@ -340,6 +341,8 @@ found_client:
 		return ERR_PTR(error);
 	}
 
+	BUG_ON(clp->cl_cons_state != NFS_CS_READY);
+
 	dprintk("--> nfs_get_client() = %p [share]\n", clp);
 	return clp;
 }
@@ -347,7 +350,7 @@ found_client:
 /*
  * Mark a server as ready or failed
  */
-void nfs_mark_client_ready(struct nfs_client *clp, int state)
+static void nfs_mark_client_ready(struct nfs_client *clp, int state)
 {
 	clp->cl_cons_state = state;
 	wake_up_all(&nfs_client_active_wq);
@@ -389,10 +392,10 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 /*
  * Create an RPC client handle
  */
-int nfs_create_rpc_client(struct nfs_client *clp, int proto,
-			  unsigned int timeo,
-			  unsigned int retrans,
-			  rpc_authflavor_t flavor)
+static int nfs_create_rpc_client(struct nfs_client *clp, int proto,
+						unsigned int timeo,
+						unsigned int retrans,
+						rpc_authflavor_t flavor)
 {
 	struct rpc_timeout	timeparms;
 	struct rpc_xprt		*xprt = NULL;
@@ -429,3 +432,719 @@ int nfs_create_rpc_client(struct nfs_client *clp, int proto,
 	clp->cl_rpcclient = clnt;
 	return 0;
 }
+
+/*
+ * Version 2 or 3 client destruction
+ */
+static void nfs_destroy_server(struct nfs_server *server)
+{
+	if (!IS_ERR(server->client_acl))
+		rpc_shutdown_client(server->client_acl);
+
+	if (!(server->flags & NFS_MOUNT_NONLM))
+		lockd_down();	/* release rpc.lockd */
+}
+
+/*
+ * Version 2 or 3 lockd setup
+ */
+static int nfs_start_lockd(struct nfs_server *server)
+{
+	int error = 0;
+
+	if (server->nfs_client->cl_nfsversion > 3)
+		goto out;
+	if (server->flags & NFS_MOUNT_NONLM)
+		goto out;
+	error = lockd_up();
+	if (error < 0)
+		server->flags |= NFS_MOUNT_NONLM;
+	else
+		server->destroy = nfs_destroy_server;
+out:
+	return error;
+}
+
+/*
+ * Initialise an NFSv3 ACL client connection
+ */
+#ifdef CONFIG_NFS_V3_ACL
+static void nfs_init_server_aclclient(struct nfs_server *server)
+{
+	if (server->nfs_client->cl_nfsversion != 3)
+		goto out_noacl;
+	if (server->flags & NFS_MOUNT_NOACL)
+		goto out_noacl;
+
+	server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+	if (IS_ERR(server->client_acl))
+		goto out_noacl;
+
+	/* No errors! Assume that Sun nfsacls are supported */
+	server->caps |= NFS_CAP_ACLS;
+	return;
+
+out_noacl:
+	server->caps &= ~NFS_CAP_ACLS;
+}
+#else
+static inline void nfs_init_server_aclclient(struct nfs_server *server)
+{
+	server->flags &= ~NFS_MOUNT_NOACL;
+	server->caps &= ~NFS_CAP_ACLS;
+}
+#endif
+
+/*
+ * Create a general RPC client
+ */
+static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t pseudoflavour)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	server->client = rpc_clone_client(clp->cl_rpcclient);
+	if (IS_ERR(server->client)) {
+		dprintk("%s: couldn't create rpc_client!\n", __FUNCTION__);
+		return PTR_ERR(server->client);
+	}
+
+	if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
+		struct rpc_auth *auth;
+
+		auth = rpcauth_create(pseudoflavour, server->client);
+		if (IS_ERR(auth)) {
+			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
+			return PTR_ERR(auth);
+		}
+	}
+	server->client->cl_softrtry = 0;
+	if (server->flags & NFS_MOUNT_SOFT)
+		server->client->cl_softrtry = 1;
+
+	server->client->cl_intr = 0;
+	if (server->flags & NFS4_MOUNT_INTR)
+		server->client->cl_intr = 1;
+
+	return 0;
+}
+
+/*
+ * Initialise an NFS2 or NFS3 client
+ */
+static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *data)
+{
+	int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
+	int error;
+
+	if (clp->cl_cons_state == NFS_CS_READY) {
+		/* the client is already initialised */
+		dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp);
+		return 0;
+	}
+
+	/* Check NFS protocol revision and initialize RPC op vector */
+	clp->rpc_ops = &nfs_v2_clientops;
+#ifdef CONFIG_NFS_V3
+	if (clp->cl_nfsversion == 3)
+		clp->rpc_ops = &nfs_v3_clientops;
+#endif
+	/*
+	 * Create a client RPC handle for doing FSSTAT with UNIX auth only
+	 * - RFC 2623, sec 2.3.2
+	 */
+	error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans,
+			RPC_AUTH_UNIX);
+	if (error < 0)
+		goto error;
+	nfs_mark_client_ready(clp, NFS_CS_READY);
+	return 0;
+
+error:
+	nfs_mark_client_ready(clp, error);
+	dprintk("<-- nfs_init_client() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Create a version 2 or 3 client
+ */
+static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_data *data)
+{
+	struct nfs_client *clp;
+	int error, nfsvers = 2;
+
+	dprintk("--> nfs_init_server()\n");
+
+#ifdef CONFIG_NFS_V3
+	if (data->flags & NFS_MOUNT_VER3)
+		nfsvers = 3;
+#endif
+
+	/* Allocate or find a client reference we can use */
+	clp = nfs_get_client(data->hostname, &data->addr, nfsvers);
+	if (IS_ERR(clp)) {
+		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
+		return PTR_ERR(clp);
+	}
+
+	error = nfs_init_client(clp, data);
+	if (error < 0)
+		goto error;
+
+	server->nfs_client = clp;
+
+	/* Initialise the client representation from the mount data */
+	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+
+	if (data->rsize)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+
+	server->acregmin = data->acregmin * HZ;
+	server->acregmax = data->acregmax * HZ;
+	server->acdirmin = data->acdirmin * HZ;
+	server->acdirmax = data->acdirmax * HZ;
+
+	/* Start lockd here, before we might error out */
+	error = nfs_start_lockd(server);
+	if (error < 0)
+		goto error;
+
+	error = nfs_init_server_rpcclient(server, data->pseudoflavor);
+	if (error < 0)
+		goto error;
+
+	server->namelen  = data->namlen;
+	/* Create a client RPC handle for the NFSv3 ACL management interface */
+	nfs_init_server_aclclient(server);
+	if (clp->cl_nfsversion == 3) {
+		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+			server->namelen = NFS3_MAXNAMLEN;
+		server->caps |= NFS_CAP_READDIRPLUS;
+	} else {
+		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+			server->namelen = NFS2_MAXNAMLEN;
+	}
+
+	dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp);
+	return 0;
+
+error:
+	server->nfs_client = NULL;
+	nfs_put_client(clp);
+	dprintk("<-- nfs_init_server() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Load up the server record from information gained in an fsinfo record
+ */
+static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
+{
+	unsigned long max_rpc_payload;
+
+	/* Work out a lot of parameters */
+	if (server->rsize == 0)
+		server->rsize = nfs_block_size(fsinfo->rtpref, NULL);
+	if (server->wsize == 0)
+		server->wsize = nfs_block_size(fsinfo->wtpref, NULL);
+
+	if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax)
+		server->rsize = nfs_block_size(fsinfo->rtmax, NULL);
+	if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax)
+		server->wsize = nfs_block_size(fsinfo->wtmax, NULL);
+
+	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+	if (server->rsize > max_rpc_payload)
+		server->rsize = max_rpc_payload;
+	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+		server->rsize = NFS_MAX_FILE_IO_SIZE;
+	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+
+	if (server->wsize > max_rpc_payload)
+		server->wsize = max_rpc_payload;
+	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+		server->wsize = NFS_MAX_FILE_IO_SIZE;
+	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
+
+	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
+	if (server->dtsize > PAGE_CACHE_SIZE)
+		server->dtsize = PAGE_CACHE_SIZE;
+	if (server->dtsize > server->rsize)
+		server->dtsize = server->rsize;
+
+	if (server->flags & NFS_MOUNT_NOAC) {
+		server->acregmin = server->acregmax = 0;
+		server->acdirmin = server->acdirmax = 0;
+	}
+
+	server->maxfilesize = fsinfo->maxfilesize;
+
+	/* We're airborne Set socket buffersize */
+	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+}
+
+/*
+ * Probe filesystem information, including the FSID on v2/v3
+ */
+static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr)
+{
+	struct nfs_fsinfo fsinfo;
+	struct nfs_client *clp = server->nfs_client;
+	int error;
+
+	dprintk("--> nfs_probe_fsinfo()\n");
+
+	if (clp->rpc_ops->set_capabilities != NULL) {
+		error = clp->rpc_ops->set_capabilities(server, mntfh);
+		if (error < 0)
+			goto out_error;
+	}
+
+	fsinfo.fattr = fattr;
+	nfs_fattr_init(fattr);
+	error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
+	if (error < 0)
+		goto out_error;
+
+	nfs_server_set_fsinfo(server, &fsinfo);
+
+	/* Get some general file system info */
+	if (server->namelen == 0) {
+		struct nfs_pathconf pathinfo;
+
+		pathinfo.fattr = fattr;
+		nfs_fattr_init(fattr);
+
+		if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0)
+			server->namelen = pathinfo.max_namelen;
+	}
+
+	dprintk("<-- nfs_probe_fsinfo() = 0\n");
+	return 0;
+
+out_error:
+	dprintk("nfs_probe_fsinfo: error = %d\n", -error);
+	return error;
+}
+
+/*
+ * Copy useful information when duplicating a server record
+ */
+static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
+{
+	target->flags = source->flags;
+	target->acregmin = source->acregmin;
+	target->acregmax = source->acregmax;
+	target->acdirmin = source->acdirmin;
+	target->acdirmax = source->acdirmax;
+	target->caps = source->caps;
+}
+
+/*
+ * Allocate and initialise a server record
+ */
+static struct nfs_server *nfs_alloc_server(void)
+{
+	struct nfs_server *server;
+
+	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (!server)
+		return NULL;
+
+	server->client = server->client_acl = ERR_PTR(-EINVAL);
+
+	/* Zero out the NFS state stuff */
+	INIT_LIST_HEAD(&server->client_link);
+	INIT_LIST_HEAD(&server->master_link);
+
+	server->io_stats = nfs_alloc_iostats();
+	if (!server->io_stats) {
+		kfree(server);
+		return NULL;
+	}
+
+	return server;
+}
+
+/*
+ * Free up a server record
+ */
+void nfs_free_server(struct nfs_server *server)
+{
+	dprintk("--> nfs_free_server()\n");
+
+	spin_lock(&nfs_client_lock);
+	list_del(&server->client_link);
+	list_del(&server->master_link);
+	spin_unlock(&nfs_client_lock);
+
+	if (server->destroy != NULL)
+		server->destroy(server);
+	if (!IS_ERR(server->client))
+		rpc_shutdown_client(server->client);
+
+	nfs_put_client(server->nfs_client);
+
+	nfs_free_iostats(server->io_stats);
+	kfree(server);
+	nfs_release_automount_timer();
+	dprintk("<-- nfs_free_server()\n");
+}
+
+/*
+ * Create a version 2 or 3 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs_create_server(const struct nfs_mount_data *data,
+				     struct nfs_fh *mntfh)
+{
+	struct nfs_server *server;
+	struct nfs_fattr fattr;
+	int error;
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	/* Get a client representation */
+	error = nfs_init_server(server, data);
+	if (error < 0)
+		goto error;
+
+	BUG_ON(!server->nfs_client);
+	BUG_ON(!server->nfs_client->rpc_ops);
+	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+	/* Probe the root fh to retrieve its FSID */
+	error = nfs_probe_fsinfo(server, mntfh, &fattr);
+	if (error < 0)
+		goto error;
+	if (!(fattr.valid & NFS_ATTR_FATTR)) {
+		error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+		if (error < 0) {
+			dprintk("nfs_create_server: getattr error = %d\n", -error);
+			goto error;
+		}
+	}
+	memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
+
+	dprintk("Server FSID: %llx:%llx\n", server->fsid.major, server->fsid.minor);
+
+	BUG_ON(!server->nfs_client);
+	BUG_ON(!server->nfs_client->rpc_ops);
+	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+	spin_lock(&nfs_client_lock);
+	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+	list_add_tail(&server->master_link, &nfs_volume_list);
+	spin_unlock(&nfs_client_lock);
+
+	server->mount_time = jiffies;
+	return server;
+
+error:
+	nfs_free_server(server);
+	return ERR_PTR(error);
+}
+
+#ifdef CONFIG_NFS_V4
+/*
+ * Initialise an NFS4 client record
+ */
+static int nfs4_init_client(struct nfs_client *clp,
+		int proto, int timeo, int retrans,
+		rpc_authflavor_t authflavour)
+{
+	int error;
+
+	if (clp->cl_cons_state == NFS_CS_READY) {
+		/* the client is initialised already */
+		dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp);
+		return 0;
+	}
+
+	/* Check NFS protocol revision and initialize RPC op vector */
+	clp->rpc_ops = &nfs_v4_clientops;
+
+	error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour);
+	if (error < 0)
+		goto error;
+
+	error = nfs_idmap_new(clp);
+	if (error < 0) {
+		dprintk("%s: failed to create idmapper. Error = %d\n",
+			__FUNCTION__, error);
+		__set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
+		goto error;
+	}
+
+	nfs_mark_client_ready(clp, NFS_CS_READY);
+	return 0;
+
+error:
+	nfs_mark_client_ready(clp, error);
+	dprintk("<-- nfs4_init_client() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Set up an NFS4 client
+ */
+static int nfs4_set_client(struct nfs_server *server,
+		const char *hostname, const struct sockaddr_in *addr,
+		rpc_authflavor_t authflavour,
+		int proto, int timeo, int retrans)
+{
+	struct nfs_client *clp;
+	int error;
+
+	dprintk("--> nfs4_set_client()\n");
+
+	/* Allocate or find a client reference we can use */
+	clp = nfs_get_client(hostname, addr, 4);
+	if (IS_ERR(clp)) {
+		error = PTR_ERR(clp);
+		goto error;
+	}
+	error = nfs4_init_client(clp, proto, timeo, retrans, authflavour);
+	if (error < 0)
+		goto error_put;
+
+	server->nfs_client = clp;
+	dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
+	return 0;
+
+error_put:
+	nfs_put_client(clp);
+error:
+	dprintk("<-- nfs4_set_client() = xerror %d\n", error);
+	return error;
+}
+
+/*
+ * Create a version 4 volume record
+ */
+static int nfs4_init_server(struct nfs_server *server,
+		const struct nfs4_mount_data *data, rpc_authflavor_t authflavour)
+{
+	int error;
+
+	dprintk("--> nfs4_init_server()\n");
+
+	/* Initialise the client representation from the mount data */
+	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+	server->caps |= NFS_CAP_ATOMIC_OPEN;
+
+	if (data->rsize)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+
+	server->acregmin = data->acregmin * HZ;
+	server->acregmax = data->acregmax * HZ;
+	server->acdirmin = data->acdirmin * HZ;
+	server->acdirmax = data->acdirmax * HZ;
+
+	error = nfs_init_server_rpcclient(server, authflavour);
+
+	/* Done */
+	dprintk("<-- nfs4_init_server() = %d\n", error);
+	return error;
+}
+
+/*
+ * Create a version 4 volume record
+ * - keyed on server and FSID
+ */
+struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
+				      const char *hostname,
+				      const struct sockaddr_in *addr,
+				      const char *mntpath,
+				      const char *ip_addr,
+				      rpc_authflavor_t authflavour,
+				      struct nfs_fh *mntfh)
+{
+	struct nfs_fattr fattr;
+	struct nfs_server *server;
+	int error;
+
+	dprintk("--> nfs4_create_server()\n");
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	/* Get a client record */
+	error = nfs4_set_client(server, hostname, addr, authflavour,
+			data->proto, data->timeo, data->retrans);
+	if (error < 0)
+		goto error;
+
+	/* set up the general RPC client */
+	error = nfs4_init_server(server, data, authflavour);
+	if (error < 0)
+		goto error;
+
+	BUG_ON(!server->nfs_client);
+	BUG_ON(!server->nfs_client->rpc_ops);
+	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+	/* Probe the root fh to retrieve its FSID */
+	error = nfs4_path_walk(server, mntfh, mntpath);
+	if (error < 0)
+		goto error;
+
+	dprintk("Server FSID: %llx:%llx\n", server->fsid.major, server->fsid.minor);
+	dprintk("Mount FH: %d\n", mntfh->size);
+
+	error = nfs_probe_fsinfo(server, mntfh, &fattr);
+	if (error < 0)
+		goto error;
+
+	BUG_ON(!server->nfs_client);
+	BUG_ON(!server->nfs_client->rpc_ops);
+	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+	spin_lock(&nfs_client_lock);
+	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+	list_add_tail(&server->master_link, &nfs_volume_list);
+	spin_unlock(&nfs_client_lock);
+
+	server->mount_time = jiffies;
+	dprintk("<-- nfs4_create_server() = %p\n", server);
+	return server;
+
+error:
+	nfs_free_server(server);
+	dprintk("<-- nfs4_create_server() = error %d\n", error);
+	return ERR_PTR(error);
+}
+
+/*
+ * Create an NFS4 referral server record
+ */
+struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
+					       struct nfs_fh *fh)
+{
+	struct nfs_client *parent_client;
+	struct nfs_server *server, *parent_server;
+	struct nfs_fattr fattr;
+	int error;
+
+	dprintk("--> nfs4_create_referral_server()\n");
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	parent_server = NFS_SB(data->sb);
+	parent_client = parent_server->nfs_client;
+
+	/* Get a client representation.
+	 * Note: NFSv4 always uses TCP, */
+	error = nfs4_set_client(server, data->hostname, data->addr,
+			data->authflavor,
+			parent_server->client->cl_xprt->prot,
+			parent_client->retrans_timeo,
+			parent_client->retrans_count);
+
+	/* Initialise the client representation from the parent server */
+	nfs_server_copy_userdata(server, parent_server);
+	server->caps |= NFS_CAP_ATOMIC_OPEN;
+
+	error = nfs_init_server_rpcclient(server, data->authflavor);
+	if (error < 0)
+		goto error;
+
+	BUG_ON(!server->nfs_client);
+	BUG_ON(!server->nfs_client->rpc_ops);
+	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+
+	/* probe the filesystem info for this server filesystem */
+	error = nfs_probe_fsinfo(server, fh, &fattr);
+	if (error < 0)
+		goto error;
+
+	dprintk("Referral FSID: %llx:%llx\n",
+		server->fsid.major, server->fsid.minor);
+
+	spin_lock(&nfs_client_lock);
+	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+	list_add_tail(&server->master_link, &nfs_volume_list);
+	spin_unlock(&nfs_client_lock);
+
+	server->mount_time = jiffies;
+
+	dprintk("<-- nfs_create_referral_server() = %p\n", server);
+	return server;
+
+error:
+	nfs_free_server(server);
+	dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
+	return ERR_PTR(error);
+}
+
+#endif /* CONFIG_NFS_V4 */
+
+/*
+ * Clone an NFS2, NFS3 or NFS4 server record
+ */
+struct nfs_server *nfs_clone_server(struct nfs_server *source,
+				    struct nfs_fh *fh,
+				    struct nfs_fattr *fattr)
+{
+	struct nfs_server *server;
+	struct nfs_fattr fattr_fsinfo;
+	int error;
+
+	dprintk("--> nfs_clone_server(,%llx:%llx,)\n",
+		fattr->fsid.major, fattr->fsid.minor);
+
+	server = nfs_alloc_server();
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+
+	/* Copy data from the source */
+	server->nfs_client = source->nfs_client;
+	atomic_inc(&server->nfs_client->cl_count);
+	nfs_server_copy_userdata(server, source);
+
+	server->fsid = fattr->fsid;
+
+	error = nfs_init_server_rpcclient(server, source->client->cl_auth->au_flavor);
+	if (error < 0)
+		goto out_free_server;
+	if (!IS_ERR(source->client_acl))
+		nfs_init_server_aclclient(server);
+
+	/* probe the filesystem info for this server filesystem */
+	error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo);
+	if (error < 0)
+		goto out_free_server;
+
+	dprintk("Cloned FSID: %llx:%llx\n",
+		server->fsid.major, server->fsid.minor);
+
+	error = nfs_start_lockd(server);
+	if (error < 0)
+		goto out_free_server;
+
+	spin_lock(&nfs_client_lock);
+	list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
+	list_add_tail(&server->master_link, &nfs_volume_list);
+	spin_unlock(&nfs_client_lock);
+
+	server->mount_time = jiffies;
+
+	dprintk("<-- nfs_clone_server() = %p\n", server);
+	return server;
+
+out_free_server:
+	nfs_free_server(server);
+	dprintk("<-- nfs_clone_server() = error %d\n", error);
+	return ERR_PTR(error);
+}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 19362712452..9b496ef4abe 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -31,6 +31,7 @@
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/namei.h>
+#include <linux/mount.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -870,14 +871,14 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
 	return (nd->intent.open.flags & O_EXCL) != 0;
 }
 
-static inline int nfs_reval_fsid(struct inode *dir,
-		struct nfs_fh *fh, struct nfs_fattr *fattr)
+static inline int nfs_reval_fsid(struct vfsmount *mnt, struct inode *dir,
+				 struct nfs_fh *fh, struct nfs_fattr *fattr)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 
 	if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
 		/* Revalidate fsid on root dir */
-		return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode);
+		return __nfs_revalidate_inode(server, mnt->mnt_root->d_inode);
 	return 0;
 }
 
@@ -913,7 +914,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 		res = ERR_PTR(error);
 		goto out_unlock;
 	}
-	error = nfs_reval_fsid(dir, &fhandle, &fattr);
+	error = nfs_reval_fsid(nd->mnt, dir, &fhandle, &fattr);
 	if (error < 0) {
 		res = ERR_PTR(error);
 		goto out_unlock;
@@ -922,8 +923,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 	res = (struct dentry *)inode;
 	if (IS_ERR(res))
 		goto out_unlock;
+
 no_entry:
-	res = d_add_unique(dentry, inode);
+	res = d_materialise_unique(dentry, inode);
 	if (res != NULL)
 		dentry = res;
 	nfs_renew_times(dentry);
@@ -1117,11 +1119,13 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 		dput(dentry);
 		return NULL;
 	}
-	alias = d_add_unique(dentry, inode);
+
+	alias = d_materialise_unique(dentry, inode);
 	if (alias != NULL) {
 		dput(dentry);
 		dentry = alias;
 	}
+
 	nfs_renew_times(dentry);
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 	return dentry;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
new file mode 100644
index 00000000000..977e59088ee
--- /dev/null
+++ b/fs/nfs/getroot.c
@@ -0,0 +1,306 @@
+/* getroot.c: get the root dentry for an NFS mount
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/namei.h>
+#include <linux/namespace.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "delegation.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_CLIENT
+#define NFS_PARANOIA 1
+
+/*
+ * get an NFS2/NFS3 root dentry from the root filehandle
+ */
+struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_fsinfo fsinfo;
+	struct nfs_fattr fattr;
+	struct dentry *mntroot;
+	struct inode *inode;
+	int error;
+
+	/* create a dummy root dentry with dummy inode for this superblock */
+	if (!sb->s_root) {
+		struct nfs_fh dummyfh;
+		struct dentry *root;
+		struct inode *iroot;
+
+		memset(&dummyfh, 0, sizeof(dummyfh));
+		memset(&fattr, 0, sizeof(fattr));
+		nfs_fattr_init(&fattr);
+		fattr.valid = NFS_ATTR_FATTR;
+		fattr.type = NFDIR;
+		fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR;
+		fattr.nlink = 2;
+
+		iroot = nfs_fhget(sb, &dummyfh, &fattr);
+		if (IS_ERR(iroot))
+			return ERR_PTR(PTR_ERR(iroot));
+
+		root = d_alloc_root(iroot);
+		if (!root) {
+			iput(iroot);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		sb->s_root = root;
+	}
+
+	/* get the actual root for this mount */
+	fsinfo.fattr = &fattr;
+
+	error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+	if (error < 0) {
+		dprintk("nfs_get_root: getattr error = %d\n", -error);
+		return ERR_PTR(error);
+	}
+
+	inode = nfs_fhget(sb, mntfh, fsinfo.fattr);
+	if (IS_ERR(inode)) {
+		dprintk("nfs_get_root: get root inode failed\n");
+		return ERR_PTR(PTR_ERR(inode));
+	}
+
+	/* root dentries normally start off anonymous and get spliced in later
+	 * if the dentry tree reaches them; however if the dentry already
+	 * exists, we'll pick it up at this point and use it as the root
+	 */
+	mntroot = d_alloc_anon(inode);
+	if (!mntroot) {
+		iput(inode);
+		dprintk("nfs_get_root: get root dentry failed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (!mntroot->d_op)
+		mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+
+	return mntroot;
+}
+
+#ifdef CONFIG_NFS_V4
+
+/*
+ * Do a simple pathwalk from the root FH of the server to the nominated target
+ * of the mountpoint
+ * - give error on symlinks
+ * - give error on ".." occurring in the path
+ * - follow traversals
+ */
+int nfs4_path_walk(struct nfs_server *server,
+		   struct nfs_fh *mntfh,
+		   const char *path)
+{
+	struct nfs_fsinfo fsinfo;
+	struct nfs_fattr fattr;
+	struct nfs_fh lastfh;
+	struct qstr name;
+	int ret;
+	//int referral_count = 0;
+
+	dprintk("--> nfs4_path_walk(,,%s)\n", path);
+
+	fsinfo.fattr = &fattr;
+	nfs_fattr_init(&fattr);
+
+	if (*path++ != '/') {
+		dprintk("nfs4_get_root: Path does not begin with a slash\n");
+		return -EINVAL;
+	}
+
+	/* Start by getting the root filehandle from the server */
+	ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
+	if (ret < 0) {
+		dprintk("nfs4_get_root: getroot error = %d\n", -ret);
+		return ret;
+	}
+
+	if (fattr.type != NFDIR) {
+		printk(KERN_ERR "nfs4_get_root:"
+		       " getroot encountered non-directory\n");
+		return -ENOTDIR;
+	}
+
+	if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+		printk(KERN_ERR "nfs4_get_root:"
+		       " getroot obtained referral\n");
+		return -EREMOTE;
+	}
+
+next_component:
+	dprintk("Next: %s\n", path);
+
+	/* extract the next bit of the path */
+	if (!*path)
+		goto path_walk_complete;
+
+	name.name = path;
+	while (*path && *path != '/')
+		path++;
+	name.len = path - (const char *) name.name;
+
+eat_dot_dir:
+	while (*path == '/')
+		path++;
+
+	if (path[0] == '.' && (path[1] == '/' || !path[1])) {
+		path += 2;
+		goto eat_dot_dir;
+	}
+
+	if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2])
+	    ) {
+		printk(KERN_ERR "nfs4_get_root:"
+		       " Mount path contains reference to \"..\"\n");
+		return -EINVAL;
+	}
+
+	/* lookup the next FH in the sequence */
+	memcpy(&lastfh, mntfh, sizeof(lastfh));
+
+	dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path);
+
+	ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name,
+						    mntfh, &fattr);
+	if (ret < 0) {
+		dprintk("nfs4_get_root: getroot error = %d\n", -ret);
+		return ret;
+	}
+
+	if (fattr.type != NFDIR) {
+		printk(KERN_ERR "nfs4_get_root:"
+		       " lookupfh encountered non-directory\n");
+		return -ENOTDIR;
+	}
+
+	if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+		printk(KERN_ERR "nfs4_get_root:"
+		       " lookupfh obtained referral\n");
+		return -EREMOTE;
+	}
+
+	goto next_component;
+
+path_walk_complete:
+	memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid));
+	dprintk("<-- nfs4_path_walk() = 0\n");
+	return 0;
+}
+
+/*
+ * get an NFS4 root dentry from the root filehandle
+ */
+struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_fattr fattr;
+	struct dentry *mntroot;
+	struct inode *inode;
+	int error;
+
+	dprintk("--> nfs4_get_root()\n");
+
+	/* create a dummy root dentry with dummy inode for this superblock */
+	if (!sb->s_root) {
+		struct nfs_fh dummyfh;
+		struct dentry *root;
+		struct inode *iroot;
+
+		memset(&dummyfh, 0, sizeof(dummyfh));
+		memset(&fattr, 0, sizeof(fattr));
+		nfs_fattr_init(&fattr);
+		fattr.valid = NFS_ATTR_FATTR;
+		fattr.type = NFDIR;
+		fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR;
+		fattr.nlink = 2;
+
+		iroot = nfs_fhget(sb, &dummyfh, &fattr);
+		if (IS_ERR(iroot))
+			return ERR_PTR(PTR_ERR(iroot));
+
+		root = d_alloc_root(iroot);
+		if (!root) {
+			iput(iroot);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		sb->s_root = root;
+	}
+
+	/* get the info about the server and filesystem */
+	error = nfs4_server_capabilities(server, mntfh);
+	if (error < 0) {
+		dprintk("nfs_get_root: getcaps error = %d\n",
+			-error);
+		return ERR_PTR(error);
+	}
+
+	/* get the actual root for this mount */
+	error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr);
+	if (error < 0) {
+		dprintk("nfs_get_root: getattr error = %d\n", -error);
+		return ERR_PTR(error);
+	}
+
+	inode = nfs_fhget(sb, mntfh, &fattr);
+	if (IS_ERR(inode)) {
+		dprintk("nfs_get_root: get root inode failed\n");
+		return ERR_PTR(PTR_ERR(inode));
+	}
+
+	/* root dentries normally start off anonymous and get spliced in later
+	 * if the dentry tree reaches them; however if the dentry already
+	 * exists, we'll pick it up at this point and use it as the root
+	 */
+	mntroot = d_alloc_anon(inode);
+	if (!mntroot) {
+		iput(inode);
+		dprintk("nfs_get_root: get root dentry failed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (!mntroot->d_op)
+		mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops;
+
+	dprintk("<-- nfs4_get_root()\n");
+	return mntroot;
+}
+
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 231c20ffc0f..f96dfac7dc9 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -114,8 +114,7 @@ nfs_idmap_new(struct nfs_client *clp)
 	struct idmap *idmap;
 	int error;
 
-	if (clp->cl_idmap != NULL)
-		return 0;
+	BUG_ON(clp->cl_idmap != NULL);
 
         if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL)
                 return -ENOMEM;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 771c3b83375..a547c58a83e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1020,7 +1020,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  out_fileid:
 	printk(KERN_ERR "NFS: server %s error: fileid changed\n"
 		"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
-		NFS_SERVER(inode)->hostname, inode->i_sb->s_id,
+		NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id,
 		(long long)nfsi->fileid, (long long)fattr->fileid);
 	goto out_err;
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2f3aa52fbef..e73ba4f1052 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -4,6 +4,18 @@
 
 #include <linux/mount.h>
 
+struct nfs_string;
+struct nfs_mount_data;
+struct nfs4_mount_data;
+
+/* Maximum number of readahead requests
+ * FIXME: this should really be a sysctl so that users may tune it to suit
+ *        their needs. People that do NFS over a slow network, might for
+ *        instance want to reduce it to something closer to 1 for improved
+ *        interactive response.
+ */
+#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
+
 struct nfs_clone_mount {
 	const struct super_block *sb;
 	const struct dentry *dentry;
@@ -16,12 +28,25 @@ struct nfs_clone_mount {
 };
 
 /* client.c */
+extern struct rpc_program nfs_program;
+
 extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int);
-extern struct nfs_client *nfs_get_client(const char *, const struct sockaddr_in *, int);
-extern void nfs_mark_client_ready(struct nfs_client *, int);
-extern int nfs_create_rpc_client(struct nfs_client *, int, unsigned int,
-				 unsigned int, rpc_authflavor_t);
+extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *,
+					    struct nfs_fh *);
+extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *,
+					     const char *,
+					     const struct sockaddr_in *,
+					     const char *,
+					     const char *,
+					     rpc_authflavor_t,
+					     struct nfs_fh *);
+extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
+						      struct nfs_fh *);
+extern void nfs_free_server(struct nfs_server *server);
+extern struct nfs_server *nfs_clone_server(struct nfs_server *,
+					   struct nfs_fh *,
+					   struct nfs_fattr *);
 
 /* nfs4namespace.c */
 #ifdef CONFIG_NFS_V4
@@ -89,10 +114,10 @@ extern void nfs4_clear_inode(struct inode *);
 #endif
 
 /* super.c */
-extern struct file_system_type nfs_referral_nfs4_fs_type;
-extern struct file_system_type clone_nfs_fs_type;
+extern struct file_system_type nfs_xdev_fs_type;
 #ifdef CONFIG_NFS_V4
-extern struct file_system_type clone_nfs4_fs_type;
+extern struct file_system_type nfs4_xdev_fs_type;
+extern struct file_system_type nfs4_referral_fs_type;
 #endif
 
 extern struct rpc_stat nfs_rpcstat;
@@ -101,28 +126,30 @@ extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
 
 /* namespace.c */
-extern char *nfs_path(const char *base, const struct dentry *dentry,
+extern char *nfs_path(const char *base,
+		      const struct dentry *droot,
+		      const struct dentry *dentry,
 		      char *buffer, ssize_t buflen);
 
-/*
- * Determine the mount path as a string
- */
+/* getroot.c */
+extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
 #ifdef CONFIG_NFS_V4
-static inline char *
-nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
-{
-	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
-}
+extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
+
+extern int nfs4_path_walk(struct nfs_server *server,
+			  struct nfs_fh *mntfh,
+			  const char *path);
 #endif
 
 /*
  * Determine the device name as a string
  */
 static inline char *nfs_devname(const struct vfsmount *mnt_parent,
-			 const struct dentry *dentry,
-			 char *buffer, ssize_t buflen)
+				const struct dentry *dentry,
+				char *buffer, ssize_t buflen)
 {
-	return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
+	return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root,
+			dentry, buffer, buflen);
 }
 
 /*
@@ -178,20 +205,3 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
 	if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
 		sb->s_maxbytes = MAX_LFS_FILESIZE;
 }
-
-/*
- * Check if the string represents a "valid" IPv4 address
- */
-static inline int valid_ipaddr4(const char *buf)
-{
-	int rc, count, in[4];
-
-	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
-	if (rc != 4)
-		return -EINVAL;
-	for (count = 0; count < 4; count++) {
-		if (in[count] > 255)
-			return -EINVAL;
-	}
-	return 0;
-}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index d8b8d56266c..77b00684894 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -2,6 +2,7 @@
  * linux/fs/nfs/namespace.c
  *
  * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
  *
  * NFS namespace
  */
@@ -28,6 +29,7 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
 /*
  * nfs_path - reconstruct the path given an arbitrary dentry
  * @base - arbitrary string to prepend to the path
+ * @droot - pointer to root dentry for mountpoint
  * @dentry - pointer to dentry
  * @buffer - result buffer
  * @buflen - length of buffer
@@ -38,7 +40,9 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
  * This is mainly for use in figuring out the path on the
  * server side when automounting on top of an existing partition.
  */
-char *nfs_path(const char *base, const struct dentry *dentry,
+char *nfs_path(const char *base,
+	       const struct dentry *droot,
+	       const struct dentry *dentry,
 	       char *buffer, ssize_t buflen)
 {
 	char *end = buffer+buflen;
@@ -47,7 +51,7 @@ char *nfs_path(const char *base, const struct dentry *dentry,
 	*--end = '\0';
 	buflen--;
 	spin_lock(&dcache_lock);
-	while (!IS_ROOT(dentry)) {
+	while (!IS_ROOT(dentry) && dentry != droot) {
 		namelen = dentry->d_name.len;
 		buflen -= namelen + 1;
 		if (buflen < 0)
@@ -96,12 +100,13 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	struct nfs_fattr fattr;
 	int err;
 
+	dprintk("--> nfs_follow_mountpoint()\n");
+
 	BUG_ON(IS_ROOT(dentry));
 	dprintk("%s: enter\n", __FUNCTION__);
 	dput(nd->dentry);
 	nd->dentry = dget(dentry);
-	if (d_mountpoint(nd->dentry))
-		goto out_follow;
+
 	/* Look it up again */
 	parent = dget_parent(nd->dentry);
 	err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
@@ -134,6 +139,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
 	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+
+	dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
 	return ERR_PTR(err);
 out_err:
 	path_release(nd);
@@ -183,14 +190,14 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 	switch (server->nfs_client->cl_nfsversion) {
 		case 2:
 		case 3:
-			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+			mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
 			break;
 		case 4:
-			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
+			mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata);
 	}
 	return mnt;
 #else
-	return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+	return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
 #endif
 }
 
@@ -216,6 +223,8 @@ struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 	char *page = (char *) __get_free_page(GFP_USER);
 	char *devname;
 
+	dprintk("--> nfs_do_submount()\n");
+
 	dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name);
@@ -230,5 +239,7 @@ free_page:
 	free_page((unsigned long)page);
 out:
 	dprintk("%s: done\n", __FUNCTION__);
+
+	dprintk("<-- nfs_do_submount() = %p\n", mnt);
 	return mnt;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 0622af0122b..9e8258ece6f 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -81,7 +81,7 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
 }
 
 /*
- * Bare-bones access to getattr: this is for nfs_read_super.
+ * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb
  */
 static int
 nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index e7879245361..61095fe4b5c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -188,8 +188,6 @@ extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(void *);
 
 /* nfs4state.c */
-extern void init_nfsv4_state(struct nfs_server *);
-extern void destroy_nfsv4_state(struct nfs_server *);
 struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
 extern u32 nfs4_alloc_lockowner_id(struct nfs_client *);
 
@@ -224,10 +222,6 @@ extern struct svc_version nfs4_callback_version1;
 
 #else
 
-#define init_nfsv4_state(server)  do { } while (0)
-#define destroy_nfsv4_state(server)       do { } while (0)
-#define nfs4_put_state_owner(inode, owner) do { } while (0)
-#define nfs4_put_open_state(state) do { } while (0)
 #define nfs4_close_state(a, b) do { } while (0)
 
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index faed9bcba50..24e47f3bbd1 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -2,6 +2,7 @@
  * linux/fs/nfs/nfs4namespace.c
  *
  * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ * - Modified by David Howells <dhowells@redhat.com>
  *
  * NFSv4 namespace
  */
@@ -47,6 +48,68 @@ Elong:
 	return ERR_PTR(-ENAMETOOLONG);
 }
 
+/*
+ * Determine the mount path as a string
+ */
+static char *nfs4_path(const struct vfsmount *mnt_parent,
+		       const struct dentry *dentry,
+		       char *buffer, ssize_t buflen)
+{
+	const char *srvpath;
+
+	srvpath = strchr(mnt_parent->mnt_devname, ':');
+	if (srvpath)
+		srvpath++;
+	else
+		srvpath = mnt_parent->mnt_devname;
+
+	return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen);
+}
+
+/*
+ * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
+ * believe to be the server path to this dentry
+ */
+static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
+				const struct dentry *dentry,
+				const struct nfs4_fs_locations *locations,
+				char *page, char *page2)
+{
+	const char *path, *fs_path;
+
+	path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+	if (IS_ERR(fs_path))
+		return PTR_ERR(fs_path);
+
+	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+		dprintk("%s: path %s does not begin with fsroot %s\n",
+			__FUNCTION__, path, fs_path);
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+/*
+ * Check if the string represents a "valid" IPv4 address
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+	int rc, count, in[4];
+
+	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+	if (rc != 4)
+		return -EINVAL;
+	for (count = 0; count < 4; count++) {
+		if (in[count] > 255)
+			return -EINVAL;
+	}
+	return 0;
+}
 
 /**
  * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
@@ -68,10 +131,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 		.dentry = dentry,
 		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
 	};
-	char *page, *page2;
-	char *path, *fs_path;
+	char *page = NULL, *page2 = NULL;
 	char *devname;
-	int loc, s;
+	int loc, s, error;
 
 	if (locations == NULL || locations->nlocations <= 0)
 		goto out;
@@ -79,31 +141,25 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 	dprintk("%s: referral at %s/%s\n", __FUNCTION__,
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	/* Ensure fs path is a prefix of current dentry path */
 	page = (char *) __get_free_page(GFP_USER);
-	if (page == NULL)
+	if (!page)
 		goto out;
+
 	page2 = (char *) __get_free_page(GFP_USER);
-	if (page2 == NULL)
+	if (!page2)
 		goto out;
 
-	path = nfs4_path(dentry, page, PAGE_SIZE);
-	if (IS_ERR(path))
-		goto out_free;
-
-	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
-	if (IS_ERR(fs_path))
-		goto out_free;
-
-	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
-		dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
-		goto out_free;
+	/* Ensure fs path is a prefix of current dentry path */
+	error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2);
+	if (error < 0) {
+		mnt = ERR_PTR(error);
+		goto out;
 	}
 
 	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
 	if (IS_ERR(devname)) {
 		mnt = (struct vfsmount *)devname;
-		goto out_free;
+		goto out;
 	}
 
 	loc = 0;
@@ -140,7 +196,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 			addr.sin_port = htons(NFS_PORT);
 			mountdata.addr = &addr;
 
-			mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
+			mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, devname, &mountdata);
 			if (!IS_ERR(mnt)) {
 				break;
 			}
@@ -149,10 +205,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 		loc++;
 	}
 
-out_free:
-	free_page((unsigned long)page);
-	free_page((unsigned long)page2);
 out:
+	free_page((unsigned long) page);
+	free_page((unsigned long) page2);
 	dprintk("%s: done\n", __FUNCTION__);
 	return mnt;
 }
@@ -165,7 +220,7 @@ out:
  */
 struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
 {
-	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
 	struct dentry *parent;
 	struct nfs4_fs_locations *fs_locations = NULL;
 	struct page *page;
@@ -183,11 +238,16 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
 		goto out_free;
 
 	/* Get locations */
+	mnt = ERR_PTR(-ENOENT);
+
 	parent = dget_parent(dentry);
-	dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
+	dprintk("%s: getting locations for %s/%s\n",
+		__FUNCTION__, parent->d_name.name, dentry->d_name.name);
+
 	err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
 	dput(parent);
-	if (err != 0 || fs_locations->nlocations <= 0 ||
+	if (err != 0 ||
+	    fs_locations->nlocations <= 0 ||
 	    fs_locations->fs_path.ncomponents <= 0)
 		goto out_free;
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1573eeb07ce..a825547e821 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1393,70 +1393,19 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	return err;
 }
 
+/*
+ * get the file handle for the "/" directory on the server
+ */
 static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
-		struct nfs_fsinfo *info)
+			      struct nfs_fsinfo *info)
 {
-	struct nfs_fattr *	fattr = info->fattr;
-	unsigned char *		p;
-	struct qstr		q;
-	struct nfs4_lookup_arg args = {
-		.dir_fh = fhandle,
-		.name = &q,
-		.bitmask = nfs4_fattr_bitmap,
-	};
-	struct nfs4_lookup_res res = {
-		.server = server,
-		.fattr = fattr,
-		.fh = fhandle,
-	};
-	struct rpc_message msg = {
-		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP],
-		.rpc_argp = &args,
-		.rpc_resp = &res,
-	};
 	int status;
 
-	/*
-	 * Now we do a separate LOOKUP for each component of the mount path.
-	 * The LOOKUPs are done separately so that we can conveniently
-	 * catch an ERR_WRONGSEC if it occurs along the way...
-	 */
 	status = nfs4_lookup_root(server, fhandle, info);
-	if (status)
-		goto out;
-
-	p = server->mnt_path;
-	for (;;) {
-		struct nfs4_exception exception = { };
-
-		while (*p == '/')
-			p++;
-		if (!*p)
-			break;
-		q.name = p;
-		while (*p && (*p != '/'))
-			p++;
-		q.len = p - q.name;
-
-		do {
-			nfs_fattr_init(fattr);
-			status = nfs4_handle_exception(server,
-					rpc_call_sync(server->client, &msg, 0),
-					&exception);
-		} while (exception.retry);
-		if (status == 0)
-			continue;
-		if (status == -ENOENT) {
-			printk(KERN_NOTICE "NFS: mount path %s does not exist!\n", server->mnt_path);
-			printk(KERN_NOTICE "NFS: suggestion: try mounting '/' instead.\n");
-		}
-		break;
-	}
 	if (status == 0)
 		status = nfs4_server_capabilities(server, fhandle);
 	if (status == 0)
 		status = nfs4_do_fsinfo(server, fhandle, info);
-out:
 	return nfs4_map_errors(status);
 }
 
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index ff947ecb8b8..f2c893690ac 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -127,26 +127,13 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
 void
 nfs4_renewd_prepare_shutdown(struct nfs_server *server)
 {
-	struct nfs_client *clp = server->nfs_client;
-
-	if (!clp)
-		return;
 	flush_scheduled_work();
-	down_write(&clp->cl_sem);
-	if (!list_empty(&server->nfs4_siblings))
-		list_del_init(&server->nfs4_siblings);
-	up_write(&clp->cl_sem);
 }
 
-/* Must be called with clp->cl_sem locked for writes */
 void
 nfs4_kill_renewd(struct nfs_client *clp)
 {
 	down_read(&clp->cl_sem);
-	if (!list_empty(&clp->cl_superblocks)) {
-		up_read(&clp->cl_sem);
-		return;
-	}
 	cancel_delayed_work(&clp->cl_renewd);
 	up_read(&clp->cl_sem);
 	flush_scheduled_work();
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 058811e3955..5fffbdfa971 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -58,24 +58,6 @@ const nfs4_stateid zero_stateid;
 
 static LIST_HEAD(nfs4_clientid_list);
 
-void
-init_nfsv4_state(struct nfs_server *server)
-{
-	server->nfs_client = NULL;
-	INIT_LIST_HEAD(&server->nfs4_siblings);
-}
-
-void
-destroy_nfsv4_state(struct nfs_server *server)
-{
-	kfree(server->mnt_path);
-	server->mnt_path = NULL;
-	if (server->nfs_client) {
-		nfs_put_client(server->nfs_client);
-		server->nfs_client = NULL;
-	}
-}
-
 static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 {
 	int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f0aff824a29..dae33c1e8a7 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -171,7 +171,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
 		rdata->args.offset = page_offset(page) + rdata->args.pgbase;
 
 		dprintk("NFS: nfs_proc_read(%s, (%s/%Ld), %Lu, %u)\n",
-			NFS_SERVER(inode)->hostname,
+			NFS_SERVER(inode)->nfs_client->cl_hostname,
 			inode->i_sb->s_id,
 			(long long)NFS_FILEID(inode),
 			(unsigned long long)rdata->args.pgbase,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5842d510d73..867b5dcd3a4 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -13,6 +13,11 @@
  *
  *  Split from inode.c by David Howells <dhowells@redhat.com>
  *
+ * - superblocks are indexed on server only - all inodes, dentries, etc. associated with a
+ *   particular server are held in the same superblock
+ * - NFS superblocks can have several effective roots to the dentry tree
+ * - directory type roots are spliced into the tree when a path from one root reaches the root
+ *   of another (see nfs_lookup())
  */
 
 #include <linux/config.h>
@@ -52,20 +57,12 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- *        their needs. People that do NFS over a slow network, might for
- *        instance want to reduce it to something closer to 1 for improved
- *        interactive response.
- */
-#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
-
 static void nfs_umount_begin(struct vfsmount *, int);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
-static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
+static int nfs_xdev_get_sb(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs_kill_super(struct super_block *);
 
@@ -77,10 +74,10 @@ static struct file_system_type nfs_fs_type = {
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
-struct file_system_type clone_nfs_fs_type = {
+struct file_system_type nfs_xdev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs",
-	.get_sb		= nfs_clone_nfs_sb,
+	.get_sb		= nfs_xdev_get_sb,
 	.kill_sb	= nfs_kill_super,
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -99,10 +96,10 @@ static struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
-static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs4_xdev_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static int nfs4_referral_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs4_kill_super(struct super_block *sb);
 
 static struct file_system_type nfs4_fs_type = {
@@ -113,18 +110,18 @@ static struct file_system_type nfs4_fs_type = {
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
-struct file_system_type clone_nfs4_fs_type = {
+struct file_system_type nfs4_xdev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs_clone_nfs4_sb,
+	.get_sb		= nfs4_xdev_get_sb,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
-struct file_system_type nfs_referral_nfs4_fs_type = {
+struct file_system_type nfs4_referral_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs_referral_nfs4_sb,
+	.get_sb		= nfs4_referral_get_sb,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -345,7 +342,7 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 	nfs_show_mount_options(m, nfss, 0);
 
 	seq_puts(m, ",addr=");
-	seq_escape(m, nfss->hostname, " \t\n\\");
+	seq_escape(m, nfss->nfs_client->cl_hostname, " \t\n\\");
 
 	return 0;
 }
@@ -429,714 +426,351 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 
 /*
  * Begin unmount by attempting to remove all automounted mountpoints we added
- * in response to traversals
+ * in response to xdev traversals and referrals
  */
 static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	struct nfs_server *server;
-	struct rpc_clnt	*rpc;
-
 	shrink_submounts(vfsmnt, &nfs_automount_list);
-	if (!(flags & MNT_FORCE))
-		return;
-	/* -EIO all pending I/O */
-	server = NFS_SB(vfsmnt->mnt_sb);
-	rpc = server->client;
-	if (!IS_ERR(rpc))
-		rpc_killall_tasks(rpc);
-	rpc = server->client_acl;
-	if (!IS_ERR(rpc))
-		rpc_killall_tasks(rpc);
 }
 
 /*
- * Obtain the root inode of the file system.
+ * Validate the NFS2/NFS3 mount data
+ * - fills in the mount root filehandle
  */
-static struct inode *
-nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
+static int nfs_validate_mount_data(struct nfs_mount_data *data,
+				   struct nfs_fh *mntfh)
 {
-	struct nfs_server	*server = NFS_SB(sb);
-	int			error;
-
-	error = server->nfs_client->rpc_ops->getroot(server, rootfh, fsinfo);
-	if (error < 0) {
-		dprintk("nfs_get_root: getattr error = %d\n", -error);
-		return ERR_PTR(error);
+	if (data == NULL) {
+		dprintk("%s: missing data argument\n", __FUNCTION__);
+		return -EINVAL;
 	}
 
-	server->fsid = fsinfo->fattr->fsid;
-	return nfs_fhget(sb, rootfh, fsinfo->fattr);
-}
-
-/*
- * Do NFS version-independent mount processing, and sanity checking
- */
-static int
-nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
-{
-	struct nfs_server	*server;
-	struct inode		*root_inode;
-	struct nfs_fattr	fattr;
-	struct nfs_fsinfo	fsinfo = {
-					.fattr = &fattr,
-				};
-	struct nfs_pathconf pathinfo = {
-			.fattr = &fattr,
-	};
-	int no_root_error = 0;
-	unsigned long max_rpc_payload;
-
-	/* We probably want something more informative here */
-	snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
-
-	server = NFS_SB(sb);
-
-	sb->s_magic      = NFS_SUPER_MAGIC;
-
-	server->io_stats = nfs_alloc_iostats();
-	if (server->io_stats == NULL)
-		return -ENOMEM;
+	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
+		dprintk("%s: bad mount version\n", __FUNCTION__);
+		return -EINVAL;
+	}
 
-	root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
-	/* Did getting the root inode fail? */
-	if (IS_ERR(root_inode)) {
-		no_root_error = PTR_ERR(root_inode);
-		goto out_no_root;
+	switch (data->version) {
+		case 1:
+			data->namlen = 0;
+		case 2:
+			data->bsize  = 0;
+		case 3:
+			if (data->flags & NFS_MOUNT_VER3) {
+				dprintk("%s: mount structure version %d does not support NFSv3\n",
+						__FUNCTION__,
+						data->version);
+				return -EINVAL;
+			}
+			data->root.size = NFS2_FHSIZE;
+			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+		case 4:
+			if (data->flags & NFS_MOUNT_SECFLAVOUR) {
+				dprintk("%s: mount structure version %d does not support strong security\n",
+						__FUNCTION__,
+						data->version);
+				return -EINVAL;
+			}
+			/* Fill in pseudoflavor for mount version < 5 */
+			data->pseudoflavor = RPC_AUTH_UNIX;
+		case 5:
+			memset(data->context, 0, sizeof(data->context));
 	}
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root) {
-		no_root_error = -ENOMEM;
-		goto out_no_root;
+
+#ifndef CONFIG_NFS_V3
+	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
+	if (data->flags & NFS_MOUNT_VER3) {
+		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
+		return -EPROTONOSUPPORT;
 	}
-	sb->s_root->d_op = server->nfs_client->rpc_ops->dentry_ops;
-
-	/* mount time stamp, in seconds */
-	server->mount_time = jiffies;
-
-	/* Get some general file system info */
-	if (server->namelen == 0 &&
-	    server->nfs_client->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
-		server->namelen = pathinfo.max_namelen;
-	/* Work out a lot of parameters */
-	if (server->rsize == 0)
-		server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
-	if (server->wsize == 0)
-		server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
-
-	if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
-		server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
-	if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
-		server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
-
-	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
-	if (server->rsize > max_rpc_payload)
-		server->rsize = max_rpc_payload;
-	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
-		server->rsize = NFS_MAX_FILE_IO_SIZE;
-	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
-	if (server->wsize > max_rpc_payload)
-		server->wsize = max_rpc_payload;
-	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
-		server->wsize = NFS_MAX_FILE_IO_SIZE;
-	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+#endif /* CONFIG_NFS_V3 */
 
-	if (sb->s_blocksize == 0)
-		sb->s_blocksize = nfs_block_bits(server->wsize,
-							 &sb->s_blocksize_bits);
-	server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
-
-	server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
-	if (server->dtsize > PAGE_CACHE_SIZE)
-		server->dtsize = PAGE_CACHE_SIZE;
-	if (server->dtsize > server->rsize)
-		server->dtsize = server->rsize;
-
-	if (server->flags & NFS_MOUNT_NOAC) {
-		server->acregmin = server->acregmax = 0;
-		server->acdirmin = server->acdirmax = 0;
-		sb->s_flags |= MS_SYNCHRONOUS;
+	/* We now require that the mount process passes the remote address */
+	if (data->addr.sin_addr.s_addr == INADDR_ANY) {
+		dprintk("%s: mount program didn't pass remote address!\n",
+			__FUNCTION__);
+		return -EINVAL;
 	}
-	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
 
-	nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+	/* Prepare the root filehandle */
+	if (data->flags & NFS_MOUNT_VER3)
+		mntfh->size = data->root.size;
+	else
+		mntfh->size = NFS2_FHSIZE;
 
-	server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
-	server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
+	if (mntfh->size > sizeof(mntfh->data)) {
+		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
+		return -EINVAL;
+	}
+
+	memcpy(mntfh->data, data->root.data, mntfh->size);
+	if (mntfh->size < sizeof(mntfh->data))
+		memset(mntfh->data + mntfh->size, 0,
+		       sizeof(mntfh->data) - mntfh->size);
 
-	/* We're airborne Set socket buffersize */
-	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 	return 0;
-	/* Yargs. It didn't work out. */
-out_no_root:
-	dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
-	if (!IS_ERR(root_inode))
-		iput(root_inode);
-	return no_root_error;
 }
 
 /*
- * Create an RPC client handle.
+ * Initialise the common bits of the superblock
  */
-static struct rpc_clnt *
-nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
+static inline void nfs_initialise_sb(struct super_block *sb)
 {
-	struct nfs_client	*clp;
-	struct rpc_clnt		*clnt;
-	int			proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-	int			nfsversion = 2;
-	int			err;
-
-#ifdef CONFIG_NFS_V3
-	if (server->flags & NFS_MOUNT_VER3)
-		nfsversion = 3;
-#endif
-
-	clp = nfs_get_client(server->hostname, &server->addr, nfsversion);
-	if (!clp) {
-		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
-		return ERR_PTR(PTR_ERR(clp));
-	}
-
-	if (clp->cl_cons_state == NFS_CS_INITING) {
-		/* Check NFS protocol revision and initialize RPC op
-		 * vector and file handle pool. */
-#ifdef CONFIG_NFS_V3
-		if (nfsversion == 3) {
-			clp->rpc_ops = &nfs_v3_clientops;
-			server->caps |= NFS_CAP_READDIRPLUS;
-		} else {
-			clp->rpc_ops = &nfs_v2_clientops;
-		}
-#else
-		clp->rpc_ops = &nfs_v2_clientops;
-#endif
-
-		/* create transport and client */
-		err = nfs_create_rpc_client(clp, proto, data->timeo,
-					    data->retrans, RPC_AUTH_UNIX);
-		if (err < 0)
-			goto client_init_error;
-
-		nfs_mark_client_ready(clp, 0);
-	}
+	struct nfs_server *server = NFS_SB(sb);
 
-	/* create an nfs_server-specific client */
-	clnt = rpc_clone_client(clp->cl_rpcclient);
-	if (IS_ERR(clnt)) {
-		dprintk("%s: couldn't create rpc_client!\n", __FUNCTION__);
-		nfs_put_client(clp);
-		return ERR_PTR(PTR_ERR(clnt));
-	}
+	sb->s_magic = NFS_SUPER_MAGIC;
 
-	if (data->pseudoflavor != clp->cl_rpcclient->cl_auth->au_flavor) {
-		struct rpc_auth *auth;
+	/* We probably want something more informative here */
+	snprintf(sb->s_id, sizeof(sb->s_id),
+		 "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
 
-		auth = rpcauth_create(data->pseudoflavor, server->client);
-		if (IS_ERR(auth)) {
-			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
-			return ERR_PTR(PTR_ERR(auth));
-		}
-	}
+	if (sb->s_blocksize == 0)
+		sb->s_blocksize = nfs_block_bits(server->wsize,
+						 &sb->s_blocksize_bits);
 
-	server->nfs_client = clp;
-	return clnt;
+	if (server->flags & NFS_MOUNT_NOAC)
+		sb->s_flags |= MS_SYNCHRONOUS;
 
-client_init_error:
-	nfs_mark_client_ready(clp, err);
-	nfs_put_client(clp);
-	return ERR_PTR(err);
+	nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
 
 /*
- * Clone a server record
+ * Finish setting up an NFS2/3 superblock
  */
-static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data)
+static void nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data)
 {
 	struct nfs_server *server = NFS_SB(sb);
-	struct nfs_server *parent = NFS_SB(data->sb);
-	struct inode *root_inode;
-	struct nfs_fsinfo fsinfo;
-	void *err = ERR_PTR(-ENOMEM);
-
-	sb->s_op = data->sb->s_op;
-	sb->s_blocksize = data->sb->s_blocksize;
-	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
-	sb->s_maxbytes = data->sb->s_maxbytes;
-
-	server->client_acl = ERR_PTR(-EINVAL);
-	server->io_stats = nfs_alloc_iostats();
-	if (server->io_stats == NULL)
-		goto out;
-
-	server->client = rpc_clone_client(parent->client);
-	if (IS_ERR((err = server->client)))
-		goto out;
-
-	if (!IS_ERR(parent->client_acl)) {
-		server->client_acl = rpc_clone_client(parent->client_acl);
-		if (IS_ERR((err = server->client_acl)))
-			goto out;
-	}
-	root_inode = nfs_fhget(sb, data->fh, data->fattr);
-	if (!root_inode)
-		goto out;
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root)
-		goto out_put_root;
-	fsinfo.fattr = data->fattr;
-	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
-		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
-	sb->s_root->d_op = server->nfs_client->rpc_ops->dentry_ops;
-	sb->s_flags |= MS_ACTIVE;
-	return server;
-out_put_root:
-	iput(root_inode);
-out:
-	return err;
-}
-
-/*
- * Copy an existing superblock and attach revised data
- */
-static int nfs_clone_generic_sb(struct nfs_clone_mount *data,
-		struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *),
-		struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *),
-		struct vfsmount *mnt)
-{
-	struct nfs_server *server;
-	struct nfs_server *parent = NFS_SB(data->sb);
-	struct super_block *sb = ERR_PTR(-EINVAL);
-	char *hostname;
-	int error = -ENOMEM;
-	int len;
-
-	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
-	if (server == NULL)
-		goto out_err;
-	memcpy(server, parent, sizeof(*server));
-	atomic_inc(&server->nfs_client->cl_count);
-	hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
-	len = strlen(hostname) + 1;
-	server->hostname = kmalloc(len, GFP_KERNEL);
-	if (server->hostname == NULL)
-		goto free_server;
-	memcpy(server->hostname, hostname, len);
-
-	sb = fill_sb(server, data);
-	if (IS_ERR(sb)) {
-		error = PTR_ERR(sb);
-		goto free_hostname;
-	}
 
-	if (sb->s_root)
-		goto out_share;
+	sb->s_blocksize_bits = 0;
+	sb->s_blocksize = 0;
+	if (data->bsize)
+		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
 
-	server = fill_server(sb, data);
-	if (IS_ERR(server)) {
-		error = PTR_ERR(server);
-		goto out_deactivate;
+	if (server->flags & NFS_MOUNT_VER3) {
+		/* The VFS shouldn't apply the umask to mode bits. We will do
+		 * so ourselves when necessary.
+		 */
+		sb->s_flags |= MS_POSIXACL;
+		sb->s_time_gran = 1;
 	}
-	return simple_set_mnt(mnt, sb);
-out_deactivate:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
-	return error;
-out_share:
-	kfree(server->hostname);
-	nfs_put_client(server->nfs_client);
-	kfree(server);
-	return simple_set_mnt(mnt, sb);
-free_hostname:
-	kfree(server->hostname);
-free_server:
-	nfs_put_client(server->nfs_client);
-	kfree(server);
-out_err:
-	return error;
+
+	sb->s_op = &nfs_sops;
+ 	nfs_initialise_sb(sb);
 }
 
 /*
- * Set up an NFS2/3 superblock
- *
- * The way this works is that the mount process passes a structure
- * in the data argument which contains the server's IP address
- * and the root file handle obtained from the server's mount
- * daemon. We stash these away in the private superblock fields.
+ * Finish setting up a cloned NFS2/3 superblock
  */
-static int
-nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
+static void nfs_clone_super(struct super_block *sb,
+			    const struct super_block *old_sb)
 {
-	struct nfs_server	*server;
-	rpc_authflavor_t	authflavor;
+	struct nfs_server *server = NFS_SB(sb);
+
+	sb->s_blocksize_bits = old_sb->s_blocksize_bits;
+	sb->s_blocksize = old_sb->s_blocksize;
+	sb->s_maxbytes = old_sb->s_maxbytes;
 
-	server           = NFS_SB(sb);
-	sb->s_blocksize_bits = 0;
-	sb->s_blocksize = 0;
-	if (data->bsize)
-		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
-	if (data->rsize)
-		server->rsize = nfs_block_size(data->rsize, NULL);
-	if (data->wsize)
-		server->wsize = nfs_block_size(data->wsize, NULL);
-	server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
-
-	server->acregmin = data->acregmin*HZ;
-	server->acregmax = data->acregmax*HZ;
-	server->acdirmin = data->acdirmin*HZ;
-	server->acdirmax = data->acdirmax*HZ;
-
-	/* Start lockd here, before we might error out */
-	if (!(server->flags & NFS_MOUNT_NONLM))
-		lockd_up();
-
-	server->namelen  = data->namlen;
-	server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
-	if (!server->hostname)
-		return -ENOMEM;
-	strcpy(server->hostname, data->hostname);
-
-	/* Fill in pseudoflavor for mount version < 5 */
-	if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-		data->pseudoflavor = RPC_AUTH_UNIX;
-	authflavor = data->pseudoflavor;	/* save for sb_init() */
-	/* XXX maybe we want to add a server->pseudoflavor field */
-
-	/* Create RPC client handles */
-	server->client = nfs_create_client(server, data);
-	if (IS_ERR(server->client))
-		return PTR_ERR(server->client);
-
-	/* RFC 2623, sec 2.3.2 */
 	if (server->flags & NFS_MOUNT_VER3) {
-#ifdef CONFIG_NFS_V3_ACL
-		if (!(server->flags & NFS_MOUNT_NOACL)) {
-			server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
-			/* No errors! Assume that Sun nfsacls are supported */
-			if (!IS_ERR(server->client_acl))
-				server->caps |= NFS_CAP_ACLS;
-		}
-#else
-		server->flags &= ~NFS_MOUNT_NOACL;
-#endif /* CONFIG_NFS_V3_ACL */
-		/*
-		 * The VFS shouldn't apply the umask to mode bits. We will
-		 * do so ourselves when necessary.
+		/* The VFS shouldn't apply the umask to mode bits. We will do
+		 * so ourselves when necessary.
 		 */
 		sb->s_flags |= MS_POSIXACL;
-		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
-			server->namelen = NFS3_MAXNAMLEN;
 		sb->s_time_gran = 1;
-	} else {
-		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
-			server->namelen = NFS2_MAXNAMLEN;
 	}
 
-	sb->s_op = &nfs_sops;
-	return nfs_sb_init(sb, authflavor);
+	sb->s_op = old_sb->s_op;
+ 	nfs_initialise_sb(sb);
 }
 
-static int nfs_set_super(struct super_block *s, void *data)
+static int nfs_set_super(struct super_block *s, void *_server)
 {
-	s->s_fs_info = data;
-	return set_anon_super(s, data);
+	struct nfs_server *server = _server;
+	int ret;
+
+	s->s_fs_info = server;
+	ret = set_anon_super(s, server);
+	if (ret == 0)
+		server->s_dev = s->s_dev;
+	return ret;
 }
 
 static int nfs_compare_super(struct super_block *sb, void *data)
 {
-	struct nfs_server *server = data;
-	struct nfs_server *old = NFS_SB(sb);
+	struct nfs_server *server = data, *old = NFS_SB(sb);
 
-	if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
+	if (old->nfs_client != server->nfs_client)
 		return 0;
-	if (old->addr.sin_port != server->addr.sin_port)
+	if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0)
 		return 0;
-	return !nfs_compare_fh(&old->fh, &server->fh);
+	return 1;
 }
 
 static int nfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-	int error;
 	struct nfs_server *server = NULL;
 	struct super_block *s;
-	struct nfs_fh *root;
+	struct nfs_fh mntfh;
 	struct nfs_mount_data *data = raw_data;
+	struct dentry *mntroot;
+	int error;
 
-	error = -EINVAL;
-	if (data == NULL) {
-		dprintk("%s: missing data argument\n", __FUNCTION__);
-		goto out_err_noserver;
-	}
-	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
-		dprintk("%s: bad mount version\n", __FUNCTION__);
-		goto out_err_noserver;
-	}
-	switch (data->version) {
-		case 1:
-			data->namlen = 0;
-		case 2:
-			data->bsize  = 0;
-		case 3:
-			if (data->flags & NFS_MOUNT_VER3) {
-				dprintk("%s: mount structure version %d does not support NFSv3\n",
-						__FUNCTION__,
-						data->version);
-				goto out_err_noserver;
-			}
-			data->root.size = NFS2_FHSIZE;
-			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
-		case 4:
-			if (data->flags & NFS_MOUNT_SECFLAVOUR) {
-				dprintk("%s: mount structure version %d does not support strong security\n",
-						__FUNCTION__,
-						data->version);
-				goto out_err_noserver;
-			}
-		case 5:
-			memset(data->context, 0, sizeof(data->context));
-	}
-#ifndef CONFIG_NFS_V3
-	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-	error = -EPROTONOSUPPORT;
-	if (data->flags & NFS_MOUNT_VER3) {
-		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-		goto out_err_noserver;
-	}
-#endif /* CONFIG_NFS_V3 */
+	/* Validate the mount data */
+	error = nfs_validate_mount_data(data, &mntfh);
+	if (error < 0)
+		return error;
 
-	error = -ENOMEM;
-	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-	if (!server)
+	/* Get a volume representation */
+	server = nfs_create_server(data, &mntfh);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
 		goto out_err_noserver;
-	/* Zero out the NFS state stuff */
-	init_nfsv4_state(server);
-	server->client = server->client_acl = ERR_PTR(-EINVAL);
-
-	root = &server->fh;
-	if (data->flags & NFS_MOUNT_VER3)
-		root->size = data->root.size;
-	else
-		root->size = NFS2_FHSIZE;
-	error = -EINVAL;
-	if (root->size > sizeof(root->data)) {
-		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
-		goto out_err;
-	}
-	memcpy(root->data, data->root.data, root->size);
-
-	/* We now require that the mount process passes the remote address */
-	memcpy(&server->addr, &data->addr, sizeof(server->addr));
-	if (server->addr.sin_addr.s_addr == INADDR_ANY) {
-		dprintk("%s: mount program didn't pass remote address!\n",
-				__FUNCTION__);
-		goto out_err;
 	}
 
+	/* Get a superblock - note that we may end up sharing one that already exists */
 	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
-		goto out_err;
+		goto out_err_nosb;
 	}
 
-	if (s->s_root)
-		goto out_share;
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	}
 
-	s->s_flags = flags;
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		s->s_flags = flags;
+		nfs_fill_super(s, data);
+	}
 
-	error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-	if (error) {
-		up_write(&s->s_umount);
-		deactivate_super(s);
-		return error;
+	mntroot = nfs_get_root(s, &mntfh);
+	if (IS_ERR(mntroot)) {
+		error = PTR_ERR(mntroot);
+		goto error_splat_super;
 	}
-	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
 
-out_share:
-	kfree(server);
-	return simple_set_mnt(mnt, s);
+	s->s_flags |= MS_ACTIVE;
+	mnt->mnt_sb = s;
+	mnt->mnt_root = mntroot;
+	return 0;
 
-out_err:
-	kfree(server);
+out_err_nosb:
+	nfs_free_server(server);
 out_err_noserver:
 	return error;
+
+error_splat_super:
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	return error;
 }
 
+/*
+ * Destroy an NFS2/3 superblock
+ */
 static void nfs_kill_super(struct super_block *s)
 {
 	struct nfs_server *server = NFS_SB(s);
 
 	kill_anon_super(s);
-
-	if (!IS_ERR(server->client))
-		rpc_shutdown_client(server->client);
-	if (!IS_ERR(server->client_acl))
-		rpc_shutdown_client(server->client_acl);
-
-	if (!(server->flags & NFS_MOUNT_NONLM))
-		lockd_down();	/* release rpc.lockd */
-
-	nfs_free_iostats(server->io_stats);
-	kfree(server->hostname);
-	nfs_put_client(server->nfs_client);
-	kfree(server);
-	nfs_release_automount_timer();
+	nfs_free_server(server);
 }
 
-static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
-{
-	struct super_block *sb;
-
-	server->fsid = data->fattr->fsid;
-	nfs_copy_fh(&server->fh, data->fh);
-	sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
-	if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
-		lockd_up();
-	return sb;
-}
-
-static int nfs_clone_nfs_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+/*
+ * Clone an NFS2/3 server record on xdev traversal (FSID-change)
+ */
+static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
+			   const char *dev_name, void *raw_data,
+			   struct vfsmount *mnt)
 {
 	struct nfs_clone_mount *data = raw_data;
-	return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server, mnt);
-}
+	struct super_block *s;
+	struct nfs_server *server;
+	struct dentry *mntroot;
+	int error;
 
-#ifdef CONFIG_NFS_V4
-static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
-	int timeo, int retrans, int proto, rpc_authflavor_t flavor)
-{
-	struct nfs_client *clp;
-	struct rpc_clnt *clnt = NULL;
-	int err = -EIO;
-
-	clp = nfs_get_client(server->hostname, &server->addr, 4);
-	if (!clp) {
-		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
-		return ERR_PTR(err);
-	}
+	dprintk("--> nfs_xdev_get_sb()\n");
 
-	/* Now create transport and client */
-	if (clp->cl_cons_state == NFS_CS_INITING) {
-		clp->rpc_ops = &nfs_v4_clientops;
+	/* create a new volume representation */
+	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
+		goto out_err_noserver;
+	}
 
-		err = nfs_create_rpc_client(clp, proto, timeo, retrans, flavor);
-		if (err < 0)
-			goto client_init_error;
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_nosb;
+	}
 
-		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
-		err = nfs_idmap_new(clp);
-		if (err < 0) {
-			dprintk("%s: failed to create idmapper.\n",
-				__FUNCTION__);
-			goto client_init_error;
-		}
-		__set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
-		nfs_mark_client_ready(clp, 0);
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
 	}
 
-	clnt = rpc_clone_client(clp->cl_rpcclient);
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		s->s_flags = flags;
+		nfs_clone_super(s, data->sb);
+	}
 
-	if (IS_ERR(clnt)) {
-		dprintk("%s: cannot create RPC client. Error = %d\n",
-				__FUNCTION__, err);
-		return clnt;
+	mntroot = nfs_get_root(s, data->fh);
+	if (IS_ERR(mntroot)) {
+		error = PTR_ERR(mntroot);
+		goto error_splat_super;
 	}
 
-	if (clnt->cl_auth->au_flavor != flavor) {
-		struct rpc_auth *auth;
+	s->s_flags |= MS_ACTIVE;
+	mnt->mnt_sb = s;
+	mnt->mnt_root = mntroot;
 
-		auth = rpcauth_create(flavor, clnt);
-		if (IS_ERR(auth)) {
-			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
-			return (struct rpc_clnt *)auth;
-		}
-	}
+	dprintk("<-- nfs_xdev_get_sb() = 0\n");
+	return 0;
 
-	server->nfs_client = clp;
-	down_write(&clp->cl_sem);
-	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
-	up_write(&clp->cl_sem);
-	return clnt;
+out_err_nosb:
+	nfs_free_server(server);
+out_err_noserver:
+	dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error);
+	return error;
 
-client_init_error:
-	nfs_mark_client_ready(clp, err);
-	nfs_put_client(clp);
-	return ERR_PTR(err);
+error_splat_super:
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
+	return error;
 }
 
+#ifdef CONFIG_NFS_V4
+
 /*
- * Set up an NFS4 superblock
+ * Finish setting up a cloned NFS4 superblock
  */
-static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
+static void nfs4_clone_super(struct super_block *sb,
+			    const struct super_block *old_sb)
 {
-	struct nfs_server *server;
-	rpc_authflavor_t authflavour;
-	int err = -EIO;
-
-	sb->s_blocksize_bits = 0;
-	sb->s_blocksize = 0;
-	server = NFS_SB(sb);
-	if (data->rsize != 0)
-		server->rsize = nfs_block_size(data->rsize, NULL);
-	if (data->wsize != 0)
-		server->wsize = nfs_block_size(data->wsize, NULL);
-	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
-	server->caps = NFS_CAP_ATOMIC_OPEN;
-
-	server->acregmin = data->acregmin*HZ;
-	server->acregmax = data->acregmax*HZ;
-	server->acdirmin = data->acdirmin*HZ;
-	server->acdirmax = data->acdirmax*HZ;
-
-	/* Now create transport and client */
-	authflavour = RPC_AUTH_UNIX;
-	if (data->auth_flavourlen != 0) {
-		if (data->auth_flavourlen != 1) {
-			dprintk("%s: Invalid number of RPC auth flavours %d.\n",
-					__FUNCTION__, data->auth_flavourlen);
-			err = -EINVAL;
-			goto out_fail;
-		}
-		if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
-			err = -EFAULT;
-			goto out_fail;
-		}
-	}
-
-	server->client = nfs4_create_client(server, data->timeo, data->retrans,
-					    data->proto, authflavour);
-	if (IS_ERR(server->client)) {
-		err = PTR_ERR(server->client);
-			dprintk("%s: cannot create RPC client. Error = %d\n",
-					__FUNCTION__, err);
-			goto out_fail;
-	}
-
+	sb->s_blocksize_bits = old_sb->s_blocksize_bits;
+	sb->s_blocksize = old_sb->s_blocksize;
+	sb->s_maxbytes = old_sb->s_maxbytes;
 	sb->s_time_gran = 1;
-
-	sb->s_op = &nfs4_sops;
-	err = nfs_sb_init(sb, authflavour);
-
- out_fail:
-	return err;
+	sb->s_op = old_sb->s_op;
+ 	nfs_initialise_sb(sb);
 }
 
-static int nfs4_compare_super(struct super_block *sb, void *data)
+/*
+ * Set up an NFS4 superblock
+ */
+static void nfs4_fill_super(struct super_block *sb)
 {
-	struct nfs_server *server = data;
-	struct nfs_server *old = NFS_SB(sb);
-
-	if (strcmp(server->hostname, old->hostname) != 0)
-		return 0;
-	if (strcmp(server->mnt_path, old->mnt_path) != 0)
-		return 0;
-	return 1;
+	sb->s_time_gran = 1;
+	sb->s_op = &nfs4_sops;
+	nfs_initialise_sb(sb);
 }
 
-static void *
-nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
+static void *nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
 {
 	void *p = NULL;
 
@@ -1157,14 +791,22 @@ nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
 	return dst;
 }
 
+/*
+ * Get the superblock for an NFS4 mountpoint
+ */
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-	int error;
-	struct nfs_server *server;
-	struct super_block *s;
 	struct nfs4_mount_data *data = raw_data;
+	struct super_block *s;
+	struct nfs_server *server;
+	struct sockaddr_in addr;
+	rpc_authflavor_t authflavour;
+	struct nfs_fh mntfh;
+	struct dentry *mntroot;
+	char *mntpath = NULL, *hostname = NULL, ip_addr[16];
 	void *p;
+	int error;
 
 	if (data == NULL) {
 		dprintk("%s: missing data argument\n", __FUNCTION__);
@@ -1175,75 +817,107 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		return -EINVAL;
 	}
 
-	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-	if (!server)
-		return -ENOMEM;
-	/* Zero out the NFS state stuff */
-	init_nfsv4_state(server);
-	server->client = server->client_acl = ERR_PTR(-EINVAL);
+	/* We now require that the mount process passes the remote address */
+	if (data->host_addrlen != sizeof(addr))
+		return -EINVAL;
+
+	if (copy_from_user(&addr, data->host_addr, sizeof(addr)))
+		return -EFAULT;
+
+	if (addr.sin_family != AF_INET ||
+	    addr.sin_addr.s_addr == INADDR_ANY
+	    ) {
+		dprintk("%s: mount program didn't pass remote IP address!\n",
+				__FUNCTION__);
+		return -EINVAL;
+	}
+
+	/* Grab the authentication type */
+	authflavour = RPC_AUTH_UNIX;
+	if (data->auth_flavourlen != 0) {
+		if (data->auth_flavourlen != 1) {
+			dprintk("%s: Invalid number of RPC auth flavours %d.\n",
+					__FUNCTION__, data->auth_flavourlen);
+			error = -EINVAL;
+			goto out_err_noserver;
+		}
+
+		if (copy_from_user(&authflavour, data->auth_flavours,
+				   sizeof(authflavour))) {
+			error = -EFAULT;
+			goto out_err_noserver;
+		}
+	}
 
 	p = nfs_copy_user_string(NULL, &data->hostname, 256);
 	if (IS_ERR(p))
 		goto out_err;
-	server->hostname = p;
+	hostname = p;
 
 	p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
 	if (IS_ERR(p))
 		goto out_err;
-	server->mnt_path = p;
+	mntpath = p;
 
-	p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
-			sizeof(server->ip_addr) - 1);
+	dprintk("MNTPATH: %s\n", mntpath);
+
+	p = nfs_copy_user_string(ip_addr, &data->client_addr,
+				 sizeof(ip_addr) - 1);
 	if (IS_ERR(p))
 		goto out_err;
 
-	/* We now require that the mount process passes the remote address */
-	if (data->host_addrlen != sizeof(server->addr)) {
-		error = -EINVAL;
-		goto out_free;
-	}
-	if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
-		error = -EFAULT;
-		goto out_free;
-	}
-	if (server->addr.sin_family != AF_INET ||
-	    server->addr.sin_addr.s_addr == INADDR_ANY) {
-		dprintk("%s: mount program didn't pass remote IP address!\n",
-				__FUNCTION__);
-		error = -EINVAL;
-		goto out_free;
+	/* Get a volume representation */
+	server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr,
+				    authflavour, &mntfh);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
+		goto out_err_noserver;
 	}
 
-	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_free;
 	}
 
-	if (s->s_root) {
-		kfree(server->mnt_path);
-		kfree(server->hostname);
-		kfree(server);
-		return simple_set_mnt(mnt, s);
-	}
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		s->s_flags = flags;
 
-	s->s_flags = flags;
+		nfs4_fill_super(s);
+	} else {
+		nfs_free_server(server);
+	}
 
-	error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-	if (error) {
-		up_write(&s->s_umount);
-		deactivate_super(s);
-		return error;
+	mntroot = nfs4_get_root(s, &mntfh);
+	if (IS_ERR(mntroot)) {
+		error = PTR_ERR(mntroot);
+		goto error_splat_super;
 	}
+
 	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
+	mnt->mnt_sb = s;
+	mnt->mnt_root = mntroot;
+	kfree(mntpath);
+	kfree(hostname);
+	return 0;
+
 out_err:
 	error = PTR_ERR(p);
+	goto out_err_noserver;
+
 out_free:
-	kfree(server->mnt_path);
-	kfree(server->hostname);
-	kfree(server);
+	nfs_free_server(server);
+out_err_noserver:
+	kfree(mntpath);
+	kfree(hostname);
 	return error;
+
+error_splat_super:
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	goto out_err_noserver;
 }
 
 static void nfs4_kill_super(struct super_block *sb)
@@ -1254,133 +928,140 @@ static void nfs4_kill_super(struct super_block *sb)
 	kill_anon_super(sb);
 
 	nfs4_renewd_prepare_shutdown(server);
-
-	if (server->client != NULL && !IS_ERR(server->client))
-		rpc_shutdown_client(server->client);
-
-	destroy_nfsv4_state(server);
-
-	nfs_free_iostats(server->io_stats);
-	kfree(server->hostname);
-	kfree(server);
-	nfs_release_automount_timer();
+	nfs_free_server(server);
 }
 
 /*
- * Constructs the SERVER-side path
+ * Clone an NFS4 server record on xdev traversal (FSID-change)
  */
-static inline char *nfs4_dup_path(const struct dentry *dentry)
+static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
+			    const char *dev_name, void *raw_data,
+			    struct vfsmount *mnt)
 {
-	char *page = (char *) __get_free_page(GFP_USER);
-	char *path;
+	struct nfs_clone_mount *data = raw_data;
+	struct super_block *s;
+	struct nfs_server *server;
+	struct dentry *mntroot;
+	int error;
 
-	path = nfs4_path(dentry, page, PAGE_SIZE);
-	if (!IS_ERR(path)) {
-		int len = PAGE_SIZE + page - path;
-		char *tmp = path;
+	dprintk("--> nfs4_xdev_get_sb()\n");
 
-		path = kmalloc(len, GFP_KERNEL);
-		if (path)
-			memcpy(path, tmp, len);
-		else
-			path = ERR_PTR(-ENOMEM);
+	/* create a new volume representation */
+	server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
+		goto out_err_noserver;
 	}
-	free_page((unsigned long)page);
-	return path;
-}
 
-static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
-{
-	const struct dentry *dentry = data->dentry;
-	struct nfs_client *clp = server->nfs_client;
-	struct super_block *sb;
-
-	server->fsid = data->fattr->fsid;
-	nfs_copy_fh(&server->fh, data->fh);
-	server->mnt_path = nfs4_dup_path(dentry);
-	if (IS_ERR(server->mnt_path)) {
-		sb = (struct super_block *)server->mnt_path;
-		goto err;
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_nosb;
 	}
-	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
-	if (IS_ERR(sb) || sb->s_root)
-		goto free_path;
-	nfs4_server_capabilities(server, &server->fh);
-
-	down_write(&clp->cl_sem);
-	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
-	up_write(&clp->cl_sem);
-	return sb;
-free_path:
-	kfree(server->mnt_path);
-err:
-	server->mnt_path = NULL;
-	return sb;
-}
 
-static int nfs_clone_nfs4_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
-{
-	struct nfs_clone_mount *data = raw_data;
-	return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server, mnt);
-}
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	}
 
-static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
-{
-	struct super_block *sb = ERR_PTR(-ENOMEM);
-	int len;
-
-	len = strlen(data->mnt_path) + 1;
-	server->mnt_path = kmalloc(len, GFP_KERNEL);
-	if (server->mnt_path == NULL)
-		goto err;
-	memcpy(server->mnt_path, data->mnt_path, len);
-	memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
-
-	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
-	if (IS_ERR(sb) || sb->s_root)
-		goto free_path;
-	return sb;
-free_path:
-	kfree(server->mnt_path);
-err:
-	server->mnt_path = NULL;
-	return sb;
-}
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		s->s_flags = flags;
+		nfs4_clone_super(s, data->sb);
+	}
 
-static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
-{
-	struct nfs_server *server = NFS_SB(sb);
-	int proto, timeo, retrans;
-	void *err;
-
-	proto = IPPROTO_TCP;
-	/* Since we are following a referral and there may be alternatives,
-	   set the timeouts and retries to low values */
-	timeo = 2;
-	retrans = 1;
-
-	nfs_put_client(server->nfs_client);
-	server->nfs_client = NULL;
-	server->client = nfs4_create_client(server, timeo, retrans, proto,
-					    data->authflavor);
-	if (IS_ERR((err = server->client)))
-		goto out_err;
+	mntroot = nfs4_get_root(s, data->fh);
+	if (IS_ERR(mntroot)) {
+		error = PTR_ERR(mntroot);
+		goto error_splat_super;
+	}
 
-	sb->s_time_gran = 1;
-	sb->s_op = &nfs4_sops;
-	err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
-	if (!IS_ERR(err))
-		return server;
-out_err:
-	return (struct nfs_server *)err;
+	s->s_flags |= MS_ACTIVE;
+	mnt->mnt_sb = s;
+	mnt->mnt_root = mntroot;
+
+	dprintk("<-- nfs4_xdev_get_sb() = 0\n");
+	return 0;
+
+out_err_nosb:
+	nfs_free_server(server);
+out_err_noserver:
+	dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error);
+	return error;
+
+error_splat_super:
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
+	return error;
 }
 
-static int nfs_referral_nfs4_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+/*
+ * Create an NFS4 server record on referral traversal
+ */
+static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
+				const char *dev_name, void *raw_data,
+				struct vfsmount *mnt)
 {
 	struct nfs_clone_mount *data = raw_data;
-	return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server, mnt);
+	struct super_block *s;
+	struct nfs_server *server;
+	struct dentry *mntroot;
+	struct nfs_fh mntfh;
+	int error;
+
+	dprintk("--> nfs4_referral_get_sb()\n");
+
+	/* create a new volume representation */
+	server = nfs4_create_referral_server(data, &mntfh);
+	if (IS_ERR(server)) {
+		error = PTR_ERR(server);
+		goto out_err_noserver;
+	}
+
+	/* Get a superblock - note that we may end up sharing one that already exists */
+	s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_nosb;
+	}
+
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	}
+
+	if (!s->s_root) {
+		/* initial superblock/root creation */
+		s->s_flags = flags;
+		nfs4_fill_super(s);
+	}
+
+	mntroot = nfs4_get_root(s, data->fh);
+	if (IS_ERR(mntroot)) {
+		error = PTR_ERR(mntroot);
+		goto error_splat_super;
+	}
+
+	s->s_flags |= MS_ACTIVE;
+	mnt->mnt_sb = s;
+	mnt->mnt_root = mntroot;
+
+	dprintk("<-- nfs4_referral_get_sb() = 0\n");
+	return 0;
+
+out_err_nosb:
+	nfs_free_server(server);
+out_err_noserver:
+	dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error);
+	return error;
+
+error_splat_super:
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
+	return error;
 }
 
-#endif
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7084ac9a645..453d44666ea 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1273,7 +1273,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		if (time_before(complain, jiffies)) {
 			dprintk("NFS: faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
-				NFS_SERVER(data->inode)->hostname,
+				NFS_SERVER(data->inode)->nfs_client->cl_hostname,
 				resp->verf->committed, argp->stable);
 			complain = jiffies + 300 * HZ;
 		}
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d404ceca916..6d0be0efd1b 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -51,7 +51,6 @@ struct nfs_client {
 	unsigned long		cl_lease_time;
 	unsigned long		cl_last_renewal;
 	struct work_struct	cl_renewd;
-	struct work_struct	cl_recoverd;
 
 	struct rpc_wait_queue	cl_rpcwaitq;
 
@@ -74,6 +73,10 @@ struct nfs_client {
  */
 struct nfs_server {
 	struct nfs_client *	nfs_client;	/* shared client and NFS4 state */
+	struct list_head	client_link;	/* List of other nfs_server structs
+						 * that share the same client
+						 */
+	struct list_head	master_link;	/* link in master servers list */
 	struct rpc_clnt *	client;		/* RPC client handle */
 	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
 	struct nfs_iostats *	io_stats;	/* I/O statistics */
@@ -92,20 +95,13 @@ struct nfs_server {
 	unsigned int		acdirmin;
 	unsigned int		acdirmax;
 	unsigned int		namelen;
-	char *			hostname;	/* remote hostname */
-	struct nfs_fh		fh;
-	struct sockaddr_in	addr;
+
 	struct nfs_fsid		fsid;
+	__u64			maxfilesize;	/* maximum file size */
 	unsigned long		mount_time;	/* when this fs was mounted */
+	dev_t			s_dev;		/* superblock dev numbers */
+
 #ifdef CONFIG_NFS_V4
-	/* Our own IP address, as a null-terminated string.
-	 * This is used to generate the clientid, and the callback address.
-	 */
-	char			ip_addr[16];
-	char *			mnt_path;
-	struct list_head	nfs4_siblings;	/* List of other nfs_server structs
-						 * that share the same clientid
-						 */
 	u32			attr_bitmask[2];/* V4 bitmask representing the set
 						   of attributes supported on this
 						   filesystem */
@@ -113,6 +109,7 @@ struct nfs_server {
 						   that are supported on this
 						   filesystem */
 #endif
+	void (*destroy)(struct nfs_server *);
 };
 
 /* Server capabilities */
-- 
cgit v1.2.3-70-g09d2


From ec739ef03dc926d05051c8c5838971445504470a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:15 -0400
Subject: SUNRPC: Create a helper to tell whether a transport is bound

Hide the contents and format of xprt->addr by eliminating direct uses
of the xprt->addr.sin_port field.  This change is required to support
alternate RPC host address formats (eg IPv6).

Test-plan:
Destructive testing (unplugging the network temporarily).  Repeated runs of
Connectathon locking suite with UDP and TCP.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h | 16 ++++++++++++++++
 net/sunrpc/clnt.c           | 10 +++++-----
 net/sunrpc/pmap_clnt.c      |  5 ++++-
 net/sunrpc/xprt.c           |  2 +-
 net/sunrpc/xprtsock.c       | 14 ++++++++++----
 5 files changed, 36 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 3a0cca255b7..a71106723d7 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -269,6 +269,7 @@ int			xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to);
 #define XPRT_CONNECTED		(1)
 #define XPRT_CONNECTING		(2)
 #define XPRT_CLOSE_WAIT		(3)
+#define XPRT_BOUND		(4)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
@@ -312,6 +313,21 @@ static inline int xprt_test_and_set_connecting(struct rpc_xprt *xprt)
 	return test_and_set_bit(XPRT_CONNECTING, &xprt->state);
 }
 
+static inline void xprt_set_bound(struct rpc_xprt *xprt)
+{
+	test_and_set_bit(XPRT_BOUND, &xprt->state);
+}
+
+static inline int xprt_bound(struct rpc_xprt *xprt)
+{
+	return test_bit(XPRT_BOUND, &xprt->state);
+}
+
+static inline void xprt_clear_bound(struct rpc_xprt *xprt)
+{
+	clear_bit(XPRT_BOUND, &xprt->state);
+}
+
 #endif /* __KERNEL__*/
 
 #endif /* _LINUX_SUNRPC_XPRT_H */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 3e19d321067..0b8d03d0856 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -148,7 +148,6 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname,
 	clnt->cl_maxproc  = version->nrprocs;
 	clnt->cl_protname = program->name;
 	clnt->cl_pmap	  = &clnt->cl_pmap_default;
-	clnt->cl_port     = xprt->addr.sin_port;
 	clnt->cl_prog     = program->number;
 	clnt->cl_vers     = version->number;
 	clnt->cl_prot     = xprt->prot;
@@ -156,7 +155,7 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname,
 	clnt->cl_metrics  = rpc_alloc_iostats(clnt);
 	rpc_init_wait_queue(&clnt->cl_pmap_default.pm_bindwait, "bindwait");
 
-	if (!clnt->cl_port)
+	if (!xprt_bound(clnt->cl_xprt))
 		clnt->cl_autobind = 1;
 
 	clnt->cl_rtt = &clnt->cl_rtt_default;
@@ -570,7 +569,7 @@ EXPORT_SYMBOL(rpc_max_payload);
 void rpc_force_rebind(struct rpc_clnt *clnt)
 {
 	if (clnt->cl_autobind)
-		clnt->cl_port = 0;
+		xprt_clear_bound(clnt->cl_xprt);
 }
 EXPORT_SYMBOL(rpc_force_rebind);
 
@@ -782,14 +781,15 @@ static void
 call_bind(struct rpc_task *task)
 {
 	struct rpc_clnt	*clnt = task->tk_client;
+	struct rpc_xprt *xprt = task->tk_xprt;
 
 	dprintk("RPC: %4d call_bind (status %d)\n",
 				task->tk_pid, task->tk_status);
 
 	task->tk_action = call_connect;
-	if (!clnt->cl_port) {
+	if (!xprt_bound(xprt)) {
 		task->tk_action = call_bind_status;
-		task->tk_timeout = task->tk_xprt->bind_timeout;
+		task->tk_timeout = xprt->bind_timeout;
 		rpc_getport(task, clnt);
 	}
 }
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 623180f224c..209ffdfee10 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -142,15 +142,17 @@ pmap_getport_done(struct rpc_task *task)
 	dprintk("RPC: %4d pmap_getport_done(status %d, port %d)\n",
 			task->tk_pid, task->tk_status, clnt->cl_port);
 
-	xprt->ops->set_port(xprt, 0);
 	if (task->tk_status < 0) {
 		/* Make the calling task exit with an error */
+		xprt->ops->set_port(xprt, 0);
 		task->tk_action = rpc_exit_task;
 	} else if (clnt->cl_port == 0) {
 		/* Program not registered */
+		xprt->ops->set_port(xprt, 0);
 		rpc_exit(task, -EACCES);
 	} else {
 		xprt->ops->set_port(xprt, clnt->cl_port);
+		xprt_set_bound(xprt);
 		clnt->cl_port = htons(clnt->cl_port);
 	}
 	spin_lock(&pmap_lock);
@@ -218,6 +220,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileg
 	if (IS_ERR(xprt))
 		return (struct rpc_clnt *)xprt;
 	xprt->ops->set_port(xprt, RPC_PMAP_PORT);
+	xprt_set_bound(xprt);
 	if (!privileged)
 		xprt->resvport = 0;
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e8c2bc4977f..e239ef985ef 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -534,7 +534,7 @@ void xprt_connect(struct rpc_task *task)
 	dprintk("RPC: %4d xprt_connect xprt %p %s connected\n", task->tk_pid,
 			xprt, (xprt_connected(xprt) ? "is" : "is not"));
 
-	if (!xprt->addr.sin_port) {
+	if (!xprt_bound(xprt)) {
 		task->tk_status = -EIO;
 		return;
 	}
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 441bd53f5ec..123ac1e5ba1 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1016,7 +1016,7 @@ static void xs_udp_connect_worker(void *args)
 	struct socket *sock = xprt->sock;
 	int err, status = -EIO;
 
-	if (xprt->shutdown || xprt->addr.sin_port == 0)
+	if (xprt->shutdown || !xprt_bound(xprt))
 		goto out;
 
 	dprintk("RPC:      xs_udp_connect_worker for xprt %p\n", xprt);
@@ -1099,7 +1099,7 @@ static void xs_tcp_connect_worker(void *args)
 	struct socket *sock = xprt->sock;
 	int err, status = -EIO;
 
-	if (xprt->shutdown || xprt->addr.sin_port == 0)
+	if (xprt->shutdown || !xprt_bound(xprt))
 		goto out;
 
 	dprintk("RPC:      xs_tcp_connect_worker for xprt %p\n", xprt);
@@ -1307,8 +1307,11 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	if (xprt->slot == NULL)
 		return -ENOMEM;
 
-	xprt->prot = IPPROTO_UDP;
+	if (ntohs(xprt->addr.sin_port) != 0)
+		xprt_set_bound(xprt);
 	xprt->port = xs_get_random_port();
+
+	xprt->prot = IPPROTO_UDP;
 	xprt->tsh_size = 0;
 	xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
 	/* XXX: header size can vary due to auth type, IPv6, etc. */
@@ -1348,8 +1351,11 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	if (xprt->slot == NULL)
 		return -ENOMEM;
 
-	xprt->prot = IPPROTO_TCP;
+	if (ntohs(xprt->addr.sin_port) != 0)
+		xprt_set_bound(xprt);
 	xprt->port = xs_get_random_port();
+
+	xprt->prot = IPPROTO_TCP;
 	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
 	xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
-- 
cgit v1.2.3-70-g09d2


From 4a68179d38874c37be2802442a71b847f5d1a2a9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:15 -0400
Subject: SUNRPC: Make RPC portmapper use per-transport storage

Move connection and bind state that was maintained in the rpc_clnt
structure to the rpc_xprt structure.  This will allow the creation of
a clean API for plugging in different types of bind mechanisms.

This brings improvements such as the elimination of a single spin lock to
control serialization for all in-kernel RPC binding.  A set of per-xprt
bitops is used to serialize tasks during RPC binding, just like it now
works for making RPC transport connections.

Test-plan:
Destructive testing (unplugging the network temporarily).  Connectathon
with UDP and TCP.  NFSv2/3 and NFSv4 mounting should be carefully checked.
Probably need to rig a server where certain services aren't running, or
that returns an error for some typical operation.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  23 +------
 include/linux/sunrpc/xprt.h |  14 ++++
 net/sunrpc/clnt.c           |   8 +--
 net/sunrpc/pmap_clnt.c      | 158 +++++++++++++++++++++++++++++---------------
 net/sunrpc/xprt.c           |   1 +
 5 files changed, 123 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 8fe9f35eba3..00e9dbaec9c 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -18,18 +18,6 @@
 #include <linux/sunrpc/timer.h>
 #include <asm/signal.h>
 
-/*
- * This defines an RPC port mapping
- */
-struct rpc_portmap {
-	__u32			pm_prog;
-	__u32			pm_vers;
-	__u32			pm_prot;
-	__u16			pm_port;
-	unsigned char		pm_binding : 1;	/* doing a getport() */
-	struct rpc_wait_queue	pm_bindwait;	/* waiting on getport() */
-};
-
 struct rpc_inode;
 
 /*
@@ -40,7 +28,9 @@ struct rpc_clnt {
 	atomic_t		cl_users;	/* number of references */
 	struct rpc_xprt *	cl_xprt;	/* transport */
 	struct rpc_procinfo *	cl_procinfo;	/* procedure info */
-	u32			cl_maxproc;	/* max procedure number */
+	u32			cl_prog,	/* RPC program number */
+				cl_vers,	/* RPC version number */
+				cl_maxproc;	/* max procedure number */
 
 	char *			cl_server;	/* server machine name */
 	char *			cl_protname;	/* protocol name */
@@ -55,7 +45,6 @@ struct rpc_clnt {
 				cl_dead     : 1;/* abandoned */
 
 	struct rpc_rtt *	cl_rtt;		/* RTO estimator data */
-	struct rpc_portmap *	cl_pmap;	/* port mapping */
 
 	int			cl_nodelen;	/* nodename length */
 	char 			cl_nodename[UNX_MAXNODENAME];
@@ -64,14 +53,8 @@ struct rpc_clnt {
 	struct dentry *		cl_dentry;	/* inode */
 	struct rpc_clnt *	cl_parent;	/* Points to parent of clones */
 	struct rpc_rtt		cl_rtt_default;
-	struct rpc_portmap	cl_pmap_default;
 	char			cl_inline_name[32];
 };
-#define cl_timeout		cl_xprt->timeout
-#define cl_prog			cl_pmap->pm_prog
-#define cl_vers			cl_pmap->pm_vers
-#define cl_port			cl_pmap->pm_port
-#define cl_prot			cl_pmap->pm_prot
 
 /*
  * General RPC program info
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a71106723d7..4ce82616873 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -138,6 +138,7 @@ struct rpc_xprt {
 	unsigned int		tsh_size;	/* size of transport specific
 						   header */
 
+	struct rpc_wait_queue	binding;	/* requests waiting on rpcbind */
 	struct rpc_wait_queue	sending;	/* requests waiting to send */
 	struct rpc_wait_queue	resend;		/* requests waiting to resend */
 	struct rpc_wait_queue	pending;	/* requests in flight */
@@ -270,6 +271,7 @@ int			xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to);
 #define XPRT_CONNECTING		(2)
 #define XPRT_CLOSE_WAIT		(3)
 #define XPRT_BOUND		(4)
+#define XPRT_BINDING		(5)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
@@ -328,6 +330,18 @@ static inline void xprt_clear_bound(struct rpc_xprt *xprt)
 	clear_bit(XPRT_BOUND, &xprt->state);
 }
 
+static inline void xprt_clear_binding(struct rpc_xprt *xprt)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(XPRT_BINDING, &xprt->state);
+	smp_mb__after_clear_bit();
+}
+
+static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
+{
+	return test_and_set_bit(XPRT_BINDING, &xprt->state);
+}
+
 #endif /* __KERNEL__*/
 
 #endif /* _LINUX_SUNRPC_XPRT_H */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0b8d03d0856..cee504162a3 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -147,13 +147,10 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname,
 	clnt->cl_procinfo = version->procs;
 	clnt->cl_maxproc  = version->nrprocs;
 	clnt->cl_protname = program->name;
-	clnt->cl_pmap	  = &clnt->cl_pmap_default;
 	clnt->cl_prog     = program->number;
 	clnt->cl_vers     = version->number;
-	clnt->cl_prot     = xprt->prot;
 	clnt->cl_stats    = program->stats;
 	clnt->cl_metrics  = rpc_alloc_iostats(clnt);
-	rpc_init_wait_queue(&clnt->cl_pmap_default.pm_bindwait, "bindwait");
 
 	if (!xprt_bound(clnt->cl_xprt))
 		clnt->cl_autobind = 1;
@@ -243,8 +240,6 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	atomic_set(&new->cl_users, 0);
 	new->cl_parent = clnt;
 	atomic_inc(&clnt->cl_count);
-	/* Duplicate portmapper */
-	rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait");
 	/* Turn off autobind on clones */
 	new->cl_autobind = 0;
 	new->cl_oneshot = 0;
@@ -254,8 +249,7 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
 	if (new->cl_auth)
 		atomic_inc(&new->cl_auth->au_count);
-	new->cl_pmap		= &new->cl_pmap_default;
-	new->cl_metrics         = rpc_alloc_iostats(clnt);
+	new->cl_metrics = rpc_alloc_iostats(clnt);
 	return new;
 out_no_clnt:
 	printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__);
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 209ffdfee10..59d542436ca 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -24,11 +24,57 @@
 #define PMAP_UNSET		2
 #define PMAP_GETPORT		3
 
+struct portmap_args {
+	u32			pm_prog;
+	u32			pm_vers;
+	u32			pm_prot;
+	unsigned short		pm_port;
+	struct rpc_task *	pm_task;
+};
+
 static struct rpc_procinfo	pmap_procedures[];
 static struct rpc_clnt *	pmap_create(char *, struct sockaddr_in *, int, int);
-static void			pmap_getport_done(struct rpc_task *);
+static void			pmap_getport_done(struct rpc_task *, void *);
 static struct rpc_program	pmap_program;
-static DEFINE_SPINLOCK(pmap_lock);
+
+static void pmap_getport_prepare(struct rpc_task *task, void *calldata)
+{
+	struct portmap_args *map = calldata;
+	struct rpc_message msg = {
+		.rpc_proc	= &pmap_procedures[PMAP_GETPORT],
+		.rpc_argp	= map,
+		.rpc_resp	= &map->pm_port,
+	};
+
+	rpc_call_setup(task, &msg, 0);
+}
+
+static inline struct portmap_args *pmap_map_alloc(void)
+{
+	return kmalloc(sizeof(struct portmap_args), GFP_NOFS);
+}
+
+static inline void pmap_map_free(struct portmap_args *map)
+{
+	kfree(map);
+}
+
+static void pmap_map_release(void *data)
+{
+	pmap_map_free(data);
+}
+
+static const struct rpc_call_ops pmap_getport_ops = {
+	.rpc_call_prepare	= pmap_getport_prepare,
+	.rpc_call_done		= pmap_getport_done,
+	.rpc_release		= pmap_map_release,
+};
+
+static inline void pmap_wake_portmap_waiters(struct rpc_xprt *xprt)
+{
+	xprt_clear_binding(xprt);
+	rpc_wake_up(&xprt->binding);
+}
 
 /*
  * Obtain the port for a given RPC service on a given host. This one can
@@ -37,67 +83,71 @@ static DEFINE_SPINLOCK(pmap_lock);
 void
 rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
-	struct rpc_portmap *map = clnt->cl_pmap;
-	struct sockaddr_in *sap = &clnt->cl_xprt->addr;
-	struct rpc_message msg = {
-		.rpc_proc	= &pmap_procedures[PMAP_GETPORT],
-		.rpc_argp	= map,
-		.rpc_resp	= &clnt->cl_port,
-		.rpc_cred	= NULL
-	};
+	struct rpc_xprt *xprt = task->tk_xprt;
+	struct sockaddr_in *sap = &xprt->addr;
+	struct portmap_args *map;
 	struct rpc_clnt	*pmap_clnt;
-	struct rpc_task	*child;
+	struct rpc_task *child;
 
-	dprintk("RPC: %4d rpc_getport(%s, %d, %d, %d)\n",
+	dprintk("RPC: %4d rpc_getport(%s, %u, %u, %d)\n",
 			task->tk_pid, clnt->cl_server,
-			map->pm_prog, map->pm_vers, map->pm_prot);
+			clnt->cl_prog, clnt->cl_vers, xprt->prot);
 
 	/* Autobind on cloned rpc clients is discouraged */
 	BUG_ON(clnt->cl_parent != clnt);
 
-	spin_lock(&pmap_lock);
-	if (map->pm_binding) {
-		rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL);
-		spin_unlock(&pmap_lock);
+	if (xprt_test_and_set_binding(xprt)) {
+		task->tk_status = -EACCES;	/* tell caller to check again */
+		rpc_sleep_on(&xprt->binding, task, NULL, NULL);
 		return;
 	}
-	map->pm_binding = 1;
-	spin_unlock(&pmap_lock);
+
+	/* Someone else may have bound if we slept */
+	if (xprt_bound(xprt)) {
+		task->tk_status = 0;
+		goto bailout_nofree;
+	}
+
+	map = pmap_map_alloc();
+	if (!map) {
+		task->tk_status = -ENOMEM;
+		goto bailout_nofree;
+	}
+	map->pm_prog = clnt->cl_prog;
+	map->pm_vers = clnt->cl_vers;
+	map->pm_prot = xprt->prot;
+	map->pm_port = 0;
+	map->pm_task = task;
 
 	pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot, 0);
 	if (IS_ERR(pmap_clnt)) {
 		task->tk_status = PTR_ERR(pmap_clnt);
 		goto bailout;
 	}
-	task->tk_status = 0;
 
-	/*
-	 * Note: rpc_new_child will release client after a failure.
-	 */
-	if (!(child = rpc_new_child(pmap_clnt, task)))
+	child = rpc_run_task(pmap_clnt, RPC_TASK_ASYNC, &pmap_getport_ops, map);
+	if (IS_ERR(child)) {
+		task->tk_status = -EIO;
 		goto bailout;
+	}
+	rpc_release_task(child);
 
-	/* Setup the call info struct */
-	rpc_call_setup(child, &msg, 0);
+	rpc_sleep_on(&xprt->binding, task, NULL, NULL);
 
-	/* ... and run the child task */
 	task->tk_xprt->stat.bind_count++;
-	rpc_run_child(task, child, pmap_getport_done);
 	return;
 
 bailout:
-	spin_lock(&pmap_lock);
-	map->pm_binding = 0;
-	rpc_wake_up(&map->pm_bindwait);
-	spin_unlock(&pmap_lock);
-	rpc_exit(task, -EIO);
+	pmap_map_free(map);
+bailout_nofree:
+	pmap_wake_portmap_waiters(xprt);
 }
 
 #ifdef CONFIG_ROOT_NFS
 int
 rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot)
 {
-	struct rpc_portmap map = {
+	struct portmap_args map = {
 		.pm_prog	= prog,
 		.pm_vers	= vers,
 		.pm_prot	= prot,
@@ -133,32 +183,32 @@ rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot)
 #endif
 
 static void
-pmap_getport_done(struct rpc_task *task)
+pmap_getport_done(struct rpc_task *child, void *data)
 {
-	struct rpc_clnt	*clnt = task->tk_client;
+	struct portmap_args *map = data;
+	struct rpc_task *task = map->pm_task;
 	struct rpc_xprt *xprt = task->tk_xprt;
-	struct rpc_portmap *map = clnt->cl_pmap;
-
-	dprintk("RPC: %4d pmap_getport_done(status %d, port %d)\n",
-			task->tk_pid, task->tk_status, clnt->cl_port);
+	int status = child->tk_status;
 
-	if (task->tk_status < 0) {
-		/* Make the calling task exit with an error */
+	if (status < 0) {
+		/* Portmapper not available */
 		xprt->ops->set_port(xprt, 0);
-		task->tk_action = rpc_exit_task;
-	} else if (clnt->cl_port == 0) {
-		/* Program not registered */
+		task->tk_status = status;
+	} else if (map->pm_port == 0) {
+		/* Requested RPC service wasn't registered */
 		xprt->ops->set_port(xprt, 0);
-		rpc_exit(task, -EACCES);
+		task->tk_status = -EACCES;
 	} else {
-		xprt->ops->set_port(xprt, clnt->cl_port);
+		/* Succeeded */
+		xprt->ops->set_port(xprt, map->pm_port);
 		xprt_set_bound(xprt);
-		clnt->cl_port = htons(clnt->cl_port);
+		task->tk_status = 0;
 	}
-	spin_lock(&pmap_lock);
-	map->pm_binding = 0;
-	rpc_wake_up(&map->pm_bindwait);
-	spin_unlock(&pmap_lock);
+
+	dprintk("RPC: %4d pmap_getport_done(status %d, port %u)\n",
+			child->tk_pid, child->tk_status, map->pm_port);
+
+	pmap_wake_portmap_waiters(xprt);
 }
 
 /*
@@ -172,7 +222,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay)
 		.sin_family	= AF_INET,
 		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
 	};
-	struct rpc_portmap	map = {
+	struct portmap_args	map = {
 		.pm_prog	= prog,
 		.pm_vers	= vers,
 		.pm_prot	= prot,
@@ -239,7 +289,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileg
  * XDR encode/decode functions for PMAP
  */
 static int
-xdr_encode_mapping(struct rpc_rqst *req, u32 *p, struct rpc_portmap *map)
+xdr_encode_mapping(struct rpc_rqst *req, u32 *p, struct portmap_args *map)
 {
 	dprintk("RPC: xdr_encode_mapping(%d, %d, %d, %d)\n",
 		map->pm_prog, map->pm_vers, map->pm_prot, map->pm_port);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index e239ef985ef..b45abd0743c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -928,6 +928,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc
 	xprt->last_used = jiffies;
 	xprt->cwnd = RPC_INITCWND;
 
+	rpc_init_wait_queue(&xprt->binding, "xprt_binding");
 	rpc_init_wait_queue(&xprt->pending, "xprt_pending");
 	rpc_init_wait_queue(&xprt->sending, "xprt_sending");
 	rpc_init_wait_queue(&xprt->resend, "xprt_resend");
-- 
cgit v1.2.3-70-g09d2


From 5b1eacbcd78930d976eb50a93f1779d311b553d1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:16 -0400
Subject: SUNRPC: Support for RPC child tasks no longer needed

The previous patches removed the last user of RPC child tasks, so we can
remove support for child tasks from net/sunrpc/sched.c now.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  5 ---
 net/sunrpc/sched.c           | 82 --------------------------------------------
 2 files changed, 87 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 82a91bb2236..f399c138f79 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -127,7 +127,6 @@ struct rpc_call_ops {
  */
 #define RPC_TASK_ASYNC		0x0001		/* is an async task */
 #define RPC_TASK_SWAPPER	0x0002		/* is swapping in/out */
-#define RPC_TASK_CHILD		0x0008		/* is child of other task */
 #define RPC_CALL_MAJORSEEN	0x0020		/* major timeout seen */
 #define RPC_TASK_ROOTCREDS	0x0040		/* force root creds */
 #define RPC_TASK_DYNAMIC	0x0080		/* task was kmalloc'ed */
@@ -136,7 +135,6 @@ struct rpc_call_ops {
 #define RPC_TASK_NOINTR		0x0400		/* uninterruptible task */
 
 #define RPC_IS_ASYNC(t)		((t)->tk_flags & RPC_TASK_ASYNC)
-#define RPC_IS_CHILD(t)		((t)->tk_flags & RPC_TASK_CHILD)
 #define RPC_IS_SWAPPER(t)	((t)->tk_flags & RPC_TASK_SWAPPER)
 #define RPC_DO_ROOTOVERRIDE(t)	((t)->tk_flags & RPC_TASK_ROOTCREDS)
 #define RPC_ASSASSINATED(t)	((t)->tk_flags & RPC_TASK_KILLED)
@@ -253,7 +251,6 @@ struct rpc_task *rpc_new_task(struct rpc_clnt *, int flags,
 				const struct rpc_call_ops *ops, void *data);
 struct rpc_task *rpc_run_task(struct rpc_clnt *clnt, int flags,
 				const struct rpc_call_ops *ops, void *data);
-struct rpc_task *rpc_new_child(struct rpc_clnt *, struct rpc_task *parent);
 void		rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt,
 				int flags, const struct rpc_call_ops *ops,
 				void *data);
@@ -261,8 +258,6 @@ void		rpc_release_task(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
 void		rpc_killall_tasks(struct rpc_clnt *);
 int		rpc_execute(struct rpc_task *);
-void		rpc_run_child(struct rpc_task *parent, struct rpc_task *child,
-					rpc_action action);
 void		rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
 void		rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
 void		rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 5c3eee76850..015ffe423a2 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -44,12 +44,6 @@ static void			__rpc_default_timer(struct rpc_task *task);
 static void			rpciod_killall(void);
 static void			rpc_async_schedule(void *);
 
-/*
- * RPC tasks that create another task (e.g. for contacting the portmapper)
- * will wait on this queue for their child's completion
- */
-static RPC_WAITQ(childq, "childq");
-
 /*
  * RPC tasks sit here while waiting for conditions to improve.
  */
@@ -323,16 +317,6 @@ static void rpc_make_runnable(struct rpc_task *task)
 		wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
 }
 
-/*
- * Place a newly initialized task on the workqueue.
- */
-static inline void
-rpc_schedule_run(struct rpc_task *task)
-{
-	rpc_set_active(task);
-	rpc_make_runnable(task);
-}
-
 /*
  * Prepare for sleeping on a wait queue.
  * By always appending tasks to the list we ensure FIFO behavior.
@@ -933,72 +917,6 @@ struct rpc_task *rpc_run_task(struct rpc_clnt *clnt, int flags,
 }
 EXPORT_SYMBOL(rpc_run_task);
 
-/**
- * rpc_find_parent - find the parent of a child task.
- * @child: child task
- * @parent: parent task
- *
- * Checks that the parent task is still sleeping on the
- * queue 'childq'. If so returns a pointer to the parent.
- * Upon failure returns NULL.
- *
- * Caller must hold childq.lock
- */
-static inline struct rpc_task *rpc_find_parent(struct rpc_task *child, struct rpc_task *parent)
-{
-	struct rpc_task	*task;
-	struct list_head *le;
-
-	task_for_each(task, le, &childq.tasks[0])
-		if (task == parent)
-			return parent;
-
-	return NULL;
-}
-
-static void rpc_child_exit(struct rpc_task *child, void *calldata)
-{
-	struct rpc_task	*parent;
-
-	spin_lock_bh(&childq.lock);
-	if ((parent = rpc_find_parent(child, calldata)) != NULL) {
-		parent->tk_status = child->tk_status;
-		__rpc_wake_up_task(parent);
-	}
-	spin_unlock_bh(&childq.lock);
-}
-
-static const struct rpc_call_ops rpc_child_ops = {
-	.rpc_call_done = rpc_child_exit,
-};
-
-/*
- * Note: rpc_new_task releases the client after a failure.
- */
-struct rpc_task *
-rpc_new_child(struct rpc_clnt *clnt, struct rpc_task *parent)
-{
-	struct rpc_task	*task;
-
-	task = rpc_new_task(clnt, RPC_TASK_ASYNC | RPC_TASK_CHILD, &rpc_child_ops, parent);
-	if (!task)
-		goto fail;
-	return task;
-
-fail:
-	parent->tk_status = -ENOMEM;
-	return NULL;
-}
-
-void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func)
-{
-	spin_lock_bh(&childq.lock);
-	/* N.B. Is it possible for the child to have already finished? */
-	__rpc_sleep_on(&childq, task, func, NULL);
-	rpc_schedule_run(child);
-	spin_unlock_bh(&childq.lock);
-}
-
 /*
  * Kill all tasks for the given client.
  * XXX: kill their descendants as well?
-- 
cgit v1.2.3-70-g09d2


From bbf7c1dd2ae2b4040b41b1065ee9b1b6905b1605 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:16 -0400
Subject: SUNRPC: Introduce transport switch callout for pluggable rpcbind

Introduce a clean transport switch API for plugging in different types of
rpcbind mechanisms.  For instance, rpcbind can cleanly replace the
existing portmapper client, or a transport can choose to implement RPC
binding any way it likes.

Test plan:
Destructive testing (unplugging the network temporarily).  Connectathon
with UDP and TCP.  NFSv2/3 and NFSv4 mounting should be carefully checked.
Probably need to rig a server where certain services aren't running, or
that returns an error for some typical operation.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h | 2 +-
 include/linux/sunrpc/xprt.h | 1 +
 net/sunrpc/clnt.c           | 3 +--
 net/sunrpc/pmap_clnt.c      | 4 ++--
 net/sunrpc/xprtsock.c       | 2 ++
 5 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 00e9dbaec9c..2e68ac0aa02 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -106,7 +106,7 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 int		rpc_shutdown_client(struct rpc_clnt *);
 int		rpc_destroy_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
-void		rpc_getport(struct rpc_task *, struct rpc_clnt *);
+void		rpc_getport(struct rpc_task *);
 int		rpc_register(u32, u32, int, unsigned short, int *);
 
 void		rpc_call_setup(struct rpc_task *, struct rpc_message *, int);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 4ce82616873..84122559fa1 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -105,6 +105,7 @@ struct rpc_xprt_ops {
 	void		(*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize);
 	int		(*reserve_xprt)(struct rpc_task *task);
 	void		(*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
+	void		(*rpcbind)(struct rpc_task *task);
 	void		(*set_port)(struct rpc_xprt *xprt, unsigned short port);
 	void		(*connect)(struct rpc_task *task);
 	void *		(*buf_alloc)(struct rpc_task *task, size_t size);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index cee504162a3..d003c2f5688 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -774,7 +774,6 @@ call_encode(struct rpc_task *task)
 static void
 call_bind(struct rpc_task *task)
 {
-	struct rpc_clnt	*clnt = task->tk_client;
 	struct rpc_xprt *xprt = task->tk_xprt;
 
 	dprintk("RPC: %4d call_bind (status %d)\n",
@@ -784,7 +783,7 @@ call_bind(struct rpc_task *task)
 	if (!xprt_bound(xprt)) {
 		task->tk_action = call_bind_status;
 		task->tk_timeout = xprt->bind_timeout;
-		rpc_getport(task, clnt);
+		xprt->ops->rpcbind(task);
 	}
 }
 
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 0efcbf1302a..f7b279a63ba 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -81,13 +81,13 @@ static inline void pmap_wake_portmap_waiters(struct rpc_xprt *xprt)
 /**
  * rpc_getport - obtain the port for a given RPC service on a given host
  * @task: task that is waiting for portmapper request
- * @clnt: controlling rpc_clnt
  *
  * This one can be called for an ongoing RPC request, and can be used in
  * an async (rpciod) context.
  */
-void rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
+void rpc_getport(struct rpc_task *task)
 {
+	struct rpc_clnt *clnt = task->tk_client;
 	struct rpc_xprt *xprt = task->tk_xprt;
 	struct sockaddr_in *sap = &xprt->addr;
 	struct portmap_args *map;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 123ac1e5ba1..4c98b89a5b4 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1262,6 +1262,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.set_buffer_size	= xs_udp_set_buffer_size,
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong,
+	.rpcbind		= rpc_getport,
 	.set_port		= xs_set_port,
 	.connect		= xs_connect,
 	.buf_alloc		= rpc_malloc,
@@ -1278,6 +1279,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 static struct rpc_xprt_ops xs_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xs_tcp_release_xprt,
+	.rpcbind		= rpc_getport,
 	.set_port		= xs_set_port,
 	.connect		= xs_connect,
 	.buf_alloc		= rpc_malloc,
-- 
cgit v1.2.3-70-g09d2


From ed39440a2573abc926f230267000f21fa5a87822 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:17 -0400
Subject: SUNRPC: create API for getting remote peer address

Provide an API for retrieving the remote peer address without allowing
direct access to the rpc_xprt struct.

Test-plan:
Compile kernel with CONFIG_NFS enabled.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 2e68ac0aa02..65196b03f0a 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -123,6 +123,7 @@ void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 size_t		rpc_max_payload(struct rpc_clnt *);
 void		rpc_force_rebind(struct rpc_clnt *);
 int		rpc_ping(struct rpc_clnt *clnt, int flags);
+size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
 
 /*
  * Helper function for NFSroot support
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d003c2f5688..94768cf5fd5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -533,6 +533,27 @@ rpc_call_setup(struct rpc_task *task, struct rpc_message *msg, int flags)
 		task->tk_action = rpc_exit_task;
 }
 
+/**
+ * rpc_peeraddr - extract remote peer address from clnt's xprt
+ * @clnt: RPC client structure
+ * @buf: target buffer
+ * @size: length of target buffer
+ *
+ * Returns the number of bytes that are actually in the stored address.
+ */
+size_t rpc_peeraddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t bufsize)
+{
+	size_t bytes;
+	struct rpc_xprt *xprt = clnt->cl_xprt;
+
+	bytes = sizeof(xprt->addr);
+	if (bytes > bufsize)
+		bytes = bufsize;
+	memcpy(buf, &clnt->cl_xprt->addr, bytes);
+	return sizeof(xprt->addr);
+}
+EXPORT_SYMBOL(rpc_peeraddr);
+
 void
 rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize)
 {
-- 
cgit v1.2.3-70-g09d2


From 39d7bbcb5ba5e9d8d658b70903dd7939400e57db Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:18 -0400
Subject: SUNRPC: remove extraneous header inclusions

include/linux/sunrpc/clnt.h already includes include/linux/sunrpc/xprt.h.
We can remove xprt.h from source files that already include clnt.h.
Likewise include/linux/sunrpc/timer.h.

Test plan:
Compile kernel with CONFIG_NFS enabled.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/mount_clnt.c     | 1 -
 include/linux/nfs_xdr.h | 1 -
 net/sunrpc/pmap_clnt.c  | 1 -
 net/sunrpc/sched.c      | 1 -
 net/sunrpc/timer.c      | 2 --
 5 files changed, 6 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 445abb4d421..41274874b9a 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -14,7 +14,6 @@
 #include <linux/net.h>
 #include <linux/in.h>
 #include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprt.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/nfs_fs.h>
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2426b11b6cc..0f33e621892 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1,7 +1,6 @@
 #ifndef _LINUX_NFS_XDR_H
 #define _LINUX_NFS_XDR_H
 
-#include <linux/sunrpc/xprt.h>
 #include <linux/nfsacl.h>
 
 /*
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 3eee8e90727..523f0e825de 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -15,7 +15,6 @@
 #include <linux/uio.h>
 #include <linux/in.h>
 #include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprt.h>
 #include <linux/sunrpc/sched.h>
 
 #ifdef RPC_DEBUG
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 015ffe423a2..ecf366351bf 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -21,7 +21,6 @@
 #include <linux/mutex.h>
 
 #include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprt.h>
 
 #ifdef RPC_DEBUG
 #define RPCDBG_FACILITY		RPCDBG_SCHED
diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c
index bcbdf6430d5..8142fdb8a93 100644
--- a/net/sunrpc/timer.c
+++ b/net/sunrpc/timer.c
@@ -19,8 +19,6 @@
 #include <linux/unistd.h>
 
 #include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/xprt.h>
-#include <linux/sunrpc/timer.h>
 
 #define RPC_RTO_MAX (60*HZ)
 #define RPC_RTO_INIT (HZ/5)
-- 
cgit v1.2.3-70-g09d2


From edb267a688fcee5335d596752f117a30c7152e44 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:18 -0400
Subject: SUNRPC: add xprt switch API for printing formatted remote peer
 addresses

Add a new method to the transport switch API to provide a way to convert
the opaque contents of xprt->addr to a human-readable string.

Test plan:
Compile kernel with CONFIG_NFS enabled.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h | 11 +++++++
 net/sunrpc/xprtsock.c       | 79 ++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 82 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 84122559fa1..8372ab8fc9b 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -51,6 +51,14 @@ struct rpc_timeout {
 	unsigned char		to_exponential;
 };
 
+enum rpc_display_format_t {
+	RPC_DISPLAY_ADDR = 0,
+	RPC_DISPLAY_PORT,
+	RPC_DISPLAY_PROTO,
+	RPC_DISPLAY_ALL,
+	RPC_DISPLAY_MAX,
+};
+
 struct rpc_task;
 struct rpc_xprt;
 struct seq_file;
@@ -103,6 +111,7 @@ struct rpc_rqst {
 
 struct rpc_xprt_ops {
 	void		(*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize);
+	char *		(*print_addr)(struct rpc_xprt *xprt, enum rpc_display_format_t format);
 	int		(*reserve_xprt)(struct rpc_task *task);
 	void		(*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task);
 	void		(*rpcbind)(struct rpc_task *task);
@@ -207,6 +216,8 @@ struct rpc_xprt {
 	void			(*old_data_ready)(struct sock *, int);
 	void			(*old_state_change)(struct sock *);
 	void			(*old_write_space)(struct sock *);
+
+	char *			address_strings[RPC_DISPLAY_MAX];
 };
 
 #define XPRT_LAST_FRAG		(1 << 0)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4c98b89a5b4..cb8e6c34e12 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -125,6 +125,47 @@ static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count)
 }
 #endif
 
+static void xs_format_peer_addresses(struct rpc_xprt *xprt)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr;
+	char *buf;
+
+	buf = kzalloc(20, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 20, "%u.%u.%u.%u",
+				NIPQUAD(addr->sin_addr.s_addr));
+	}
+	xprt->address_strings[RPC_DISPLAY_ADDR] = buf;
+
+	buf = kzalloc(8, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 8, "%u",
+				ntohs(addr->sin_port));
+	}
+	xprt->address_strings[RPC_DISPLAY_PORT] = buf;
+
+	if (xprt->prot == IPPROTO_UDP)
+		xprt->address_strings[RPC_DISPLAY_PROTO] = "udp";
+	else
+		xprt->address_strings[RPC_DISPLAY_PROTO] = "tcp";
+
+	buf = kzalloc(48, GFP_KERNEL);
+	if (buf) {
+		snprintf(buf, 48, "addr=%u.%u.%u.%u port=%u proto=%s",
+			NIPQUAD(addr->sin_addr.s_addr),
+			ntohs(addr->sin_port),
+			xprt->prot == IPPROTO_UDP ? "udp" : "tcp");
+	}
+	xprt->address_strings[RPC_DISPLAY_ALL] = buf;
+}
+
+static void xs_free_peer_addresses(struct rpc_xprt *xprt)
+{
+	kfree(xprt->address_strings[RPC_DISPLAY_ADDR]);
+	kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
+	kfree(xprt->address_strings[RPC_DISPLAY_ALL]);
+}
+
 #define XS_SENDMSG_FLAGS	(MSG_DONTWAIT | MSG_NOSIGNAL)
 
 static inline int xs_send_head(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, unsigned int len)
@@ -490,6 +531,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
 
 	xprt_disconnect(xprt);
 	xs_close(xprt);
+	xs_free_peer_addresses(xprt);
 	kfree(xprt->slot);
 }
 
@@ -964,6 +1006,19 @@ static unsigned short xs_get_random_port(void)
 	return rand + xprt_min_resvport;
 }
 
+/**
+ * xs_print_peer_address - format an IPv4 address for printing
+ * @xprt: generic transport
+ * @format: flags field indicating which parts of the address to render
+ */
+static char *xs_print_peer_address(struct rpc_xprt *xprt, enum rpc_display_format_t format)
+{
+	if (xprt->address_strings[format] != NULL)
+		return xprt->address_strings[format];
+	else
+		return "unprintable";
+}
+
 /**
  * xs_set_port - reset the port number in the remote endpoint address
  * @xprt: generic transport
@@ -1019,8 +1074,6 @@ static void xs_udp_connect_worker(void *args)
 	if (xprt->shutdown || !xprt_bound(xprt))
 		goto out;
 
-	dprintk("RPC:      xs_udp_connect_worker for xprt %p\n", xprt);
-
 	/* Start by resetting any existing state */
 	xs_close(xprt);
 
@@ -1034,6 +1087,9 @@ static void xs_udp_connect_worker(void *args)
 		goto out;
 	}
 
+	dprintk("RPC:      worker connecting xprt %p to address: %s\n",
+			xprt, xs_print_peer_address(xprt, RPC_DISPLAY_ALL));
+
 	if (!xprt->inet) {
 		struct sock *sk = sock->sk;
 
@@ -1102,8 +1158,6 @@ static void xs_tcp_connect_worker(void *args)
 	if (xprt->shutdown || !xprt_bound(xprt))
 		goto out;
 
-	dprintk("RPC:      xs_tcp_connect_worker for xprt %p\n", xprt);
-
 	if (!xprt->sock) {
 		/* start from scratch */
 		if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
@@ -1119,6 +1173,9 @@ static void xs_tcp_connect_worker(void *args)
 		/* "close" the socket, preserving the local port */
 		xs_tcp_reuse_connection(xprt);
 
+	dprintk("RPC:      worker connecting xprt %p to address: %s\n",
+			xprt, xs_print_peer_address(xprt, RPC_DISPLAY_ALL));
+
 	if (!xprt->inet) {
 		struct sock *sk = sock->sk;
 
@@ -1260,6 +1317,7 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 
 static struct rpc_xprt_ops xs_udp_ops = {
 	.set_buffer_size	= xs_udp_set_buffer_size,
+	.print_addr		= xs_print_peer_address,
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong,
 	.rpcbind		= rpc_getport,
@@ -1277,6 +1335,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 };
 
 static struct rpc_xprt_ops xs_tcp_ops = {
+	.print_addr		= xs_print_peer_address,
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xs_tcp_release_xprt,
 	.rpcbind		= rpc_getport,
@@ -1301,8 +1360,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 {
 	size_t slot_table_size;
 
-	dprintk("RPC:      setting up udp-ipv4 transport...\n");
-
 	xprt->max_reqs = xprt_udp_slot_table_entries;
 	slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
 	xprt->slot = kzalloc(slot_table_size, GFP_KERNEL);
@@ -1332,6 +1389,10 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	else
 		xprt_set_timeout(&xprt->timeout, 5, 5 * HZ);
 
+	xs_format_peer_addresses(xprt);
+	dprintk("RPC:      set up transport to address %s\n",
+			xs_print_peer_address(xprt, RPC_DISPLAY_ALL));
+
 	return 0;
 }
 
@@ -1345,8 +1406,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 {
 	size_t slot_table_size;
 
-	dprintk("RPC:      setting up tcp-ipv4 transport...\n");
-
 	xprt->max_reqs = xprt_tcp_slot_table_entries;
 	slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
 	xprt->slot = kzalloc(slot_table_size, GFP_KERNEL);
@@ -1375,5 +1434,9 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	else
 		xprt_set_timeout(&xprt->timeout, 2, 60 * HZ);
 
+	xs_format_peer_addresses(xprt);
+	dprintk("RPC:      set up transport to address %s\n",
+			xs_print_peer_address(xprt, RPC_DISPLAY_ALL));
+
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From f425eba437f0051bde979ea2eef8bc875a77cd00 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:18 -0400
Subject: SUNRPC: Create API for displaying remote peer address

Provide an API for formatting the remote peer address for printing without
exposing its internal structure.  The address could be dynamic, so we
support a function call to get the address rather than reading it straight
out of a structure.

Test-plan:
Destructive testing (unplugging the network temporarily).  Probably need
to rig a server where certain services aren't running, or that returns an
error for some typical operation.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 13 +++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 65196b03f0a..b7d47f01835 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -124,6 +124,7 @@ size_t		rpc_max_payload(struct rpc_clnt *);
 void		rpc_force_rebind(struct rpc_clnt *);
 int		rpc_ping(struct rpc_clnt *clnt, int flags);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
+char *		rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
 
 /*
  * Helper function for NFSroot support
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 94768cf5fd5..e5b19e348d8 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -554,6 +554,19 @@ size_t rpc_peeraddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t bufsize)
 }
 EXPORT_SYMBOL(rpc_peeraddr);
 
+/**
+ * rpc_peeraddr2str - return remote peer address in printable format
+ * @clnt: RPC client structure
+ * @format: address format
+ *
+ */
+char *rpc_peeraddr2str(struct rpc_clnt *clnt, enum rpc_display_format_t format)
+{
+	struct rpc_xprt *xprt = clnt->cl_xprt;
+	return xprt->ops->print_addr(xprt, format);
+}
+EXPORT_SYMBOL(rpc_peeraddr2str);
+
 void
 rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize)
 {
-- 
cgit v1.2.3-70-g09d2


From c4efcb1d3e0bc76aeb9ca6301d19a5079893c6c9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:19 -0400
Subject: SUNRPC: Use "sockaddr_storage" for storing RPC client's remote peer
 address

IPv6 addresses are big (128 bytes).  Now that no RPC client consumers treat
the addr field in rpc_xprt structs as an opaque, and access it only via the
API calls, we can safely widen the field in the rpc_xprt struct to
accomodate larger addresses.

Test plan:
Compile kernel with CONFIG_NFS enabled.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  3 ++-
 net/sunrpc/clnt.c           |  2 +-
 net/sunrpc/xprt.c           |  3 ++-
 net/sunrpc/xprtsock.c       | 15 ++++++++++-----
 4 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 8372ab8fc9b..fc05cfbd580 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -134,7 +134,8 @@ struct rpc_xprt {
 	struct sock *		inet;		/* INET layer */
 
 	struct rpc_timeout	timeout;	/* timeout parms */
-	struct sockaddr_in	addr;		/* server address */
+	struct sockaddr_storage	addr;		/* server address */
+	size_t			addrlen;	/* size of server address */
 	int			prot;		/* IP protocol */
 
 	unsigned long		cong;		/* current congestion */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index e5b19e348d8..ff1e90fd81a 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -550,7 +550,7 @@ size_t rpc_peeraddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t bufsize)
 	if (bytes > bufsize)
 		bytes = bufsize;
 	memcpy(buf, &clnt->cl_xprt->addr, bytes);
-	return sizeof(xprt->addr);
+	return xprt->addrlen;
 }
 EXPORT_SYMBOL(rpc_peeraddr);
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index b45abd0743c..4987517cc74 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -896,7 +896,8 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc
 	if ((xprt = kzalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	xprt->addr = *ap;
+	memcpy(&xprt->addr, ap, sizeof(*ap));
+	xprt->addrlen = sizeof(*ap);
 
 	switch (proto) {
 	case IPPROTO_UDP:
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index cb8e6c34e12..17179aa4c20 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -341,7 +341,7 @@ static int xs_udp_send_request(struct rpc_task *task)
 
 	req->rq_xtime = jiffies;
 	status = xs_sendpages(xprt->sock, (struct sockaddr *) &xprt->addr,
-				sizeof(xprt->addr), xdr, req->rq_bytes_sent);
+				xprt->addrlen, xdr, req->rq_bytes_sent);
 
 	dprintk("RPC:      xs_udp_send_request(%u) = %d\n",
 			xdr->len - req->rq_bytes_sent, status);
@@ -1027,8 +1027,11 @@ static char *xs_print_peer_address(struct rpc_xprt *xprt, enum rpc_display_forma
  */
 static void xs_set_port(struct rpc_xprt *xprt, unsigned short port)
 {
+	struct sockaddr_in *sap = (struct sockaddr_in *) &xprt->addr;
+
 	dprintk("RPC:      setting port for xprt %p to %u\n", xprt, port);
-	xprt->addr.sin_port = htons(port);
+
+	sap->sin_port = htons(port);
 }
 
 static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock)
@@ -1209,7 +1212,7 @@ static void xs_tcp_connect_worker(void *args)
 	xprt->stat.connect_count++;
 	xprt->stat.connect_start = jiffies;
 	status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr,
-			sizeof(xprt->addr), O_NONBLOCK);
+			xprt->addrlen, O_NONBLOCK);
 	dprintk("RPC: %p  connect status %d connected %d sock state %d\n",
 			xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
 	if (status < 0) {
@@ -1359,6 +1362,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 {
 	size_t slot_table_size;
+	struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr;
 
 	xprt->max_reqs = xprt_udp_slot_table_entries;
 	slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
@@ -1366,7 +1370,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	if (xprt->slot == NULL)
 		return -ENOMEM;
 
-	if (ntohs(xprt->addr.sin_port) != 0)
+	if (ntohs(addr->sin_port != 0))
 		xprt_set_bound(xprt);
 	xprt->port = xs_get_random_port();
 
@@ -1405,6 +1409,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 {
 	size_t slot_table_size;
+	struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr;
 
 	xprt->max_reqs = xprt_tcp_slot_table_entries;
 	slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]);
@@ -1412,7 +1417,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 	if (xprt->slot == NULL)
 		return -ENOMEM;
 
-	if (ntohs(xprt->addr.sin_port) != 0)
+	if (ntohs(addr->sin_port) != 0)
 		xprt_set_bound(xprt);
 	xprt->port = xs_get_random_port();
 
-- 
cgit v1.2.3-70-g09d2


From 6ca948238724c945bd353f51d54ae7d285f3889f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:19 -0400
Subject: SUNRPC: Clean-up after previous patches.

Remove some unused macros related to accessing an RPC peer address

Test plan:
Compile kernel with CONFIG_NFS option enabled.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             | 1 -
 include/linux/nfs_fs.h      | 1 -
 include/linux/sunrpc/clnt.h | 3 ---
 3 files changed, 5 deletions(-)

(limited to 'include')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 38b0e8a1aec..a516a01561b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -26,7 +26,6 @@
 #define NLM_HOST_REBIND		(60 * HZ)
 #define NLM_HOST_EXPIRE		((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ)
 #define NLM_HOST_COLLECT	((nrhosts > NLM_HOST_MAX)? 120 * HZ :  60 * HZ)
-#define NLM_HOST_ADDR(sv)	(&(sv)->s_nlmclnt->cl_xprt->addr)
 
 static struct nlm_host *	nlm_hosts[NLM_HOST_NRHASH];
 static unsigned long		next_gc;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 51e9bd90ded..3b5b04193fe 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -216,7 +216,6 @@ static inline struct nfs_inode *NFS_I(struct inode *inode)
 #define NFS_SERVER(inode)		(NFS_SB(inode->i_sb))
 #define NFS_CLIENT(inode)		(NFS_SERVER(inode)->client)
 #define NFS_PROTO(inode)		(NFS_SERVER(inode)->nfs_client->rpc_ops)
-#define NFS_ADDR(inode)			(RPC_PEERADDR(NFS_CLIENT(inode)))
 #define NFS_COOKIEVERF(inode)		(NFS_I(inode)->cookieverf)
 #define NFS_READTIME(inode)		(NFS_I(inode)->read_cache_jiffies)
 #define NFS_CHANGE_ATTR(inode)		(NFS_I(inode)->change_attr)
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index b7d47f01835..a26d69583c7 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -89,9 +89,6 @@ struct rpc_procinfo {
 	char *			p_name;		/* name of procedure */
 };
 
-#define RPC_CONGESTED(clnt)	(RPCXPRT_CONGESTED((clnt)->cl_xprt))
-#define RPC_PEERADDR(clnt)	(&(clnt)->cl_xprt->addr)
-
 #ifdef __KERNEL__
 
 struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
-- 
cgit v1.2.3-70-g09d2


From c2866763b4029411d166040306691773c12d4caf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:20 -0400
Subject: SUNRPC: use sockaddr + size when creating remote transport endpoints

Prepare for more generic transport endpoint handling needed by transports
that might use different forms of addressing, such as IPv6.

Introduce a single function call to replace the two-call
xprt_create_proto/rpc_create_client API.  Define a new rpc_create_args
structure that allows callers to pass in remote endpoint addresses of
varying length.

Test-plan:
Compile kernel with CONFIG_NFS enabled.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h | 22 +++++++++++++
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/clnt.c           | 61 ++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprt.c           | 75 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 159 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index a26d69583c7..7817ba82f1b 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -97,6 +97,28 @@ struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
 struct rpc_clnt *rpc_new_client(struct rpc_xprt *xprt, char *servname,
 				struct rpc_program *info,
 				u32 version, rpc_authflavor_t authflavor);
+
+struct rpc_create_args {
+	int			protocol;
+	struct sockaddr		*address;
+	size_t			addrsize;
+	struct rpc_timeout	*timeout;
+	char			*servername;
+	struct rpc_program	*program;
+	u32			version;
+	rpc_authflavor_t	authflavor;
+	unsigned long		flags;
+};
+
+/* Values for "flags" field */
+#define RPC_CLNT_CREATE_HARDRTRY	(1UL << 0)
+#define RPC_CLNT_CREATE_INTR		(1UL << 1)
+#define RPC_CLNT_CREATE_AUTOBIND	(1UL << 2)
+#define RPC_CLNT_CREATE_ONESHOT		(1UL << 3)
+#define RPC_CLNT_CREATE_NONPRIVPORT	(1UL << 4)
+#define RPC_CLNT_CREATE_NOPING		(1UL << 5)
+
+struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 				struct rpc_program *, int);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index fc05cfbd580..bc80fcfdd89 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -237,6 +237,7 @@ void			xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long
 /*
  * Generic internal transport functions
  */
+struct rpc_xprt *	xprt_create_transport(int proto, struct sockaddr *addr, size_t size, struct rpc_timeout *toparms);
 void			xprt_connect(struct rpc_task *task);
 void			xprt_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_task *task);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index ff1e90fd81a..dbb93bdf6cc 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -192,6 +192,67 @@ out_no_xprt:
 	return ERR_PTR(err);
 }
 
+/*
+ * rpc_create - create an RPC client and transport with one call
+ * @args: rpc_clnt create argument structure
+ *
+ * Creates and initializes an RPC transport and an RPC client.
+ *
+ * It can ping the server in order to determine if it is up, and to see if
+ * it supports this program and version.  RPC_CLNT_CREATE_NOPING disables
+ * this behavior so asynchronous tasks can also use rpc_create.
+ */
+struct rpc_clnt *rpc_create(struct rpc_create_args *args)
+{
+	struct rpc_xprt *xprt;
+	struct rpc_clnt *clnt;
+
+	xprt = xprt_create_transport(args->protocol, args->address,
+					args->addrsize, args->timeout);
+	if (IS_ERR(xprt))
+		return (struct rpc_clnt *)xprt;
+
+	/*
+	 * By default, kernel RPC client connects from a reserved port.
+	 * CAP_NET_BIND_SERVICE will not be set for unprivileged requesters,
+	 * but it is always enabled for rpciod, which handles the connect
+	 * operation.
+	 */
+	xprt->resvport = 1;
+	if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
+		xprt->resvport = 0;
+
+	dprintk("RPC:       creating %s client for %s (xprt %p)\n",
+		args->program->name, args->servername, xprt);
+
+	clnt = rpc_new_client(xprt, args->servername, args->program,
+				args->version, args->authflavor);
+	if (IS_ERR(clnt))
+		return clnt;
+
+	if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
+		int err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
+		if (err != 0) {
+			rpc_shutdown_client(clnt);
+			return ERR_PTR(err);
+		}
+	}
+
+	clnt->cl_softrtry = 1;
+	if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
+		clnt->cl_softrtry = 0;
+
+	if (args->flags & RPC_CLNT_CREATE_INTR)
+		clnt->cl_intr = 1;
+	if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
+		clnt->cl_autobind = 1;
+	if (args->flags & RPC_CLNT_CREATE_ONESHOT)
+		clnt->cl_oneshot = 1;
+
+	return clnt;
+}
+EXPORT_SYMBOL(rpc_create);
+
 /**
  * Create an RPC client
  * @xprt - pointer to xprt struct
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 4987517cc74..17f56cfe241 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -887,6 +887,81 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long i
 	to->to_exponential = 0;
 }
 
+/**
+ * xprt_create_transport - create an RPC transport
+ * @proto: requested transport protocol
+ * @ap: remote peer address
+ * @size: length of address
+ * @to: timeout parameters
+ *
+ */
+struct rpc_xprt *xprt_create_transport(int proto, struct sockaddr *ap, size_t size, struct rpc_timeout *to)
+{
+	int result;
+	struct rpc_xprt	*xprt;
+	struct rpc_rqst	*req;
+
+	if ((xprt = kzalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL) {
+		dprintk("RPC:      xprt_create_transport: no memory\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	if (size <= sizeof(xprt->addr)) {
+		memcpy(&xprt->addr, ap, size);
+		xprt->addrlen = size;
+	} else {
+		kfree(xprt);
+		dprintk("RPC:      xprt_create_transport: address too large\n");
+		return ERR_PTR(-EBADF);
+	}
+
+	switch (proto) {
+	case IPPROTO_UDP:
+		result = xs_setup_udp(xprt, to);
+		break;
+	case IPPROTO_TCP:
+		result = xs_setup_tcp(xprt, to);
+		break;
+	default:
+		printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n",
+				proto);
+		return ERR_PTR(-EIO);
+	}
+	if (result) {
+		kfree(xprt);
+		dprintk("RPC:      xprt_create_transport: failed, %d\n", result);
+		return ERR_PTR(result);
+	}
+
+	spin_lock_init(&xprt->transport_lock);
+	spin_lock_init(&xprt->reserve_lock);
+
+	INIT_LIST_HEAD(&xprt->free);
+	INIT_LIST_HEAD(&xprt->recv);
+	INIT_WORK(&xprt->task_cleanup, xprt_autoclose, xprt);
+	init_timer(&xprt->timer);
+	xprt->timer.function = xprt_init_autodisconnect;
+	xprt->timer.data = (unsigned long) xprt;
+	xprt->last_used = jiffies;
+	xprt->cwnd = RPC_INITCWND;
+
+	rpc_init_wait_queue(&xprt->binding, "xprt_binding");
+	rpc_init_wait_queue(&xprt->pending, "xprt_pending");
+	rpc_init_wait_queue(&xprt->sending, "xprt_sending");
+	rpc_init_wait_queue(&xprt->resend, "xprt_resend");
+	rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog");
+
+	/* initialize free list */
+	for (req = &xprt->slot[xprt->max_reqs-1]; req >= &xprt->slot[0]; req--)
+		list_add(&req->rq_list, &xprt->free);
+
+	xprt_init_xid(xprt);
+
+	dprintk("RPC:      created transport %p with %u slots\n", xprt,
+			xprt->max_reqs);
+
+	return xprt;
+}
+
 static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to)
 {
 	int result;
-- 
cgit v1.2.3-70-g09d2


From ff9aa5e56df60cc8565a93cc868fe25ae3f20e49 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:21 -0400
Subject: SUNRPC: Eliminate xprt_create_proto and rpc_create_client

The two function call API for creating a new RPC client is now obsolete.
Remove it.

Also, remove an unnecessary check to see whether the caller is capable of
using privileged network services.  The kernel RPC client always uses a
privileged ephemeral port by default; callers are responsible for checking
the authority of users to make use of any RPC service, or for specifying
that a nonprivileged port is acceptable.

Test plan:
Repeated runs of Connectathon locking suite.  Check network trace to ensure
correctness of NLM requests and replies.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  7 ----
 include/linux/sunrpc/xprt.h |  1 -
 net/sunrpc/clnt.c           | 42 +-----------------------
 net/sunrpc/sunrpc_syms.c    |  3 --
 net/sunrpc/xprt.c           | 79 ---------------------------------------------
 net/sunrpc/xprtsock.c       |  2 --
 6 files changed, 1 insertion(+), 133 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 7817ba82f1b..f6d1d646ce0 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -91,13 +91,6 @@ struct rpc_procinfo {
 
 #ifdef __KERNEL__
 
-struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
-				struct rpc_program *info,
-				u32 version, rpc_authflavor_t authflavor);
-struct rpc_clnt *rpc_new_client(struct rpc_xprt *xprt, char *servname,
-				struct rpc_program *info,
-				u32 version, rpc_authflavor_t authflavor);
-
 struct rpc_create_args {
 	int			protocol;
 	struct sockaddr		*address;
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index bc80fcfdd89..de4efea7c85 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -231,7 +231,6 @@ struct rpc_xprt {
 /*
  * Transport operations used by ULPs
  */
-struct rpc_xprt *	xprt_create_proto(int proto, struct sockaddr_in *addr, struct rpc_timeout *to);
 void			xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr);
 
 /*
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index dbb93bdf6cc..428704dd5b3 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -97,17 +97,7 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
 	}
 }
 
-/*
- * Create an RPC client
- * FIXME: This should also take a flags argument (as in task->tk_flags).
- * It's called (among others) from pmap_create_client, which may in
- * turn be called by an async task. In this case, rpciod should not be
- * made to sleep too long.
- */
-struct rpc_clnt *
-rpc_new_client(struct rpc_xprt *xprt, char *servname,
-		  struct rpc_program *program, u32 vers,
-		  rpc_authflavor_t flavor)
+static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, struct rpc_program *program, u32 vers, rpc_authflavor_t flavor)
 {
 	struct rpc_version	*version;
 	struct rpc_clnt		*clnt = NULL;
@@ -253,36 +243,6 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 }
 EXPORT_SYMBOL(rpc_create);
 
-/**
- * Create an RPC client
- * @xprt - pointer to xprt struct
- * @servname - name of server
- * @info - rpc_program
- * @version - rpc_program version
- * @authflavor - rpc_auth flavour to use
- *
- * Creates an RPC client structure, then pings the server in order to
- * determine if it is up, and if it supports this program and version.
- *
- * This function should never be called by asynchronous tasks such as
- * the portmapper.
- */
-struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
-		struct rpc_program *info, u32 version, rpc_authflavor_t authflavor)
-{
-	struct rpc_clnt *clnt;
-	int err;
-	
-	clnt = rpc_new_client(xprt, servname, info, version, authflavor);
-	if (IS_ERR(clnt))
-		return clnt;
-	err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
-	if (err == 0)
-		return clnt;
-	rpc_shutdown_client(clnt);
-	return ERR_PTR(err);
-}
-
 /*
  * This function clones the RPC client structure. It allows us to share the
  * same transport while varying parameters such as the authentication
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index f38f939ce95..26c0531d7e2 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -36,8 +36,6 @@ EXPORT_SYMBOL(rpc_wake_up_status);
 EXPORT_SYMBOL(rpc_release_task);
 
 /* RPC client functions */
-EXPORT_SYMBOL(rpc_create_client);
-EXPORT_SYMBOL(rpc_new_client);
 EXPORT_SYMBOL(rpc_clone_client);
 EXPORT_SYMBOL(rpc_bind_new_program);
 EXPORT_SYMBOL(rpc_destroy_client);
@@ -57,7 +55,6 @@ EXPORT_SYMBOL(rpc_queue_upcall);
 EXPORT_SYMBOL(rpc_mkpipe);
 
 /* Client transport */
-EXPORT_SYMBOL(xprt_create_proto);
 EXPORT_SYMBOL(xprt_set_timeout);
 
 /* Client credential cache */
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 17f56cfe241..e4f64fb58ff 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -962,85 +962,6 @@ struct rpc_xprt *xprt_create_transport(int proto, struct sockaddr *ap, size_t si
 	return xprt;
 }
 
-static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to)
-{
-	int result;
-	struct rpc_xprt	*xprt;
-	struct rpc_rqst	*req;
-
-	if ((xprt = kzalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	memcpy(&xprt->addr, ap, sizeof(*ap));
-	xprt->addrlen = sizeof(*ap);
-
-	switch (proto) {
-	case IPPROTO_UDP:
-		result = xs_setup_udp(xprt, to);
-		break;
-	case IPPROTO_TCP:
-		result = xs_setup_tcp(xprt, to);
-		break;
-	default:
-		printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n",
-				proto);
-		result = -EIO;
-		break;
-	}
-	if (result) {
-		kfree(xprt);
-		return ERR_PTR(result);
-	}
-
-	spin_lock_init(&xprt->transport_lock);
-	spin_lock_init(&xprt->reserve_lock);
-
-	INIT_LIST_HEAD(&xprt->free);
-	INIT_LIST_HEAD(&xprt->recv);
-	INIT_WORK(&xprt->task_cleanup, xprt_autoclose, xprt);
-	init_timer(&xprt->timer);
-	xprt->timer.function = xprt_init_autodisconnect;
-	xprt->timer.data = (unsigned long) xprt;
-	xprt->last_used = jiffies;
-	xprt->cwnd = RPC_INITCWND;
-
-	rpc_init_wait_queue(&xprt->binding, "xprt_binding");
-	rpc_init_wait_queue(&xprt->pending, "xprt_pending");
-	rpc_init_wait_queue(&xprt->sending, "xprt_sending");
-	rpc_init_wait_queue(&xprt->resend, "xprt_resend");
-	rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog");
-
-	/* initialize free list */
-	for (req = &xprt->slot[xprt->max_reqs-1]; req >= &xprt->slot[0]; req--)
-		list_add(&req->rq_list, &xprt->free);
-
-	xprt_init_xid(xprt);
-
-	dprintk("RPC:      created transport %p with %u slots\n", xprt,
-			xprt->max_reqs);
-	
-	return xprt;
-}
-
-/**
- * xprt_create_proto - create an RPC client transport
- * @proto: requested transport protocol
- * @sap: remote peer's address
- * @to: timeout parameters for new transport
- *
- */
-struct rpc_xprt *xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to)
-{
-	struct rpc_xprt	*xprt;
-
-	xprt = xprt_setup(proto, sap, to);
-	if (IS_ERR(xprt))
-		dprintk("RPC:      xprt_create_proto failed\n");
-	else
-		dprintk("RPC:      xprt_create_proto created xprt %p\n", xprt);
-	return xprt;
-}
-
 /**
  * xprt_destroy - destroy an RPC transport, killing off all requests.
  * @xprt: transport to destroy
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 17179aa4c20..0b84fab68d7 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1376,7 +1376,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 
 	xprt->prot = IPPROTO_UDP;
 	xprt->tsh_size = 0;
-	xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
 	/* XXX: header size can vary due to auth type, IPv6, etc. */
 	xprt->max_payload = (1U << 16) - (MAX_HEADER << 3);
 
@@ -1423,7 +1422,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to)
 
 	xprt->prot = IPPROTO_TCP;
 	xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32);
-	xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0;
 	xprt->max_payload = RPC_MAX_FRAGMENT_SIZE;
 
 	INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt);
-- 
cgit v1.2.3-70-g09d2


From 4f390c152bc87165da4b1f5b7d870b46fb106d4e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:22 -0400
Subject: NFS: Fix double d_drop in nfs_instantiate() error path

If the LOOKUP or GETATTR in nfs_instantiate fail, nfs_instantiate will do a
d_drop before returning.  But some callers already do a d_drop in the case
of an error return.  Make certain we do only one d_drop in all error paths.

This issue was introduced because over time, the symlink proc API diverged
slightly from the create/mkdir/mknod proc API.  To prevent other coding
mistakes of this type, change the symlink proc API to be more like
create/mkdir/mknod and move the nfs_instantiate call into the symlink proc
routines so it is used in exactly the same way for create, mkdir, mknod,
and symlink.

Test plan:
Connectathon, all versions of NFS.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            | 16 ++++------------
 fs/nfs/nfs3proc.c       | 26 ++++++++++++++++----------
 fs/nfs/nfs4proc.c       | 31 ++++++++++++++++---------------
 fs/nfs/proc.c           | 29 +++++++++++++++++++++--------
 include/linux/nfs_xdr.h |  5 ++---
 5 files changed, 59 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 084e8cb41c8..affd3ae52e5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1147,23 +1147,20 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
 		struct inode *dir = dentry->d_parent->d_inode;
 		error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
 		if (error)
-			goto out_err;
+			return error;
 	}
 	if (!(fattr->valid & NFS_ATTR_FATTR)) {
 		struct nfs_server *server = NFS_SB(dentry->d_sb);
 		error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr);
 		if (error < 0)
-			goto out_err;
+			return error;
 	}
 	inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
 	error = PTR_ERR(inode);
 	if (IS_ERR(inode))
-		goto out_err;
+		return error;
 	d_instantiate(dentry, inode);
 	return 0;
-out_err:
-	d_drop(dentry);
-	return error;
 }
 
 /*
@@ -1448,8 +1445,6 @@ static int
 nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
 	struct iattr attr;
-	struct nfs_fattr sym_attr;
-	struct nfs_fh sym_fh;
 	struct qstr qsymname;
 	int error;
 
@@ -1473,12 +1468,9 @@ dentry->d_parent->d_name.name, dentry->d_name.name);
 
 	lock_kernel();
 	nfs_begin_data_update(dir);
-	error = NFS_PROTO(dir)->symlink(dir, &dentry->d_name, &qsymname,
-					  &attr, &sym_fh, &sym_attr);
+	error = NFS_PROTO(dir)->symlink(dir, dentry, &qsymname, &attr);
 	nfs_end_data_update(dir);
 	if (!error)
-		error = nfs_instantiate(dentry, &sym_fh, &sym_attr);
-	else
 		d_drop(dentry);
 	unlock_kernel();
 	return error;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 9e8258ece6f..d85ac427c32 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -544,23 +544,23 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 }
 
 static int
-nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
-		  struct iattr *sattr, struct nfs_fh *fhandle,
-		  struct nfs_fattr *fattr)
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct qstr *path,
+		  struct iattr *sattr)
 {
-	struct nfs_fattr	dir_attr;
+	struct nfs_fh fhandle;
+	struct nfs_fattr fattr, dir_attr;
 	struct nfs3_symlinkargs	arg = {
 		.fromfh		= NFS_FH(dir),
-		.fromname	= name->name,
-		.fromlen	= name->len,
+		.fromname	= dentry->d_name.name,
+		.fromlen	= dentry->d_name.len,
 		.topath		= path->name,
 		.tolen		= path->len,
 		.sattr		= sattr
 	};
 	struct nfs3_diropres	res = {
 		.dir_attr	= &dir_attr,
-		.fh		= fhandle,
-		.fattr		= fattr
+		.fh		= &fhandle,
+		.fattr		= &fattr
 	};
 	struct rpc_message msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_SYMLINK],
@@ -571,11 +571,17 @@ nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
 
 	if (path->len > NFS3_MAXPATHLEN)
 		return -ENAMETOOLONG;
-	dprintk("NFS call  symlink %s -> %s\n", name->name, path->name);
+
+	dprintk("NFS call  symlink %s -> %s\n", dentry->d_name.name,
+			path->name);
 	nfs_fattr_init(&dir_attr);
-	nfs_fattr_init(fattr);
+	nfs_fattr_init(&fattr);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_post_op_update_inode(dir, &dir_attr);
+	if (status != 0)
+		goto out;
+	status = nfs_instantiate(dentry, &fhandle, &fattr);
+out:
 	dprintk("NFS reply symlink: %d\n", status);
 	return status;
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a825547e821..2d18eac6bee 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2084,24 +2084,24 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
 	return err;
 }
 
-static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
-		struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr)
+static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+		struct qstr *path, struct iattr *sattr)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
-	struct nfs_fattr dir_fattr;
+	struct nfs_fh fhandle;
+	struct nfs_fattr fattr, dir_fattr;
 	struct nfs4_create_arg arg = {
 		.dir_fh = NFS_FH(dir),
 		.server = server,
-		.name = name,
+		.name = &dentry->d_name,
 		.attrs = sattr,
 		.ftype = NF4LNK,
 		.bitmask = server->attr_bitmask,
 	};
 	struct nfs4_create_res res = {
 		.server = server,
-		.fh = fhandle,
-		.fattr = fattr,
+		.fh = &fhandle,
+		.fattr = &fattr,
 		.dir_fattr = &dir_fattr,
 	};
 	struct rpc_message msg = {
@@ -2113,27 +2113,28 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
 
 	if (path->len > NFS4_MAXPATHLEN)
 		return -ENAMETOOLONG;
+
 	arg.u.symlink = path;
-	nfs_fattr_init(fattr);
+	nfs_fattr_init(&fattr);
 	nfs_fattr_init(&dir_fattr);
 	
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
-	if (!status)
+	if (!status) {
 		update_changeattr(dir, &res.dir_cinfo);
-	nfs_post_op_update_inode(dir, res.dir_fattr);
+		nfs_post_op_update_inode(dir, res.dir_fattr);
+		status = nfs_instantiate(dentry, &fhandle, &fattr);
+	}
 	return status;
 }
 
-static int nfs4_proc_symlink(struct inode *dir, struct qstr *name,
-		struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle,
-		struct nfs_fattr *fattr)
+static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
+		struct qstr *path, struct iattr *sattr)
 {
 	struct nfs4_exception exception = { };
 	int err;
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(dir),
-				_nfs4_proc_symlink(dir, name, path, sattr,
-					fhandle, fattr),
+				_nfs4_proc_symlink(dir, dentry, path, sattr),
 				&exception);
 	} while (exception.retry);
 	return err;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 5a8b9407ee9..0b507bf0f33 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -425,14 +425,15 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 }
 
 static int
-nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
-		 struct iattr *sattr, struct nfs_fh *fhandle,
-		 struct nfs_fattr *fattr)
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct qstr *path,
+		 struct iattr *sattr)
 {
+	struct nfs_fh fhandle;
+	struct nfs_fattr fattr;
 	struct nfs_symlinkargs	arg = {
 		.fromfh		= NFS_FH(dir),
-		.fromname	= name->name,
-		.fromlen	= name->len,
+		.fromname	= dentry->d_name.name,
+		.fromlen	= dentry->d_name.len,
 		.topath		= path->name,
 		.tolen		= path->len,
 		.sattr		= sattr
@@ -445,11 +446,23 @@ nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path,
 
 	if (path->len > NFS2_MAXPATHLEN)
 		return -ENAMETOOLONG;
-	dprintk("NFS call  symlink %s -> %s\n", name->name, path->name);
-	nfs_fattr_init(fattr);
-	fhandle->size = 0;
+
+	dprintk("NFS call  symlink %s -> %s\n", dentry->d_name.name,
+			path->name);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
+
+	/*
+	 * V2 SYMLINK requests don't return any attributes.  Setting the
+	 * filehandle size to zero indicates to nfs_instantiate that it
+	 * should fill in the data with a LOOKUP call on the wire.
+	 */
+	if (status == 0) {
+		nfs_fattr_init(&fattr);
+		fhandle.size = 0;
+		status = nfs_instantiate(dentry, &fhandle, &fattr);
+	}
+
 	dprintk("NFS reply symlink: %d\n", status);
 	return status;
 }
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 0f33e621892..ddf5d75e97a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -793,9 +793,8 @@ struct nfs_rpc_ops {
 	int	(*rename)  (struct inode *, struct qstr *,
 			    struct inode *, struct qstr *);
 	int	(*link)    (struct inode *, struct inode *, struct qstr *);
-	int	(*symlink) (struct inode *, struct qstr *, struct qstr *,
-			    struct iattr *, struct nfs_fh *,
-			    struct nfs_fattr *);
+	int	(*symlink) (struct inode *, struct dentry *, struct qstr *,
+			    struct iattr *);
 	int	(*mkdir)   (struct inode *, struct dentry *, struct iattr *);
 	int	(*rmdir)   (struct inode *, struct qstr *);
 	int	(*readdir) (struct dentry *, struct rpc_cred *,
-- 
cgit v1.2.3-70-g09d2


From 94a6d75320b3681e6e728b70e18bd186cb55e682 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 22 Aug 2006 20:06:23 -0400
Subject: NFS: Use cached page as buffer for NFS symlink requests

Now that we have a copy of the symlink path in the page cache, we can pass
a struct page down to the XDR routines instead of a string buffer.

Test plan:
Connectathon, all NFS versions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c            |  8 +-------
 fs/nfs/nfs2xdr.c        | 21 ++++++++++++++++++---
 fs/nfs/nfs3proc.c       | 14 +++++++-------
 fs/nfs/nfs3xdr.c        |  7 +++++--
 fs/nfs/nfs4proc.c       | 12 +++++++-----
 fs/nfs/nfs4xdr.c        |  8 ++++----
 fs/nfs/proc.c           | 14 +++++++-------
 include/linux/nfs_xdr.h | 17 ++++++++++-------
 8 files changed, 59 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b483e5d206c..51328ae640d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1464,10 +1464,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 	char *kaddr;
 	struct iattr attr;
 	unsigned int pathlen = strlen(symname);
-	struct qstr qsymname = {
-		.name	= symname,
-		.len	= pathlen,
-	};
 	int error;
 
 	dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,
@@ -1493,10 +1489,8 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	/* XXX: eventually this will pass in {page, pathlen},
-	 *	instead of qsymname; need XDR changes for that */
 	nfs_begin_data_update(dir);
-	error = NFS_PROTO(dir)->symlink(dir, dentry, &qsymname, &attr);
+	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
 	nfs_end_data_update(dir);
 	if (error != 0) {
 		dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n",
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 67391eef6b9..b49501fc0a7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -51,7 +51,7 @@
 #define NFS_createargs_sz	(NFS_diropargs_sz+NFS_sattr_sz)
 #define NFS_renameargs_sz	(NFS_diropargs_sz+NFS_diropargs_sz)
 #define NFS_linkargs_sz		(NFS_fhandle_sz+NFS_diropargs_sz)
-#define NFS_symlinkargs_sz	(NFS_diropargs_sz+NFS_path_sz+NFS_sattr_sz)
+#define NFS_symlinkargs_sz	(NFS_diropargs_sz+1+NFS_sattr_sz)
 #define NFS_readdirargs_sz	(NFS_fhandle_sz+2)
 
 #define NFS_attrstat_sz		(1+NFS_fattr_sz)
@@ -351,11 +351,26 @@ nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args)
 static int
 nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args)
 {
+	struct xdr_buf *sndbuf = &req->rq_snd_buf;
+	size_t pad;
+
 	p = xdr_encode_fhandle(p, args->fromfh);
 	p = xdr_encode_array(p, args->fromname, args->fromlen);
-	p = xdr_encode_array(p, args->topath, args->tolen);
+	*p++ = htonl(args->pathlen);
+	sndbuf->len = xdr_adjust_iovec(sndbuf->head, p);
+
+	xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen);
+
+	/*
+	 * xdr_encode_pages may have added a few bytes to ensure the
+	 * pathname ends on a 4-byte boundary.  Start encoding the
+	 * attributes after the pad bytes.
+	 */
+	pad = sndbuf->tail->iov_len;
+	if (pad > 0)
+		p++;
 	p = xdr_encode_sattr(p, args->sattr);
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad;
 	return 0;
 }
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d85ac427c32..f8688eaa000 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -544,8 +544,8 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 }
 
 static int
-nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct qstr *path,
-		  struct iattr *sattr)
+nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+		  unsigned int len, struct iattr *sattr)
 {
 	struct nfs_fh fhandle;
 	struct nfs_fattr fattr, dir_attr;
@@ -553,8 +553,8 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct qstr *path,
 		.fromfh		= NFS_FH(dir),
 		.fromname	= dentry->d_name.name,
 		.fromlen	= dentry->d_name.len,
-		.topath		= path->name,
-		.tolen		= path->len,
+		.pages		= &page,
+		.pathlen	= len,
 		.sattr		= sattr
 	};
 	struct nfs3_diropres	res = {
@@ -569,11 +569,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct qstr *path,
 	};
 	int			status;
 
-	if (path->len > NFS3_MAXPATHLEN)
+	if (len > NFS3_MAXPATHLEN)
 		return -ENAMETOOLONG;
 
-	dprintk("NFS call  symlink %s -> %s\n", dentry->d_name.name,
-			path->name);
+	dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+
 	nfs_fattr_init(&dir_attr);
 	nfs_fattr_init(&fattr);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 0250269e975..16556fa4eff 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -56,7 +56,7 @@
 #define NFS3_writeargs_sz	(NFS3_fh_sz+5)
 #define NFS3_createargs_sz	(NFS3_diropargs_sz+NFS3_sattr_sz)
 #define NFS3_mkdirargs_sz	(NFS3_diropargs_sz+NFS3_sattr_sz)
-#define NFS3_symlinkargs_sz	(NFS3_diropargs_sz+NFS3_path_sz+NFS3_sattr_sz)
+#define NFS3_symlinkargs_sz	(NFS3_diropargs_sz+1+NFS3_sattr_sz)
 #define NFS3_mknodargs_sz	(NFS3_diropargs_sz+2+NFS3_sattr_sz)
 #define NFS3_renameargs_sz	(NFS3_diropargs_sz+NFS3_diropargs_sz)
 #define NFS3_linkargs_sz		(NFS3_fh_sz+NFS3_diropargs_sz)
@@ -398,8 +398,11 @@ nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args
 	p = xdr_encode_fhandle(p, args->fromfh);
 	p = xdr_encode_array(p, args->fromname, args->fromlen);
 	p = xdr_encode_sattr(p, args->sattr);
-	p = xdr_encode_array(p, args->topath, args->tolen);
+	*p++ = htonl(args->pathlen);
 	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+
+	/* Copy the page */
+	xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2d18eac6bee..7f60beb40df 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2085,7 +2085,7 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
 }
 
 static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-		struct qstr *path, struct iattr *sattr)
+		struct page *page, unsigned int len, struct iattr *sattr)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs_fh fhandle;
@@ -2111,10 +2111,11 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 	};
 	int			status;
 
-	if (path->len > NFS4_MAXPATHLEN)
+	if (len > NFS4_MAXPATHLEN)
 		return -ENAMETOOLONG;
 
-	arg.u.symlink = path;
+	arg.u.symlink.pages = &page;
+	arg.u.symlink.len = len;
 	nfs_fattr_init(&fattr);
 	nfs_fattr_init(&dir_fattr);
 	
@@ -2128,13 +2129,14 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
 }
 
 static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
-		struct qstr *path, struct iattr *sattr)
+		struct page *page, unsigned int len, struct iattr *sattr)
 {
 	struct nfs4_exception exception = { };
 	int err;
 	do {
 		err = nfs4_handle_exception(NFS_SERVER(dir),
-				_nfs4_proc_symlink(dir, dentry, path, sattr),
+				_nfs4_proc_symlink(dir, dentry, page,
+							len, sattr),
 				&exception);
 	} while (exception.retry);
 	return err;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 99926067eca..3dd413f52da 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -128,7 +128,7 @@ static int nfs4_stat_to_errno(int);
 #define decode_link_maxsz	(op_decode_hdr_maxsz + 5)
 #define encode_symlink_maxsz	(op_encode_hdr_maxsz + \
 				1 + nfs4_name_maxsz + \
-				nfs4_path_maxsz + \
+				1 + \
 				nfs4_fattr_maxsz)
 #define decode_symlink_maxsz	(op_decode_hdr_maxsz + 8)
 #define encode_create_maxsz	(op_encode_hdr_maxsz + \
@@ -673,9 +673,9 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
 
 	switch (create->ftype) {
 	case NF4LNK:
-		RESERVE_SPACE(4 + create->u.symlink->len);
-		WRITE32(create->u.symlink->len);
-		WRITEMEM(create->u.symlink->name, create->u.symlink->len);
+		RESERVE_SPACE(4);
+		WRITE32(create->u.symlink.len);
+		xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
 		break;
 
 	case NF4BLK: case NF4CHR:
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 0b507bf0f33..630e50647bb 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -425,8 +425,8 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 }
 
 static int
-nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct qstr *path,
-		 struct iattr *sattr)
+nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
+		 unsigned int len, struct iattr *sattr)
 {
 	struct nfs_fh fhandle;
 	struct nfs_fattr fattr;
@@ -434,8 +434,8 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct qstr *path,
 		.fromfh		= NFS_FH(dir),
 		.fromname	= dentry->d_name.name,
 		.fromlen	= dentry->d_name.len,
-		.topath		= path->name,
-		.tolen		= path->len,
+		.pages		= &page,
+		.pathlen	= len,
 		.sattr		= sattr
 	};
 	struct rpc_message msg = {
@@ -444,11 +444,11 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct qstr *path,
 	};
 	int			status;
 
-	if (path->len > NFS2_MAXPATHLEN)
+	if (len > NFS2_MAXPATHLEN)
 		return -ENAMETOOLONG;
 
-	dprintk("NFS call  symlink %s -> %s\n", dentry->d_name.name,
-			path->name);
+	dprintk("NFS call  symlink %s\n", dentry->d_name.name);
+
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
 	nfs_mark_for_revalidate(dir);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ddf5d75e97a..dc5397d9d23 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -358,8 +358,8 @@ struct nfs_symlinkargs {
 	struct nfs_fh *		fromfh;
 	const char *		fromname;
 	unsigned int		fromlen;
-	const char *		topath;
-	unsigned int		tolen;
+	struct page **		pages;
+	unsigned int		pathlen;
 	struct iattr *		sattr;
 };
 
@@ -434,8 +434,8 @@ struct nfs3_symlinkargs {
 	struct nfs_fh *		fromfh;
 	const char *		fromname;
 	unsigned int		fromlen;
-	const char *		topath;
-	unsigned int		tolen;
+	struct page **		pages;
+	unsigned int		pathlen;
 	struct iattr *		sattr;
 };
 
@@ -533,7 +533,10 @@ struct nfs4_accessres {
 struct nfs4_create_arg {
 	u32				ftype;
 	union {
-		struct qstr *		symlink;    /* NF4LNK */
+		struct {
+			struct page **	pages;
+			unsigned int	len;
+		} symlink;   /* NF4LNK */
 		struct {
 			u32		specdata1;
 			u32		specdata2;
@@ -793,8 +796,8 @@ struct nfs_rpc_ops {
 	int	(*rename)  (struct inode *, struct qstr *,
 			    struct inode *, struct qstr *);
 	int	(*link)    (struct inode *, struct inode *, struct qstr *);
-	int	(*symlink) (struct inode *, struct dentry *, struct qstr *,
-			    struct iattr *);
+	int	(*symlink) (struct inode *, struct dentry *, struct page *,
+			    unsigned int, struct iattr *);
 	int	(*mkdir)   (struct inode *, struct dentry *, struct iattr *);
 	int	(*rmdir)   (struct inode *, struct qstr *);
 	int	(*readdir) (struct dentry *, struct rpc_cred *,
-- 
cgit v1.2.3-70-g09d2


From 275a082fe9308e710324e26ccb5363c53d8fd45f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 Aug 2006 20:06:24 -0400
Subject: Add a real API for dealing with blk_congestion_wait()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 block/ll_rw_blk.c         | 12 ++++++++++++
 fs/nfs/write.c            |  1 +
 include/linux/blkdev.h    |  1 +
 include/linux/writeback.h |  1 +
 mm/page-writeback.c       |  9 +++++++++
 5 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index ddd9253f9d5..dcbd6ff1fa3 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2734,6 +2734,18 @@ long blk_congestion_wait(int rw, long timeout)
 
 EXPORT_SYMBOL(blk_congestion_wait);
 
+/**
+ * blk_congestion_end - wake up sleepers on a congestion queue
+ * @rw: READ or WRITE
+ */
+void blk_congestion_end(int rw)
+{
+	wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+	if (waitqueue_active(wqh))
+		wake_up(wqh);
+}
+
 /*
  * Has to be called with the request spinlock acquired
  */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 453d44666ea..38ba5c09af0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -396,6 +396,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 out:
 	clear_bit(BDI_write_congested, &bdi->state);
 	wake_up_all(&nfs_write_congestion);
+	writeback_congestion_end();
 	return err;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index aafe82788b4..96c9040c00a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -746,6 +746,7 @@ extern void blk_queue_free_tags(request_queue_t *);
 extern int blk_queue_resize_tags(request_queue_t *, int);
 extern void blk_queue_invalidate_tags(request_queue_t *);
 extern long blk_congestion_wait(int rw, long timeout);
+extern void blk_congestion_end(int rw);
 
 extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *);
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9e38b566d0e..0422036af4e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -85,6 +85,7 @@ int wakeup_pdflush(long nr_pages);
 void laptop_io_completion(void);
 void laptop_sync_completion(void);
 void throttle_vm_writeout(void);
+void writeback_congestion_end(void);
 
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e630188ccc4..77a0bc4e261 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -802,6 +802,15 @@ int test_set_page_writeback(struct page *page)
 }
 EXPORT_SYMBOL(test_set_page_writeback);
 
+/*
+ * Wakes up tasks that are being throttled due to writeback congestion
+ */
+void writeback_congestion_end(void)
+{
+	blk_congestion_end(WRITE);
+}
+EXPORT_SYMBOL(writeback_congestion_end);
+
 /*
  * Return true if any of the pages in the mapping are marged with the
  * passed tag.
-- 
cgit v1.2.3-70-g09d2


From 5dd3177ae5012c1e2ad7a9ffdbd0e0d0de2f60e4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 24 Aug 2006 01:03:05 -0400
Subject: NFSv4: Fix a use-after-free issue with the nfs server.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 36 +++++++++++++++++++++---------------
 fs/nfs/nfs4renewd.c       |  1 +
 fs/nfs/super.c            |  8 +++++---
 include/linux/nfs_fs_sb.h |  1 +
 4 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 12941a8a6d7..f1ff2aec2ca 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -164,6 +164,26 @@ error_0:
 	return NULL;
 }
 
+static void nfs4_shutdown_client(struct nfs_client *clp)
+{
+#ifdef CONFIG_NFS_V4
+	if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state))
+		nfs4_kill_renewd(clp);
+	while (!list_empty(&clp->cl_unused)) {
+		struct nfs4_state_owner *sp;
+
+		sp = list_entry(clp->cl_unused.next,
+				struct nfs4_state_owner,
+				so_list);
+		list_del(&sp->so_list);
+		kfree(sp);
+	}
+	BUG_ON(!list_empty(&clp->cl_state_owners));
+	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state))
+		nfs_idmap_delete(clp);
+#endif
+}
+
 /*
  * Destroy a shared client record
  */
@@ -171,21 +191,7 @@ static void nfs_free_client(struct nfs_client *clp)
 {
 	dprintk("--> nfs_free_client(%d)\n", clp->cl_nfsversion);
 
-#ifdef CONFIG_NFS_V4
-	if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) {
-		while (!list_empty(&clp->cl_unused)) {
-			struct nfs4_state_owner *sp;
-
-			sp = list_entry(clp->cl_unused.next,
-					struct nfs4_state_owner,
-					so_list);
-			list_del(&sp->so_list);
-			kfree(sp);
-		}
-		BUG_ON(!list_empty(&clp->cl_state_owners));
-		nfs_idmap_delete(clp);
-	}
-#endif
+	nfs4_shutdown_client(clp);
 
 	/* -EIO all pending I/O */
 	if (!IS_ERR(clp->cl_rpcclient))
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index f2c893690ac..7b6df1852e7 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -121,6 +121,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
 			__FUNCTION__, (timeout + HZ - 1) / HZ);
 	cancel_delayed_work(&clp->cl_renewd);
 	schedule_delayed_work(&clp->cl_renewd, timeout);
+	set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
 	spin_unlock(&clp->cl_lock);
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 97cfb143e09..665949d2779 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -883,13 +883,15 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		goto out_free;
 	}
 
+	if (s->s_fs_info != server) {
+		nfs_free_server(server);
+		server = NULL;
+	}
+
 	if (!s->s_root) {
 		/* initial superblock/root creation */
 		s->s_flags = flags;
-
 		nfs4_fill_super(s);
-	} else {
-		nfs_free_server(server);
 	}
 
 	mntroot = nfs4_get_root(s, &mntfh);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 6d0be0efd1b..7ccfc7ef0a8 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -19,6 +19,7 @@ struct nfs_client {
 #define NFS_CS_RPCIOD		0		/* - rpciod started */
 #define NFS_CS_CALLBACK		1		/* - callback started */
 #define NFS_CS_IDMAP		2		/* - idmap started */
+#define NFS_CS_RENEWD		3		/* - renewd started */
 	struct sockaddr_in	cl_addr;	/* server identifier */
 	char *			cl_hostname;	/* hostname of server */
 	struct list_head	cl_share_link;	/* link in global client list */
-- 
cgit v1.2.3-70-g09d2


From 158998b6fe36f6acef087f574c96d44713499cc9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 24 Aug 2006 01:03:17 -0400
Subject: SUNRPC: Make rpc_mkpipe() take the parent dentry as an argument

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c                     |  6 +-----
 include/linux/sunrpc/rpc_pipe_fs.h |  2 +-
 net/sunrpc/auth_gss/auth_gss.c     |  7 ++-----
 net/sunrpc/rpc_pipe.c              | 38 +++++++++++++++++++++++---------------
 4 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index f96dfac7dc9..82ad7110a1c 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -84,7 +84,6 @@ struct idmap_hashtable {
 };
 
 struct idmap {
-	char                  idmap_path[48];
 	struct dentry        *idmap_dentry;
 	wait_queue_head_t     idmap_wq;
 	struct idmap_msg      idmap_im;
@@ -119,10 +118,7 @@ nfs_idmap_new(struct nfs_client *clp)
         if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL)
                 return -ENOMEM;
 
-	snprintf(idmap->idmap_path, sizeof(idmap->idmap_path),
-	    "%s/idmap", clp->cl_rpcclient->cl_pathname);
-
-        idmap->idmap_dentry = rpc_mkpipe(idmap->idmap_path,
+        idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
 	    idmap, &idmap_upcall_ops, 0);
         if (IS_ERR(idmap->idmap_dentry)) {
 		error = PTR_ERR(idmap->idmap_dentry);
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index a481472c948..a2eb9b4a9de 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -43,7 +43,7 @@ extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *);
 
 extern struct dentry *rpc_mkdir(char *, struct rpc_clnt *);
 extern int rpc_rmdir(struct dentry *);
-extern struct dentry *rpc_mkpipe(char *, void *, struct rpc_pipe_ops *, int flags);
+extern struct dentry *rpc_mkpipe(struct dentry *, const char *, void *, struct rpc_pipe_ops *, int flags);
 extern int rpc_unlink(struct dentry *);
 extern struct vfsmount *rpc_get_mount(void);
 extern void rpc_put_mount(void);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index ef1cf5b476c..6eed3e166ba 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -88,7 +88,6 @@ struct gss_auth {
 	struct list_head upcalls;
 	struct rpc_clnt *client;
 	struct dentry *dentry;
-	char path[48];
 	spinlock_t lock;
 };
 
@@ -690,10 +689,8 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 	if (err)
 		goto err_put_mech;
 
-	snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s",
-			clnt->cl_pathname,
-			gss_auth->mech->gm_name);
-	gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
+	gss_auth->dentry = rpc_mkpipe(clnt->cl_dentry, gss_auth->mech->gm_name,
+			clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
 	if (IS_ERR(gss_auth->dentry)) {
 		err = PTR_ERR(gss_auth->dentry);
 		goto err_put_mech;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index c21dc07f2a8..11ec12a09d7 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -621,17 +621,13 @@ __rpc_rmdir(struct inode *dir, struct dentry *dentry)
 }
 
 static struct dentry *
-rpc_lookup_negative(char *path, struct nameidata *nd)
+rpc_lookup_create(struct dentry *parent, const char *name, int len)
 {
+	struct inode *dir = parent->d_inode;
 	struct dentry *dentry;
-	struct inode *dir;
-	int error;
 
-	if ((error = rpc_lookup_parent(path, nd)) != 0)
-		return ERR_PTR(error);
-	dir = nd->dentry->d_inode;
 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
-	dentry = lookup_one_len(nd->last.name, nd->dentry, nd->last.len);
+	dentry = lookup_one_len(name, parent, len);
 	if (IS_ERR(dentry))
 		goto out_err;
 	if (dentry->d_inode) {
@@ -642,7 +638,20 @@ rpc_lookup_negative(char *path, struct nameidata *nd)
 	return dentry;
 out_err:
 	mutex_unlock(&dir->i_mutex);
-	rpc_release_path(nd);
+	return dentry;
+}
+
+static struct dentry *
+rpc_lookup_negative(char *path, struct nameidata *nd)
+{
+	struct dentry *dentry;
+	int error;
+
+	if ((error = rpc_lookup_parent(path, nd)) != 0)
+		return ERR_PTR(error);
+	dentry = rpc_lookup_create(nd->dentry, nd->last.name, nd->last.len);
+	if (IS_ERR(dentry))
+		rpc_release_path(nd);
 	return dentry;
 }
 
@@ -701,17 +710,16 @@ rpc_rmdir(struct dentry *dentry)
 }
 
 struct dentry *
-rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags)
+rpc_mkpipe(struct dentry *parent, const char *name, void *private, struct rpc_pipe_ops *ops, int flags)
 {
-	struct nameidata nd;
 	struct dentry *dentry;
 	struct inode *dir, *inode;
 	struct rpc_inode *rpci;
 
-	dentry = rpc_lookup_negative(path, &nd);
+	dentry = rpc_lookup_create(parent, name, strlen(name));
 	if (IS_ERR(dentry))
 		return dentry;
-	dir = nd.dentry->d_inode;
+	dir = parent->d_inode;
 	inode = rpc_get_inode(dir->i_sb, S_IFSOCK | S_IRUSR | S_IWUSR);
 	if (!inode)
 		goto err_dput;
@@ -726,13 +734,13 @@ rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags)
 	dget(dentry);
 out:
 	mutex_unlock(&dir->i_mutex);
-	rpc_release_path(&nd);
 	return dentry;
 err_dput:
 	dput(dentry);
 	dentry = ERR_PTR(-ENOMEM);
-	printk(KERN_WARNING "%s: %s() failed to create pipe %s (errno = %d)\n",
-			__FILE__, __FUNCTION__, path, -ENOMEM);
+	printk(KERN_WARNING "%s: %s() failed to create pipe %s/%s (errno = %d)\n",
+			__FILE__, __FUNCTION__, parent->d_name.name, name,
+			-ENOMEM);
 	goto out;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 6b6ca86b77b62b798cf9ca2599036420abce7796 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 5 Sep 2006 12:55:57 -0400
Subject: SUNRPC: Add refcounting to the struct rpc_xprt

In a subsequent patch, this will allow the portmapper to take a reference
to the rpc_xprt for which it is updating the port number, fixing an Oops.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  5 ++++-
 net/sunrpc/clnt.c           |  8 +++-----
 net/sunrpc/xprt.c           | 28 +++++++++++++++++++++++++---
 3 files changed, 32 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index de4efea7c85..bdeba8538c7 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -12,6 +12,7 @@
 #include <linux/uio.h>
 #include <linux/socket.h>
 #include <linux/in.h>
+#include <linux/kref.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/xdr.h>
 
@@ -129,6 +130,7 @@ struct rpc_xprt_ops {
 };
 
 struct rpc_xprt {
+	struct kref		kref;		/* Reference count */
 	struct rpc_xprt_ops *	ops;		/* transport methods */
 	struct socket *		sock;		/* BSD socket layer */
 	struct sock *		inet;		/* INET layer */
@@ -248,7 +250,8 @@ int			xprt_adjust_timeout(struct rpc_rqst *req);
 void			xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
 void			xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
 void			xprt_release(struct rpc_task *task);
-int			xprt_destroy(struct rpc_xprt *xprt);
+struct rpc_xprt *	xprt_get(struct rpc_xprt *xprt);
+void			xprt_put(struct rpc_xprt *xprt);
 
 static inline u32 *xprt_skip_transport_header(struct rpc_xprt *xprt, u32 *p)
 {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index ceadb728f0d..084a0ad5c64 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -177,7 +177,7 @@ out_no_path:
 		kfree(clnt->cl_server);
 	kfree(clnt);
 out_err:
-	xprt_destroy(xprt);
+	xprt_put(xprt);
 out_no_xprt:
 	return ERR_PTR(err);
 }
@@ -261,6 +261,7 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	atomic_set(&new->cl_users, 0);
 	new->cl_parent = clnt;
 	atomic_inc(&clnt->cl_count);
+	new->cl_xprt = xprt_get(clnt->cl_xprt);
 	/* Turn off autobind on clones */
 	new->cl_autobind = 0;
 	new->cl_oneshot = 0;
@@ -337,15 +338,12 @@ rpc_destroy_client(struct rpc_clnt *clnt)
 		rpc_rmdir(clnt->cl_dentry);
 		rpc_put_mount();
 	}
-	if (clnt->cl_xprt) {
-		xprt_destroy(clnt->cl_xprt);
-		clnt->cl_xprt = NULL;
-	}
 	if (clnt->cl_server != clnt->cl_inline_name)
 		kfree(clnt->cl_server);
 out_free:
 	rpc_free_iostats(clnt->cl_metrics);
 	clnt->cl_metrics = NULL;
+	xprt_put(clnt->cl_xprt);
 	kfree(clnt);
 	return 0;
 }
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a85f82baefc..1f786f68729 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -926,6 +926,7 @@ struct rpc_xprt *xprt_create_transport(int proto, struct sockaddr *ap, size_t si
 		return ERR_PTR(result);
 	}
 
+	kref_init(&xprt->kref);
 	spin_lock_init(&xprt->transport_lock);
 	spin_lock_init(&xprt->reserve_lock);
 
@@ -958,16 +959,37 @@ struct rpc_xprt *xprt_create_transport(int proto, struct sockaddr *ap, size_t si
 
 /**
  * xprt_destroy - destroy an RPC transport, killing off all requests.
- * @xprt: transport to destroy
+ * @kref: kref for the transport to destroy
  *
  */
-int xprt_destroy(struct rpc_xprt *xprt)
+static void xprt_destroy(struct kref *kref)
 {
+	struct rpc_xprt *xprt = container_of(kref, struct rpc_xprt, kref);
+
 	dprintk("RPC:      destroying transport %p\n", xprt);
 	xprt->shutdown = 1;
 	del_timer_sync(&xprt->timer);
 	xprt->ops->destroy(xprt);
 	kfree(xprt);
+}
 
-	return 0;
+/**
+ * xprt_put - release a reference to an RPC transport.
+ * @xprt: pointer to the transport
+ *
+ */
+void xprt_put(struct rpc_xprt *xprt)
+{
+	kref_put(&xprt->kref, xprt_destroy);
+}
+
+/**
+ * xprt_get - return a reference to an RPC transport.
+ * @xprt: pointer to the transport
+ *
+ */
+struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
+{
+	kref_get(&xprt->kref);
+	return xprt;
 }
-- 
cgit v1.2.3-70-g09d2


From 42750b04c5baa7c5ffdf0a8be2b9b320efdf069f Mon Sep 17 00:00:00 2001
From: Jaroslav Kysela <perex@suse.cz>
Date: Thu, 1 Jun 2006 18:34:01 +0200
Subject: [ALSA] Control API - TLV implementation for additional information
 like dB scale

This patch implements a TLV mechanism to transfer an additional information
like dB scale to the user space. The types might be extended in future.
Acked-by: Takashi Iwai <tiwai@suse.de>

Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/asound.h          |  9 ++++++++-
 include/sound/control.h         |  2 ++
 include/sound/tlv.h             | 43 +++++++++++++++++++++++++++++++++++++++++
 sound/core/control.c            | 41 +++++++++++++++++++++++++++++++++++++--
 sound/pci/ca0106/ca0106_mixer.c |  4 ++++
 5 files changed, 96 insertions(+), 3 deletions(-)
 create mode 100644 include/sound/tlv.h

(limited to 'include')

diff --git a/include/sound/asound.h b/include/sound/asound.h
index 41885f48ad9..76a20406bd1 100644
--- a/include/sound/asound.h
+++ b/include/sound/asound.h
@@ -688,7 +688,7 @@ struct snd_timer_tread {
  *                                                                          *
  ****************************************************************************/
 
-#define SNDRV_CTL_VERSION		SNDRV_PROTOCOL_VERSION(2, 0, 3)
+#define SNDRV_CTL_VERSION		SNDRV_PROTOCOL_VERSION(2, 0, 4)
 
 struct snd_ctl_card_info {
 	int card;			/* card number */
@@ -818,6 +818,12 @@ struct snd_ctl_elem_value {
         unsigned char reserved[128-sizeof(struct timespec)];
 };
 
+struct snd_ctl_tlv {
+        unsigned int numid;	/* control element numeric identification */
+        unsigned int length;	/* in bytes aligned to 4 */
+        unsigned int tlv[0];	/* first TLV */
+};
+
 enum {
 	SNDRV_CTL_IOCTL_PVERSION = _IOR('U', 0x00, int),
 	SNDRV_CTL_IOCTL_CARD_INFO = _IOR('U', 0x01, struct snd_ctl_card_info),
@@ -831,6 +837,7 @@ enum {
 	SNDRV_CTL_IOCTL_ELEM_ADD = _IOWR('U', 0x17, struct snd_ctl_elem_info),
 	SNDRV_CTL_IOCTL_ELEM_REPLACE = _IOWR('U', 0x18, struct snd_ctl_elem_info),
 	SNDRV_CTL_IOCTL_ELEM_REMOVE = _IOWR('U', 0x19, struct snd_ctl_elem_id),
+	SNDRV_CTL_IOCTL_TLV_READ = _IOWR('U', 0x1a, struct snd_ctl_tlv),
 	SNDRV_CTL_IOCTL_HWDEP_NEXT_DEVICE = _IOWR('U', 0x20, int),
 	SNDRV_CTL_IOCTL_HWDEP_INFO = _IOR('U', 0x21, struct snd_hwdep_info),
 	SNDRV_CTL_IOCTL_PCM_NEXT_DEVICE = _IOR('U', 0x30, int),
diff --git a/include/sound/control.h b/include/sound/control.h
index 2489b1eb011..a93a58d0e68 100644
--- a/include/sound/control.h
+++ b/include/sound/control.h
@@ -42,6 +42,7 @@ struct snd_kcontrol_new {
 	snd_kcontrol_info_t *info;
 	snd_kcontrol_get_t *get;
 	snd_kcontrol_put_t *put;
+	unsigned int *tlv;
 	unsigned long private_value;
 };
 
@@ -58,6 +59,7 @@ struct snd_kcontrol {
 	snd_kcontrol_info_t *info;
 	snd_kcontrol_get_t *get;
 	snd_kcontrol_put_t *put;
+	unsigned int *tlv;
 	unsigned long private_value;
 	void *private_data;
 	void (*private_free)(struct snd_kcontrol *kcontrol);
diff --git a/include/sound/tlv.h b/include/sound/tlv.h
new file mode 100644
index 00000000000..b826e1df1da
--- /dev/null
+++ b/include/sound/tlv.h
@@ -0,0 +1,43 @@
+#ifndef __SOUND_TLV_H
+#define __SOUND_TLV_H
+
+/*
+ *  Advanced Linux Sound Architecture - ALSA - Driver
+ *  Copyright (c) 2006 by Jaroslav Kysela <perex@suse.cz>
+ *
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ */
+
+/*
+ * TLV structure is right behind the struct snd_ctl_tlv:
+ *   unsigned int type  	- see SNDRV_CTL_TLVT_*
+ *   unsigned int length
+ *   .... data aligned to sizeof(unsigned int), use
+ *        block_length = (length + (sizeof(unsigned int) - 1)) &
+ *                       ~(sizeof(unsigned int) - 1)) ....
+ */
+
+#define SNDRV_CTL_TLVT_CONTAINER 0	/* one level down - group of TLVs */
+#define SNDRV_CTL_TLVT_DB_SCALE	1       /* dB scale */
+
+#define DECLARE_TLV_DB_SCALE(name, min, step, mute) \
+unsigned int name[] = { \
+        SNDRV_CTL_TLVT_DB_SCALE, 2 * sizeof(unsigned int), \
+        (min), ((step) & 0xffff) | ((mute) ? 0x10000 : 0) \
+}
+
+#endif /* __SOUND_TLV_H */
diff --git a/sound/core/control.c b/sound/core/control.c
index bb397eaa718..e9c8854d2f7 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -241,6 +241,7 @@ struct snd_kcontrol *snd_ctl_new1(const struct snd_kcontrol_new *ncontrol,
 	kctl.info = ncontrol->info;
 	kctl.get = ncontrol->get;
 	kctl.put = ncontrol->put;
+	kctl.tlv = ncontrol->tlv;
 	kctl.private_value = ncontrol->private_value;
 	kctl.private_data = private_data;
 	return snd_ctl_new(&kctl, access);
@@ -1067,6 +1068,40 @@ static int snd_ctl_subscribe_events(struct snd_ctl_file *file, int __user *ptr)
 	return 0;
 }
 
+static int snd_ctl_tlv_read(struct snd_card *card,
+                            struct snd_ctl_tlv __user *_tlv)
+{
+	struct snd_ctl_tlv tlv;
+	struct snd_kcontrol *kctl;
+	unsigned int len;
+	int err = 0;
+
+	if (copy_from_user(&tlv, _tlv, sizeof(tlv)))
+		return -EFAULT;
+        if (tlv.length < sizeof(unsigned int) * 3)
+                return -EINVAL;
+        down_read(&card->controls_rwsem);
+        kctl = snd_ctl_find_numid(card, tlv.numid);
+        if (kctl == NULL) {
+                err = -ENOENT;
+                goto __kctl_end;
+        }
+        if (kctl->tlv == NULL) {
+                err = -ENXIO;
+                goto __kctl_end;
+        }
+        len = kctl->tlv[1] + 2 * sizeof(unsigned int);
+        if (tlv.length < len) {
+                err = -ENOMEM;
+                goto __kctl_end;
+        }
+        if (copy_to_user(_tlv->tlv, kctl->tlv, len))
+        	err = -EFAULT;
+      __kctl_end:
+        up_read(&card->controls_rwsem);
+        return err;
+}
+
 static long snd_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct snd_ctl_file *ctl;
@@ -1086,11 +1121,11 @@ static long snd_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 	case SNDRV_CTL_IOCTL_CARD_INFO:
 		return snd_ctl_card_info(card, ctl, cmd, argp);
 	case SNDRV_CTL_IOCTL_ELEM_LIST:
-		return snd_ctl_elem_list(ctl->card, argp);
+		return snd_ctl_elem_list(card, argp);
 	case SNDRV_CTL_IOCTL_ELEM_INFO:
 		return snd_ctl_elem_info_user(ctl, argp);
 	case SNDRV_CTL_IOCTL_ELEM_READ:
-		return snd_ctl_elem_read_user(ctl->card, argp);
+		return snd_ctl_elem_read_user(card, argp);
 	case SNDRV_CTL_IOCTL_ELEM_WRITE:
 		return snd_ctl_elem_write_user(ctl, argp);
 	case SNDRV_CTL_IOCTL_ELEM_LOCK:
@@ -1105,6 +1140,8 @@ static long snd_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 		return snd_ctl_elem_remove(ctl, argp);
 	case SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS:
 		return snd_ctl_subscribe_events(ctl, ip);
+        case SNDRV_CTL_IOCTL_TLV_READ:
+                return snd_ctl_tlv_read(card, argp);
 	case SNDRV_CTL_IOCTL_POWER:
 		return -ENOPROTOOPT;
 	case SNDRV_CTL_IOCTL_POWER_STATE:
diff --git a/sound/pci/ca0106/ca0106_mixer.c b/sound/pci/ca0106/ca0106_mixer.c
index 146eed70dce..35309b3ed8c 100644
--- a/sound/pci/ca0106/ca0106_mixer.c
+++ b/sound/pci/ca0106/ca0106_mixer.c
@@ -70,9 +70,12 @@
 #include <sound/pcm.h>
 #include <sound/ac97_codec.h>
 #include <sound/info.h>
+#include <sound/tlv.h>
 
 #include "ca0106.h"
 
+static DECLARE_TLV_DB_SCALE(snd_ca0106_db_scale, -5150, 75, 1);
+
 static int snd_ca0106_shared_spdif_info(struct snd_kcontrol *kcontrol,
 					struct snd_ctl_elem_info *uinfo)
 {
@@ -472,6 +475,7 @@ static int snd_ca0106_i2c_volume_put(struct snd_kcontrol *kcontrol,
 	.info =	 snd_ca0106_volume_info,			\
 	.get =   snd_ca0106_volume_get,				\
 	.put =   snd_ca0106_volume_put,				\
+	.tlv =	 snd_ca0106_db_scale,				\
 	.private_value = ((chid) << 8) | (reg)			\
 }
 
-- 
cgit v1.2.3-70-g09d2


From 746d4a02e68499fc6c1f8d0c43d2271853ade181 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 23 Jun 2006 14:37:59 +0200
Subject: [ALSA] Fix disconnection of proc interface

- Add the linked list to each proc entry to enable a single-shot
  disconnection (unregister)
- Deprecate snd_info_unregister(), use snd_info_free_entry()
- Removed NULL checks of snd_info_free_entry()

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/info.h                |   7 +--
 sound/core/hwdep.c                  |   2 +-
 sound/core/info.c                   | 102 ++++++++++++++++++------------------
 sound/core/info_oss.c               |   6 +--
 sound/core/init.c                   |   7 +--
 sound/core/oss/mixer_oss.c          |   6 +--
 sound/core/oss/pcm_oss.c            |   8 ++-
 sound/core/pcm.c                    |  53 +++++++------------
 sound/core/pcm_memory.c             |   2 +-
 sound/core/rawmidi.c                |   2 +-
 sound/core/seq/oss/seq_oss.c        |   3 +-
 sound/core/seq/seq_device.c         |   2 +-
 sound/core/seq/seq_info.c           |   6 +--
 sound/core/sound.c                  |   3 +-
 sound/core/sound_oss.c              |   3 +-
 sound/core/timer.c                  |   2 +-
 sound/drivers/opl4/opl4_proc.c      |   3 +-
 sound/pci/ac97/ac97_proc.c          |  18 +++----
 sound/pci/cs46xx/dsp_spos.c         |  52 +++++++-----------
 sound/pci/cs46xx/dsp_spos_scb_lib.c |   2 +-
 sound/synth/emux/emux_proc.c        |   6 +--
 21 files changed, 123 insertions(+), 172 deletions(-)

(limited to 'include')

diff --git a/include/sound/info.h b/include/sound/info.h
index 74f6996769c..97ffc4fb996 100644
--- a/include/sound/info.h
+++ b/include/sound/info.h
@@ -71,7 +71,6 @@ struct snd_info_entry {
 	mode_t mode;
 	long size;
 	unsigned short content;
-	unsigned short disconnected: 1;
 	union {
 		struct snd_info_entry_text text;
 		struct snd_info_entry_ops *ops;
@@ -83,6 +82,8 @@ struct snd_info_entry {
 	void (*private_free)(struct snd_info_entry *entry);
 	struct proc_dir_entry *p;
 	struct mutex access;
+	struct list_head children;
+	struct list_head list;
 };
 
 #if defined(CONFIG_SND_OSSEMUL) && defined(CONFIG_PROC_FS)
@@ -122,8 +123,8 @@ int snd_info_restore_text(struct snd_info_entry * entry);
 int snd_info_card_create(struct snd_card * card);
 int snd_info_card_register(struct snd_card * card);
 int snd_info_card_free(struct snd_card * card);
+void snd_info_card_disconnect(struct snd_card * card);
 int snd_info_register(struct snd_info_entry * entry);
-int snd_info_unregister(struct snd_info_entry * entry);
 
 /* for card drivers */
 int snd_card_proc_new(struct snd_card *card, const char *name, struct snd_info_entry **entryp);
@@ -156,8 +157,8 @@ static inline void snd_info_free_entry(struct snd_info_entry * entry) { ; }
 static inline int snd_info_card_create(struct snd_card * card) { return 0; }
 static inline int snd_info_card_register(struct snd_card * card) { return 0; }
 static inline int snd_info_card_free(struct snd_card * card) { return 0; }
+static inline void snd_info_card_disconnect(struct snd_card * card) { }
 static inline int snd_info_register(struct snd_info_entry * entry) { return 0; }
-static inline int snd_info_unregister(struct snd_info_entry * entry) { return 0; }
 
 static inline int snd_card_proc_new(struct snd_card *card, const char *name,
 				    struct snd_info_entry **entryp) { return -EINVAL; }
diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c
index 8bd0dcc93eb..cbd8a63282b 100644
--- a/sound/core/hwdep.c
+++ b/sound/core/hwdep.c
@@ -497,7 +497,7 @@ static void __init snd_hwdep_proc_init(void)
 
 static void __exit snd_hwdep_proc_done(void)
 {
-	snd_info_unregister(snd_hwdep_proc_entry);
+	snd_info_free_entry(snd_hwdep_proc_entry);
 }
 #else /* !CONFIG_PROC_FS */
 #define snd_hwdep_proc_init()
diff --git a/sound/core/info.c b/sound/core/info.c
index 340332c6d97..9663b6be9c3 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -78,6 +78,7 @@ struct snd_info_private_data {
 
 static int snd_info_version_init(void);
 static int snd_info_version_done(void);
+static void snd_info_disconnect(struct snd_info_entry *entry);
 
 
 /* resize the proc r/w buffer */
@@ -304,7 +305,7 @@ static int snd_info_entry_open(struct inode *inode, struct file *file)
 	mutex_lock(&info_mutex);
 	p = PDE(inode);
 	entry = p == NULL ? NULL : (struct snd_info_entry *)p->data;
-	if (entry == NULL || entry->disconnected) {
+	if (entry == NULL || ! entry->p) {
 		mutex_unlock(&info_mutex);
 		return -ENODEV;
 	}
@@ -586,10 +587,10 @@ int __exit snd_info_done(void)
 	snd_info_version_done();
 	if (snd_proc_root) {
 #if defined(CONFIG_SND_SEQUENCER) || defined(CONFIG_SND_SEQUENCER_MODULE)
-		snd_info_unregister(snd_seq_root);
+		snd_info_free_entry(snd_seq_root);
 #endif
 #ifdef CONFIG_SND_OSSEMUL
-		snd_info_unregister(snd_oss_root);
+		snd_info_free_entry(snd_oss_root);
 #endif
 		snd_remove_proc_entry(&proc_root, snd_proc_root);
 	}
@@ -648,17 +649,28 @@ int snd_info_card_register(struct snd_card *card)
  * de-register the card proc file
  * called from init.c
  */
-int snd_info_card_free(struct snd_card *card)
+void snd_info_card_disconnect(struct snd_card *card)
 {
-	snd_assert(card != NULL, return -ENXIO);
+	snd_assert(card != NULL, return);
+	mutex_lock(&info_mutex);
 	if (card->proc_root_link) {
 		snd_remove_proc_entry(snd_proc_root, card->proc_root_link);
 		card->proc_root_link = NULL;
 	}
-	if (card->proc_root) {
-		snd_info_unregister(card->proc_root);
-		card->proc_root = NULL;
-	}
+	if (card->proc_root)
+		snd_info_disconnect(card->proc_root);
+	mutex_unlock(&info_mutex);
+}
+
+/*
+ * release the card proc file resources
+ * called from init.c
+ */
+int snd_info_card_free(struct snd_card *card)
+{
+	snd_assert(card != NULL, return -ENXIO);
+	snd_info_free_entry(card->proc_root);
+	card->proc_root = NULL;
 	return 0;
 }
 
@@ -767,6 +779,8 @@ static struct snd_info_entry *snd_info_create_entry(const char *name)
 	entry->mode = S_IFREG | S_IRUGO;
 	entry->content = SNDRV_INFO_CONTENT_TEXT;
 	mutex_init(&entry->access);
+	INIT_LIST_HEAD(&entry->children);
+	INIT_LIST_HEAD(&entry->list);
 	return entry;
 }
 
@@ -819,30 +833,35 @@ struct snd_info_entry *snd_info_create_card_entry(struct snd_card *card,
 
 EXPORT_SYMBOL(snd_info_create_card_entry);
 
-static int snd_info_dev_free_entry(struct snd_device *device)
+static void snd_info_disconnect(struct snd_info_entry *entry)
 {
-	struct snd_info_entry *entry = device->device_data;
-	snd_info_free_entry(entry);
-	return 0;
-}
+	struct list_head *p, *n;
+	struct proc_dir_entry *root;
 
-static int snd_info_dev_register_entry(struct snd_device *device)
-{
-	struct snd_info_entry *entry = device->device_data;
-	return snd_info_register(entry);
+	list_for_each_safe(p, n, &entry->children) {
+		snd_info_disconnect(list_entry(p, struct snd_info_entry, list));
+	}
+
+	if (! entry->p)
+		return;
+	list_del_init(&entry->list);
+	root = entry->parent == NULL ? snd_proc_root : entry->parent->p;
+	snd_assert(root, return);
+	snd_remove_proc_entry(root, entry->p);
+	entry->p = NULL;
 }
 
-static int snd_info_dev_disconnect_entry(struct snd_device *device)
+static int snd_info_dev_free_entry(struct snd_device *device)
 {
 	struct snd_info_entry *entry = device->device_data;
-	entry->disconnected = 1;
+	snd_info_free_entry(entry);
 	return 0;
 }
 
-static int snd_info_dev_unregister_entry(struct snd_device *device)
+static int snd_info_dev_register_entry(struct snd_device *device)
 {
 	struct snd_info_entry *entry = device->device_data;
-	return snd_info_unregister(entry);
+	return snd_info_register(entry);
 }
 
 /**
@@ -871,8 +890,7 @@ int snd_card_proc_new(struct snd_card *card, const char *name,
 	static struct snd_device_ops ops = {
 		.dev_free = snd_info_dev_free_entry,
 		.dev_register =	snd_info_dev_register_entry,
-		.dev_disconnect = snd_info_dev_disconnect_entry,
-		.dev_unregister = snd_info_dev_unregister_entry
+		/* disconnect is done via snd_info_card_disconnect() */
 	};
 	struct snd_info_entry *entry;
 	int err;
@@ -901,6 +919,11 @@ void snd_info_free_entry(struct snd_info_entry * entry)
 {
 	if (entry == NULL)
 		return;
+	if (entry->p) {
+		mutex_lock(&info_mutex);
+		snd_info_disconnect(entry);
+		mutex_unlock(&info_mutex);
+	}
 	kfree(entry->name);
 	if (entry->private_free)
 		entry->private_free(entry);
@@ -935,38 +958,14 @@ int snd_info_register(struct snd_info_entry * entry)
 	p->size = entry->size;
 	p->data = entry;
 	entry->p = p;
+	if (entry->parent)
+		list_add_tail(&entry->list, &entry->parent->children);
 	mutex_unlock(&info_mutex);
 	return 0;
 }
 
 EXPORT_SYMBOL(snd_info_register);
 
-/**
- * snd_info_unregister - de-register the info entry
- * @entry: the info entry
- *
- * De-registers the info entry and releases the instance.
- *
- * Returns zero if successful, or a negative error code on failure.
- */
-int snd_info_unregister(struct snd_info_entry * entry)
-{
-	struct proc_dir_entry *root;
-
-	if (! entry)
-		return 0;
-	snd_assert(entry->p != NULL, return -ENXIO);
-	root = entry->parent == NULL ? snd_proc_root : entry->parent->p;
-	snd_assert(root, return -ENXIO);
-	mutex_lock(&info_mutex);
-	snd_remove_proc_entry(root, entry->p);
-	mutex_unlock(&info_mutex);
-	snd_info_free_entry(entry);
-	return 0;
-}
-
-EXPORT_SYMBOL(snd_info_unregister);
-
 /*
 
  */
@@ -999,8 +998,7 @@ static int __init snd_info_version_init(void)
 
 static int __exit snd_info_version_done(void)
 {
-	if (snd_info_version_entry)
-		snd_info_unregister(snd_info_version_entry);
+	snd_info_free_entry(snd_info_version_entry);
 	return 0;
 }
 
diff --git a/sound/core/info_oss.c b/sound/core/info_oss.c
index bb2c40d0ab6..3ebc34919c7 100644
--- a/sound/core/info_oss.c
+++ b/sound/core/info_oss.c
@@ -131,10 +131,8 @@ int snd_info_minor_register(void)
 
 int snd_info_minor_unregister(void)
 {
-	if (snd_sndstat_proc_entry) {
-		snd_info_unregister(snd_sndstat_proc_entry);
-		snd_sndstat_proc_entry = NULL;
-	}
+	snd_info_free_entry(snd_sndstat_proc_entry);
+	snd_sndstat_proc_entry = NULL;
 	return 0;
 }
 
diff --git a/sound/core/init.c b/sound/core/init.c
index 4d9258884e4..1ecb029ff4c 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -310,6 +310,7 @@ int snd_card_disconnect(struct snd_card *card)
 	if (err < 0)
 		snd_printk(KERN_ERR "not all devices for card %i can be disconnected\n", card->number);
 
+	snd_info_card_disconnect(card);
 	return 0;	
 }
 
@@ -360,7 +361,7 @@ int snd_card_free(struct snd_card *card)
 	}
 	if (card->private_free)
 		card->private_free(card);
-	snd_info_unregister(card->proc_id);
+	snd_info_free_entry(card->proc_id);
 	if (snd_info_card_free(card) < 0) {
 		snd_printk(KERN_WARNING "unable to free card info\n");
 		/* Not fatal error */
@@ -625,9 +626,9 @@ int __init snd_card_info_init(void)
 
 int __exit snd_card_info_done(void)
 {
-	snd_info_unregister(snd_card_info_entry);
+	snd_info_free_entry(snd_card_info_entry);
 #ifdef MODULE
-	snd_info_unregister(snd_card_module_info_entry);
+	snd_info_free_entry(snd_card_module_info_entry);
 #endif
 	return 0;
 }
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index 75a9505c744..00c95def95a 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -1193,10 +1193,8 @@ static void snd_mixer_oss_proc_init(struct snd_mixer_oss *mixer)
 
 static void snd_mixer_oss_proc_done(struct snd_mixer_oss *mixer)
 {
-	if (mixer->proc_entry) {
-		snd_info_unregister(mixer->proc_entry);
-		mixer->proc_entry = NULL;
-	}
+	snd_info_free_entry(mixer->proc_entry);
+	mixer->proc_entry = NULL;
 }
 #else /* !CONFIG_PROC_FS */
 #define snd_mixer_oss_proc_init(mix)
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 472fce0ee0e..a92b93e5ebd 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -2846,11 +2846,9 @@ static void snd_pcm_oss_proc_done(struct snd_pcm *pcm)
 	int stream;
 	for (stream = 0; stream < 2; ++stream) {
 		struct snd_pcm_str *pstr = &pcm->streams[stream];
-		if (pstr->oss.proc_entry) {
-			snd_info_unregister(pstr->oss.proc_entry);
-			pstr->oss.proc_entry = NULL;
-			snd_pcm_oss_proc_free_setup_list(pstr);
-		}
+		snd_info_free_entry(pstr->oss.proc_entry);
+		pstr->oss.proc_entry = NULL;
+		snd_pcm_oss_proc_free_setup_list(pstr);
 	}
 }
 #else /* !CONFIG_SND_VERBOSE_PROCFS */
diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index 7581edd7b9f..b8602471f7e 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -494,19 +494,13 @@ static int snd_pcm_stream_proc_init(struct snd_pcm_str *pstr)
 static int snd_pcm_stream_proc_done(struct snd_pcm_str *pstr)
 {
 #ifdef CONFIG_SND_PCM_XRUN_DEBUG
-	if (pstr->proc_xrun_debug_entry) {
-		snd_info_unregister(pstr->proc_xrun_debug_entry);
-		pstr->proc_xrun_debug_entry = NULL;
-	}
+	snd_info_free_entry(pstr->proc_xrun_debug_entry);
+	pstr->proc_xrun_debug_entry = NULL;
 #endif
-	if (pstr->proc_info_entry) {
-		snd_info_unregister(pstr->proc_info_entry);
-		pstr->proc_info_entry = NULL;
-	}
-	if (pstr->proc_root) {
-		snd_info_unregister(pstr->proc_root);
-		pstr->proc_root = NULL;
-	}
+	snd_info_free_entry(pstr->proc_info_entry);
+	pstr->proc_info_entry = NULL;
+	snd_info_free_entry(pstr->proc_root);
+	pstr->proc_root = NULL;
 	return 0;
 }
 
@@ -570,29 +564,19 @@ static int snd_pcm_substream_proc_init(struct snd_pcm_substream *substream)
 
 	return 0;
 }
-		
+
 static int snd_pcm_substream_proc_done(struct snd_pcm_substream *substream)
 {
-	if (substream->proc_info_entry) {
-		snd_info_unregister(substream->proc_info_entry);
-		substream->proc_info_entry = NULL;
-	}
-	if (substream->proc_hw_params_entry) {
-		snd_info_unregister(substream->proc_hw_params_entry);
-		substream->proc_hw_params_entry = NULL;
-	}
-	if (substream->proc_sw_params_entry) {
-		snd_info_unregister(substream->proc_sw_params_entry);
-		substream->proc_sw_params_entry = NULL;
-	}
-	if (substream->proc_status_entry) {
-		snd_info_unregister(substream->proc_status_entry);
-		substream->proc_status_entry = NULL;
-	}
-	if (substream->proc_root) {
-		snd_info_unregister(substream->proc_root);
-		substream->proc_root = NULL;
-	}
+	snd_info_free_entry(substream->proc_info_entry);
+	substream->proc_info_entry = NULL;
+	snd_info_free_entry(substream->proc_hw_params_entry);
+	substream->proc_hw_params_entry = NULL;
+	snd_info_free_entry(substream->proc_sw_params_entry);
+	substream->proc_sw_params_entry = NULL;
+	snd_info_free_entry(substream->proc_status_entry);
+	substream->proc_status_entry = NULL;
+	snd_info_free_entry(substream->proc_root);
+	substream->proc_root = NULL;
 	return 0;
 }
 #else /* !CONFIG_SND_VERBOSE_PROCFS */
@@ -1090,8 +1074,7 @@ static void snd_pcm_proc_init(void)
 
 static void snd_pcm_proc_done(void)
 {
-	if (snd_pcm_proc_entry)
-		snd_info_unregister(snd_pcm_proc_entry);
+	snd_info_free_entry(snd_pcm_proc_entry);
 }
 
 #else /* !CONFIG_PROC_FS */
diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c
index 067d2056db9..be030cb4d37 100644
--- a/sound/core/pcm_memory.c
+++ b/sound/core/pcm_memory.c
@@ -101,7 +101,7 @@ int snd_pcm_lib_preallocate_free(struct snd_pcm_substream *substream)
 {
 	snd_pcm_lib_preallocate_dma_free(substream);
 #ifdef CONFIG_SND_VERBOSE_PROCFS
-	snd_info_unregister(substream->proc_prealloc_entry);
+	snd_info_free_entry(substream->proc_prealloc_entry);
 	substream->proc_prealloc_entry = NULL;
 #endif
 	return 0;
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index 8c15c66eb4a..51577c22f8c 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -1599,7 +1599,7 @@ static int snd_rawmidi_dev_unregister(struct snd_device *device)
 	mutex_lock(&register_mutex);
 	list_del(&rmidi->list);
 	if (rmidi->proc_entry) {
-		snd_info_unregister(rmidi->proc_entry);
+		snd_info_free_entry(rmidi->proc_entry);
 		rmidi->proc_entry = NULL;
 	}
 #ifdef CONFIG_SND_OSSEMUL
diff --git a/sound/core/seq/oss/seq_oss.c b/sound/core/seq/oss/seq_oss.c
index e7234135641..92858cf8b6e 100644
--- a/sound/core/seq/oss/seq_oss.c
+++ b/sound/core/seq/oss/seq_oss.c
@@ -303,8 +303,7 @@ register_proc(void)
 static void
 unregister_proc(void)
 {
-	if (info_entry)
-		snd_info_unregister(info_entry);
+	snd_info_free_entry(info_entry);
 	info_entry = NULL;
 }
 #endif /* CONFIG_PROC_FS */
diff --git a/sound/core/seq/seq_device.c b/sound/core/seq/seq_device.c
index 102ff548ce6..b85954e956d 100644
--- a/sound/core/seq/seq_device.c
+++ b/sound/core/seq/seq_device.c
@@ -573,7 +573,7 @@ static void __exit alsa_seq_device_exit(void)
 {
 	remove_drivers();
 #ifdef CONFIG_PROC_FS
-	snd_info_unregister(info_entry);
+	snd_info_free_entry(info_entry);
 #endif
 	if (num_ops)
 		snd_printk(KERN_ERR "drivers not released (%d)\n", num_ops);
diff --git a/sound/core/seq/seq_info.c b/sound/core/seq/seq_info.c
index 142e9e6882c..8a7fe5cca1c 100644
--- a/sound/core/seq/seq_info.c
+++ b/sound/core/seq/seq_info.c
@@ -64,9 +64,9 @@ int __init snd_seq_info_init(void)
 
 int __exit snd_seq_info_done(void)
 {
-	snd_info_unregister(queues_entry);
-	snd_info_unregister(clients_entry);
-	snd_info_unregister(timer_entry);
+	snd_info_free_entry(queues_entry);
+	snd_info_free_entry(clients_entry);
+	snd_info_free_entry(timer_entry);
 	return 0;
 }
 #endif
diff --git a/sound/core/sound.c b/sound/core/sound.c
index 7edd1fc58b1..b4430db3fa4 100644
--- a/sound/core/sound.c
+++ b/sound/core/sound.c
@@ -387,8 +387,7 @@ int __init snd_minor_info_init(void)
 
 int __exit snd_minor_info_done(void)
 {
-	if (snd_minor_info_entry)
-		snd_info_unregister(snd_minor_info_entry);
+	snd_info_free_entry(snd_minor_info_entry);
 	return 0;
 }
 #endif /* CONFIG_PROC_FS */
diff --git a/sound/core/sound_oss.c b/sound/core/sound_oss.c
index 74f0fe5a1ba..b2fc40aa520 100644
--- a/sound/core/sound_oss.c
+++ b/sound/core/sound_oss.c
@@ -270,8 +270,7 @@ int __init snd_minor_info_oss_init(void)
 
 int __exit snd_minor_info_oss_done(void)
 {
-	if (snd_minor_info_oss_entry)
-		snd_info_unregister(snd_minor_info_oss_entry);
+	snd_info_free_entry(snd_minor_info_oss_entry);
 	return 0;
 }
 #endif /* CONFIG_PROC_FS */
diff --git a/sound/core/timer.c b/sound/core/timer.c
index 0a984e881c1..52ecbe1e9ab 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1126,7 +1126,7 @@ static void __init snd_timer_proc_init(void)
 
 static void __exit snd_timer_proc_done(void)
 {
-	snd_info_unregister(snd_timer_proc_entry);
+	snd_info_free_entry(snd_timer_proc_entry);
 }
 #else /* !CONFIG_PROC_FS */
 #define snd_timer_proc_init()
diff --git a/sound/drivers/opl4/opl4_proc.c b/sound/drivers/opl4/opl4_proc.c
index e552ec34166..11dd811771a 100644
--- a/sound/drivers/opl4/opl4_proc.c
+++ b/sound/drivers/opl4/opl4_proc.c
@@ -159,8 +159,7 @@ int snd_opl4_create_proc(struct snd_opl4 *opl4)
 
 void snd_opl4_free_proc(struct snd_opl4 *opl4)
 {
-	if (opl4->proc_entry)
-		snd_info_unregister(opl4->proc_entry);
+	snd_info_free_entry(opl4->proc_entry);
 }
 
 #endif /* CONFIG_PROC_FS */
diff --git a/sound/pci/ac97/ac97_proc.c b/sound/pci/ac97/ac97_proc.c
index 2118df50b9d..a3fdd7da911 100644
--- a/sound/pci/ac97/ac97_proc.c
+++ b/sound/pci/ac97/ac97_proc.c
@@ -457,14 +457,10 @@ void snd_ac97_proc_init(struct snd_ac97 * ac97)
 
 void snd_ac97_proc_done(struct snd_ac97 * ac97)
 {
-	if (ac97->proc_regs) {
-		snd_info_unregister(ac97->proc_regs);
-		ac97->proc_regs = NULL;
-	}
-	if (ac97->proc) {
-		snd_info_unregister(ac97->proc);
-		ac97->proc = NULL;
-	}
+	snd_info_free_entry(ac97->proc_regs);
+	ac97->proc_regs = NULL;
+	snd_info_free_entry(ac97->proc);
+	ac97->proc = NULL;
 }
 
 void snd_ac97_bus_proc_init(struct snd_ac97_bus * bus)
@@ -485,8 +481,6 @@ void snd_ac97_bus_proc_init(struct snd_ac97_bus * bus)
 
 void snd_ac97_bus_proc_done(struct snd_ac97_bus * bus)
 {
-	if (bus->proc) {
-		snd_info_unregister(bus->proc);
-		bus->proc = NULL;
-	}
+	snd_info_free_entry(bus->proc);
+	bus->proc = NULL;
 }
diff --git a/sound/pci/cs46xx/dsp_spos.c b/sound/pci/cs46xx/dsp_spos.c
index 5c9711c0265..89c402770a1 100644
--- a/sound/pci/cs46xx/dsp_spos.c
+++ b/sound/pci/cs46xx/dsp_spos.c
@@ -868,35 +868,23 @@ int cs46xx_dsp_proc_done (struct snd_cs46xx *chip)
 	struct dsp_spos_instance * ins = chip->dsp_spos_instance;
 	int i;
 
-	if (ins->proc_sym_info_entry) {
-		snd_info_unregister(ins->proc_sym_info_entry);
-		ins->proc_sym_info_entry = NULL;
-	}
-  
-	if (ins->proc_modules_info_entry) {
-		snd_info_unregister(ins->proc_modules_info_entry);
-		ins->proc_modules_info_entry = NULL;
-	}
- 
-	if (ins->proc_parameter_dump_info_entry) {
-		snd_info_unregister(ins->proc_parameter_dump_info_entry);
-		ins->proc_parameter_dump_info_entry = NULL;
-	}
-  
-	if (ins->proc_sample_dump_info_entry) {
-		snd_info_unregister(ins->proc_sample_dump_info_entry);
-		ins->proc_sample_dump_info_entry = NULL;
-	}
-  
-	if (ins->proc_scb_info_entry) {
-		snd_info_unregister(ins->proc_scb_info_entry);
-		ins->proc_scb_info_entry = NULL;
-	}
-  
-	if (ins->proc_task_info_entry) {
-		snd_info_unregister(ins->proc_task_info_entry);
-		ins->proc_task_info_entry = NULL;
-	}
+	snd_info_free_entry(ins->proc_sym_info_entry);
+	ins->proc_sym_info_entry = NULL;
+
+	snd_info_free_entry(ins->proc_modules_info_entry);
+	ins->proc_modules_info_entry = NULL;
+
+	snd_info_free_entry(ins->proc_parameter_dump_info_entry);
+	ins->proc_parameter_dump_info_entry = NULL;
+
+	snd_info_free_entry(ins->proc_sample_dump_info_entry);
+	ins->proc_sample_dump_info_entry = NULL;
+
+	snd_info_free_entry(ins->proc_scb_info_entry);
+	ins->proc_scb_info_entry = NULL;
+
+	snd_info_free_entry(ins->proc_task_info_entry);
+	ins->proc_task_info_entry = NULL;
 
 	mutex_lock(&chip->spos_mutex);
 	for (i = 0; i < ins->nscb; ++i) {
@@ -905,10 +893,8 @@ int cs46xx_dsp_proc_done (struct snd_cs46xx *chip)
 	}
 	mutex_unlock(&chip->spos_mutex);
 
-	if (ins->proc_dsp_dir) {
-		snd_info_unregister (ins->proc_dsp_dir);
-		ins->proc_dsp_dir = NULL;
-	}
+	snd_info_free_entry(ins->proc_dsp_dir);
+	ins->proc_dsp_dir = NULL;
 
 	return 0;
 }
diff --git a/sound/pci/cs46xx/dsp_spos_scb_lib.c b/sound/pci/cs46xx/dsp_spos_scb_lib.c
index 232b337852f..343f51d5311 100644
--- a/sound/pci/cs46xx/dsp_spos_scb_lib.c
+++ b/sound/pci/cs46xx/dsp_spos_scb_lib.c
@@ -233,7 +233,7 @@ void cs46xx_dsp_proc_free_scb_desc (struct dsp_scb_descriptor * scb)
 
 		snd_printdd("cs46xx_dsp_proc_free_scb_desc: freeing %s\n",scb->scb_name);
 
-		snd_info_unregister(scb->proc_info);
+		snd_info_free_entry(scb->proc_info);
 		scb->proc_info = NULL;
 
 		snd_assert (scb_info != NULL, return);
diff --git a/sound/synth/emux/emux_proc.c b/sound/synth/emux/emux_proc.c
index 58b9601f3ad..59144ec026e 100644
--- a/sound/synth/emux/emux_proc.c
+++ b/sound/synth/emux/emux_proc.c
@@ -128,10 +128,8 @@ void snd_emux_proc_init(struct snd_emux *emu, struct snd_card *card, int device)
 
 void snd_emux_proc_free(struct snd_emux *emu)
 {
-	if (emu->proc) {
-		snd_info_unregister(emu->proc);
-		emu->proc = NULL;
-	}
+	snd_info_free_entry(emu->proc);
+	emu->proc = NULL;
 }
 
 #endif /* CONFIG_PROC_FS */
-- 
cgit v1.2.3-70-g09d2


From c461482c8072bb073e6146db320d3da85cdc89ad Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 23 Jun 2006 14:38:23 +0200
Subject: [ALSA] Unregister device files at disconnection

Orignally proposed by Sam Revitch <sam.revitch@gmail.com>.
Unregister device files at disconnection to avoid the futher accesses.
Also, the dev_unregister callback is removed and replaced with the
combination of disconnect + free.
A new function snd_card_free_when_closed() is introduced, which is
used in USB disconnect callback.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/core.h        |  3 +-
 include/sound/timer.h       |  1 -
 sound/core/control.c        | 27 ++++++------------
 sound/core/device.c         | 20 ++++++-------
 sound/core/hwdep.c          | 10 +++----
 sound/core/init.c           | 69 +++++++++++++++++++++++++++++++++++----------
 sound/core/oss/mixer_oss.c  | 16 +++++------
 sound/core/oss/pcm_oss.c    | 16 +++++------
 sound/core/pcm.c            | 48 ++++++++++++-------------------
 sound/core/rawmidi.c        | 35 ++++++-----------------
 sound/core/rtctimer.c       |  2 +-
 sound/core/seq/seq_device.c | 11 --------
 sound/core/timer.c          | 52 +++++++++++++++-------------------
 sound/pci/ac97/ac97_codec.c |  8 +++---
 sound/usb/usbaudio.c        |  2 +-
 15 files changed, 148 insertions(+), 172 deletions(-)

(limited to 'include')

diff --git a/include/sound/core.h b/include/sound/core.h
index bab3ff457e4..cf4001cf624 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -71,7 +71,6 @@ struct snd_device_ops {
 	int (*dev_free)(struct snd_device *dev);
 	int (*dev_register)(struct snd_device *dev);
 	int (*dev_disconnect)(struct snd_device *dev);
-	int (*dev_unregister)(struct snd_device *dev);
 };
 
 struct snd_device {
@@ -131,6 +130,7 @@ struct snd_card {
 								state */
 	spinlock_t files_lock;		/* lock the files for this card */
 	int shutdown;			/* this card is going down */
+	int free_on_last_close;		/* free in context of file_release */
 	wait_queue_head_t shutdown_sleep;
 	struct work_struct free_workq;	/* for free in workqueue */
 	struct device *dev;
@@ -244,6 +244,7 @@ struct snd_card *snd_card_new(int idx, const char *id,
 			 struct module *module, int extra_size);
 int snd_card_disconnect(struct snd_card *card);
 int snd_card_free(struct snd_card *card);
+int snd_card_free_when_closed(struct snd_card *card);
 int snd_card_free_in_thread(struct snd_card *card);
 int snd_card_register(struct snd_card *card);
 int snd_card_info_init(void);
diff --git a/include/sound/timer.h b/include/sound/timer.h
index 5ece2bf541d..d42c083db1d 100644
--- a/include/sound/timer.h
+++ b/include/sound/timer.h
@@ -129,7 +129,6 @@ void snd_timer_notify(struct snd_timer *timer, int event, struct timespec *tstam
 int snd_timer_global_new(char *id, int device, struct snd_timer **rtimer);
 int snd_timer_global_free(struct snd_timer *timer);
 int snd_timer_global_register(struct snd_timer *timer);
-int snd_timer_global_unregister(struct snd_timer *timer);
 
 int snd_timer_open(struct snd_timer_instance **ti, char *owner, struct snd_timer_id *tid, unsigned int slave_id);
 int snd_timer_close(struct snd_timer_instance *timeri);
diff --git a/sound/core/control.c b/sound/core/control.c
index e9c8854d2f7..f0c7272a2d4 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -1375,6 +1375,11 @@ static int snd_ctl_dev_disconnect(struct snd_device *device)
 	struct snd_card *card = device->device_data;
 	struct list_head *flist;
 	struct snd_ctl_file *ctl;
+	int err, cardnum;
+
+	snd_assert(card != NULL, return -ENXIO);
+	cardnum = card->number;
+	snd_assert(cardnum >= 0 && cardnum < SNDRV_CARDS, return -ENXIO);
 
 	down_read(&card->controls_rwsem);
 	list_for_each(flist, &card->ctl_files) {
@@ -1383,6 +1388,10 @@ static int snd_ctl_dev_disconnect(struct snd_device *device)
 		kill_fasync(&ctl->fasync, SIGIO, POLL_ERR);
 	}
 	up_read(&card->controls_rwsem);
+
+	if ((err = snd_unregister_device(SNDRV_DEVICE_TYPE_CONTROL,
+					 card, -1)) < 0)
+		return err;
 	return 0;
 }
 
@@ -1403,23 +1412,6 @@ static int snd_ctl_dev_free(struct snd_device *device)
 	return 0;
 }
 
-/*
- * de-registration of the control device
- */
-static int snd_ctl_dev_unregister(struct snd_device *device)
-{
-	struct snd_card *card = device->device_data;
-	int err, cardnum;
-
-	snd_assert(card != NULL, return -ENXIO);
-	cardnum = card->number;
-	snd_assert(cardnum >= 0 && cardnum < SNDRV_CARDS, return -ENXIO);
-	if ((err = snd_unregister_device(SNDRV_DEVICE_TYPE_CONTROL,
-					 card, -1)) < 0)
-		return err;
-	return snd_ctl_dev_free(device);
-}
-
 /*
  * create control core:
  * called from init.c
@@ -1430,7 +1422,6 @@ int snd_ctl_create(struct snd_card *card)
 		.dev_free = snd_ctl_dev_free,
 		.dev_register =	snd_ctl_dev_register,
 		.dev_disconnect = snd_ctl_dev_disconnect,
-		.dev_unregister = snd_ctl_dev_unregister
 	};
 
 	snd_assert(card != NULL, return -ENXIO);
diff --git a/sound/core/device.c b/sound/core/device.c
index 6ce4da4a108..ccb25816ac9 100644
--- a/sound/core/device.c
+++ b/sound/core/device.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(snd_device_new);
  * @device_data: the data pointer to release
  *
  * Removes the device from the list on the card and invokes the
- * callback, dev_unregister or dev_free, corresponding to the state.
+ * callbacks, dev_disconnect and dev_free, corresponding to the state.
  * Then release the device.
  *
  * Returns zero if successful, or a negative error code on failure or if the
@@ -90,16 +90,14 @@ int snd_device_free(struct snd_card *card, void *device_data)
 			continue;
 		/* unlink */
 		list_del(&dev->list);
-		if ((dev->state == SNDRV_DEV_REGISTERED ||
-		     dev->state == SNDRV_DEV_DISCONNECTED) &&
-		    dev->ops->dev_unregister) {
-			if (dev->ops->dev_unregister(dev))
-				snd_printk(KERN_ERR "device unregister failure\n");
-		} else {
-			if (dev->ops->dev_free) {
-				if (dev->ops->dev_free(dev))
-					snd_printk(KERN_ERR "device free failure\n");
-			}
+		if (dev->state == SNDRV_DEV_REGISTERED &&
+		    dev->ops->dev_disconnect)
+			if (dev->ops->dev_disconnect(dev))
+				snd_printk(KERN_ERR
+					   "device disconnect failure\n");
+		if (dev->ops->dev_free) {
+			if (dev->ops->dev_free(dev))
+				snd_printk(KERN_ERR "device free failure\n");
 		}
 		kfree(dev);
 		return 0;
diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c
index cbd8a63282b..9aa9d94891f 100644
--- a/sound/core/hwdep.c
+++ b/sound/core/hwdep.c
@@ -42,7 +42,7 @@ static DEFINE_MUTEX(register_mutex);
 static int snd_hwdep_free(struct snd_hwdep *hwdep);
 static int snd_hwdep_dev_free(struct snd_device *device);
 static int snd_hwdep_dev_register(struct snd_device *device);
-static int snd_hwdep_dev_unregister(struct snd_device *device);
+static int snd_hwdep_dev_disconnect(struct snd_device *device);
 
 
 static struct snd_hwdep *snd_hwdep_search(struct snd_card *card, int device)
@@ -353,7 +353,7 @@ int snd_hwdep_new(struct snd_card *card, char *id, int device,
 	static struct snd_device_ops ops = {
 		.dev_free = snd_hwdep_dev_free,
 		.dev_register = snd_hwdep_dev_register,
-		.dev_unregister = snd_hwdep_dev_unregister
+		.dev_disconnect = snd_hwdep_dev_disconnect,
 	};
 
 	snd_assert(rhwdep != NULL, return -EINVAL);
@@ -439,7 +439,7 @@ static int snd_hwdep_dev_register(struct snd_device *device)
 	return 0;
 }
 
-static int snd_hwdep_dev_unregister(struct snd_device *device)
+static int snd_hwdep_dev_disconnect(struct snd_device *device)
 {
 	struct snd_hwdep *hwdep = device->device_data;
 
@@ -454,9 +454,9 @@ static int snd_hwdep_dev_unregister(struct snd_device *device)
 		snd_unregister_oss_device(hwdep->oss_type, hwdep->card, hwdep->device);
 #endif
 	snd_unregister_device(SNDRV_DEVICE_TYPE_HWDEP, hwdep->card, hwdep->device);
-	list_del(&hwdep->list);
+	list_del_init(&hwdep->list);
 	mutex_unlock(&register_mutex);
-	return snd_hwdep_free(hwdep);
+	return 0;
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/sound/core/init.c b/sound/core/init.c
index 1ecb029ff4c..5850d99d21e 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -327,22 +327,10 @@ EXPORT_SYMBOL(snd_card_disconnect);
  *  Returns zero. Frees all associated devices and frees the control
  *  interface associated to given soundcard.
  */
-int snd_card_free(struct snd_card *card)
+static int snd_card_do_free(struct snd_card *card)
 {
 	struct snd_shutdown_f_ops *s_f_ops;
 
-	if (card == NULL)
-		return -EINVAL;
-	mutex_lock(&snd_card_mutex);
-	snd_cards[card->number] = NULL;
-	mutex_unlock(&snd_card_mutex);
-
-#ifdef CONFIG_PM
-	wake_up(&card->power_sleep);
-#endif
-	/* wait, until all devices are ready for the free operation */
-	wait_event(card->shutdown_sleep, card->files == NULL);
-
 #if defined(CONFIG_SND_MIXER_OSS) || defined(CONFIG_SND_MIXER_OSS_MODULE)
 	if (snd_mixer_oss_notify_callback)
 		snd_mixer_oss_notify_callback(card, SND_MIXER_OSS_NOTIFY_FREE);
@@ -371,10 +359,55 @@ int snd_card_free(struct snd_card *card)
 		card->s_f_ops = s_f_ops->next;
 		kfree(s_f_ops);
 	}
+	kfree(card);
+	return 0;
+}
+
+static int snd_card_free_prepare(struct snd_card *card)
+{
+	if (card == NULL)
+		return -EINVAL;
+	(void) snd_card_disconnect(card);
 	mutex_lock(&snd_card_mutex);
+	snd_cards[card->number] = NULL;
 	snd_cards_lock &= ~(1 << card->number);
 	mutex_unlock(&snd_card_mutex);
-	kfree(card);
+#ifdef CONFIG_PM
+	wake_up(&card->power_sleep);
+#endif
+	return 0;
+}
+
+int snd_card_free_when_closed(struct snd_card *card)
+{
+	int free_now = 0;
+	int ret = snd_card_free_prepare(card);
+	if (ret)
+		return ret;
+
+	spin_lock(&card->files_lock);
+	if (card->files == NULL)
+		free_now = 1;
+	else
+		card->free_on_last_close = 1;
+	spin_unlock(&card->files_lock);
+
+	if (free_now)
+		snd_card_do_free(card);
+	return 0;
+}
+
+EXPORT_SYMBOL(snd_card_free_when_closed);
+
+int snd_card_free(struct snd_card *card)
+{
+	int ret = snd_card_free_prepare(card);
+	if (ret)
+		return ret;
+
+	/* wait, until all devices are ready for the free operation */
+	wait_event(card->shutdown_sleep, card->files == NULL);
+	snd_card_do_free(card);
 	return 0;
 }
 
@@ -718,6 +751,7 @@ EXPORT_SYMBOL(snd_card_file_add);
 int snd_card_file_remove(struct snd_card *card, struct file *file)
 {
 	struct snd_monitor_file *mfile, *pfile = NULL;
+	int last_close = 0;
 
 	spin_lock(&card->files_lock);
 	mfile = card->files;
@@ -732,9 +766,14 @@ int snd_card_file_remove(struct snd_card *card, struct file *file)
 		pfile = mfile;
 		mfile = mfile->next;
 	}
-	spin_unlock(&card->files_lock);
 	if (card->files == NULL)
+		last_close = 1;
+	spin_unlock(&card->files_lock);
+	if (last_close) {
 		wake_up(&card->shutdown_sleep);
+		if (card->free_on_last_close)
+			snd_card_do_free(card);
+	}
 	if (!mfile) {
 		snd_printk(KERN_ERR "ALSA card file remove problem (%p)\n", file);
 		return -ENOENT;
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index 00c95def95a..f4c67042e3a 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -1310,21 +1310,19 @@ static int snd_mixer_oss_notify_handler(struct snd_card *card, int cmd)
 		card->mixer_oss = mixer;
 		snd_mixer_oss_build(mixer);
 		snd_mixer_oss_proc_init(mixer);
-	} else if (cmd == SND_MIXER_OSS_NOTIFY_DISCONNECT) {
-		mixer = card->mixer_oss;
-		if (mixer == NULL || !mixer->oss_dev_alloc)
-			return 0;
-		snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_MIXER, mixer->card, 0);
-		mixer->oss_dev_alloc = 0;
-	} else {		/* free */
+	} else {
 		mixer = card->mixer_oss;
 		if (mixer == NULL)
 			return 0;
+		if (mixer->oss_dev_alloc) {
 #ifdef SNDRV_OSS_INFO_DEV_MIXERS
-		snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_MIXERS, mixer->card->number);
+			snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_MIXERS, mixer->card->number);
 #endif
-		if (mixer->oss_dev_alloc)
 			snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_MIXER, mixer->card, 0);
+			mixer->oss_dev_alloc = 0;
+		}
+		if (cmd == SND_MIXER_OSS_NOTIFY_DISCONNECT)
+			return 0;
 		snd_mixer_oss_proc_done(mixer);
 		return snd_mixer_oss_free1(mixer);
 	}
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index a92b93e5ebd..505b23ec405 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -2929,25 +2929,23 @@ static int snd_pcm_oss_disconnect_minor(struct snd_pcm *pcm)
 			snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM,
 						  pcm->card, 1);
 		}
-	}
-	return 0;
-}
-
-static int snd_pcm_oss_unregister_minor(struct snd_pcm *pcm)
-{
-	snd_pcm_oss_disconnect_minor(pcm);
-	if (pcm->oss.reg) {
 		if (dsp_map[pcm->card->number] == (int)pcm->device) {
 #ifdef SNDRV_OSS_INFO_DEV_AUDIO
 			snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_AUDIO, pcm->card->number);
 #endif
 		}
 		pcm->oss.reg = 0;
-		snd_pcm_oss_proc_done(pcm);
 	}
 	return 0;
 }
 
+static int snd_pcm_oss_unregister_minor(struct snd_pcm *pcm)
+{
+	snd_pcm_oss_disconnect_minor(pcm);
+	snd_pcm_oss_proc_done(pcm);
+	return 0;
+}
+
 static struct snd_pcm_notify snd_pcm_oss_notify =
 {
 	.n_register =	snd_pcm_oss_register_minor,
diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index b8602471f7e..f52178abf12 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -42,7 +42,6 @@ static int snd_pcm_free(struct snd_pcm *pcm);
 static int snd_pcm_dev_free(struct snd_device *device);
 static int snd_pcm_dev_register(struct snd_device *device);
 static int snd_pcm_dev_disconnect(struct snd_device *device);
-static int snd_pcm_dev_unregister(struct snd_device *device);
 
 static struct snd_pcm *snd_pcm_search(struct snd_card *card, int device)
 {
@@ -680,7 +679,6 @@ int snd_pcm_new(struct snd_card *card, char *id, int device,
 		.dev_free = snd_pcm_dev_free,
 		.dev_register =	snd_pcm_dev_register,
 		.dev_disconnect = snd_pcm_dev_disconnect,
-		.dev_unregister = snd_pcm_dev_unregister
 	};
 
 	snd_assert(rpcm != NULL, return -EINVAL);
@@ -724,6 +722,7 @@ static void snd_pcm_free_stream(struct snd_pcm_str * pstr)
 	substream = pstr->substream;
 	while (substream) {
 		substream_next = substream->next;
+		snd_pcm_timer_done(substream);
 		snd_pcm_substream_proc_done(substream);
 		kfree(substream);
 		substream = substream_next;
@@ -740,7 +739,12 @@ static void snd_pcm_free_stream(struct snd_pcm_str * pstr)
 
 static int snd_pcm_free(struct snd_pcm *pcm)
 {
+	struct snd_pcm_notify *notify;
+
 	snd_assert(pcm != NULL, return -ENXIO);
+	list_for_each_entry(notify, &snd_pcm_notify_list, list) {
+		notify->n_unregister(pcm);
+	}
 	if (pcm->private_free)
 		pcm->private_free(pcm);
 	snd_pcm_lib_preallocate_free_for_all(pcm);
@@ -955,35 +959,22 @@ static int snd_pcm_dev_register(struct snd_device *device)
 static int snd_pcm_dev_disconnect(struct snd_device *device)
 {
 	struct snd_pcm *pcm = device->device_data;
-	struct list_head *list;
+	struct snd_pcm_notify *notify;
 	struct snd_pcm_substream *substream;
-	int cidx;
+	int cidx, devtype;
 
 	mutex_lock(&register_mutex);
+	if (list_empty(&pcm->list))
+		goto unlock;
+
 	list_del_init(&pcm->list);
 	for (cidx = 0; cidx < 2; cidx++)
 		for (substream = pcm->streams[cidx].substream; substream; substream = substream->next)
 			if (substream->runtime)
 				substream->runtime->status->state = SNDRV_PCM_STATE_DISCONNECTED;
-	list_for_each(list, &snd_pcm_notify_list) {
-		struct snd_pcm_notify *notify;
-		notify = list_entry(list, struct snd_pcm_notify, list);
+	list_for_each_entry(notify, &snd_pcm_notify_list, list) {
 		notify->n_disconnect(pcm);
 	}
-	mutex_unlock(&register_mutex);
-	return 0;
-}
-
-static int snd_pcm_dev_unregister(struct snd_device *device)
-{
-	int cidx, devtype;
-	struct snd_pcm_substream *substream;
-	struct list_head *list;
-	struct snd_pcm *pcm = device->device_data;
-
-	snd_assert(pcm != NULL, return -ENXIO);
-	mutex_lock(&register_mutex);
-	list_del(&pcm->list);
 	for (cidx = 0; cidx < 2; cidx++) {
 		devtype = -1;
 		switch (cidx) {
@@ -995,23 +986,20 @@ static int snd_pcm_dev_unregister(struct snd_device *device)
 			break;
 		}
 		snd_unregister_device(devtype, pcm->card, pcm->device);
-		for (substream = pcm->streams[cidx].substream; substream; substream = substream->next)
-			snd_pcm_timer_done(substream);
-	}
-	list_for_each(list, &snd_pcm_notify_list) {
-		struct snd_pcm_notify *notify;
-		notify = list_entry(list, struct snd_pcm_notify, list);
-		notify->n_unregister(pcm);
 	}
+ unlock:
 	mutex_unlock(&register_mutex);
-	return snd_pcm_free(pcm);
+	return 0;
 }
 
 int snd_pcm_notify(struct snd_pcm_notify *notify, int nfree)
 {
 	struct list_head *p;
 
-	snd_assert(notify != NULL && notify->n_register != NULL && notify->n_unregister != NULL, return -EINVAL);
+	snd_assert(notify != NULL &&
+		   notify->n_register != NULL &&
+		   notify->n_unregister != NULL &&
+		   notify->n_disconnect, return -EINVAL);
 	mutex_lock(&register_mutex);
 	if (nfree) {
 		list_del(&notify->list);
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index 51577c22f8c..8a2bdfae63e 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -55,7 +55,6 @@ static int snd_rawmidi_free(struct snd_rawmidi *rawmidi);
 static int snd_rawmidi_dev_free(struct snd_device *device);
 static int snd_rawmidi_dev_register(struct snd_device *device);
 static int snd_rawmidi_dev_disconnect(struct snd_device *device);
-static int snd_rawmidi_dev_unregister(struct snd_device *device);
 
 static LIST_HEAD(snd_rawmidi_devices);
 static DEFINE_MUTEX(register_mutex);
@@ -1426,7 +1425,6 @@ int snd_rawmidi_new(struct snd_card *card, char *id, int device,
 		.dev_free = snd_rawmidi_dev_free,
 		.dev_register = snd_rawmidi_dev_register,
 		.dev_disconnect = snd_rawmidi_dev_disconnect,
-		.dev_unregister = snd_rawmidi_dev_unregister
 	};
 
 	snd_assert(rrawmidi != NULL, return -EINVAL);
@@ -1479,6 +1477,14 @@ static void snd_rawmidi_free_substreams(struct snd_rawmidi_str *stream)
 static int snd_rawmidi_free(struct snd_rawmidi *rmidi)
 {
 	snd_assert(rmidi != NULL, return -ENXIO);	
+
+	snd_info_free_entry(rmidi->proc_entry);
+	rmidi->proc_entry = NULL;
+	mutex_lock(&register_mutex);
+	if (rmidi->ops && rmidi->ops->dev_unregister)
+		rmidi->ops->dev_unregister(rmidi);
+	mutex_unlock(&register_mutex);
+
 	snd_rawmidi_free_substreams(&rmidi->streams[SNDRV_RAWMIDI_STREAM_INPUT]);
 	snd_rawmidi_free_substreams(&rmidi->streams[SNDRV_RAWMIDI_STREAM_OUTPUT]);
 	if (rmidi->private_free)
@@ -1587,21 +1593,6 @@ static int snd_rawmidi_dev_disconnect(struct snd_device *device)
 
 	mutex_lock(&register_mutex);
 	list_del_init(&rmidi->list);
-	mutex_unlock(&register_mutex);
-	return 0;
-}
-
-static int snd_rawmidi_dev_unregister(struct snd_device *device)
-{
-	struct snd_rawmidi *rmidi = device->device_data;
-
-	snd_assert(rmidi != NULL, return -ENXIO);
-	mutex_lock(&register_mutex);
-	list_del(&rmidi->list);
-	if (rmidi->proc_entry) {
-		snd_info_free_entry(rmidi->proc_entry);
-		rmidi->proc_entry = NULL;
-	}
 #ifdef CONFIG_SND_OSSEMUL
 	if (rmidi->ossreg) {
 		if ((int)rmidi->device == midi_map[rmidi->card->number]) {
@@ -1615,17 +1606,9 @@ static int snd_rawmidi_dev_unregister(struct snd_device *device)
 		rmidi->ossreg = 0;
 	}
 #endif /* CONFIG_SND_OSSEMUL */
-	if (rmidi->ops && rmidi->ops->dev_unregister)
-		rmidi->ops->dev_unregister(rmidi);
 	snd_unregister_device(SNDRV_DEVICE_TYPE_RAWMIDI, rmidi->card, rmidi->device);
 	mutex_unlock(&register_mutex);
-#if defined(CONFIG_SND_SEQUENCER) || (defined(MODULE) && defined(CONFIG_SND_SEQUENCER_MODULE))
-	if (rmidi->seq_dev) {
-		snd_device_free(rmidi->card, rmidi->seq_dev);
-		rmidi->seq_dev = NULL;
-	}
-#endif
-	return snd_rawmidi_free(rmidi);
+	return 0;
 }
 
 /**
diff --git a/sound/core/rtctimer.c b/sound/core/rtctimer.c
index 84704ccb182..412dd62b654 100644
--- a/sound/core/rtctimer.c
+++ b/sound/core/rtctimer.c
@@ -156,7 +156,7 @@ static int __init rtctimer_init(void)
 static void __exit rtctimer_exit(void)
 {
 	if (rtctimer) {
-		snd_timer_global_unregister(rtctimer);
+		snd_timer_global_free(rtctimer);
 		rtctimer = NULL;
 	}
 }
diff --git a/sound/core/seq/seq_device.c b/sound/core/seq/seq_device.c
index b85954e956d..b79d011813c 100644
--- a/sound/core/seq/seq_device.c
+++ b/sound/core/seq/seq_device.c
@@ -90,7 +90,6 @@ static int snd_seq_device_free(struct snd_seq_device *dev);
 static int snd_seq_device_dev_free(struct snd_device *device);
 static int snd_seq_device_dev_register(struct snd_device *device);
 static int snd_seq_device_dev_disconnect(struct snd_device *device);
-static int snd_seq_device_dev_unregister(struct snd_device *device);
 
 static int init_device(struct snd_seq_device *dev, struct ops_list *ops);
 static int free_device(struct snd_seq_device *dev, struct ops_list *ops);
@@ -189,7 +188,6 @@ int snd_seq_device_new(struct snd_card *card, int device, char *id, int argsize,
 		.dev_free = snd_seq_device_dev_free,
 		.dev_register = snd_seq_device_dev_register,
 		.dev_disconnect = snd_seq_device_dev_disconnect,
-		.dev_unregister = snd_seq_device_dev_unregister
 	};
 
 	if (result)
@@ -308,15 +306,6 @@ static int snd_seq_device_dev_disconnect(struct snd_device *device)
 	return 0;
 }
 
-/*
- * unregister the existing device
- */
-static int snd_seq_device_dev_unregister(struct snd_device *device)
-{
-	struct snd_seq_device *dev = device->device_data;
-	return snd_seq_device_free(dev);
-}
-
 /*
  * register device driver
  * id = driver id
diff --git a/sound/core/timer.c b/sound/core/timer.c
index 52ecbe1e9ab..7e5e562fe35 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -88,7 +88,7 @@ static DEFINE_MUTEX(register_mutex);
 static int snd_timer_free(struct snd_timer *timer);
 static int snd_timer_dev_free(struct snd_device *device);
 static int snd_timer_dev_register(struct snd_device *device);
-static int snd_timer_dev_unregister(struct snd_device *device);
+static int snd_timer_dev_disconnect(struct snd_device *device);
 
 static void snd_timer_reschedule(struct snd_timer * timer, unsigned long ticks_left);
 
@@ -773,7 +773,7 @@ int snd_timer_new(struct snd_card *card, char *id, struct snd_timer_id *tid,
 	static struct snd_device_ops ops = {
 		.dev_free = snd_timer_dev_free,
 		.dev_register = snd_timer_dev_register,
-		.dev_unregister = snd_timer_dev_unregister
+		.dev_disconnect = snd_timer_dev_disconnect,
 	};
 
 	snd_assert(tid != NULL, return -EINVAL);
@@ -813,6 +813,21 @@ int snd_timer_new(struct snd_card *card, char *id, struct snd_timer_id *tid,
 static int snd_timer_free(struct snd_timer *timer)
 {
 	snd_assert(timer != NULL, return -ENXIO);
+
+	mutex_lock(&register_mutex);
+	if (! list_empty(&timer->open_list_head)) {
+		struct list_head *p, *n;
+		struct snd_timer_instance *ti;
+		snd_printk(KERN_WARNING "timer %p is busy?\n", timer);
+		list_for_each_safe(p, n, &timer->open_list_head) {
+			list_del_init(p);
+			ti = list_entry(p, struct snd_timer_instance, open_list);
+			ti->timer = NULL;
+		}
+	}
+	list_del(&timer->device_list);
+	mutex_unlock(&register_mutex);
+
 	if (timer->private_free)
 		timer->private_free(timer);
 	kfree(timer);
@@ -867,30 +882,13 @@ static int snd_timer_dev_register(struct snd_device *dev)
 	return 0;
 }
 
-static int snd_timer_unregister(struct snd_timer *timer)
+static int snd_timer_dev_disconnect(struct snd_device *device)
 {
-	struct list_head *p, *n;
-	struct snd_timer_instance *ti;
-
-	snd_assert(timer != NULL, return -ENXIO);
+	struct snd_timer *timer = device->device_data;
 	mutex_lock(&register_mutex);
-	if (! list_empty(&timer->open_list_head)) {
-		snd_printk(KERN_WARNING "timer 0x%lx is busy?\n", (long)timer);
-		list_for_each_safe(p, n, &timer->open_list_head) {
-			list_del_init(p);
-			ti = list_entry(p, struct snd_timer_instance, open_list);
-			ti->timer = NULL;
-		}
-	}
-	list_del(&timer->device_list);
+	list_del_init(&timer->device_list);
 	mutex_unlock(&register_mutex);
-	return snd_timer_free(timer);
-}
-
-static int snd_timer_dev_unregister(struct snd_device *device)
-{
-	struct snd_timer *timer = device->device_data;
-	return snd_timer_unregister(timer);
+	return 0;
 }
 
 void snd_timer_notify(struct snd_timer *timer, int event, struct timespec *tstamp)
@@ -955,11 +953,6 @@ int snd_timer_global_register(struct snd_timer *timer)
 	return snd_timer_dev_register(&dev);
 }
 
-int snd_timer_global_unregister(struct snd_timer *timer)
-{
-	return snd_timer_unregister(timer);
-}
-
 /*
  *  System timer
  */
@@ -1982,7 +1975,7 @@ static void __exit alsa_timer_exit(void)
 	/* unregister the system timer */
 	list_for_each_safe(p, n, &snd_timer_list) {
 		struct snd_timer *timer = list_entry(p, struct snd_timer, device_list);
-		snd_timer_unregister(timer);
+		snd_timer_free(timer);
 	}
 	snd_timer_proc_done();
 #ifdef SNDRV_OSS_INFO_DEV_TIMERS
@@ -2005,5 +1998,4 @@ EXPORT_SYMBOL(snd_timer_notify);
 EXPORT_SYMBOL(snd_timer_global_new);
 EXPORT_SYMBOL(snd_timer_global_free);
 EXPORT_SYMBOL(snd_timer_global_register);
-EXPORT_SYMBOL(snd_timer_global_unregister);
 EXPORT_SYMBOL(snd_timer_interrupt);
diff --git a/sound/pci/ac97/ac97_codec.c b/sound/pci/ac97/ac97_codec.c
index 51e83d7a839..b35280ca246 100644
--- a/sound/pci/ac97/ac97_codec.c
+++ b/sound/pci/ac97/ac97_codec.c
@@ -1817,13 +1817,13 @@ static int snd_ac97_dev_register(struct snd_device *device)
 	return 0;
 }
 
-/* unregister ac97 codec */
-static int snd_ac97_dev_unregister(struct snd_device *device)
+/* disconnect ac97 codec */
+static int snd_ac97_dev_disconnect(struct snd_device *device)
 {
 	struct snd_ac97 *ac97 = device->device_data;
 	if (ac97->dev.bus)
 		device_unregister(&ac97->dev);
-	return snd_ac97_free(ac97);
+	return 0;
 }
 
 /* build_ops to do nothing */
@@ -1860,7 +1860,7 @@ int snd_ac97_mixer(struct snd_ac97_bus *bus, struct snd_ac97_template *template,
 	static struct snd_device_ops ops = {
 		.dev_free =	snd_ac97_dev_free,
 		.dev_register =	snd_ac97_dev_register,
-		.dev_unregister =	snd_ac97_dev_unregister,
+		.dev_disconnect =	snd_ac97_dev_disconnect,
 	};
 
 	snd_assert(rac97 != NULL, return -EINVAL);
diff --git a/sound/usb/usbaudio.c b/sound/usb/usbaudio.c
index 1b7f499c549..31443138591 100644
--- a/sound/usb/usbaudio.c
+++ b/sound/usb/usbaudio.c
@@ -3499,7 +3499,7 @@ static void snd_usb_audio_disconnect(struct usb_device *dev, void *ptr)
 		}
 		usb_chip[chip->index] = NULL;
 		mutex_unlock(&register_mutex);
-		snd_card_free(card);
+		snd_card_free_when_closed(card);
 	} else {
 		mutex_unlock(&register_mutex);
 	}
-- 
cgit v1.2.3-70-g09d2


From 2b29b13c5794f648cd5e839796496704d787f5a6 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 23 Jun 2006 14:38:26 +0200
Subject: [ALSA] Deprecate snd_card_free_in_thread()

Deprecated snd_card_free_in_thread(), replaced with
snd_card_free_when_closed().

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 .../sound/alsa/DocBook/writing-an-alsa-driver.tmpl |  5 +-
 include/sound/core.h                               |  3 --
 sound/core/init.c                                  | 56 ++--------------------
 sound/drivers/mpu401/mpu401.c                      |  2 +-
 sound/pcmcia/pdaudiocf/pdaudiocf.c                 |  2 +-
 sound/pcmcia/vx/vxpocket.c                         |  4 +-
 6 files changed, 9 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
index b8dc51ca776..4807ef79a94 100644
--- a/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/sound/alsa/DocBook/writing-an-alsa-driver.tmpl
@@ -1054,9 +1054,8 @@
 
       <para>
       For a device which allows hotplugging, you can use
-      <function>snd_card_free_in_thread</function>.  This one will
-      postpone the destruction and wait in a kernel-thread until all
-      devices are closed.
+      <function>snd_card_free_when_closed</function>.  This one will
+      postpone the destruction until all devices are closed.
       </para>
 
     </section>
diff --git a/include/sound/core.h b/include/sound/core.h
index cf4001cf624..1359c532b68 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -25,7 +25,6 @@
 #include <linux/sched.h>		/* wake_up() */
 #include <linux/mutex.h>		/* struct mutex */
 #include <linux/rwsem.h>		/* struct rw_semaphore */
-#include <linux/workqueue.h>		/* struct workqueue_struct */
 #include <linux/pm.h>			/* pm_message_t */
 
 /* forward declarations */
@@ -132,7 +131,6 @@ struct snd_card {
 	int shutdown;			/* this card is going down */
 	int free_on_last_close;		/* free in context of file_release */
 	wait_queue_head_t shutdown_sleep;
-	struct work_struct free_workq;	/* for free in workqueue */
 	struct device *dev;
 
 #ifdef CONFIG_PM
@@ -245,7 +243,6 @@ struct snd_card *snd_card_new(int idx, const char *id,
 int snd_card_disconnect(struct snd_card *card);
 int snd_card_free(struct snd_card *card);
 int snd_card_free_when_closed(struct snd_card *card);
-int snd_card_free_in_thread(struct snd_card *card);
 int snd_card_register(struct snd_card *card);
 int snd_card_info_init(void);
 int snd_card_info_done(void);
diff --git a/sound/core/init.c b/sound/core/init.c
index 5850d99d21e..d7607a25acd 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -81,8 +81,6 @@ static inline int init_info_for_card(struct snd_card *card)
 #define init_info_for_card(card)
 #endif
 
-static void snd_card_free_thread(void * __card);
-
 /**
  *  snd_card_new - create and initialize a soundcard structure
  *  @idx: card index (address) [0 ... (SNDRV_CARDS-1)]
@@ -145,7 +143,6 @@ struct snd_card *snd_card_new(int idx, const char *xid,
 	INIT_LIST_HEAD(&card->ctl_files);
 	spin_lock_init(&card->files_lock);
 	init_waitqueue_head(&card->shutdown_sleep);
-	INIT_WORK(&card->free_workq, snd_card_free_thread, card);
 #ifdef CONFIG_PM
 	mutex_init(&card->power_lock);
 	init_waitqueue_head(&card->power_sleep);
@@ -413,53 +410,6 @@ int snd_card_free(struct snd_card *card)
 
 EXPORT_SYMBOL(snd_card_free);
 
-static void snd_card_free_thread(void * __card)
-{
-	struct snd_card *card = __card;
-	struct module * module = card->module;
-
-	if (!try_module_get(module)) {
-		snd_printk(KERN_ERR "unable to lock toplevel module for card %i in free thread\n", card->number);
-		module = NULL;
-	}
-
-	snd_card_free(card);
-
-	module_put(module);
-}
-
-/**
- *  snd_card_free_in_thread - call snd_card_free() in thread
- *  @card: soundcard structure
- *
- *  This function schedules the call of snd_card_free() function in a
- *  work queue.  When all devices are released (non-busy), the work
- *  is woken up and calls snd_card_free().
- *
- *  When a card can be disconnected at any time by hotplug service,
- *  this function should be used in disconnect (or detach) callback
- *  instead of calling snd_card_free() directly.
- *  
- *  Returns - zero otherwise a negative error code if the start of thread failed.
- */
-int snd_card_free_in_thread(struct snd_card *card)
-{
-	if (card->files == NULL) {
-		snd_card_free(card);
-		return 0;
-	}
-
-	if (schedule_work(&card->free_workq))
-		return 0;
-
-	snd_printk(KERN_ERR "schedule_work() failed in snd_card_free_in_thread for card %i\n", card->number);
-	/* try to free the structure immediately */
-	snd_card_free(card);
-	return -EFAULT;
-}
-
-EXPORT_SYMBOL(snd_card_free_in_thread);
-
 static void choose_default_id(struct snd_card *card)
 {
 	int i, len, idx_flag = 0, loops = SNDRV_CARDS;
@@ -742,9 +692,9 @@ EXPORT_SYMBOL(snd_card_file_add);
  *
  *  This function removes the file formerly added to the card via
  *  snd_card_file_add() function.
- *  If all files are removed and the release of the card is
- *  scheduled, it will wake up the the thread to call snd_card_free()
- *  (see snd_card_free_in_thread() function).
+ *  If all files are removed and snd_card_free_when_closed() was
+ *  called beforehand, it processes the pending release of
+ *  resources.
  *
  *  Returns zero or a negative error code.
  */
diff --git a/sound/drivers/mpu401/mpu401.c b/sound/drivers/mpu401/mpu401.c
index 17cc105b26f..2de181ad0b0 100644
--- a/sound/drivers/mpu401/mpu401.c
+++ b/sound/drivers/mpu401/mpu401.c
@@ -211,7 +211,7 @@ static void __devexit snd_mpu401_pnp_remove(struct pnp_dev *dev)
 	struct snd_card *card = (struct snd_card *) pnp_get_drvdata(dev);
 
 	snd_card_disconnect(card);
-	snd_card_free_in_thread(card);
+	snd_card_free_when_closed(card);
 }
 
 static struct pnp_driver snd_mpu401_pnp_driver = {
diff --git a/sound/pcmcia/pdaudiocf/pdaudiocf.c b/sound/pcmcia/pdaudiocf/pdaudiocf.c
index 1c09e5f49da..fd3590fcaed 100644
--- a/sound/pcmcia/pdaudiocf/pdaudiocf.c
+++ b/sound/pcmcia/pdaudiocf/pdaudiocf.c
@@ -206,7 +206,7 @@ static void snd_pdacf_detach(struct pcmcia_device *link)
 		snd_pdacf_powerdown(chip);
 	chip->chip_status |= PDAUDIOCF_STAT_IS_STALE; /* to be sure */
 	snd_card_disconnect(chip->card);
-	snd_card_free_in_thread(chip->card);
+	snd_card_free_when_closed(chip->card);
 }
 
 /*
diff --git a/sound/pcmcia/vx/vxpocket.c b/sound/pcmcia/vx/vxpocket.c
index cafe6640cc1..76c85cffb40 100644
--- a/sound/pcmcia/vx/vxpocket.c
+++ b/sound/pcmcia/vx/vxpocket.c
@@ -65,7 +65,7 @@ static void vxpocket_release(struct pcmcia_device *link)
 }
 
 /*
- * destructor, called from snd_card_free_in_thread()
+ * destructor, called from snd_card_free_when_closed()
  */
 static int snd_vxpocket_dev_free(struct snd_device *device)
 {
@@ -363,7 +363,7 @@ static void vxpocket_detach(struct pcmcia_device *link)
 	chip->chip_status |= VX_STAT_IS_STALE; /* to be sure */
 	snd_card_disconnect(chip->card);
 	vxpocket_release(link);
-	snd_card_free_in_thread(chip->card);
+	snd_card_free_when_closed(chip->card);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 6dbe662874ba08585eaf732d126762c25ac8e3f7 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 27 Jun 2006 18:28:53 +0200
Subject: [ALSA] Add experimental support of aggressive AC97 power-saving mode

Added CONFIG_SND_AC97_POWER_SAVE kernel config to enable the support
of aggressive AC97 power-saving mode.  In this mode, the AC97
powerdown register bits are dynamically controlled at each open/close
of PCM streams.
The mode is activated via power_save option for snd-ac97-codec
driver.  As default it's off.  It can be turned on/off on the fly
via sysfs, too.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/ac97_codec.h  |  32 ++++++
 sound/drivers/Kconfig       |  13 +++
 sound/pci/ac97/ac97_codec.c | 264 +++++++++++++++++++++++++++++++++++++-------
 sound/pci/ac97/ac97_pcm.c   |  18 +++
 sound/pci/intel8x0.c        |  14 ++-
 sound/pci/via82xx.c         |  13 ++-
 6 files changed, 315 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/sound/ac97_codec.h b/include/sound/ac97_codec.h
index 758f8bf133c..4c43521cc49 100644
--- a/include/sound/ac97_codec.h
+++ b/include/sound/ac97_codec.h
@@ -27,6 +27,7 @@
 
 #include <linux/bitops.h>
 #include <linux/device.h>
+#include <linux/workqueue.h>
 #include "pcm.h"
 #include "control.h"
 #include "info.h"
@@ -140,6 +141,20 @@
 #define AC97_GP_DRSS_1011	0x0000	/* LR(C) 10+11(+12) */
 #define AC97_GP_DRSS_78		0x0400	/* LR 7+8 */
 
+/* powerdown bits */
+#define AC97_PD_ADC_STATUS	0x0001	/* ADC status (RO) */
+#define AC97_PD_DAC_STATUS	0x0002	/* DAC status (RO) */
+#define AC97_PD_MIXER_STATUS	0x0004	/* Analog mixer status (RO) */
+#define AC97_PD_VREF_STATUS	0x0008	/* Vref status (RO) */
+#define AC97_PD_PR0		0x0100	/* Power down PCM ADCs and input MUX */
+#define AC97_PD_PR1		0x0200	/* Power down PCM front DAC */
+#define AC97_PD_PR2		0x0400	/* Power down Mixer (Vref still on) */
+#define AC97_PD_PR3		0x0800	/* Power down Mixer (Vref off) */
+#define AC97_PD_PR4		0x1000	/* Power down AC-Link */
+#define AC97_PD_PR5		0x2000	/* Disable internal clock usage */
+#define AC97_PD_PR6		0x4000	/* Headphone amplifier */
+#define AC97_PD_EAPD		0x8000	/* External Amplifer Power Down (EAPD) */
+
 /* extended audio ID bit defines */
 #define AC97_EI_VRA		0x0001	/* Variable bit rate supported */
 #define AC97_EI_DRA		0x0002	/* Double rate supported */
@@ -359,6 +374,7 @@
 #define AC97_SCAP_INV_EAPD	(1<<7)	/* inverted EAPD */
 #define AC97_SCAP_DETECT_BY_VENDOR (1<<8) /* use vendor registers for read tests */
 #define AC97_SCAP_NO_SPDIF	(1<<9)	/* don't build SPDIF controls */
+#define AC97_SCAP_EAPD_LED	(1<<10)	/* EAPD as mute LED */
 
 /* ac97->flags */
 #define AC97_HAS_PC_BEEP	(1<<0)	/* force PC Speaker usage */
@@ -491,6 +507,12 @@ struct snd_ac97 {
 	/* jack-sharing info */
 	unsigned char indep_surround;
 	unsigned char channel_mode;
+
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+	unsigned int power_up;	/* power states */
+	struct workqueue_struct *power_workq;
+	struct work_struct power_work;
+#endif
 	struct device dev;
 };
 
@@ -532,6 +554,15 @@ unsigned short snd_ac97_read(struct snd_ac97 *ac97, unsigned short reg);
 void snd_ac97_write_cache(struct snd_ac97 *ac97, unsigned short reg, unsigned short value);
 int snd_ac97_update(struct snd_ac97 *ac97, unsigned short reg, unsigned short value);
 int snd_ac97_update_bits(struct snd_ac97 *ac97, unsigned short reg, unsigned short mask, unsigned short value);
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+int snd_ac97_update_power(struct snd_ac97 *ac97, int reg, int powerup);
+#else
+static inline int snd_ac97_update_power(struct snd_ac97 *ac97, int reg,
+					int powerup)
+{
+	return 0;
+}
+#endif
 #ifdef CONFIG_PM
 void snd_ac97_suspend(struct snd_ac97 *ac97);
 void snd_ac97_resume(struct snd_ac97 *ac97);
@@ -583,6 +614,7 @@ struct ac97_pcm {
 		     copy_flag: 1,	   /* lowlevel driver must fill all entries */
 		     spdif: 1;		   /* spdif pcm */
 	unsigned short aslots;		   /* active slots */
+	unsigned short cur_dbl;		   /* current double-rate state */
 	unsigned int rates;		   /* available rates */
 	struct {
 		unsigned short slots;	   /* driver input: requested AC97 slot numbers */
diff --git a/sound/drivers/Kconfig b/sound/drivers/Kconfig
index 395c4ef52ac..897dc2dfd7d 100644
--- a/sound/drivers/Kconfig
+++ b/sound/drivers/Kconfig
@@ -100,4 +100,17 @@ config SND_MPU401
 	  To compile this driver as a module, choose M here: the module
 	  will be called snd-mpu401.
 
+config SND_AC97_POWER_SAVE
+	bool "AC97 Power-Saving Mode"
+	depends on SND_AC97_CODEC && EXPERIMENTAL
+	default n
+	help
+	  Say Y here to enable the aggressive power-saving support of
+	  AC97 codecs.  In this mode, the power-mode is dynamically
+	  controlled at each open/close.
+
+	  The mode is activated by passing power_save=1 option to
+	  snd-ac97-codec driver.  You can toggle it dynamically over
+	  sysfs, too.
+
 endmenu
diff --git a/sound/pci/ac97/ac97_codec.c b/sound/pci/ac97/ac97_codec.c
index b35280ca246..f82c636e99a 100644
--- a/sound/pci/ac97/ac97_codec.c
+++ b/sound/pci/ac97/ac97_codec.c
@@ -47,6 +47,11 @@ static int enable_loopback;
 module_param(enable_loopback, bool, 0444);
 MODULE_PARM_DESC(enable_loopback, "Enable AC97 ADC/DAC Loopback Control");
 
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+static int power_save;
+module_param(power_save, bool, 0644);
+MODULE_PARM_DESC(power_save, "Enable AC97 power-saving control");
+#endif
 /*
 
  */
@@ -187,6 +192,8 @@ static const struct ac97_codec_id snd_ac97_codec_ids[] = {
 };
 
 
+static void update_power_regs(struct snd_ac97 *ac97);
+
 /*
  *  I/O routines
  */
@@ -554,6 +561,18 @@ int snd_ac97_put_volsw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value
 	}
 	err = snd_ac97_update_bits(ac97, reg, val_mask, val);
 	snd_ac97_page_restore(ac97, page_save);
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+	/* check analog mixer power-down */
+	if ((val_mask & 0x8000) &&
+	    (kcontrol->private_value & (1<<30))) {
+		if (val & 0x8000)
+			ac97->power_up &= ~(1 << (reg>>1));
+		else
+			ac97->power_up |= 1 << (reg>>1);
+		if (power_save)
+			update_power_regs(ac97);
+	}
+#endif
 	return err;
 }
 
@@ -962,6 +981,10 @@ static int snd_ac97_bus_dev_free(struct snd_device *device)
 static int snd_ac97_free(struct snd_ac97 *ac97)
 {
 	if (ac97) {
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+		if (ac97->power_workq)
+			destroy_workqueue(ac97->power_workq);
+#endif
 		snd_ac97_proc_done(ac97);
 		if (ac97->bus)
 			ac97->bus->codec[ac97->num] = NULL;
@@ -1117,7 +1140,9 @@ struct snd_kcontrol *snd_ac97_cnew(const struct snd_kcontrol_new *_template, str
 /*
  * create mute switch(es) for normal stereo controls
  */
-static int snd_ac97_cmute_new_stereo(struct snd_card *card, char *name, int reg, int check_stereo, struct snd_ac97 *ac97)
+static int snd_ac97_cmute_new_stereo(struct snd_card *card, char *name, int reg,
+				     int check_stereo, int check_amix,
+				     struct snd_ac97 *ac97)
 {
 	struct snd_kcontrol *kctl;
 	int err;
@@ -1137,10 +1162,14 @@ static int snd_ac97_cmute_new_stereo(struct snd_card *card, char *name, int reg,
 	}
 	if (mute_mask == 0x8080) {
 		struct snd_kcontrol_new tmp = AC97_DOUBLE(name, reg, 15, 7, 1, 1);
+		if (check_amix)
+			tmp.private_value |= (1 << 30);
 		tmp.index = ac97->num;
 		kctl = snd_ctl_new1(&tmp, ac97);
 	} else {
 		struct snd_kcontrol_new tmp = AC97_SINGLE(name, reg, 15, 1, 1);
+		if (check_amix)
+			tmp.private_value |= (1 << 30);
 		tmp.index = ac97->num;
 		kctl = snd_ctl_new1(&tmp, ac97);
 	}
@@ -1186,7 +1215,9 @@ static int snd_ac97_cvol_new(struct snd_card *card, char *name, int reg, unsigne
 /*
  * create a mute-switch and a volume for normal stereo/mono controls
  */
-static int snd_ac97_cmix_new_stereo(struct snd_card *card, const char *pfx, int reg, int check_stereo, struct snd_ac97 *ac97)
+static int snd_ac97_cmix_new_stereo(struct snd_card *card, const char *pfx,
+				    int reg, int check_stereo, int check_amix,
+				    struct snd_ac97 *ac97)
 {
 	int err;
 	char name[44];
@@ -1197,7 +1228,9 @@ static int snd_ac97_cmix_new_stereo(struct snd_card *card, const char *pfx, int
 
 	if (snd_ac97_try_bit(ac97, reg, 15)) {
 		sprintf(name, "%s Switch", pfx);
-		if ((err = snd_ac97_cmute_new_stereo(card, name, reg, check_stereo, ac97)) < 0)
+		if ((err = snd_ac97_cmute_new_stereo(card, name, reg,
+						     check_stereo, check_amix,
+						     ac97)) < 0)
 			return err;
 	}
 	check_volume_resolution(ac97, reg, &lo_max, &hi_max);
@@ -1209,8 +1242,10 @@ static int snd_ac97_cmix_new_stereo(struct snd_card *card, const char *pfx, int
 	return 0;
 }
 
-#define snd_ac97_cmix_new(card, pfx, reg, ac97)	snd_ac97_cmix_new_stereo(card, pfx, reg, 0, ac97)
-#define snd_ac97_cmute_new(card, name, reg, ac97)	snd_ac97_cmute_new_stereo(card, name, reg, 0, ac97)
+#define snd_ac97_cmix_new(card, pfx, reg, acheck, ac97) \
+	snd_ac97_cmix_new_stereo(card, pfx, reg, 0, acheck, ac97)
+#define snd_ac97_cmute_new(card, name, reg, acheck, ac97) \
+	snd_ac97_cmute_new_stereo(card, name, reg, 0, acheck, ac97)
 
 static unsigned int snd_ac97_determine_spdif_rates(struct snd_ac97 *ac97);
 
@@ -1226,9 +1261,11 @@ static int snd_ac97_mixer_build(struct snd_ac97 * ac97)
 	/* AD claims to remove this control from AD1887, although spec v2.2 does not allow this */
 	if (snd_ac97_try_volume_mix(ac97, AC97_MASTER)) {
 		if (ac97->flags & AC97_HAS_NO_MASTER_VOL)
-			err = snd_ac97_cmute_new(card, "Master Playback Switch", AC97_MASTER, ac97);
+			err = snd_ac97_cmute_new(card, "Master Playback Switch",
+						 AC97_MASTER, 0, ac97);
 		else
-			err = snd_ac97_cmix_new(card, "Master Playback", AC97_MASTER, ac97);
+			err = snd_ac97_cmix_new(card, "Master Playback",
+						AC97_MASTER, 0, ac97);
 		if (err < 0)
 			return err;
 	}
@@ -1265,19 +1302,23 @@ static int snd_ac97_mixer_build(struct snd_ac97 * ac97)
 	if ((snd_ac97_try_volume_mix(ac97, AC97_SURROUND_MASTER)) 
 		&& !(ac97->flags & AC97_AD_MULTI)) {
 		/* Surround Master (0x38) is with stereo mutes */
-		if ((err = snd_ac97_cmix_new_stereo(card, "Surround Playback", AC97_SURROUND_MASTER, 1, ac97)) < 0)
+		if ((err = snd_ac97_cmix_new_stereo(card, "Surround Playback",
+						    AC97_SURROUND_MASTER, 1, 0,
+						    ac97)) < 0)
 			return err;
 	}
 
 	/* build headphone controls */
 	if (snd_ac97_try_volume_mix(ac97, AC97_HEADPHONE)) {
-		if ((err = snd_ac97_cmix_new(card, "Headphone Playback", AC97_HEADPHONE, ac97)) < 0)
+		if ((err = snd_ac97_cmix_new(card, "Headphone Playback",
+					     AC97_HEADPHONE, 0, ac97)) < 0)
 			return err;
 	}
 	
 	/* build master mono controls */
 	if (snd_ac97_try_volume_mix(ac97, AC97_MASTER_MONO)) {
-		if ((err = snd_ac97_cmix_new(card, "Master Mono Playback", AC97_MASTER_MONO, ac97)) < 0)
+		if ((err = snd_ac97_cmix_new(card, "Master Mono Playback",
+					     AC97_MASTER_MONO, 0, ac97)) < 0)
 			return err;
 	}
 	
@@ -1310,7 +1351,8 @@ static int snd_ac97_mixer_build(struct snd_ac97 * ac97)
 	/* build Phone controls */
 	if (!(ac97->flags & AC97_HAS_NO_PHONE)) {
 		if (snd_ac97_try_volume_mix(ac97, AC97_PHONE)) {
-			if ((err = snd_ac97_cmix_new(card, "Phone Playback", AC97_PHONE, ac97)) < 0)
+			if ((err = snd_ac97_cmix_new(card, "Phone Playback",
+						     AC97_PHONE, 1, ac97)) < 0)
 				return err;
 		}
 	}
@@ -1318,7 +1360,8 @@ static int snd_ac97_mixer_build(struct snd_ac97 * ac97)
 	/* build MIC controls */
 	if (!(ac97->flags & AC97_HAS_NO_MIC)) {
 		if (snd_ac97_try_volume_mix(ac97, AC97_MIC)) {
-			if ((err = snd_ac97_cmix_new(card, "Mic Playback", AC97_MIC, ac97)) < 0)
+			if ((err = snd_ac97_cmix_new(card, "Mic Playback",
+						     AC97_MIC, 1, ac97)) < 0)
 				return err;
 			if ((err = snd_ctl_add(card, snd_ac97_cnew(&snd_ac97_controls_mic_boost, ac97))) < 0)
 				return err;
@@ -1327,14 +1370,16 @@ static int snd_ac97_mixer_build(struct snd_ac97 * ac97)
 
 	/* build Line controls */
 	if (snd_ac97_try_volume_mix(ac97, AC97_LINE)) {
-		if ((err = snd_ac97_cmix_new(card, "Line Playback", AC97_LINE, ac97)) < 0)
+		if ((err = snd_ac97_cmix_new(card, "Line Playback",
+					     AC97_LINE, 1, ac97)) < 0)
 			return err;
 	}
 	
 	/* build CD controls */
 	if (!(ac97->flags & AC97_HAS_NO_CD)) {
 		if (snd_ac97_try_volume_mix(ac97, AC97_CD)) {
-			if ((err = snd_ac97_cmix_new(card, "CD Playback", AC97_CD, ac97)) < 0)
+			if ((err = snd_ac97_cmix_new(card, "CD Playback",
+						     AC97_CD, 1, ac97)) < 0)
 				return err;
 		}
 	}
@@ -1342,7 +1387,8 @@ static int snd_ac97_mixer_build(struct snd_ac97 * ac97)
 	/* build Video controls */
 	if (!(ac97->flags & AC97_HAS_NO_VIDEO)) {
 		if (snd_ac97_try_volume_mix(ac97, AC97_VIDEO)) {
-			if ((err = snd_ac97_cmix_new(card, "Video Playback", AC97_VIDEO, ac97)) < 0)
+			if ((err = snd_ac97_cmix_new(card, "Video Playback",
+						     AC97_VIDEO, 1, ac97)) < 0)
 				return err;
 		}
 	}
@@ -1350,7 +1396,8 @@ static int snd_ac97_mixer_build(struct snd_ac97 * ac97)
 	/* build Aux controls */
 	if (!(ac97->flags & AC97_HAS_NO_AUX)) {
 		if (snd_ac97_try_volume_mix(ac97, AC97_AUX)) {
-			if ((err = snd_ac97_cmix_new(card, "Aux Playback", AC97_AUX, ac97)) < 0)
+			if ((err = snd_ac97_cmix_new(card, "Aux Playback",
+						     AC97_AUX, 1, ac97)) < 0)
 				return err;
 		}
 	}
@@ -1385,9 +1432,12 @@ static int snd_ac97_mixer_build(struct snd_ac97 * ac97)
 	} else {
 		if (!(ac97->flags & AC97_HAS_NO_STD_PCM)) {
 			if (ac97->flags & AC97_HAS_NO_PCM_VOL)
-				err = snd_ac97_cmute_new(card, "PCM Playback Switch", AC97_PCM, ac97);
+				err = snd_ac97_cmute_new(card,
+							 "PCM Playback Switch",
+							 AC97_PCM, 0, ac97);
 			else
-				err = snd_ac97_cmix_new(card, "PCM Playback", AC97_PCM, ac97);
+				err = snd_ac97_cmix_new(card, "PCM Playback",
+							AC97_PCM, 0, ac97);
 			if (err < 0)
 				return err;
 		}
@@ -1398,7 +1448,9 @@ static int snd_ac97_mixer_build(struct snd_ac97 * ac97)
 		if ((err = snd_ctl_add(card, snd_ac97_cnew(&snd_ac97_control_capture_src, ac97))) < 0)
 			return err;
 		if (snd_ac97_try_bit(ac97, AC97_REC_GAIN, 15)) {
-			if ((err = snd_ac97_cmute_new(card, "Capture Switch", AC97_REC_GAIN, ac97)) < 0)
+			err = snd_ac97_cmute_new(card, "Capture Switch",
+						 AC97_REC_GAIN, 0, ac97);
+			if (err < 0)
 				return err;
 		}
 		if ((err = snd_ctl_add(card, snd_ac97_cnew(&snd_ac97_control_capture_vol, ac97))) < 0)
@@ -1829,6 +1881,13 @@ static int snd_ac97_dev_disconnect(struct snd_device *device)
 /* build_ops to do nothing */
 static struct snd_ac97_build_ops null_build_ops;
 
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+static void do_update_power(void *data)
+{
+	update_power_regs(data);
+}
+#endif
+
 /**
  * snd_ac97_mixer - create an Codec97 component
  * @bus: the AC97 bus which codec is attached to
@@ -1883,6 +1942,10 @@ int snd_ac97_mixer(struct snd_ac97_bus *bus, struct snd_ac97_template *template,
 	bus->codec[ac97->num] = ac97;
 	mutex_init(&ac97->reg_mutex);
 	mutex_init(&ac97->page_mutex);
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+	ac97->power_workq = create_workqueue("ac97");
+	INIT_WORK(&ac97->power_work, do_update_power, ac97);
+#endif
 
 #ifdef CONFIG_PCI
 	if (ac97->pci) {
@@ -2117,15 +2180,8 @@ int snd_ac97_mixer(struct snd_ac97_bus *bus, struct snd_ac97_template *template,
 			return -ENOMEM;
 		}
 	}
-	/* make sure the proper powerdown bits are cleared */
-	if (ac97->scaps && ac97_is_audio(ac97)) {
-		reg = snd_ac97_read(ac97, AC97_EXTENDED_STATUS);
-		if (ac97->scaps & AC97_SCAP_SURROUND_DAC) 
-			reg &= ~AC97_EA_PRJ;
-		if (ac97->scaps & AC97_SCAP_CENTER_LFE_DAC) 
-			reg &= ~(AC97_EA_PRI | AC97_EA_PRK);
-		snd_ac97_write_cache(ac97, AC97_EXTENDED_STATUS, reg);
-	}
+	if (ac97_is_audio(ac97))
+		update_power_regs(ac97);
 	snd_ac97_proc_init(ac97);
 	if ((err = snd_device_new(card, SNDRV_DEV_CODEC, ac97, &ops)) < 0) {
 		snd_ac97_free(ac97);
@@ -2153,22 +2209,155 @@ static void snd_ac97_powerdown(struct snd_ac97 *ac97)
 		snd_ac97_write(ac97, AC97_HEADPHONE, 0x9f9f);
 	}
 
-	power = ac97->regs[AC97_POWERDOWN] | 0x8000;	/* EAPD */
-	power |= 0x4000;	/* Headphone amplifier powerdown */
-	power |= 0x0300;	/* ADC & DAC powerdown */
+	/* surround, CLFE, mic powerdown */
+	power = ac97->regs[AC97_EXTENDED_STATUS];
+	if (ac97->scaps & AC97_SCAP_SURROUND_DAC)
+		power |= AC97_EA_PRJ;
+	if (ac97->scaps & AC97_SCAP_CENTER_LFE_DAC)
+		power |= AC97_EA_PRI | AC97_EA_PRK;
+	power |= AC97_EA_PRL;
+	snd_ac97_write(ac97, AC97_EXTENDED_STATUS, power);
+
+	/* powerdown external amplifier */
+	if (ac97->scaps & AC97_SCAP_INV_EAPD)
+		power = ac97->regs[AC97_POWERDOWN] & ~AC97_PD_EAPD;
+	else if (! (ac97->scaps & AC97_SCAP_EAPD_LED))
+		power = ac97->regs[AC97_POWERDOWN] | AC97_PD_EAPD;
+	power |= AC97_PD_PR6;	/* Headphone amplifier powerdown */
+	power |= AC97_PD_PR0 | AC97_PD_PR1;	/* ADC & DAC powerdown */
 	snd_ac97_write(ac97, AC97_POWERDOWN, power);
 	udelay(100);
-	power |= 0x0400;	/* Analog Mixer powerdown (Vref on) */
-	snd_ac97_write(ac97, AC97_POWERDOWN, power);
-	udelay(100);
-#if 0
-	/* FIXME: this causes click noises on some boards at resume */
-	power |= 0x3800;	/* AC-link powerdown, internal Clk disable */
+	power |= AC97_PD_PR2 | AC97_PD_PR3;	/* Analog Mixer powerdown */
 	snd_ac97_write(ac97, AC97_POWERDOWN, power);
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+	if (power_save) {
+		udelay(100);
+		/* AC-link powerdown, internal Clk disable */
+		/* FIXME: this may cause click noises on some boards */
+		power |= AC97_PD_PR4 | AC97_PD_PR5;
+		snd_ac97_write(ac97, AC97_POWERDOWN, power);
+	}
 #endif
 }
 
 
+struct ac97_power_reg {
+	unsigned short reg;
+	unsigned short power_reg;
+	unsigned short mask;
+};
+
+enum { PWIDX_ADC, PWIDX_FRONT, PWIDX_CLFE, PWIDX_SURR, PWIDX_MIC, PWIDX_SIZE };
+
+static struct ac97_power_reg power_regs[PWIDX_SIZE] = {
+	[PWIDX_ADC] = { AC97_PCM_LR_ADC_RATE, AC97_POWERDOWN, AC97_PD_PR0},
+	[PWIDX_FRONT] = { AC97_PCM_FRONT_DAC_RATE, AC97_POWERDOWN, AC97_PD_PR1},
+	[PWIDX_CLFE] = { AC97_PCM_LFE_DAC_RATE, AC97_EXTENDED_STATUS,
+			 AC97_EA_PRI | AC97_EA_PRK},
+	[PWIDX_SURR] = { AC97_PCM_SURR_DAC_RATE, AC97_EXTENDED_STATUS,
+			 AC97_EA_PRJ},
+	[PWIDX_MIC] = { AC97_PCM_MIC_ADC_RATE, AC97_EXTENDED_STATUS,
+			AC97_EA_PRL},
+};
+
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+/**
+ * snd_ac97_update_power - update the powerdown register
+ * @ac97: the codec instance
+ * @reg: the rate register, e.g. AC97_PCM_FRONT_DAC_RATE
+ * @powerup: non-zero when power up the part
+ *
+ * Update the AC97 powerdown register bits of the given part.
+ */
+int snd_ac97_update_power(struct snd_ac97 *ac97, int reg, int powerup)
+{
+	int i;
+
+	if (! ac97)
+		return 0;
+
+	if (reg) {
+		/* SPDIF requires DAC power, too */
+		if (reg == AC97_SPDIF)
+			reg = AC97_PCM_FRONT_DAC_RATE;
+		for (i = 0; i < PWIDX_SIZE; i++) {
+			if (power_regs[i].reg == reg) {
+				if (powerup)
+					ac97->power_up |= (1 << i);
+				else
+					ac97->power_up &= ~(1 << i);
+				break;
+			}
+		}
+	}
+
+	if (! power_save)
+		return 0;
+
+	if (! powerup && ac97->power_workq)
+		/* adjust power-down bits after two seconds delay
+		 * (for avoiding loud click noises for many (OSS) apps
+		 *  that open/close frequently)
+		 */
+		queue_delayed_work(ac97->power_workq, &ac97->power_work, HZ*2);
+	else
+		update_power_regs(ac97);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(snd_ac97_update_power);
+#endif /* CONFIG_SND_AC97_POWER_SAVE */
+
+static void update_power_regs(struct snd_ac97 *ac97)
+{
+	unsigned int power_up, bits;
+	int i;
+
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+	if (power_save)
+		power_up = ac97->power_up;
+	else {
+#endif
+		power_up = (1 << PWIDX_FRONT) | (1 << PWIDX_ADC);
+		power_up |= (1 << PWIDX_MIC);
+		if (ac97->scaps & AC97_SCAP_SURROUND_DAC)
+			power_up |= (1 << PWIDX_SURR);
+		if (ac97->scaps & AC97_SCAP_CENTER_LFE_DAC)
+			power_up |= (1 << PWIDX_CLFE);
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+	}
+#endif
+	if (power_up) {
+		if (ac97->regs[AC97_POWERDOWN] & AC97_PD_PR2) {
+			/* needs power-up analog mix and vref */
+			snd_ac97_update_bits(ac97, AC97_POWERDOWN,
+					     AC97_PD_PR3, 0);
+			msleep(1);
+			snd_ac97_update_bits(ac97, AC97_POWERDOWN,
+					     AC97_PD_PR2, 0);
+		}
+	}
+	for (i = 0; i < PWIDX_SIZE; i++) {
+		if (power_up & (1 << i))
+			bits = 0;
+		else
+			bits = power_regs[i].mask;
+		snd_ac97_update_bits(ac97, power_regs[i].power_reg,
+				     power_regs[i].mask, bits);
+	}
+	if (! power_up) {
+		if (! (ac97->regs[AC97_POWERDOWN] & AC97_PD_PR2)) {
+			/* power down analog mix and vref */
+			snd_ac97_update_bits(ac97, AC97_POWERDOWN,
+					     AC97_PD_PR2, AC97_PD_PR2);
+			snd_ac97_update_bits(ac97, AC97_POWERDOWN,
+					     AC97_PD_PR3, AC97_PD_PR3);
+		}
+	}
+}
+
+
 #ifdef CONFIG_PM
 /**
  * snd_ac97_suspend - General suspend function for AC97 codec
@@ -2484,6 +2673,7 @@ static int tune_mute_led(struct snd_ac97 *ac97)
 	msw->put = master_mute_sw_put;
 	snd_ac97_remove_ctl(ac97, "External Amplifier", NULL);
 	snd_ac97_update_bits(ac97, AC97_POWERDOWN, 0x8000, 0x8000); /* mute LED on */
+	ac97->scaps |= AC97_SCAP_EAPD_LED;
 	return 0;
 }
 
diff --git a/sound/pci/ac97/ac97_pcm.c b/sound/pci/ac97/ac97_pcm.c
index f684aa2c006..3758d07182f 100644
--- a/sound/pci/ac97/ac97_pcm.c
+++ b/sound/pci/ac97/ac97_pcm.c
@@ -269,6 +269,7 @@ int snd_ac97_set_rate(struct snd_ac97 *ac97, int reg, unsigned int rate)
 			return -EINVAL;
 	}
 
+	snd_ac97_update_power(ac97, reg, 1);
 	switch (reg) {
 	case AC97_PCM_MIC_ADC_RATE:
 		if ((ac97->regs[AC97_EXTENDED_STATUS] & AC97_EA_VRM) == 0)	/* MIC VRA */
@@ -606,6 +607,7 @@ int snd_ac97_pcm_open(struct ac97_pcm *pcm, unsigned int rate,
 			goto error;
 		}
 	}
+	pcm->cur_dbl = r;
 	spin_unlock_irq(&pcm->bus->bus_lock);
 	for (i = 3; i < 12; i++) {
 		if (!(slots & (1 << i)))
@@ -651,6 +653,21 @@ int snd_ac97_pcm_close(struct ac97_pcm *pcm)
 	unsigned short slots = pcm->aslots;
 	int i, cidx;
 
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+	int r = pcm->cur_dbl;
+	for (i = 3; i < 12; i++) {
+		if (!(slots & (1 << i)))
+			continue;
+		for (cidx = 0; cidx < 4; cidx++) {
+			if (pcm->r[r].rslots[cidx] & (1 << i)) {
+				int reg = get_slot_reg(pcm, cidx, i, r);
+				snd_ac97_update_power(pcm->r[r].codec[cidx],
+						      reg, 0);
+			}
+		}
+	}
+#endif
+
 	bus = pcm->bus;
 	spin_lock_irq(&pcm->bus->bus_lock);
 	for (i = 3; i < 12; i++) {
@@ -660,6 +677,7 @@ int snd_ac97_pcm_close(struct ac97_pcm *pcm)
 			bus->used_slots[pcm->stream][cidx] &= ~(1 << i);
 	}
 	pcm->aslots = 0;
+	pcm->cur_dbl = 0;
 	spin_unlock_irq(&pcm->bus->bus_lock);
 	return 0;
 }
diff --git a/sound/pci/intel8x0.c b/sound/pci/intel8x0.c
index 6874263f168..72dbaedcbdf 100644
--- a/sound/pci/intel8x0.c
+++ b/sound/pci/intel8x0.c
@@ -2251,6 +2251,16 @@ static int snd_intel8x0_ich_chip_init(struct intel8x0 *chip, int probing)
 	/* ACLink on, 2 channels */
 	cnt = igetdword(chip, ICHREG(GLOB_CNT));
 	cnt &= ~(ICH_ACLINK | ICH_PCM_246_MASK);
+#ifdef CONFIG_SND_AC97_POWER_SAVE
+	/* do cold reset - the full ac97 powerdown may leave the controller
+	 * in a warm state but actually it cannot communicate with the codec.
+	 */
+	iputdword(chip, ICHREG(GLOB_CNT), cnt & ~ICH_AC97COLD);
+	cnt = igetdword(chip, ICHREG(GLOB_CNT));
+	udelay(10);
+	iputdword(chip, ICHREG(GLOB_CNT), cnt | ICH_AC97COLD);
+	msleep(1);
+#else
 	/* finish cold or do warm reset */
 	cnt |= (cnt & ICH_AC97COLD) == 0 ? ICH_AC97COLD : ICH_AC97WARM;
 	iputdword(chip, ICHREG(GLOB_CNT), cnt);
@@ -2265,6 +2275,7 @@ static int snd_intel8x0_ich_chip_init(struct intel8x0 *chip, int probing)
 	return -EIO;
 
       __ok:
+#endif
 	if (probing) {
 		/* wait for any codec ready status.
 		 * Once it becomes ready it should remain ready
@@ -2485,7 +2496,7 @@ static int intel8x0_resume(struct pci_dev *pci)
 		    card->shortname, chip);
 	chip->irq = pci->irq;
 	synchronize_irq(chip->irq);
-	snd_intel8x0_chip_init(chip, 1);
+	snd_intel8x0_chip_init(chip, 0);
 
 	/* re-initialize mixer stuff */
 	if (chip->device_type == DEVICE_INTEL_ICH4) {
@@ -2615,6 +2626,7 @@ static void __devinit intel8x0_measure_ac97_clock(struct intel8x0 *chip)
 		/* not 48000Hz, tuning the clock.. */
 		chip->ac97_bus->clock = (chip->ac97_bus->clock * 48000) / pos;
 	printk(KERN_INFO "intel8x0: clocking to %d\n", chip->ac97_bus->clock);
+	snd_ac97_update_power(chip->ac97[0], AC97_PCM_FRONT_DAC_RATE, 0);
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/sound/pci/via82xx.c b/sound/pci/via82xx.c
index 08da9234efb..2c23a665c3e 100644
--- a/sound/pci/via82xx.c
+++ b/sound/pci/via82xx.c
@@ -1277,7 +1277,18 @@ static int snd_via82xx_pcm_close(struct snd_pcm_substream *substream)
 	if (! ratep->used)
 		ratep->rate = 0;
 	spin_unlock_irq(&ratep->lock);
-
+	if (! ratep->rate) {
+		if (! viadev->direction) {
+			snd_ac97_update_power(chip->ac97,
+					      AC97_PCM_FRONT_DAC_RATE, 0);
+			snd_ac97_update_power(chip->ac97,
+					      AC97_PCM_SURR_DAC_RATE, 0);
+			snd_ac97_update_power(chip->ac97,
+					      AC97_PCM_LFE_DAC_RATE, 0);
+		} else
+			snd_ac97_update_power(chip->ac97,
+					      AC97_PCM_LR_ADC_RATE, 0);
+	}
 	viadev->substream = NULL;
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 8aa9b586e42099817163aba01d925c2660c4dbbe Mon Sep 17 00:00:00 2001
From: Jaroslav Kysela <perex@suse.cz>
Date: Wed, 5 Jul 2006 17:34:51 +0200
Subject: [ALSA] Control API - more robust TLV implementation

- added callback option
- added READ/WRITE/COMMAND flags to access member
- added WRITE/COMMAND ioctls
- added SNDRV_CTL_EVENT_MASK_TLV for TLV change notifications
- added TLV support to ELEM_ADD ioctl

Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/asound.h  |  10 +++-
 include/sound/control.h |  16 +++++-
 sound/core/control.c    | 139 +++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 132 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/sound/asound.h b/include/sound/asound.h
index 76a20406bd1..c1621c650a9 100644
--- a/include/sound/asound.h
+++ b/include/sound/asound.h
@@ -727,10 +727,15 @@ typedef int __bitwise snd_ctl_elem_iface_t;
 #define SNDRV_CTL_ELEM_ACCESS_WRITE		(1<<1)
 #define SNDRV_CTL_ELEM_ACCESS_READWRITE		(SNDRV_CTL_ELEM_ACCESS_READ|SNDRV_CTL_ELEM_ACCESS_WRITE)
 #define SNDRV_CTL_ELEM_ACCESS_VOLATILE		(1<<2)	/* control value may be changed without a notification */
-#define SNDRV_CTL_ELEM_ACCESS_TIMESTAMP		(1<<2)	/* when was control changed */
+#define SNDRV_CTL_ELEM_ACCESS_TIMESTAMP		(1<<3)	/* when was control changed */
+#define SNDRV_CTL_ELEM_ACCESS_TLV_READ		(1<<4)	/* TLV read is possible */
+#define SNDRV_CTL_ELEM_ACCESS_TLV_WRITE		(1<<5)	/* TLV write is possible */
+#define SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE	(SNDRV_CTL_ELEM_ACCESS_TLV_READ|SNDRV_CTL_ELEM_ACCESS_TLV_WRITE)
+#define SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND	(1<<6)	/* TLV command is possible */
 #define SNDRV_CTL_ELEM_ACCESS_INACTIVE		(1<<8)	/* control does actually nothing, but may be updated */
 #define SNDRV_CTL_ELEM_ACCESS_LOCK		(1<<9)	/* write lock */
 #define SNDRV_CTL_ELEM_ACCESS_OWNER		(1<<10)	/* write lock owner */
+#define SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK	(1<<28)	/* kernel use a TLV callback */ 
 #define SNDRV_CTL_ELEM_ACCESS_USER		(1<<29) /* user space element */
 #define SNDRV_CTL_ELEM_ACCESS_DINDIRECT		(1<<30)	/* indirect access for matrix dimensions in the info structure */
 #define SNDRV_CTL_ELEM_ACCESS_INDIRECT		(1<<31)	/* indirect access for element value in the value structure */
@@ -838,6 +843,8 @@ enum {
 	SNDRV_CTL_IOCTL_ELEM_REPLACE = _IOWR('U', 0x18, struct snd_ctl_elem_info),
 	SNDRV_CTL_IOCTL_ELEM_REMOVE = _IOWR('U', 0x19, struct snd_ctl_elem_id),
 	SNDRV_CTL_IOCTL_TLV_READ = _IOWR('U', 0x1a, struct snd_ctl_tlv),
+	SNDRV_CTL_IOCTL_TLV_WRITE = _IOWR('U', 0x1b, struct snd_ctl_tlv),
+	SNDRV_CTL_IOCTL_TLV_COMMAND = _IOWR('U', 0x1c, struct snd_ctl_tlv),
 	SNDRV_CTL_IOCTL_HWDEP_NEXT_DEVICE = _IOWR('U', 0x20, int),
 	SNDRV_CTL_IOCTL_HWDEP_INFO = _IOR('U', 0x21, struct snd_hwdep_info),
 	SNDRV_CTL_IOCTL_PCM_NEXT_DEVICE = _IOR('U', 0x30, int),
@@ -862,6 +869,7 @@ enum sndrv_ctl_event_type {
 #define SNDRV_CTL_EVENT_MASK_VALUE	(1<<0)	/* element value was changed */
 #define SNDRV_CTL_EVENT_MASK_INFO	(1<<1)	/* element info was changed */
 #define SNDRV_CTL_EVENT_MASK_ADD	(1<<2)	/* element was added */
+#define SNDRV_CTL_EVENT_MASK_TLV	(1<<3)	/* element TLV tree was changed */
 #define SNDRV_CTL_EVENT_MASK_REMOVE	(~0U)	/* element was removed */
 
 struct snd_ctl_event {
diff --git a/include/sound/control.h b/include/sound/control.h
index a93a58d0e68..e3905c5a095 100644
--- a/include/sound/control.h
+++ b/include/sound/control.h
@@ -30,6 +30,11 @@ struct snd_kcontrol;
 typedef int (snd_kcontrol_info_t) (struct snd_kcontrol * kcontrol, struct snd_ctl_elem_info * uinfo);
 typedef int (snd_kcontrol_get_t) (struct snd_kcontrol * kcontrol, struct snd_ctl_elem_value * ucontrol);
 typedef int (snd_kcontrol_put_t) (struct snd_kcontrol * kcontrol, struct snd_ctl_elem_value * ucontrol);
+typedef int (snd_kcontrol_tlv_rw_t)(struct snd_kcontrol *kcontrol,
+				    int op_flag, /* 0=read,1=write,-1=command */
+				    unsigned int size,
+				    unsigned int __user *tlv);
+
 
 struct snd_kcontrol_new {
 	snd_ctl_elem_iface_t iface;	/* interface identifier */
@@ -42,7 +47,10 @@ struct snd_kcontrol_new {
 	snd_kcontrol_info_t *info;
 	snd_kcontrol_get_t *get;
 	snd_kcontrol_put_t *put;
-	unsigned int *tlv;
+	union {
+		snd_kcontrol_tlv_rw_t *c;
+		unsigned int *p;
+	} tlv;
 	unsigned long private_value;
 };
 
@@ -59,7 +67,11 @@ struct snd_kcontrol {
 	snd_kcontrol_info_t *info;
 	snd_kcontrol_get_t *get;
 	snd_kcontrol_put_t *put;
-	unsigned int *tlv;
+	snd_kcontrol_tlv_rw_t *tlv_rw;
+	union {
+		snd_kcontrol_tlv_rw_t *c;
+		unsigned int *p;
+	} tlv;
 	unsigned long private_value;
 	void *private_data;
 	void (*private_free)(struct snd_kcontrol *kcontrol);
diff --git a/sound/core/control.c b/sound/core/control.c
index f0c7272a2d4..31ad58154c0 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -236,12 +236,16 @@ struct snd_kcontrol *snd_ctl_new1(const struct snd_kcontrol_new *ncontrol,
 	kctl.id.index = ncontrol->index;
 	kctl.count = ncontrol->count ? ncontrol->count : 1;
 	access = ncontrol->access == 0 ? SNDRV_CTL_ELEM_ACCESS_READWRITE :
-		 (ncontrol->access & (SNDRV_CTL_ELEM_ACCESS_READWRITE|SNDRV_CTL_ELEM_ACCESS_INACTIVE|
-		 		      SNDRV_CTL_ELEM_ACCESS_DINDIRECT|SNDRV_CTL_ELEM_ACCESS_INDIRECT));
+		 (ncontrol->access & (SNDRV_CTL_ELEM_ACCESS_READWRITE|
+				      SNDRV_CTL_ELEM_ACCESS_INACTIVE|
+		 		      SNDRV_CTL_ELEM_ACCESS_DINDIRECT|
+		 		      SNDRV_CTL_ELEM_ACCESS_INDIRECT|
+		 		      SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE|
+		 		      SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK));
 	kctl.info = ncontrol->info;
 	kctl.get = ncontrol->get;
 	kctl.put = ncontrol->put;
-	kctl.tlv = ncontrol->tlv;
+	kctl.tlv.p = ncontrol->tlv.p;
 	kctl.private_value = ncontrol->private_value;
 	kctl.private_data = private_data;
 	return snd_ctl_new(&kctl, access);
@@ -883,6 +887,8 @@ struct user_element {
 	struct snd_ctl_elem_info info;
 	void *elem_data;		/* element data */
 	unsigned long elem_data_size;	/* size of element data in bytes */
+	void *tlv_data;			/* TLV data */
+	unsigned long tlv_data_size;	/* TLV data size */
 	void *priv_data;		/* private data (like strings for enumerated type) */
 	unsigned long priv_data_size;	/* size of private data in bytes */
 };
@@ -917,9 +923,46 @@ static int snd_ctl_elem_user_put(struct snd_kcontrol *kcontrol,
 	return change;
 }
 
+static int snd_ctl_elem_user_tlv(struct snd_kcontrol *kcontrol,
+				 int op_flag,
+				 unsigned int size,
+				 unsigned int __user *tlv)
+{
+	struct user_element *ue = kcontrol->private_data;
+	int change = 0;
+	void *new_data;
+
+	if (op_flag > 0) {
+		if (size > 1024 * 128)	/* sane value */
+			return -EINVAL;
+		new_data = kmalloc(size, GFP_KERNEL);
+		if (new_data == NULL)
+			return -ENOMEM;
+		if (copy_from_user(new_data, tlv, size)) {
+			kfree(new_data);
+			return -EFAULT;
+		}
+		change = ue->tlv_data_size != size;
+		if (!change)
+			change = memcmp(ue->tlv_data, new_data, size);
+		kfree(ue->tlv_data);
+		ue->tlv_data = new_data;
+		ue->tlv_data_size = size;
+	} else {
+		if (size < ue->tlv_data_size)
+			return -ENOSPC;
+		if (copy_to_user(tlv, ue->tlv_data, ue->tlv_data_size))
+			return -EFAULT;
+	}
+	return change;
+}
+
 static void snd_ctl_elem_user_free(struct snd_kcontrol *kcontrol)
 {
-	kfree(kcontrol->private_data);
+	struct user_element *ue = kcontrol->private_data;
+	if (ue->tlv_data)
+		kfree(ue->tlv_data);
+	kfree(ue);
 }
 
 static int snd_ctl_elem_add(struct snd_ctl_file *file,
@@ -938,7 +981,8 @@ static int snd_ctl_elem_add(struct snd_ctl_file *file,
 		return -EINVAL;
 	access = info->access == 0 ? SNDRV_CTL_ELEM_ACCESS_READWRITE :
 		(info->access & (SNDRV_CTL_ELEM_ACCESS_READWRITE|
-				 SNDRV_CTL_ELEM_ACCESS_INACTIVE));
+				 SNDRV_CTL_ELEM_ACCESS_INACTIVE|
+				 SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE));
 	info->id.numid = 0;
 	memset(&kctl, 0, sizeof(kctl));
 	down_write(&card->controls_rwsem);
@@ -964,6 +1008,10 @@ static int snd_ctl_elem_add(struct snd_ctl_file *file,
 		kctl.get = snd_ctl_elem_user_get;
 	if (access & SNDRV_CTL_ELEM_ACCESS_WRITE)
 		kctl.put = snd_ctl_elem_user_put;
+	if (access & SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE) {
+		kctl.tlv.c = snd_ctl_elem_user_tlv;
+		access |= SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK;
+	}
 	switch (info->type) {
 	case SNDRV_CTL_ELEM_TYPE_BOOLEAN:
 		private_size = sizeof(char);
@@ -1068,38 +1116,65 @@ static int snd_ctl_subscribe_events(struct snd_ctl_file *file, int __user *ptr)
 	return 0;
 }
 
-static int snd_ctl_tlv_read(struct snd_card *card,
-                            struct snd_ctl_tlv __user *_tlv)
+static int snd_ctl_tlv_ioctl(struct snd_ctl_file *file,
+                             struct snd_ctl_tlv __user *_tlv,
+                             int op_flag)
 {
+	struct snd_card *card = file->card;
 	struct snd_ctl_tlv tlv;
 	struct snd_kcontrol *kctl;
+	struct snd_kcontrol_volatile *vd;
 	unsigned int len;
 	int err = 0;
 
 	if (copy_from_user(&tlv, _tlv, sizeof(tlv)))
 		return -EFAULT;
-        if (tlv.length < sizeof(unsigned int) * 3)
-                return -EINVAL;
-        down_read(&card->controls_rwsem);
-        kctl = snd_ctl_find_numid(card, tlv.numid);
-        if (kctl == NULL) {
-                err = -ENOENT;
-                goto __kctl_end;
-        }
-        if (kctl->tlv == NULL) {
-                err = -ENXIO;
-                goto __kctl_end;
-        }
-        len = kctl->tlv[1] + 2 * sizeof(unsigned int);
-        if (tlv.length < len) {
-                err = -ENOMEM;
-                goto __kctl_end;
-        }
-        if (copy_to_user(_tlv->tlv, kctl->tlv, len))
-        	err = -EFAULT;
+	if (tlv.length < sizeof(unsigned int) * 3)
+		return -EINVAL;
+	down_read(&card->controls_rwsem);
+	kctl = snd_ctl_find_numid(card, tlv.numid);
+	if (kctl == NULL) {
+		err = -ENOENT;
+		goto __kctl_end;
+	}
+	if (kctl->tlv.p == NULL) {
+		err = -ENXIO;
+		goto __kctl_end;
+	}
+	vd = &kctl->vd[tlv.numid - kctl->id.numid];
+	if ((op_flag == 0 && (vd->access & SNDRV_CTL_ELEM_ACCESS_TLV_READ) == 0) ||
+	    (op_flag > 0 && (vd->access & SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) == 0) ||
+	    (op_flag < 0 && (vd->access & SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND) == 0)) {
+	    	err = -ENXIO;
+	    	goto __kctl_end;
+	}
+	if (vd->access & SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK) {
+		if (file && vd->owner != NULL && vd->owner != file) {
+			err = -EPERM;
+			goto __kctl_end;
+		}
+		err = kctl->tlv.c(kctl, op_flag, tlv.length, _tlv->tlv); 
+		if (err > 0) {
+			up_read(&card->controls_rwsem);
+			snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_TLV, &kctl->id);
+			return 0;
+		}
+	} else {
+		if (op_flag) {
+			err = -ENXIO;
+			goto __kctl_end;
+		}
+		len = kctl->tlv.p[1] + 2 * sizeof(unsigned int);
+		if (tlv.length < len) {
+			err = -ENOMEM;
+			goto __kctl_end;
+		}
+		if (copy_to_user(_tlv->tlv, kctl->tlv.p, len))
+			err = -EFAULT;
+	}
       __kctl_end:
-        up_read(&card->controls_rwsem);
-        return err;
+	up_read(&card->controls_rwsem);
+	return err;
 }
 
 static long snd_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1140,8 +1215,12 @@ static long snd_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg
 		return snd_ctl_elem_remove(ctl, argp);
 	case SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS:
 		return snd_ctl_subscribe_events(ctl, ip);
-        case SNDRV_CTL_IOCTL_TLV_READ:
-                return snd_ctl_tlv_read(card, argp);
+	case SNDRV_CTL_IOCTL_TLV_READ:
+		return snd_ctl_tlv_ioctl(ctl, argp, 0);
+	case SNDRV_CTL_IOCTL_TLV_WRITE:
+		return snd_ctl_tlv_ioctl(ctl, argp, 1);
+	case SNDRV_CTL_IOCTL_TLV_COMMAND:
+		return snd_ctl_tlv_ioctl(ctl, argp, -1);
 	case SNDRV_CTL_IOCTL_POWER:
 		return -ENOPROTOOPT;
 	case SNDRV_CTL_IOCTL_POWER_STATE:
-- 
cgit v1.2.3-70-g09d2


From 6a65d793b0a82c7e190d9fd92a479401b6a127ca Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 14 Jul 2006 14:39:34 +0200
Subject: [ALSA] Remove unused tlv_rw field from struct snd_kcontrol

Remove unused tlv_rw field from struct snd_kcontrol.  The callback is
set in tlv.c field, instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/control.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/control.h b/include/sound/control.h
index e3905c5a095..1de148b0fd9 100644
--- a/include/sound/control.h
+++ b/include/sound/control.h
@@ -67,7 +67,6 @@ struct snd_kcontrol {
 	snd_kcontrol_info_t *info;
 	snd_kcontrol_get_t *get;
 	snd_kcontrol_put_t *put;
-	snd_kcontrol_tlv_rw_t *tlv_rw;
 	union {
 		snd_kcontrol_tlv_rw_t *c;
 		unsigned int *p;
-- 
cgit v1.2.3-70-g09d2


From 31508f83f591dc8764427b6321c89f8f9e84bad2 Mon Sep 17 00:00:00 2001
From: James Courtier-Dutton <James@superbug.co.uk>
Date: Sat, 22 Jul 2006 17:02:10 +0100
Subject: [ALSA] snd-emu10k1: Implement dB gain infomation.

Signed-off-by: James Courtier-Dutton <James@superbug.co.uk>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/emu10k1.h   |  4 ++++
 sound/pci/emu10k1/emufx.c | 11 +++++++++++
 sound/pci/emu10k1/p16v.c  |  5 +++++
 3 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 884bbf54cd3..892e310c504 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1524,6 +1524,10 @@ struct snd_emu10k1_fx8010_control_gpr {
 	unsigned int value[32];		/* initial values */
 	unsigned int min;		/* minimum range */
 	unsigned int max;		/* maximum range */
+	union {
+		snd_kcontrol_tlv_rw_t *c;
+		unsigned int *p;
+	} tlv;
 	unsigned int translation;	/* translation type (EMU10K1_GPR_TRANSLATION*) */
 };
 
diff --git a/sound/pci/emu10k1/emufx.c b/sound/pci/emu10k1/emufx.c
index dfba00230d4..00fc904c251 100644
--- a/sound/pci/emu10k1/emufx.c
+++ b/sound/pci/emu10k1/emufx.c
@@ -35,6 +35,7 @@
 #include <linux/mutex.h>
 
 #include <sound/core.h>
+#include <sound/tlv.h>
 #include <sound/emu10k1.h>
 
 #if 0		/* for testing purposes - digital out -> capture */
@@ -290,6 +291,9 @@ static const u32 db_table[101] = {
 	0x7fffffff,
 };
 
+/* EMU10k1/EMU10k2 DSP control db gain */
+static DECLARE_TLV_DB_SCALE(snd_emu10k1_db_scale1, -4000, 40, 1);
+
 static const u32 onoff_table[2] = {
 	0x00000000, 0x00000001
 };
@@ -755,6 +759,11 @@ static int snd_emu10k1_add_controls(struct snd_emu10k1 *emu,
 		knew.device = gctl->id.device;
 		knew.subdevice = gctl->id.subdevice;
 		knew.info = snd_emu10k1_gpr_ctl_info;
+		if (gctl->tlv.p) {
+			knew.tlv.p = gctl->tlv.p;
+			knew.access = SNDRV_CTL_ELEM_ACCESS_READWRITE |
+				SNDRV_CTL_ELEM_ACCESS_TLV_READ;
+		} 
 		knew.get = snd_emu10k1_gpr_ctl_get;
 		knew.put = snd_emu10k1_gpr_ctl_put;
 		memset(nctl, 0, sizeof(*nctl));
@@ -1013,6 +1022,7 @@ snd_emu10k1_init_mono_control(struct snd_emu10k1_fx8010_control_gpr *ctl,
 	ctl->gpr[0] = gpr + 0; ctl->value[0] = defval;
 	ctl->min = 0;
 	ctl->max = 100;
+	ctl->tlv.p = snd_emu10k1_db_scale1;
 	ctl->translation = EMU10K1_GPR_TRANSLATION_TABLE100;	
 }
 
@@ -1027,6 +1037,7 @@ snd_emu10k1_init_stereo_control(struct snd_emu10k1_fx8010_control_gpr *ctl,
 	ctl->gpr[1] = gpr + 1; ctl->value[1] = defval;
 	ctl->min = 0;
 	ctl->max = 100;
+	ctl->tlv.p = snd_emu10k1_db_scale1;
 	ctl->translation = EMU10K1_GPR_TRANSLATION_TABLE100;
 }
 
diff --git a/sound/pci/emu10k1/p16v.c b/sound/pci/emu10k1/p16v.c
index 9905651935f..1e44714b862 100644
--- a/sound/pci/emu10k1/p16v.c
+++ b/sound/pci/emu10k1/p16v.c
@@ -100,6 +100,7 @@
 #include <sound/pcm.h>
 #include <sound/ac97_codec.h>
 #include <sound/info.h>
+#include <sound/tlv.h>
 #include <sound/emu10k1.h>
 #include "p16v.h"
 
@@ -784,12 +785,16 @@ static int snd_p16v_capture_channel_put(struct snd_kcontrol *kcontrol,
 	}
         return change;
 }
+static DECLARE_TLV_DB_SCALE(snd_p16v_db_scale1, -5175, 25, 1);
 
 #define P16V_VOL(xname,xreg,xhl) { \
 	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \
+        .access = SNDRV_CTL_ELEM_ACCESS_READWRITE |             \
+                  SNDRV_CTL_ELEM_ACCESS_TLV_READ,               \
 	.info = snd_p16v_volume_info, \
 	.get = snd_p16v_volume_get, \
 	.put = snd_p16v_volume_put, \
+	.tlv.p = snd_p16v_db_scale1, \
 	.private_value = ((xreg) | ((xhl) << 8)) \
 }
 
-- 
cgit v1.2.3-70-g09d2


From 548a648b98318e4b843b636dd2c7f42377e19a00 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 31 Jul 2006 16:51:51 +0200
Subject: [ALSA] Fix control/status mmap with shared PCM substream

The flag to avoid 32bit-incompatible mmap for control/status records
should be outside the pcm substream instance since a substream can be
shared among multiple opens.  Now it's flagged in pcm_file list that
is directly assigned to file->private_data.
Also, removed snd_pcm_add_file() and remove_file() functions and
substream.files field that are not really used in the code.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/pcm.h     |  4 +---
 sound/core/pcm_compat.c |  2 +-
 sound/core/pcm_native.c | 49 ++++++++++---------------------------------------
 3 files changed, 12 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index f84d84993a3..60d40b34efc 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -190,7 +190,7 @@ struct snd_pcm_ops {
 
 struct snd_pcm_file {
 	struct snd_pcm_substream *substream;
-	struct snd_pcm_file *next;
+	int no_compat_mmap;
 };
 
 struct snd_pcm_hw_rule;
@@ -384,7 +384,6 @@ struct snd_pcm_substream {
 	struct snd_info_entry *proc_prealloc_entry;
 #endif
 	/* misc flags */
-	unsigned int no_mmap_ctrl: 1;
 	unsigned int hw_opened: 1;
 };
 
@@ -402,7 +401,6 @@ struct snd_pcm_str {
 	/* -- OSS things -- */
 	struct snd_pcm_oss_stream oss;
 #endif
-	struct snd_pcm_file *files;
 #ifdef CONFIG_SND_VERBOSE_PROCFS
 	struct snd_info_entry *proc_root;
 	struct snd_info_entry *proc_info_entry;
diff --git a/sound/core/pcm_compat.c b/sound/core/pcm_compat.c
index 2b8aab6fd6c..2b539799d23 100644
--- a/sound/core/pcm_compat.c
+++ b/sound/core/pcm_compat.c
@@ -478,7 +478,7 @@ static long snd_pcm_ioctl_compat(struct file *file, unsigned int cmd, unsigned l
 	 * mmap of PCM status/control records because of the size
 	 * incompatibility.
 	 */
-	substream->no_mmap_ctrl = 1;
+	pcm_file->no_compat_mmap = 1;
 
 	switch (cmd) {
 	case SNDRV_PCM_IOCTL_PVERSION:
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 439f047929e..0224c70414f 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -1992,35 +1992,9 @@ int snd_pcm_hw_constraints_complete(struct snd_pcm_substream *substream)
 	return 0;
 }
 
-static void snd_pcm_add_file(struct snd_pcm_str *str,
-			     struct snd_pcm_file *pcm_file)
-{
-	pcm_file->next = str->files;
-	str->files = pcm_file;
-}
-
-static void snd_pcm_remove_file(struct snd_pcm_str *str,
-				struct snd_pcm_file *pcm_file)
-{
-	struct snd_pcm_file * pcm_file1;
-	if (str->files == pcm_file) {
-		str->files = pcm_file->next;
-	} else {
-		pcm_file1 = str->files;
-		while (pcm_file1 && pcm_file1->next != pcm_file)
-			pcm_file1 = pcm_file1->next;
-		if (pcm_file1 != NULL)
-			pcm_file1->next = pcm_file->next;
-	}
-}
-
 static void pcm_release_private(struct snd_pcm_substream *substream)
 {
-	struct snd_pcm_file *pcm_file = substream->file;
-
 	snd_pcm_unlink(substream);
-	snd_pcm_remove_file(substream->pstr, pcm_file);
-	kfree(pcm_file);
 }
 
 void snd_pcm_release_substream(struct snd_pcm_substream *substream)
@@ -2060,7 +2034,6 @@ int snd_pcm_open_substream(struct snd_pcm *pcm, int stream,
 		return 0;
 	}
 
-	substream->no_mmap_ctrl = 0;
 	err = snd_pcm_hw_constraints_init(substream);
 	if (err < 0) {
 		snd_printd("snd_pcm_hw_constraints_init failed\n");
@@ -2105,19 +2078,16 @@ static int snd_pcm_open_file(struct file *file,
 	if (err < 0)
 		return err;
 
-	if (substream->ref_count > 1)
-		pcm_file = substream->file;
-	else {
-		pcm_file = kzalloc(sizeof(*pcm_file), GFP_KERNEL);
-		if (pcm_file == NULL) {
-			snd_pcm_release_substream(substream);
-			return -ENOMEM;
-		}
+	pcm_file = kzalloc(sizeof(*pcm_file), GFP_KERNEL);
+	if (pcm_file == NULL) {
+		snd_pcm_release_substream(substream);
+		return -ENOMEM;
+	}
+	pcm_file->substream = substream;
+	if (substream->ref_count == 1) {
 		str = substream->pstr;
 		substream->file = pcm_file;
 		substream->pcm_release = pcm_release_private;
-		pcm_file->substream = substream;
-		snd_pcm_add_file(str, pcm_file);
 	}
 	file->private_data = pcm_file;
 	*rpcm_file = pcm_file;
@@ -2209,6 +2179,7 @@ static int snd_pcm_release(struct inode *inode, struct file *file)
 	fasync_helper(-1, file, 0, &substream->runtime->fasync);
 	mutex_lock(&pcm->open_mutex);
 	snd_pcm_release_substream(substream);
+	kfree(pcm_file);
 	mutex_unlock(&pcm->open_mutex);
 	wake_up(&pcm->open_wait);
 	module_put(pcm->card->module);
@@ -3270,11 +3241,11 @@ static int snd_pcm_mmap(struct file *file, struct vm_area_struct *area)
 	offset = area->vm_pgoff << PAGE_SHIFT;
 	switch (offset) {
 	case SNDRV_PCM_MMAP_OFFSET_STATUS:
-		if (substream->no_mmap_ctrl)
+		if (pcm_file->no_compat_mmap)
 			return -ENXIO;
 		return snd_pcm_mmap_status(substream, file, area);
 	case SNDRV_PCM_MMAP_OFFSET_CONTROL:
-		if (substream->no_mmap_ctrl)
+		if (pcm_file->no_compat_mmap)
 			return -ENXIO;
 		return snd_pcm_mmap_control(substream, file, area);
 	default:
-- 
cgit v1.2.3-70-g09d2


From 683fe1537e660c322c8af953773921e814791193 Mon Sep 17 00:00:00 2001
From: Jochen Voss <voss@seehuhn.de>
Date: Tue, 8 Aug 2006 21:12:44 +0200
Subject: [ALSA] Revolution 5.1 - add AK5365 ADC support

Add support for the AK5365 ADC.

Signed-off-by: Jochen Voss <voss@seehuhn.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/ak4xxx-adda.h   |  3 ++-
 sound/i2c/other/ak4xxx-adda.c | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/ak4xxx-adda.h b/include/sound/ak4xxx-adda.h
index 3d988849202..65ddfa3cac1 100644
--- a/include/sound/ak4xxx-adda.h
+++ b/include/sound/ak4xxx-adda.h
@@ -53,7 +53,8 @@ struct snd_akm4xxx {
 	unsigned int idx_offset;		/* control index offset */
 	enum {
 		SND_AK4524, SND_AK4528, SND_AK4529,
-		SND_AK4355, SND_AK4358, SND_AK4381
+		SND_AK4355, SND_AK4358, SND_AK4381,
+		SND_AK5365
 	} type;
 	unsigned int *num_stereo;	/* array of combined counts
 					 * for the mixer
diff --git a/sound/i2c/other/ak4xxx-adda.c b/sound/i2c/other/ak4xxx-adda.c
index dc7cc2001b7..7d562f08420 100644
--- a/sound/i2c/other/ak4xxx-adda.c
+++ b/sound/i2c/other/ak4xxx-adda.c
@@ -598,6 +598,31 @@ int snd_akm4xxx_build_controls(struct snd_akm4xxx *ak)
 		if (err < 0)
 			goto __error;
 	}
+
+	if (ak->type == SND_AK5365) {
+		memset(ctl, 0, sizeof(*ctl));
+		if (ak->channel_names == NULL)
+			strcpy(ctl->id.name, "Capture Volume");
+		else
+			strcpy(ctl->id.name, ak->channel_names[0]);
+		ctl->id.index = ak->idx_offset * 2;
+		ctl->id.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
+		ctl->count = 1;
+		ctl->info = snd_akm4xxx_stereo_volume_info;
+		ctl->get = snd_akm4xxx_stereo_volume_get;
+		ctl->put = snd_akm4xxx_stereo_volume_put;
+		/* Registers 4 & 5 (see AK5365 data sheet, pages 34 and 35):
+		 * valid values are from 0x00 (mute) to 0x98 (+12dB).  */
+		ctl->private_value =
+			AK_COMPOSE(0, 4, 0, 0x98);
+		ctl->private_data = ak;
+		err = snd_ctl_add(ak->card,
+				  snd_ctl_new(ctl, SNDRV_CTL_ELEM_ACCESS_READ|
+					      SNDRV_CTL_ELEM_ACCESS_WRITE));
+		if (err < 0)
+			goto __error;
+	}
+
 	if (ak->type == SND_AK4355 || ak->type == SND_AK4358)
 		num_emphs = 1;
 	else
-- 
cgit v1.2.3-70-g09d2


From eac06a10d2b814dfacc36a8fff35ef07bf4eec8e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 22 Aug 2006 13:16:25 +0200
Subject: [ALSA] Add dB scale information to ad1848 driver

Added the dB scale information to ad1848 driver.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/ad1848.h        | 22 +++++++++++++------
 sound/isa/ad1848/ad1848_lib.c | 49 ++++++++++++++++++++++++++++++-------------
 2 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/sound/ad1848.h b/include/sound/ad1848.h
index 57af1fe7b30..c8de6f83338 100644
--- a/include/sound/ad1848.h
+++ b/include/sound/ad1848.h
@@ -179,14 +179,13 @@ enum { AD1848_MIX_SINGLE, AD1848_MIX_DOUBLE, AD1848_MIX_CAPTURE };
 #define AD1848_MIXVAL_DOUBLE(left_reg, right_reg, shift_left, shift_right, mask, invert) \
 	((left_reg) | ((right_reg) << 8) | ((shift_left) << 16) | ((shift_right) << 19) | ((mask) << 24) | ((invert) << 22))
 
-int snd_ad1848_add_ctl(struct snd_ad1848 *chip, const char *name, int index, int type, unsigned long value);
-
 /* for ease of use */
 struct ad1848_mix_elem {
 	const char *name;
 	int index;
 	int type;
 	unsigned long private_value;
+	unsigned int *tlv;
 };
 
 #define AD1848_SINGLE(xname, xindex, reg, shift, mask, invert) \
@@ -195,15 +194,26 @@ struct ad1848_mix_elem {
   .type = AD1848_MIX_SINGLE, \
   .private_value = AD1848_MIXVAL_SINGLE(reg, shift, mask, invert) }
 
+#define AD1848_SINGLE_TLV(xname, xindex, reg, shift, mask, invert, xtlv) \
+{ .name = xname, \
+  .index = xindex, \
+  .type = AD1848_MIX_SINGLE, \
+  .private_value = AD1848_MIXVAL_SINGLE(reg, shift, mask, invert), \
+  .tlv = xtlv }
+
 #define AD1848_DOUBLE(xname, xindex, left_reg, right_reg, shift_left, shift_right, mask, invert) \
 { .name = xname, \
   .index = xindex, \
   .type = AD1848_MIX_DOUBLE, \
   .private_value = AD1848_MIXVAL_DOUBLE(left_reg, right_reg, shift_left, shift_right, mask, invert) }
 
-static inline int snd_ad1848_add_ctl_elem(struct snd_ad1848 *chip, const struct ad1848_mix_elem *c)
-{
-	return snd_ad1848_add_ctl(chip, c->name, c->index, c->type, c->private_value);
-}
+#define AD1848_DOUBLE_TLV(xname, xindex, left_reg, right_reg, shift_left, shift_right, mask, invert, xtlv) \
+{ .name = xname, \
+  .index = xindex, \
+  .type = AD1848_MIX_DOUBLE, \
+  .private_value = AD1848_MIXVAL_DOUBLE(left_reg, right_reg, shift_left, shift_right, mask, invert), \
+  .tlv = xtlv }
+
+int snd_ad1848_add_ctl_elem(struct snd_ad1848 *chip, const struct ad1848_mix_elem *c);
 
 #endif /* __SOUND_AD1848_H */
diff --git a/sound/isa/ad1848/ad1848_lib.c b/sound/isa/ad1848/ad1848_lib.c
index e711f87d5fd..a6fbd5d1d62 100644
--- a/sound/isa/ad1848/ad1848_lib.c
+++ b/sound/isa/ad1848/ad1848_lib.c
@@ -29,6 +29,7 @@
 #include <sound/core.h>
 #include <sound/ad1848.h>
 #include <sound/control.h>
+#include <sound/tlv.h>
 #include <sound/pcm_params.h>
 
 #include <asm/io.h>
@@ -118,6 +119,8 @@ void snd_ad1848_out(struct snd_ad1848 *chip,
 #endif
 }
 
+EXPORT_SYMBOL(snd_ad1848_out);
+
 static void snd_ad1848_dout(struct snd_ad1848 *chip,
 			    unsigned char reg, unsigned char value)
 {
@@ -941,6 +944,8 @@ int snd_ad1848_create(struct snd_card *card,
 	return 0;
 }
 
+EXPORT_SYMBOL(snd_ad1848_create);
+
 static struct snd_pcm_ops snd_ad1848_playback_ops = {
 	.open =		snd_ad1848_playback_open,
 	.close =	snd_ad1848_playback_close,
@@ -988,12 +993,16 @@ int snd_ad1848_pcm(struct snd_ad1848 *chip, int device, struct snd_pcm **rpcm)
 	return 0;
 }
 
+EXPORT_SYMBOL(snd_ad1848_pcm);
+
 const struct snd_pcm_ops *snd_ad1848_get_pcm_ops(int direction)
 {
 	return direction == SNDRV_PCM_STREAM_PLAYBACK ?
 		&snd_ad1848_playback_ops : &snd_ad1848_capture_ops;
 }
 
+EXPORT_SYMBOL(snd_ad1848_get_pcm_ops);
+
 /*
  *  MIXER part
  */
@@ -1171,7 +1180,8 @@ static int snd_ad1848_put_double(struct snd_kcontrol *kcontrol, struct snd_ctl_e
 
 /*
  */
-int snd_ad1848_add_ctl(struct snd_ad1848 *chip, const char *name, int index, int type, unsigned long value)
+int snd_ad1848_add_ctl_elem(struct snd_ad1848 *chip,
+			    const struct ad1848_mix_elem *c)
 {
 	static struct snd_kcontrol_new newctls[] = {
 		[AD1848_MIX_SINGLE] = {
@@ -1196,32 +1206,46 @@ int snd_ad1848_add_ctl(struct snd_ad1848 *chip, const char *name, int index, int
 	struct snd_kcontrol *ctl;
 	int err;
 
-	ctl = snd_ctl_new1(&newctls[type], chip);
+	ctl = snd_ctl_new1(&newctls[c->type], chip);
 	if (! ctl)
 		return -ENOMEM;
-	strlcpy(ctl->id.name, name, sizeof(ctl->id.name));
-	ctl->id.index = index;
-	ctl->private_value = value;
+	strlcpy(ctl->id.name, c->name, sizeof(ctl->id.name));
+	ctl->id.index = c->index;
+	ctl->private_value = c->private_value;
+	if (c->tlv) {
+		ctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_TLV_READ;
+		ctl->tlv.p = c->tlv;
+	}
 	if ((err = snd_ctl_add(chip->card, ctl)) < 0)
 		return err;
 	return 0;
 }
 
+EXPORT_SYMBOL(snd_ad1848_add_ctl_elem);
+
+static DECLARE_TLV_DB_SCALE(db_scale_6bit, -9450, 150, 0);
+static DECLARE_TLV_DB_SCALE(db_scale_5bit_12db_max, -3450, 150, 0);
+static DECLARE_TLV_DB_SCALE(db_scale_rec_gain, 0, 150, 0);
 
 static struct ad1848_mix_elem snd_ad1848_controls[] = {
 AD1848_DOUBLE("PCM Playback Switch", 0, AD1848_LEFT_OUTPUT, AD1848_RIGHT_OUTPUT, 7, 7, 1, 1),
-AD1848_DOUBLE("PCM Playback Volume", 0, AD1848_LEFT_OUTPUT, AD1848_RIGHT_OUTPUT, 0, 0, 63, 1),
+AD1848_DOUBLE_TLV("PCM Playback Volume", 0, AD1848_LEFT_OUTPUT, AD1848_RIGHT_OUTPUT, 0, 0, 63, 1,
+		  db_scale_6bit),
 AD1848_DOUBLE("Aux Playback Switch", 0, AD1848_AUX1_LEFT_INPUT, AD1848_AUX1_RIGHT_INPUT, 7, 7, 1, 1),
-AD1848_DOUBLE("Aux Playback Volume", 0, AD1848_AUX1_LEFT_INPUT, AD1848_AUX1_RIGHT_INPUT, 0, 0, 31, 1),
+AD1848_DOUBLE_TLV("Aux Playback Volume", 0, AD1848_AUX1_LEFT_INPUT, AD1848_AUX1_RIGHT_INPUT, 0, 0, 31, 1,
+		  db_scale_5bit_12db_max),
 AD1848_DOUBLE("Aux Playback Switch", 1, AD1848_AUX2_LEFT_INPUT, AD1848_AUX2_RIGHT_INPUT, 7, 7, 1, 1),
-AD1848_DOUBLE("Aux Playback Volume", 1, AD1848_AUX2_LEFT_INPUT, AD1848_AUX2_RIGHT_INPUT, 0, 0, 31, 1),
-AD1848_DOUBLE("Capture Volume", 0, AD1848_LEFT_INPUT, AD1848_RIGHT_INPUT, 0, 0, 15, 0),
+AD1848_DOUBLE_TLV("Aux Playback Volume", 1, AD1848_AUX2_LEFT_INPUT, AD1848_AUX2_RIGHT_INPUT, 0, 0, 31, 1,
+		  db_scale_5bit_12db_max),
+AD1848_DOUBLE_TLV("Capture Volume", 0, AD1848_LEFT_INPUT, AD1848_RIGHT_INPUT, 0, 0, 15, 0,
+		  db_scale_rec_gain),
 {
 	.name = "Capture Source",
 	.type = AD1848_MIX_CAPTURE,
 },
 AD1848_SINGLE("Loopback Capture Switch", 0, AD1848_LOOPBACK, 0, 1, 0),
-AD1848_SINGLE("Loopback Capture Volume", 0, AD1848_LOOPBACK, 1, 63, 0)
+AD1848_SINGLE_TLV("Loopback Capture Volume", 0, AD1848_LOOPBACK, 1, 63, 0,
+		  db_scale_6bit),
 };
                                         
 int snd_ad1848_mixer(struct snd_ad1848 *chip)
@@ -1245,12 +1269,7 @@ int snd_ad1848_mixer(struct snd_ad1848 *chip)
 	return 0;
 }
 
-EXPORT_SYMBOL(snd_ad1848_out);
-EXPORT_SYMBOL(snd_ad1848_create);
-EXPORT_SYMBOL(snd_ad1848_pcm);
-EXPORT_SYMBOL(snd_ad1848_get_pcm_ops);
 EXPORT_SYMBOL(snd_ad1848_mixer);
-EXPORT_SYMBOL(snd_ad1848_add_ctl);
 
 /*
  *  INIT part
-- 
cgit v1.2.3-70-g09d2


From 1186ed8c7dc9c0185e783beddf241509cc224f1a Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 23 Aug 2006 19:53:28 +0200
Subject: [ALSA] Add dB scale information to vxpocket and vx222 drivers

Added the dB scale information to vxpocket and vx222 drivers.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/vx_core.h     |  1 +
 sound/drivers/vx/vx_mixer.c | 17 +++++++++++++++--
 sound/pci/vx222/vx222.c     |  7 +++++++
 sound/pci/vx222/vx222_ops.c |  9 +++++++++
 sound/pcmcia/vx/vxpocket.c  |  5 +++++
 5 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/vx_core.h b/include/sound/vx_core.h
index 9821a6194ca..dbca1417061 100644
--- a/include/sound/vx_core.h
+++ b/include/sound/vx_core.h
@@ -128,6 +128,7 @@ struct snd_vx_hardware {
 	unsigned int num_ins;
 	unsigned int num_outs;
 	unsigned int output_level_max;
+	unsigned int *output_level_db_scale;
 };
 
 /* hwdep id string */
diff --git a/sound/drivers/vx/vx_mixer.c b/sound/drivers/vx/vx_mixer.c
index c1d7fcdd197..1613ed844ac 100644
--- a/sound/drivers/vx/vx_mixer.c
+++ b/sound/drivers/vx/vx_mixer.c
@@ -23,6 +23,7 @@
 #include <sound/driver.h>
 #include <sound/core.h>
 #include <sound/control.h>
+#include <sound/tlv.h>
 #include <sound/vx_core.h>
 #include "vx_cmd.h"
 
@@ -455,10 +456,13 @@ static int vx_output_level_put(struct snd_kcontrol *kcontrol, struct snd_ctl_ele
 
 static struct snd_kcontrol_new vx_control_output_level = {
 	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.access =	(SNDRV_CTL_ELEM_ACCESS_READWRITE |
+			 SNDRV_CTL_ELEM_ACCESS_TLV_READ),
 	.name =		"Master Playback Volume",
 	.info =		vx_output_level_info,
 	.get =		vx_output_level_get,
 	.put =		vx_output_level_put,
+	/* tlv will be filled later */
 };
 
 /*
@@ -712,12 +716,17 @@ static int vx_monitor_sw_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_
 	return 0;
 }
 
+static DECLARE_TLV_DB_SCALE(db_scale_audio_gain, -10975, 25, 0);
+
 static struct snd_kcontrol_new vx_control_audio_gain = {
 	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.access =	(SNDRV_CTL_ELEM_ACCESS_READWRITE |
+			 SNDRV_CTL_ELEM_ACCESS_TLV_READ),
 	/* name will be filled later */
 	.info =         vx_audio_gain_info,
 	.get =          vx_audio_gain_get,
-	.put =          vx_audio_gain_put
+	.put =          vx_audio_gain_put,
+	.tlv = { .p = db_scale_audio_gain },
 };
 static struct snd_kcontrol_new vx_control_output_switch = {
 	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
@@ -729,9 +738,12 @@ static struct snd_kcontrol_new vx_control_output_switch = {
 static struct snd_kcontrol_new vx_control_monitor_gain = {
 	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
 	.name =         "Monitoring Volume",
+	.access =	(SNDRV_CTL_ELEM_ACCESS_READWRITE |
+			 SNDRV_CTL_ELEM_ACCESS_TLV_READ),
 	.info =         vx_audio_gain_info,	/* shared */
 	.get =          vx_audio_monitor_get,
-	.put =          vx_audio_monitor_put
+	.put =          vx_audio_monitor_put,
+	.tlv = { .p = db_scale_audio_gain },
 };
 static struct snd_kcontrol_new vx_control_monitor_switch = {
 	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
@@ -918,6 +930,7 @@ int snd_vx_mixer_new(struct vx_core *chip)
 	for (i = 0; i < chip->hw->num_outs; i++) {
 		temp = vx_control_output_level;
 		temp.index = i;
+		temp.tlv.p = chip->hw->output_level_db_scale;
 		if ((err = snd_ctl_add(card, snd_ctl_new1(&temp, chip))) < 0)
 			return err;
 	}
diff --git a/sound/pci/vx222/vx222.c b/sound/pci/vx222/vx222.c
index 9c03c6b4e49..e7cd8acab59 100644
--- a/sound/pci/vx222/vx222.c
+++ b/sound/pci/vx222/vx222.c
@@ -26,6 +26,7 @@
 #include <linux/moduleparam.h>
 #include <sound/core.h>
 #include <sound/initval.h>
+#include <sound/tlv.h>
 #include "vx222.h"
 
 #define CARD_NAME "VX222"
@@ -72,6 +73,9 @@ MODULE_DEVICE_TABLE(pci, snd_vx222_ids);
 /*
  */
 
+static DECLARE_TLV_DB_SCALE(db_scale_old_vol, -11350, 50, 0);
+static DECLARE_TLV_DB_SCALE(db_scale_akm, -7350, 50, 0);
+
 static struct snd_vx_hardware vx222_old_hw = {
 
 	.name = "VX222/Old",
@@ -81,6 +85,7 @@ static struct snd_vx_hardware vx222_old_hw = {
 	.num_ins = 1,
 	.num_outs = 1,
 	.output_level_max = VX_ANALOG_OUT_LEVEL_MAX,
+	.output_level_db_scale = db_scale_old_vol,
 };
 
 static struct snd_vx_hardware vx222_v2_hw = {
@@ -92,6 +97,7 @@ static struct snd_vx_hardware vx222_v2_hw = {
 	.num_ins = 1,
 	.num_outs = 1,
 	.output_level_max = VX2_AKM_LEVEL_MAX,
+	.output_level_db_scale = db_scale_akm,
 };
 
 static struct snd_vx_hardware vx222_mic_hw = {
@@ -103,6 +109,7 @@ static struct snd_vx_hardware vx222_mic_hw = {
 	.num_ins = 1,
 	.num_outs = 1,
 	.output_level_max = VX2_AKM_LEVEL_MAX,
+	.output_level_db_scale = db_scale_akm,
 };
 
 
diff --git a/sound/pci/vx222/vx222_ops.c b/sound/pci/vx222/vx222_ops.c
index 9b6d345b83a..5e51950e05f 100644
--- a/sound/pci/vx222/vx222_ops.c
+++ b/sound/pci/vx222/vx222_ops.c
@@ -28,6 +28,7 @@
 
 #include <sound/core.h>
 #include <sound/control.h>
+#include <sound/tlv.h>
 #include <asm/io.h>
 #include "vx222.h"
 
@@ -845,6 +846,8 @@ static void vx2_set_input_level(struct snd_vx222 *chip)
 
 #define MIC_LEVEL_MAX	0xff
 
+static DECLARE_TLV_DB_SCALE(db_scale_mic, -6450, 50, 0);
+
 /*
  * controls API for input levels
  */
@@ -922,18 +925,24 @@ static int vx_mic_level_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_v
 
 static struct snd_kcontrol_new vx_control_input_level = {
 	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.access =	(SNDRV_CTL_ELEM_ACCESS_READWRITE |
+			 SNDRV_CTL_ELEM_ACCESS_TLV_READ),
 	.name =		"Capture Volume",
 	.info =		vx_input_level_info,
 	.get =		vx_input_level_get,
 	.put =		vx_input_level_put,
+	.tlv = { .p = db_scale_mic },
 };
 
 static struct snd_kcontrol_new vx_control_mic_level = {
 	.iface =	SNDRV_CTL_ELEM_IFACE_MIXER,
+	.access =	(SNDRV_CTL_ELEM_ACCESS_READWRITE |
+			 SNDRV_CTL_ELEM_ACCESS_TLV_READ),
 	.name =		"Mic Capture Volume",
 	.info =		vx_mic_level_info,
 	.get =		vx_mic_level_get,
 	.put =		vx_mic_level_put,
+	.tlv = { .p = db_scale_mic },
 };
 
 /*
diff --git a/sound/pcmcia/vx/vxpocket.c b/sound/pcmcia/vx/vxpocket.c
index 76c85cffb40..3089fcca800 100644
--- a/sound/pcmcia/vx/vxpocket.c
+++ b/sound/pcmcia/vx/vxpocket.c
@@ -27,6 +27,7 @@
 #include <pcmcia/ciscode.h>
 #include <pcmcia/cisreg.h>
 #include <sound/initval.h>
+#include <sound/tlv.h>
 
 /*
  */
@@ -90,6 +91,8 @@ static int snd_vxpocket_dev_free(struct snd_device *device)
  * Only output levels can be modified
  */
 
+static DECLARE_TLV_DB_SCALE(db_scale_old_vol, -11350, 50, 0);
+
 static struct snd_vx_hardware vxpocket_hw = {
 	.name = "VXPocket",
 	.type = VX_TYPE_VXPOCKET,
@@ -99,6 +102,7 @@ static struct snd_vx_hardware vxpocket_hw = {
 	.num_ins = 1,
 	.num_outs = 1,
 	.output_level_max = VX_ANALOG_OUT_LEVEL_MAX,
+	.output_level_db_scale = db_scale_old_vol,
 };	
 
 /* VX-pocket 440
@@ -120,6 +124,7 @@ static struct snd_vx_hardware vxp440_hw = {
 	.num_ins = 2,
 	.num_outs = 2,
 	.output_level_max = VX_ANALOG_OUT_LEVEL_MAX,
+	.output_level_db_scale = db_scale_old_vol,
 };	
 
 
-- 
cgit v1.2.3-70-g09d2


From 063a40d9111ce7558f2fdfa4f85acfc47eb27353 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 28 Aug 2006 13:20:13 +0200
Subject: [ALSA] Add the definition of linear volume TLV

Added the definition of linear volume TLV type.
Some DSP chips and codecs (e.g. AK codec) use linear volume control.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/tlv.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/sound/tlv.h b/include/sound/tlv.h
index b826e1df1da..7905841643d 100644
--- a/include/sound/tlv.h
+++ b/include/sound/tlv.h
@@ -33,6 +33,7 @@
 
 #define SNDRV_CTL_TLVT_CONTAINER 0	/* one level down - group of TLVs */
 #define SNDRV_CTL_TLVT_DB_SCALE	1       /* dB scale */
+#define SNDRV_CTL_TLVT_DB_LINEAR 2	/* linear volume */
 
 #define DECLARE_TLV_DB_SCALE(name, min, step, mute) \
 unsigned int name[] = { \
@@ -40,4 +41,13 @@ unsigned int name[] = { \
         (min), ((step) & 0xffff) | ((mute) ? 0x10000 : 0) \
 }
 
+/* linear volume between min_dB and max_dB (.01dB unit) */
+#define DECLARE_TLV_DB_LINEAR(name, min_dB, max_dB)	\
+unsigned int name[] = { \
+        SNDRV_CTL_TLVT_DB_LINEAR, 2 * sizeof(unsigned int), \
+        (min_dB), (max_dB)				\
+}
+
+#define TLV_DB_GAIN_MUTE	-9999999
+
 #endif /* __SOUND_TLV_H */
-- 
cgit v1.2.3-70-g09d2


From 723b2b0d36fa7cea81a962af2d40d88520d5a5f1 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 30 Aug 2006 16:49:54 +0200
Subject: [ALSA] Clean up and add TLV support to AK4xxx i2c driver

- Clean up the code in AK4xxx-ADDA i2c code.
- Fix capture gain controls for AK5365
- Changed the static table for DAC/ADC mixer labels to use
  structs
- Implemented TLV entries for each AK codec
  The volumes in AK4524, AK4528 and AK5365 are corrected with
  a table to be suitable for dB conversion.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/ak4xxx-adda.h   |  40 +++-
 sound/i2c/other/ak4xxx-adda.c | 523 ++++++++++++++++++++++++------------------
 sound/pci/ice1712/revo.c      |  47 ++--
 3 files changed, 354 insertions(+), 256 deletions(-)

(limited to 'include')

diff --git a/include/sound/ak4xxx-adda.h b/include/sound/ak4xxx-adda.h
index 65ddfa3cac1..026e4072a9a 100644
--- a/include/sound/ak4xxx-adda.h
+++ b/include/sound/ak4xxx-adda.h
@@ -39,14 +39,26 @@ struct snd_ak4xxx_ops {
 
 #define AK4XXX_IMAGE_SIZE	(AK4XXX_MAX_CHIPS * 16)	/* 64 bytes */
 
+/* DAC label and channels */
+struct snd_akm4xxx_dac_channel {
+	char *name;		/* mixer volume name */
+	unsigned int num_channels;
+};
+
+/* ADC labels and channels */
+struct snd_akm4xxx_adc_channel {
+	char *name;		/* capture gain volume label */
+	char *gain_name;	/* IPGA */
+	char *switch_name;	/* capture switch */
+	unsigned int num_channels;
+};
+
 struct snd_akm4xxx {
 	struct snd_card *card;
 	unsigned int num_adcs;			/* AK4524 or AK4528 ADCs */
 	unsigned int num_dacs;			/* AK4524 or AK4528 DACs */
 	unsigned char images[AK4XXX_IMAGE_SIZE]; /* saved register image */
-	unsigned char ipga_gain[AK4XXX_MAX_CHIPS][2]; /* saved register image
-						       * for IPGA (AK4528)
-						       */
+	unsigned char volumes[AK4XXX_IMAGE_SIZE]; /* saved volume values */
 	unsigned long private_value[AK4XXX_MAX_CHIPS];	/* helper for driver */
 	void *private_data[AK4XXX_MAX_CHIPS];		/* helper for driver */
 	/* template should fill the following fields */
@@ -56,10 +68,11 @@ struct snd_akm4xxx {
 		SND_AK4355, SND_AK4358, SND_AK4381,
 		SND_AK5365
 	} type;
-	unsigned int *num_stereo;	/* array of combined counts
-					 * for the mixer
-					 */
-	char **channel_names;		/* array of mixer channel names */
+
+	/* (array) information of combined codecs */
+	struct snd_akm4xxx_dac_channel *dac_info;
+	struct snd_akm4xxx_adc_channel *adc_info;
+
 	struct snd_ak4xxx_ops ops;
 };
 
@@ -73,9 +86,18 @@ int snd_akm4xxx_build_controls(struct snd_akm4xxx *ak);
 	(ak)->images[(chip) * 16 + (reg)]
 #define snd_akm4xxx_set(ak,chip,reg,val) \
 	((ak)->images[(chip) * 16 + (reg)] = (val))
+#define snd_akm4xxx_get_vol(ak,chip,reg) \
+	(ak)->volumes[(chip) * 16 + (reg)]
+#define snd_akm4xxx_set_vol(ak,chip,reg,val) \
+	((ak)->volumes[(chip) * 16 + (reg)] = (val))
+
+/* Warning: IPGA is tricky - we assume the addr + 4 is unused
+ *   so far, it's OK for all AK codecs with IPGA:
+ *   AK4524, AK4528 and EK5365
+ */
 #define snd_akm4xxx_get_ipga(ak,chip,reg) \
-	(ak)->ipga_gain[chip][(reg)-4]
+	snd_akm4xxx_get_vol(ak, chip, (reg) + 4)
 #define snd_akm4xxx_set_ipga(ak,chip,reg,val) \
-	((ak)->ipga_gain[chip][(reg)-4] = (val))
+	snd_akm4xxx_set_vol(ak, chip, (reg) + 4, val)
 
 #endif /* __SOUND_AK4XXX_ADDA_H */
diff --git a/sound/i2c/other/ak4xxx-adda.c b/sound/i2c/other/ak4xxx-adda.c
index 89fc3cbc235..c34cb468460 100644
--- a/sound/i2c/other/ak4xxx-adda.c
+++ b/sound/i2c/other/ak4xxx-adda.c
@@ -28,12 +28,14 @@
 #include <linux/init.h>
 #include <sound/core.h>
 #include <sound/control.h>
+#include <sound/tlv.h>
 #include <sound/ak4xxx-adda.h>
 
 MODULE_AUTHOR("Jaroslav Kysela <perex@suse.cz>, Takashi Iwai <tiwai@suse.de>");
 MODULE_DESCRIPTION("Routines for control of AK452x / AK43xx  AD/DA converters");
 MODULE_LICENSE("GPL");
 
+/* write the given register and save the data to the cache */
 void snd_akm4xxx_write(struct snd_akm4xxx *ak, int chip, unsigned char reg,
 		       unsigned char val)
 {
@@ -41,15 +43,10 @@ void snd_akm4xxx_write(struct snd_akm4xxx *ak, int chip, unsigned char reg,
 	ak->ops.write(ak, chip, reg, val);
 
 	/* save the data */
-	if (ak->type == SND_AK4524 || ak->type == SND_AK4528) {
-		if ((reg != 0x04 && reg != 0x05) || (val & 0x80) == 0)
-			snd_akm4xxx_set(ak, chip, reg, val);
-		else
-			snd_akm4xxx_set_ipga(ak, chip, reg, val);
-	} else {
-		/* AK4529, or else */
+	/* don't overwrite with IPGA data */
+	if ((ak->type != SND_AK4524 && ak->type != SND_AK5365) ||
+	    (reg != 0x04 && reg != 0x05) || (val & 0x80) == 0)
 		snd_akm4xxx_set(ak, chip, reg, val);
-	}
 	ak->ops.unlock(ak, chip);
 }
 
@@ -78,7 +75,7 @@ static void ak4524_reset(struct snd_akm4xxx *ak, int state)
 		/* IPGA */
 		for (reg = 0x04; reg < 0x06; reg++)
 			snd_akm4xxx_write(ak, chip, reg,
-					  snd_akm4xxx_get_ipga(ak, chip, reg));
+					  snd_akm4xxx_get_ipga(ak, chip, reg) | 0x80);
 	}
 }
 
@@ -144,6 +141,42 @@ void snd_akm4xxx_reset(struct snd_akm4xxx *ak, int state)
 
 EXPORT_SYMBOL(snd_akm4xxx_reset);
 
+
+/*
+ * Volume conversion table for non-linear volumes
+ * from -63.5dB (mute) to 0dB step 0.5dB
+ *
+ * Used for AK4524 input/ouput attenuation, AK4528, and
+ * AK5365 input attenuation
+ */
+static unsigned char vol_cvt_datt[128] = {
+	0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04,
+	0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x06, 0x06,
+	0x06, 0x07, 0x07, 0x08, 0x08, 0x08, 0x09, 0x0a,
+	0x0a, 0x0b, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0f,
+	0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14,
+	0x15, 0x16, 0x17, 0x17, 0x18, 0x19, 0x1a, 0x1c,
+	0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x23,
+	0x24, 0x25, 0x26, 0x28, 0x29, 0x2a, 0x2b, 0x2d,
+	0x2e, 0x30, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35,
+	0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3e, 0x3f, 0x40,
+	0x41, 0x42, 0x43, 0x44, 0x46, 0x47, 0x48, 0x4a,
+	0x4b, 0x4d, 0x4e, 0x50, 0x51, 0x52, 0x53, 0x54,
+	0x55, 0x56, 0x58, 0x59, 0x5b, 0x5c, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x64, 0x65, 0x66, 0x67, 0x69,
+	0x6a, 0x6c, 0x6d, 0x6f, 0x70, 0x71, 0x72, 0x73,
+	0x75, 0x76, 0x77, 0x79, 0x7a, 0x7c, 0x7d, 0x7f,
+};
+
+/*
+ * dB tables
+ */
+static DECLARE_TLV_DB_SCALE(db_scale_vol_datt, -6350, 50, 1);
+static DECLARE_TLV_DB_SCALE(db_scale_8bit, -12750, 50, 1);
+static DECLARE_TLV_DB_SCALE(db_scale_7bit, -6350, 50, 1);
+static DECLARE_TLV_DB_LINEAR(db_scale_linear, TLV_DB_GAIN_MUTE, 0);
+static DECLARE_TLV_DB_SCALE(db_scale_ipga, 0, 50, 0);
+
 /*
  * initialize all the ak4xxx chips
  */
@@ -240,6 +273,9 @@ void snd_akm4xxx_init(struct snd_akm4xxx *ak)
 	int chip, num_chips;
 	unsigned char *ptr, reg, data, *inits;
 
+	memset(ak->images, 0, sizeof(ak->images));
+	memset(ak->volumes, 0, sizeof(ak->volumes));
+
 	switch (ak->type) {
 	case SND_AK4524:
 		inits = inits_ak4524;
@@ -265,6 +301,9 @@ void snd_akm4xxx_init(struct snd_akm4xxx *ak)
 		inits = inits_ak4381;
 		num_chips = ak->num_dacs / 2;
 		break;
+	case SND_AK5365:
+		/* FIXME: any init sequence? */
+		return;
 	default:
 		snd_BUG();
 		return;
@@ -282,16 +321,21 @@ void snd_akm4xxx_init(struct snd_akm4xxx *ak)
 
 EXPORT_SYMBOL(snd_akm4xxx_init);
 
+/*
+ * Mixer callbacks
+ */
+#define AK_VOL_CVT 			(1<<21)	/* need dB conversion */
+#define AK_NEEDSMSB 			(1<<22)	/* need MSB update bit */
+#define AK_INVERT 			(1<<23)	/* data is inverted */
 #define AK_GET_CHIP(val)		(((val) >> 8) & 0xff)
 #define AK_GET_ADDR(val)		((val) & 0xff)
-#define AK_GET_SHIFT(val)		(((val) >> 16) & 0x3f)
+#define AK_GET_SHIFT(val)		(((val) >> 16) & 0x1f)
+#define AK_GET_VOL_CVT(val)		(((val) >> 21) & 1)
 #define AK_GET_NEEDSMSB(val)		(((val) >> 22) & 1)
 #define AK_GET_INVERT(val)		(((val) >> 23) & 1)
 #define AK_GET_MASK(val)		(((val) >> 24) & 0xff)
 #define AK_COMPOSE(chip,addr,shift,mask) \
 	(((chip) << 8) | (addr) | ((shift) << 16) | ((mask) << 24))
-#define AK_NEEDSMSB 			(1<<22)
-#define AK_INVERT 			(1<<23)
 
 static int snd_akm4xxx_volume_info(struct snd_kcontrol *kcontrol,
 				   struct snd_ctl_elem_info *uinfo)
@@ -311,37 +355,37 @@ static int snd_akm4xxx_volume_get(struct snd_kcontrol *kcontrol,
 	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
 	int chip = AK_GET_CHIP(kcontrol->private_value);
 	int addr = AK_GET_ADDR(kcontrol->private_value);
-	int needsmsb = AK_GET_NEEDSMSB(kcontrol->private_value);
-	int invert = AK_GET_INVERT(kcontrol->private_value);
-	unsigned int mask = AK_GET_MASK(kcontrol->private_value);
-	unsigned char val = snd_akm4xxx_get(ak, chip, addr);
 
-	if (needsmsb)
-		val &= 0x7f;
-	ucontrol->value.integer.value[0] = invert ? mask - val : val;
+	ucontrol->value.integer.value[0] = snd_akm4xxx_get_vol(ak, chip, addr);
 	return 0;
 }
 
-static int snd_akm4xxx_volume_put(struct snd_kcontrol *kcontrol,
-				  struct snd_ctl_elem_value *ucontrol)
+static int put_ak_reg(struct snd_kcontrol *kcontrol, int addr,
+		      unsigned char nval)
 {
 	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
-	int chip = AK_GET_CHIP(kcontrol->private_value);
-	int addr = AK_GET_ADDR(kcontrol->private_value);
-	int needsmsb = AK_GET_NEEDSMSB(kcontrol->private_value);
-	int invert = AK_GET_INVERT(kcontrol->private_value);
 	unsigned int mask = AK_GET_MASK(kcontrol->private_value);
-	unsigned char nval = ucontrol->value.integer.value[0] % (mask+1);
-	int change;
+	int chip = AK_GET_CHIP(kcontrol->private_value);
 
-	if (invert)
+	if (snd_akm4xxx_get_vol(ak, chip, addr) == nval)
+		return 0;
+
+	snd_akm4xxx_set_vol(ak, chip, addr, nval);
+	if (AK_GET_VOL_CVT(kcontrol->private_value))
+		nval = vol_cvt_datt[nval];
+	if (AK_GET_INVERT(kcontrol->private_value))
 		nval = mask - nval;
-	if (needsmsb)
+	if (AK_GET_NEEDSMSB(kcontrol->private_value))
 		nval |= 0x80;
-	change = snd_akm4xxx_get(ak, chip, addr) != nval;
-	if (change)
-		snd_akm4xxx_write(ak, chip, addr, nval);
-	return change;
+	snd_akm4xxx_write(ak, chip, addr, nval);
+	return 1;
+}
+
+static int snd_akm4xxx_volume_put(struct snd_kcontrol *kcontrol,
+				  struct snd_ctl_elem_value *ucontrol)
+{
+	return put_ak_reg(kcontrol, AK_GET_ADDR(kcontrol->private_value),
+			  ucontrol->value.integer.value[0]);
 }
 
 static int snd_akm4xxx_stereo_volume_info(struct snd_kcontrol *kcontrol,
@@ -362,66 +406,25 @@ static int snd_akm4xxx_stereo_volume_get(struct snd_kcontrol *kcontrol,
 	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
 	int chip = AK_GET_CHIP(kcontrol->private_value);
 	int addr = AK_GET_ADDR(kcontrol->private_value);
-	int needsmsb = AK_GET_NEEDSMSB(kcontrol->private_value);
-	int invert = AK_GET_INVERT(kcontrol->private_value);
-	unsigned int mask = AK_GET_MASK(kcontrol->private_value);
-	unsigned char val;
-
-	val = snd_akm4xxx_get(ak, chip, addr);
-	if (needsmsb)
-		val &= 0x7f;
-	ucontrol->value.integer.value[0] = invert ? mask - val : val;
-
-	val = snd_akm4xxx_get(ak, chip, addr+1);
-	if (needsmsb)
-		val &= 0x7f;
-	ucontrol->value.integer.value[1] = invert ? mask - val : val;
 
+	ucontrol->value.integer.value[0] = snd_akm4xxx_get_vol(ak, chip, addr);
+	ucontrol->value.integer.value[1] = snd_akm4xxx_get_vol(ak, chip, addr+1);
 	return 0;
 }
 
 static int snd_akm4xxx_stereo_volume_put(struct snd_kcontrol *kcontrol,
 					 struct snd_ctl_elem_value *ucontrol)
 {
-	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
-	int chip = AK_GET_CHIP(kcontrol->private_value);
 	int addr = AK_GET_ADDR(kcontrol->private_value);
-	int needsmsb = AK_GET_NEEDSMSB(kcontrol->private_value);
-	int invert = AK_GET_INVERT(kcontrol->private_value);
-	unsigned int mask = AK_GET_MASK(kcontrol->private_value);
-	unsigned char nval = ucontrol->value.integer.value[0] % (mask+1);
-	int change0, change1;
-
-	if (invert)
-		nval = mask - nval;
-	if (needsmsb)
-		nval |= 0x80;
-	change0 = snd_akm4xxx_get(ak, chip, addr) != nval;
-	if (change0)
-		snd_akm4xxx_write(ak, chip, addr, nval);
-
-	nval = ucontrol->value.integer.value[1] % (mask+1);
-	if (invert)
-		nval = mask - nval;
-	if (needsmsb)
-		nval |= 0x80;
-	change1 = snd_akm4xxx_get(ak, chip, addr+1) != nval;
-	if (change1)
-		snd_akm4xxx_write(ak, chip, addr+1, nval);
-
+	int change;
 
-	return change0 || change1;
+	change = put_ak_reg(kcontrol, addr, ucontrol->value.integer.value[0]);
+	change |= put_ak_reg(kcontrol, addr + 1,
+			     ucontrol->value.integer.value[1]);
+	return change;
 }
 
-static int snd_akm4xxx_ipga_gain_info(struct snd_kcontrol *kcontrol,
-				      struct snd_ctl_elem_info *uinfo)
-{
-	uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER;
-	uinfo->count = 1;
-	uinfo->value.integer.min = 0;
-	uinfo->value.integer.max = 36;
-	return 0;
-}
+#define snd_akm4xxx_ipga_gain_info	snd_akm4xxx_volume_info
 
 static int snd_akm4xxx_ipga_gain_get(struct snd_kcontrol *kcontrol,
 				     struct snd_ctl_elem_value *ucontrol)
@@ -429,21 +432,57 @@ static int snd_akm4xxx_ipga_gain_get(struct snd_kcontrol *kcontrol,
 	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
 	int chip = AK_GET_CHIP(kcontrol->private_value);
 	int addr = AK_GET_ADDR(kcontrol->private_value);
+
 	ucontrol->value.integer.value[0] =
-		snd_akm4xxx_get_ipga(ak, chip, addr) & 0x7f;
+		snd_akm4xxx_get_ipga(ak, chip, addr);
 	return 0;
 }
 
+static int put_ak_ipga(struct snd_kcontrol *kcontrol, int addr,
+		       unsigned char nval)
+{
+	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
+	int chip = AK_GET_CHIP(kcontrol->private_value);
+
+	if (snd_akm4xxx_get_ipga(ak, chip, addr) == nval)
+		return 0;
+	snd_akm4xxx_set_ipga(ak, chip, addr, nval);
+	snd_akm4xxx_write(ak, chip, addr, nval | 0x80); /* need MSB */
+	return 1;
+}
+
 static int snd_akm4xxx_ipga_gain_put(struct snd_kcontrol *kcontrol,
 				     struct snd_ctl_elem_value *ucontrol)
+{
+	return put_ak_ipga(kcontrol, AK_GET_ADDR(kcontrol->private_value),
+			   ucontrol->value.integer.value[0]);
+}
+
+#define snd_akm4xxx_stereo_gain_info	snd_akm4xxx_stereo_volume_info
+
+static int snd_akm4xxx_stereo_gain_get(struct snd_kcontrol *kcontrol,
+				       struct snd_ctl_elem_value *ucontrol)
 {
 	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
 	int chip = AK_GET_CHIP(kcontrol->private_value);
 	int addr = AK_GET_ADDR(kcontrol->private_value);
-	unsigned char nval = (ucontrol->value.integer.value[0] % 37) | 0x80;
-	int change = snd_akm4xxx_get_ipga(ak, chip, addr) != nval;
-	if (change)
-		snd_akm4xxx_write(ak, chip, addr, nval);
+
+	ucontrol->value.integer.value[0] =
+		snd_akm4xxx_get_ipga(ak, chip, addr);
+	ucontrol->value.integer.value[1] =
+		snd_akm4xxx_get_ipga(ak, chip, addr + 1);
+	return 0;
+}
+
+static int snd_akm4xxx_stereo_gain_put(struct snd_kcontrol *kcontrol,
+				       struct snd_ctl_elem_value *ucontrol)
+{
+	int addr = AK_GET_ADDR(kcontrol->private_value);
+	int change;
+
+	change = put_ak_ipga(kcontrol, addr, ucontrol->value.integer.value[0]);
+	change |= put_ak_ipga(kcontrol, addr + 1,
+			      ucontrol->value.integer.value[1]);
 	return change;
 }
 
@@ -548,221 +587,247 @@ static int ak4xxx_switch_put(struct snd_kcontrol *kcontrol,
  * build AK4xxx controls
  */
 
-int snd_akm4xxx_build_controls(struct snd_akm4xxx *ak)
+static int build_dac_controls(struct snd_akm4xxx *ak)
 {
-	unsigned int idx, num_emphs;
-	struct snd_kcontrol *ctl;
-	int err;
-	int mixer_ch = 0;
-	int num_stereo;
-
-	ctl = kmalloc(sizeof(*ctl), GFP_KERNEL);
-	if (! ctl)
-		return -ENOMEM;
+	int idx, err, mixer_ch, num_stereo;
+	struct snd_kcontrol_new knew;
 
+	mixer_ch = 0;
 	for (idx = 0; idx < ak->num_dacs; ) {
-		memset(ctl, 0, sizeof(*ctl));
-		if (ak->channel_names == NULL) {
-			strcpy(ctl->id.name, "DAC Volume");
+		memset(&knew, 0, sizeof(knew));
+		if (! ak->dac_info || ! ak->dac_info[mixer_ch].name) {
+			knew.name = "DAC Volume";
+			knew.index = mixer_ch + ak->idx_offset * 2;
 			num_stereo = 1;
-			ctl->id.index = mixer_ch + ak->idx_offset * 2;
 		} else {
-			strcpy(ctl->id.name, ak->channel_names[mixer_ch]);
-			num_stereo = ak->num_stereo[mixer_ch];
-			ctl->id.index = 0;
+			knew.name = ak->dac_info[mixer_ch].name;
+			num_stereo = ak->dac_info[mixer_ch].num_channels;
 		}
-		ctl->id.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
-		ctl->count = 1;
+		knew.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
+		knew.count = 1;
+		knew.access = SNDRV_CTL_ELEM_ACCESS_READWRITE |
+			SNDRV_CTL_ELEM_ACCESS_TLV_READ;
 		if (num_stereo == 2) {
-			ctl->info = snd_akm4xxx_stereo_volume_info;
-			ctl->get = snd_akm4xxx_stereo_volume_get;
-			ctl->put = snd_akm4xxx_stereo_volume_put;
+			knew.info = snd_akm4xxx_stereo_volume_info;
+			knew.get = snd_akm4xxx_stereo_volume_get;
+			knew.put = snd_akm4xxx_stereo_volume_put;
 		} else {
-			ctl->info = snd_akm4xxx_volume_info;
-			ctl->get = snd_akm4xxx_volume_get;
-			ctl->put = snd_akm4xxx_volume_put;
+			knew.info = snd_akm4xxx_volume_info;
+			knew.get = snd_akm4xxx_volume_get;
+			knew.put = snd_akm4xxx_volume_put;
 		}
 		switch (ak->type) {
 		case SND_AK4524:
 			/* register 6 & 7 */
-			ctl->private_value =
-				AK_COMPOSE(idx/2, (idx%2) + 6, 0, 127);
+			knew.private_value =
+				AK_COMPOSE(idx/2, (idx%2) + 6, 0, 127) |
+				AK_VOL_CVT;
+			knew.tlv.p = db_scale_vol_datt;
 			break;
 		case SND_AK4528:
 			/* register 4 & 5 */
-			ctl->private_value =
-				AK_COMPOSE(idx/2, (idx%2) + 4, 0, 127);
+			knew.private_value =
+				AK_COMPOSE(idx/2, (idx%2) + 4, 0, 127) |
+				AK_VOL_CVT;
+			knew.tlv.p = db_scale_vol_datt;
 			break;
 		case SND_AK4529: {
 			/* registers 2-7 and b,c */
 			int val = idx < 6 ? idx + 2 : (idx - 6) + 0xb;
-			ctl->private_value =
+			knew.private_value =
 				AK_COMPOSE(0, val, 0, 255) | AK_INVERT;
+			knew.tlv.p = db_scale_8bit;
 			break;
 		}
 		case SND_AK4355:
 			/* register 4-9, chip #0 only */
-			ctl->private_value = AK_COMPOSE(0, idx + 4, 0, 255);
+			knew.private_value = AK_COMPOSE(0, idx + 4, 0, 255);
+			knew.tlv.p = db_scale_8bit;
 			break;
 		case SND_AK4358: {
 			/* register 4-9 and 11-12, chip #0 only */
 			int  addr = idx < 6 ? idx + 4 : idx + 5;
-			ctl->private_value =
+			knew.private_value =
 				AK_COMPOSE(0, addr, 0, 127) | AK_NEEDSMSB;
+			knew.tlv.p = db_scale_7bit;
 			break;
 		}
 		case SND_AK4381:
 			/* register 3 & 4 */
-			ctl->private_value =
+			knew.private_value =
 				AK_COMPOSE(idx/2, (idx%2) + 3, 0, 255);
+			knew.tlv.p = db_scale_linear;
 			break;
 		default:
-			err = -EINVAL;
-			goto __error;
+			return -EINVAL;
 		}
 
-		ctl->private_data = ak;
-		err = snd_ctl_add(ak->card,
-				  snd_ctl_new(ctl, SNDRV_CTL_ELEM_ACCESS_READ|
-					      SNDRV_CTL_ELEM_ACCESS_WRITE));
+		err = snd_ctl_add(ak->card, snd_ctl_new1(&knew, ak));
 		if (err < 0)
-			goto __error;
+			return err;
 
 		idx += num_stereo;
 		mixer_ch++;
 	}
-	for (idx = 0; idx < ak->num_adcs && ak->type == SND_AK4524; ++idx) {
-		memset(ctl, 0, sizeof(*ctl));
-		strcpy(ctl->id.name, "ADC Volume");
-		ctl->id.index = idx + ak->idx_offset * 2;
-		ctl->id.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
-		ctl->count = 1;
-		ctl->info = snd_akm4xxx_volume_info;
-		ctl->get = snd_akm4xxx_volume_get;
-		ctl->put = snd_akm4xxx_volume_put;
-		/* register 4 & 5 */
-		ctl->private_value =
-			AK_COMPOSE(idx/2, (idx%2) + 4, 0, 127);
-		ctl->private_data = ak;
-		err = snd_ctl_add(ak->card,
-				  snd_ctl_new(ctl, SNDRV_CTL_ELEM_ACCESS_READ|
-					      SNDRV_CTL_ELEM_ACCESS_WRITE));
-		if (err < 0)
-			goto __error;
-
-		memset(ctl, 0, sizeof(*ctl));
-		strcpy(ctl->id.name, "IPGA Analog Capture Volume");
-		ctl->id.index = idx + ak->idx_offset * 2;
-		ctl->id.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
-		ctl->count = 1;
-		ctl->info = snd_akm4xxx_ipga_gain_info;
-		ctl->get = snd_akm4xxx_ipga_gain_get;
-		ctl->put = snd_akm4xxx_ipga_gain_put;
+	return 0;
+}
+
+static int build_adc_controls(struct snd_akm4xxx *ak)
+{
+	int idx, err, mixer_ch, num_stereo;
+	struct snd_kcontrol_new knew;
+
+	mixer_ch = 0;
+	for (idx = 0; idx < ak->num_adcs;) {
+		memset(&knew, 0, sizeof(knew));
+		if (! ak->adc_info || ! ak->adc_info[mixer_ch].name) {
+			knew.name = "ADC Volume";
+			knew.index = mixer_ch + ak->idx_offset * 2;
+			num_stereo = 1;
+		} else {
+			knew.name = ak->adc_info[mixer_ch].name;
+			num_stereo = ak->adc_info[mixer_ch].num_channels;
+		}
+		knew.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
+		knew.count = 1;
+		knew.access = SNDRV_CTL_ELEM_ACCESS_READWRITE |
+			SNDRV_CTL_ELEM_ACCESS_TLV_READ;
+		if (num_stereo == 2) {
+			knew.info = snd_akm4xxx_stereo_volume_info;
+			knew.get = snd_akm4xxx_stereo_volume_get;
+			knew.put = snd_akm4xxx_stereo_volume_put;
+		} else {
+			knew.info = snd_akm4xxx_volume_info;
+			knew.get = snd_akm4xxx_volume_get;
+			knew.put = snd_akm4xxx_volume_put;
+		}
 		/* register 4 & 5 */
-		ctl->private_value = AK_COMPOSE(idx/2, (idx%2) + 4, 0, 0);
-		ctl->private_data = ak;
-		err = snd_ctl_add(ak->card,
-				  snd_ctl_new(ctl, SNDRV_CTL_ELEM_ACCESS_READ|
-					      SNDRV_CTL_ELEM_ACCESS_WRITE));
+		knew.private_value =
+			AK_COMPOSE(idx/2, (idx%2) + 4, 0, 127) |
+			AK_VOL_CVT;
+		knew.tlv.p = db_scale_vol_datt;
+		err = snd_ctl_add(ak->card, snd_ctl_new1(&knew, ak));
 		if (err < 0)
-			goto __error;
-	}
+			return err;
 
-	if (ak->type == SND_AK5365) {
-		memset(ctl, 0, sizeof(*ctl));
-		if (ak->channel_names == NULL)
-			strcpy(ctl->id.name, "Capture Volume");
+		if (! ak->adc_info || ! ak->adc_info[mixer_ch].gain_name)
+			knew.name = "IPGA Analog Capture Volume";
 		else
-			strcpy(ctl->id.name, ak->channel_names[0]);
-		ctl->id.index = ak->idx_offset * 2;
-		ctl->id.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
-		ctl->count = 1;
-		ctl->info = snd_akm4xxx_stereo_volume_info;
-		ctl->get = snd_akm4xxx_stereo_volume_get;
-		ctl->put = snd_akm4xxx_stereo_volume_put;
-		/* Registers 4 & 5 (see AK5365 data sheet, pages 34 and 35):
-		 * valid values are from 0x00 (mute) to 0x98 (+12dB).  */
-		ctl->private_value =
-			AK_COMPOSE(0, 4, 0, 0x98);
-		ctl->private_data = ak;
-		err = snd_ctl_add(ak->card,
-				  snd_ctl_new(ctl, SNDRV_CTL_ELEM_ACCESS_READ|
-					      SNDRV_CTL_ELEM_ACCESS_WRITE));
+			knew.name = ak->adc_info[mixer_ch].gain_name;
+		if (num_stereo == 2) {
+			knew.info = snd_akm4xxx_stereo_gain_info;
+			knew.get = snd_akm4xxx_stereo_gain_get;
+			knew.put = snd_akm4xxx_stereo_gain_put;
+		} else {
+			knew.info = snd_akm4xxx_ipga_gain_info;
+			knew.get = snd_akm4xxx_ipga_gain_get;
+			knew.put = snd_akm4xxx_ipga_gain_put;
+		}
+		/* register 4 & 5 */
+		if (ak->type == SND_AK4524)
+			knew.private_value = AK_COMPOSE(idx/2, (idx%2) + 4, 0,
+							24);
+		else /* AK5365 */
+			knew.private_value = AK_COMPOSE(idx/2, (idx%2) + 4, 0,
+							36);
+		knew.tlv.p = db_scale_ipga;
+		err = snd_ctl_add(ak->card, snd_ctl_new1(&knew, ak));
 		if (err < 0)
-			goto __error;
+			return err;
+
+		if (ak->type == SND_AK5365 && (idx % 2) == 0) {
+			if (! ak->adc_info || 
+			    ! ak->adc_info[mixer_ch].switch_name)
+				knew.name = "Capture Switch";
+			else
+				knew.name = ak->adc_info[mixer_ch].switch_name;
+			knew.info = ak4xxx_switch_info;
+			knew.get = ak4xxx_switch_get;
+			knew.put = ak4xxx_switch_put;
+			knew.access = 0;
+			/* register 2, bit 0 (SMUTE): 0 = normal operation,
+			   1 = mute */
+			knew.private_value =
+				AK_COMPOSE(idx/2, 2, 0, 0) | AK_INVERT;
+			err = snd_ctl_add(ak->card, snd_ctl_new1(&knew, ak));
+			if (err < 0)
+				return err;
+		}
 
-		memset(ctl, 0, sizeof(*ctl));
-		if (ak->channel_names == NULL)
-			strcpy(ctl->id.name, "Capture Switch");
-		else
-			strcpy(ctl->id.name, ak->channel_names[1]);
-		ctl->id.index = ak->idx_offset * 2;
-		ctl->id.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
-		ctl->count = 1;
-		ctl->info = ak4xxx_switch_info;
-		ctl->get = ak4xxx_switch_get;
-		ctl->put = ak4xxx_switch_put;
-		/* register 2, bit 0 (SMUTE): 0 = normal operation, 1 = mute */
-		ctl->private_value =
-			AK_COMPOSE(0, 2, 0, 0) | AK_INVERT;
-		ctl->private_data = ak;
-		err = snd_ctl_add(ak->card,
-				  snd_ctl_new(ctl, SNDRV_CTL_ELEM_ACCESS_READ|
-					      SNDRV_CTL_ELEM_ACCESS_WRITE));
-		if (err < 0)
-			goto __error;
+		idx += num_stereo;
+		mixer_ch++;
 	}
+	return 0;
+}
+
+static int build_deemphasis(struct snd_akm4xxx *ak, int num_emphs)
+{
+	int idx, err;
+	struct snd_kcontrol_new knew;
 
-	if (ak->type == SND_AK4355 || ak->type == SND_AK4358)
-		num_emphs = 1;
-	else
-		num_emphs = ak->num_dacs / 2;
 	for (idx = 0; idx < num_emphs; idx++) {
-		memset(ctl, 0, sizeof(*ctl));
-		strcpy(ctl->id.name, "Deemphasis");
-		ctl->id.index = idx + ak->idx_offset;
-		ctl->id.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
-		ctl->count = 1;
-		ctl->info = snd_akm4xxx_deemphasis_info;
-		ctl->get = snd_akm4xxx_deemphasis_get;
-		ctl->put = snd_akm4xxx_deemphasis_put;
+		memset(&knew, 0, sizeof(knew));
+		knew.name = "Deemphasis";
+		knew.index = idx + ak->idx_offset;
+		knew.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
+		knew.count = 1;
+		knew.info = snd_akm4xxx_deemphasis_info;
+		knew.get = snd_akm4xxx_deemphasis_get;
+		knew.put = snd_akm4xxx_deemphasis_put;
 		switch (ak->type) {
 		case SND_AK4524:
 		case SND_AK4528:
 			/* register 3 */
-			ctl->private_value = AK_COMPOSE(idx, 3, 0, 0);
+			knew.private_value = AK_COMPOSE(idx, 3, 0, 0);
 			break;
 		case SND_AK4529: {
 			int shift = idx == 3 ? 6 : (2 - idx) * 2;
 			/* register 8 with shift */
-			ctl->private_value = AK_COMPOSE(0, 8, shift, 0);
+			knew.private_value = AK_COMPOSE(0, 8, shift, 0);
 			break;
 		}
 		case SND_AK4355:
 		case SND_AK4358:
-			ctl->private_value = AK_COMPOSE(idx, 3, 0, 0);
+			knew.private_value = AK_COMPOSE(idx, 3, 0, 0);
 			break;
 		case SND_AK4381:
-			ctl->private_value = AK_COMPOSE(idx, 1, 1, 0);
+			knew.private_value = AK_COMPOSE(idx, 1, 1, 0);
 			break;
 		default:
-			err = -EINVAL;
-			goto __error;
+			return -EINVAL;
 		}
-		ctl->private_data = ak;
-		err = snd_ctl_add(ak->card,
-				  snd_ctl_new(ctl, SNDRV_CTL_ELEM_ACCESS_READ|
-					      SNDRV_CTL_ELEM_ACCESS_WRITE));
+		err = snd_ctl_add(ak->card, snd_ctl_new1(&knew, ak));
 		if (err < 0)
-			goto __error;
+			return err;
 	}
-	err = 0;
-
- __error:
-	kfree(ctl);
-	return err;
+	return 0;
 }
 
+int snd_akm4xxx_build_controls(struct snd_akm4xxx *ak)
+{
+	int err, num_emphs;
+
+	err = build_dac_controls(ak);
+	if (err < 0)
+		return err;
+
+	if (ak->type == SND_AK4524 || ak->type == SND_AK5365) {
+		err = build_adc_controls(ak);
+		if (err < 0)
+			return err;
+	}
+
+	if (ak->type == SND_AK4355 || ak->type == SND_AK4358)
+		num_emphs = 1;
+	else
+		num_emphs = ak->num_dacs / 2;
+	err = build_deemphasis(ak, num_emphs);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+	
 EXPORT_SYMBOL(snd_akm4xxx_build_controls);
 
 static int __init alsa_akm4xxx_module_init(void)
diff --git a/sound/pci/ice1712/revo.c b/sound/pci/ice1712/revo.c
index 1134a57f9e6..c9eefa9bbff 100644
--- a/sound/pci/ice1712/revo.c
+++ b/sound/pci/ice1712/revo.c
@@ -87,19 +87,34 @@ static void revo_set_rate_val(struct snd_akm4xxx *ak, unsigned int rate)
  * initialize the chips on M-Audio Revolution cards
  */
 
-static unsigned int revo71_num_stereo_front[] = {2};
-static char *revo71_channel_names_front[] = {"PCM Playback Volume"};
+#define AK_DAC(xname,xch) { .name = xname, .num_channels = xch }
 
-static unsigned int revo71_num_stereo_surround[] = {1, 1, 2, 2};
-static char *revo71_channel_names_surround[] = {"PCM Center Playback Volume", "PCM LFE Playback Volume",
-						"PCM Side Playback Volume", "PCM Rear Playback Volume"};
+static struct snd_akm4xxx_dac_channel revo71_front[] = {
+	AK_DAC("PCM Playback Volume", 2)
+};
+
+static struct snd_akm4xxx_dac_channel revo71_surround[] = {
+	AK_DAC("PCM Center Playback Volume", 1),
+	AK_DAC("PCM LFE Playback Volume", 1),
+	AK_DAC("PCM Side Playback Volume", 2),
+	AK_DAC("PCM Rear Playback Volume", 2),
+};
 
-static unsigned int revo51_num_stereo[] = {2, 1, 1, 2};
-static char *revo51_channel_names[] = {"PCM Playback Volume", "PCM Center Playback Volume",
-					"PCM LFE Playback Volume", "PCM Rear Playback Volume"};
+static struct snd_akm4xxx_dac_channel revo51_dac[] = {
+	AK_DAC("PCM Playback Volume", 2),
+	AK_DAC("PCM Center Playback Volume", 1),
+	AK_DAC("PCM LFE Playback Volume", 1),
+	AK_DAC("PCM Rear Playback Volume", 2),
+};
 
-static unsigned int revo51_adc_num_stereo[] = {2};
-static char *revo51_adc_channel_names[] = {"PCM Capture Volume","PCM Capture Switch"};
+static struct snd_akm4xxx_adc_channel revo51_adc[] = {
+	{
+		.name = "PCM Capture Volume",
+		.gain_name = "PCM Capture Gain Volume",
+		.switch_name = "PCM Capture Switch",
+		.num_channels = 2
+	},
+};
 
 static struct snd_akm4xxx akm_revo_front __devinitdata = {
 	.type = SND_AK4381,
@@ -107,8 +122,7 @@ static struct snd_akm4xxx akm_revo_front __devinitdata = {
 	.ops = {
 		.set_rate_val = revo_set_rate_val
 	},
-	.num_stereo = revo71_num_stereo_front,
-	.channel_names = revo71_channel_names_front
+	.dac_info = revo71_front,
 };
 
 static struct snd_ak4xxx_private akm_revo_front_priv __devinitdata = {
@@ -130,8 +144,7 @@ static struct snd_akm4xxx akm_revo_surround __devinitdata = {
 	.ops = {
 		.set_rate_val = revo_set_rate_val
 	},
-	.num_stereo = revo71_num_stereo_surround,
-	.channel_names = revo71_channel_names_surround
+	.dac_info = revo71_surround,
 };
 
 static struct snd_ak4xxx_private akm_revo_surround_priv __devinitdata = {
@@ -152,8 +165,7 @@ static struct snd_akm4xxx akm_revo51 __devinitdata = {
 	.ops = {
 		.set_rate_val = revo_set_rate_val
 	},
-	.num_stereo = revo51_num_stereo,
-	.channel_names = revo51_channel_names
+	.dac_info = revo51_dac,
 };
 
 static struct snd_ak4xxx_private akm_revo51_priv __devinitdata = {
@@ -171,8 +183,7 @@ static struct snd_ak4xxx_private akm_revo51_priv __devinitdata = {
 static struct snd_akm4xxx akm_revo51_adc __devinitdata = {
 	.type = SND_AK5365,
 	.num_adcs = 2,
-	.num_stereo = revo51_adc_num_stereo,
-	.channel_names = revo51_adc_channel_names
+	.adc_info = revo51_adc,
 };
 
 static struct snd_ak4xxx_private akm_revo51_adc_priv __devinitdata = {
-- 
cgit v1.2.3-70-g09d2


From 55a29af5ed5d914f017e6a7c613a4d7cc34f82d9 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 6 Sep 2006 12:15:34 +0200
Subject: [ALSA] Add definition of TLV dB range compound

Added the definition of TLV dB range compound.  It contains one or
more dB-range or linear-volume TLV entries with min/max ranges.
Used for volume controls with non-linear curves.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/tlv.h | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/sound/tlv.h b/include/sound/tlv.h
index 7905841643d..d93a96b9187 100644
--- a/include/sound/tlv.h
+++ b/include/sound/tlv.h
@@ -34,19 +34,26 @@
 #define SNDRV_CTL_TLVT_CONTAINER 0	/* one level down - group of TLVs */
 #define SNDRV_CTL_TLVT_DB_SCALE	1       /* dB scale */
 #define SNDRV_CTL_TLVT_DB_LINEAR 2	/* linear volume */
+#define SNDRV_CTL_TLVT_DB_RANGE 3	/* dB range container */
 
+#define TLV_DB_SCALE_ITEM(min, step, mute)			\
+	SNDRV_CTL_TLVT_DB_SCALE, 2 * sizeof(unsigned int),	\
+	(min), ((step) & 0xffff) | ((mute) ? 0x10000 : 0)
 #define DECLARE_TLV_DB_SCALE(name, min, step, mute) \
-unsigned int name[] = { \
-        SNDRV_CTL_TLVT_DB_SCALE, 2 * sizeof(unsigned int), \
-        (min), ((step) & 0xffff) | ((mute) ? 0x10000 : 0) \
-}
+	unsigned int name[] = { TLV_DB_SCALE_ITEM(min, step, mute) }
 
 /* linear volume between min_dB and max_dB (.01dB unit) */
+#define TLV_DB_LINEAR_ITEM(min_dB, max_dB)		    \
+	SNDRV_CTL_TLVT_DB_LINEAR, 2 * sizeof(unsigned int), \
+	(min_dB), (max_dB)
 #define DECLARE_TLV_DB_LINEAR(name, min_dB, max_dB)	\
-unsigned int name[] = { \
-        SNDRV_CTL_TLVT_DB_LINEAR, 2 * sizeof(unsigned int), \
-        (min_dB), (max_dB)				\
-}
+	unsigned int name[] = { TLV_DB_LINEAR_ITEM(min_dB, max_dB) }
+
+/* dB range container */
+/* Each item is: <min> <max> <TLV> */
+/* The below assumes that each item TLV is 4 words like DB_SCALE or LINEAR */
+#define TLV_DB_RANGE_HEAD(num)			\
+	SNDRV_CTL_TLVT_DB_RANGE, 6 * (num) * sizeof(unsigned int)
 
 #define TLV_DB_GAIN_MUTE	-9999999
 
-- 
cgit v1.2.3-70-g09d2


From 9d19f48cfe2570562c2c6226780a7ca627b0f1f1 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 6 Sep 2006 14:27:46 +0200
Subject: [ALSA] Add pcm_class attribute to PCM sysfs entry

This patch adds a new attribute, pcm_class, to each PCM sysfs entry.
It's useful to detect what kind of PCM stream is, for example, HAL
can check whether it's a modem or not.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/core.h |  4 ++++
 sound/core/pcm.c     | 24 ++++++++++++++++++++++
 sound/core/sound.c   | 56 +++++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 72 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/sound/core.h b/include/sound/core.h
index 1359c532b68..b056ea925ec 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -26,6 +26,7 @@
 #include <linux/mutex.h>		/* struct mutex */
 #include <linux/rwsem.h>		/* struct rw_semaphore */
 #include <linux/pm.h>			/* pm_message_t */
+#include <linux/device.h>
 
 /* forward declarations */
 #ifdef CONFIG_PCI
@@ -186,6 +187,7 @@ struct snd_minor {
 	int device;			/* device number */
 	const struct file_operations *f_ops;	/* file operations */
 	void *private_data;		/* private data for f_ops->open */
+	struct class_device *class_dev;	/* class device for sysfs */
 };
 
 /* sound.c */
@@ -200,6 +202,8 @@ int snd_register_device(int type, struct snd_card *card, int dev,
 			const char *name);
 int snd_unregister_device(int type, struct snd_card *card, int dev);
 void *snd_lookup_minor_data(unsigned int minor, int type);
+int snd_add_device_sysfs_file(int type, struct snd_card *card, int dev,
+			      const struct class_device_attribute *attr);
 
 #ifdef CONFIG_SND_OSSEMUL
 int snd_register_oss_device(int type, struct snd_card *card, int dev,
diff --git a/sound/core/pcm.c b/sound/core/pcm.c
index ed3b0946956..bf8f412988b 100644
--- a/sound/core/pcm.c
+++ b/sound/core/pcm.c
@@ -907,6 +907,28 @@ void snd_pcm_detach_substream(struct snd_pcm_substream *substream)
 	substream->pstr->substream_opened--;
 }
 
+static ssize_t show_pcm_class(struct class_device *class_device, char *buf)
+{
+	struct snd_pcm *pcm;
+	const char *str;
+	static const char *strs[SNDRV_PCM_CLASS_LAST + 1] = {
+		[SNDRV_PCM_CLASS_GENERIC] = "generic",
+		[SNDRV_PCM_CLASS_MULTI] = "multi",
+		[SNDRV_PCM_CLASS_MODEM] = "modem",
+		[SNDRV_PCM_CLASS_DIGITIZER] = "digitizer",
+	};
+
+	if (! (pcm = class_get_devdata(class_device)) ||
+	    pcm->dev_class > SNDRV_PCM_CLASS_LAST)
+		str = "none";
+	else
+		str = strs[pcm->dev_class];
+        return snprintf(buf, PAGE_SIZE, "%s\n", str);
+}
+
+static struct class_device_attribute pcm_attrs =
+	__ATTR(pcm_class, S_IRUGO, show_pcm_class, NULL);
+
 static int snd_pcm_dev_register(struct snd_device *device)
 {
 	int cidx, err;
@@ -945,6 +967,8 @@ static int snd_pcm_dev_register(struct snd_device *device)
 			mutex_unlock(&register_mutex);
 			return err;
 		}
+		snd_add_device_sysfs_file(devtype, pcm->card, pcm->device,
+					  &pcm_attrs);
 		for (substream = pcm->streams[cidx].substream; substream; substream = substream->next)
 			snd_pcm_timer_init(substream);
 	}
diff --git a/sound/core/sound.c b/sound/core/sound.c
index b4430db3fa4..efa476c5210 100644
--- a/sound/core/sound.c
+++ b/sound/core/sound.c
@@ -268,7 +268,11 @@ int snd_register_device(int type, struct snd_card *card, int dev,
 	snd_minors[minor] = preg;
 	if (card)
 		device = card->dev;
-	class_device_create(sound_class, NULL, MKDEV(major, minor), device, "%s", name);
+	preg->class_dev = class_device_create(sound_class, NULL,
+					      MKDEV(major, minor),
+					      device, "%s", name);
+	if (preg->class_dev)
+		class_set_devdata(preg->class_dev, private_data);
 
 	mutex_unlock(&sound_mutex);
 	return 0;
@@ -276,6 +280,24 @@ int snd_register_device(int type, struct snd_card *card, int dev,
 
 EXPORT_SYMBOL(snd_register_device);
 
+/* find the matching minor record
+ * return the index of snd_minor, or -1 if not found
+ */
+static int find_snd_minor(int type, struct snd_card *card, int dev)
+{
+	int cardnum, minor;
+	struct snd_minor *mptr;
+
+	cardnum = card ? card->number : -1;
+	for (minor = 0; minor < ARRAY_SIZE(snd_minors); ++minor)
+		if ((mptr = snd_minors[minor]) != NULL &&
+		    mptr->type == type &&
+		    mptr->card == cardnum &&
+		    mptr->device == dev)
+			return minor;
+	return -1;
+}
+
 /**
  * snd_unregister_device - unregister the device on the given card
  * @type: the device type, SNDRV_DEVICE_TYPE_XXX
@@ -289,32 +311,42 @@ EXPORT_SYMBOL(snd_register_device);
  */
 int snd_unregister_device(int type, struct snd_card *card, int dev)
 {
-	int cardnum, minor;
-	struct snd_minor *mptr;
+	int minor;
 
-	cardnum = card ? card->number : -1;
 	mutex_lock(&sound_mutex);
-	for (minor = 0; minor < ARRAY_SIZE(snd_minors); ++minor)
-		if ((mptr = snd_minors[minor]) != NULL &&
-		    mptr->type == type &&
-		    mptr->card == cardnum &&
-		    mptr->device == dev)
-			break;
-	if (minor == ARRAY_SIZE(snd_minors)) {
+	minor = find_snd_minor(type, card, dev);
+	if (minor < 0) {
 		mutex_unlock(&sound_mutex);
 		return -EINVAL;
 	}
 
 	class_device_destroy(sound_class, MKDEV(major, minor));
 
+	kfree(snd_minors[minor]);
 	snd_minors[minor] = NULL;
 	mutex_unlock(&sound_mutex);
-	kfree(mptr);
 	return 0;
 }
 
 EXPORT_SYMBOL(snd_unregister_device);
 
+int snd_add_device_sysfs_file(int type, struct snd_card *card, int dev,
+			      const struct class_device_attribute *attr)
+{
+	int minor, ret = -EINVAL;
+	struct class_device *cdev;
+
+	mutex_lock(&sound_mutex);
+	minor = find_snd_minor(type, card, dev);
+	if (minor >= 0 && (cdev = snd_minors[minor]->class_dev) != NULL)
+		ret = class_device_create_file(cdev, attr);
+	mutex_unlock(&sound_mutex);
+	return ret;
+
+}
+
+EXPORT_SYMBOL(snd_add_device_sysfs_file);
+
 #ifdef CONFIG_PROC_FS
 /*
  *  INFO PART
-- 
cgit v1.2.3-70-g09d2


From 854b66e44260320c21ebe4b8a18e189f2e45b5be Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 8 Sep 2006 12:27:38 +0200
Subject: [ALSA] ak4xxx - Remove bogus IPGA controls

Remove IPGA volume controls and merge the IPGA range to ADC volume
controls.  These two volumes are not really independent but connected
simply in different ranges 0-0x7f and 0x80-max.  It doesn't make sense
to provide two controls.
Since both 0x7f and 0x80 specify 0dB, a hack is needed for IPGA range
to skip 0x80 (increment one) for such controls.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jaroslav Kysela <perex@suse.cz>
---
 include/sound/ak4xxx-adda.h   |  10 ----
 sound/i2c/other/ak4xxx-adda.c | 128 ++++++------------------------------------
 sound/pci/ice1712/revo.c      |   1 -
 3 files changed, 18 insertions(+), 121 deletions(-)

(limited to 'include')

diff --git a/include/sound/ak4xxx-adda.h b/include/sound/ak4xxx-adda.h
index 026e4072a9a..d0deca669b9 100644
--- a/include/sound/ak4xxx-adda.h
+++ b/include/sound/ak4xxx-adda.h
@@ -48,7 +48,6 @@ struct snd_akm4xxx_dac_channel {
 /* ADC labels and channels */
 struct snd_akm4xxx_adc_channel {
 	char *name;		/* capture gain volume label */
-	char *gain_name;	/* IPGA */
 	char *switch_name;	/* capture switch */
 	unsigned int num_channels;
 };
@@ -91,13 +90,4 @@ int snd_akm4xxx_build_controls(struct snd_akm4xxx *ak);
 #define snd_akm4xxx_set_vol(ak,chip,reg,val) \
 	((ak)->volumes[(chip) * 16 + (reg)] = (val))
 
-/* Warning: IPGA is tricky - we assume the addr + 4 is unused
- *   so far, it's OK for all AK codecs with IPGA:
- *   AK4524, AK4528 and EK5365
- */
-#define snd_akm4xxx_get_ipga(ak,chip,reg) \
-	snd_akm4xxx_get_vol(ak, chip, (reg) + 4)
-#define snd_akm4xxx_set_ipga(ak,chip,reg,val) \
-	snd_akm4xxx_set_vol(ak, chip, (reg) + 4, val)
-
 #endif /* __SOUND_AK4XXX_ADDA_H */
diff --git a/sound/i2c/other/ak4xxx-adda.c b/sound/i2c/other/ak4xxx-adda.c
index c34cb468460..5da49e2eb35 100644
--- a/sound/i2c/other/ak4xxx-adda.c
+++ b/sound/i2c/other/ak4xxx-adda.c
@@ -43,10 +43,7 @@ void snd_akm4xxx_write(struct snd_akm4xxx *ak, int chip, unsigned char reg,
 	ak->ops.write(ak, chip, reg, val);
 
 	/* save the data */
-	/* don't overwrite with IPGA data */
-	if ((ak->type != SND_AK4524 && ak->type != SND_AK5365) ||
-	    (reg != 0x04 && reg != 0x05) || (val & 0x80) == 0)
-		snd_akm4xxx_set(ak, chip, reg, val);
+	snd_akm4xxx_set(ak, chip, reg, val);
 	ak->ops.unlock(ak, chip);
 }
 
@@ -70,12 +67,6 @@ static void ak4524_reset(struct snd_akm4xxx *ak, int state)
 		for (reg = 0x04; reg < maxreg; reg++)
 			snd_akm4xxx_write(ak, chip, reg,
 					  snd_akm4xxx_get(ak, chip, reg));
-		if (ak->type == SND_AK4528)
-			continue;
-		/* IPGA */
-		for (reg = 0x04; reg < 0x06; reg++)
-			snd_akm4xxx_write(ak, chip, reg,
-					  snd_akm4xxx_get_ipga(ak, chip, reg) | 0x80);
 	}
 }
 
@@ -175,7 +166,6 @@ static DECLARE_TLV_DB_SCALE(db_scale_vol_datt, -6350, 50, 1);
 static DECLARE_TLV_DB_SCALE(db_scale_8bit, -12750, 50, 1);
 static DECLARE_TLV_DB_SCALE(db_scale_7bit, -6350, 50, 1);
 static DECLARE_TLV_DB_LINEAR(db_scale_linear, TLV_DB_GAIN_MUTE, 0);
-static DECLARE_TLV_DB_SCALE(db_scale_ipga, 0, 50, 0);
 
 /*
  * initialize all the ak4xxx chips
@@ -190,8 +180,6 @@ void snd_akm4xxx_init(struct snd_akm4xxx *ak)
 		0x01, 0x03, /* 1: ADC/DAC enable */
 		0x04, 0x00, /* 4: ADC left muted */
 		0x05, 0x00, /* 5: ADC right muted */
-		0x04, 0x80, /* 4: ADC IPGA gain 0dB */
-		0x05, 0x80, /* 5: ADC IPGA gain 0dB */
 		0x06, 0x00, /* 6: DAC left muted */
 		0x07, 0x00, /* 7: DAC right muted */
 		0xff, 0xff
@@ -324,13 +312,15 @@ EXPORT_SYMBOL(snd_akm4xxx_init);
 /*
  * Mixer callbacks
  */
+#define AK_IPGA 			(1<<20)	/* including IPGA */
 #define AK_VOL_CVT 			(1<<21)	/* need dB conversion */
 #define AK_NEEDSMSB 			(1<<22)	/* need MSB update bit */
 #define AK_INVERT 			(1<<23)	/* data is inverted */
 #define AK_GET_CHIP(val)		(((val) >> 8) & 0xff)
 #define AK_GET_ADDR(val)		((val) & 0xff)
-#define AK_GET_SHIFT(val)		(((val) >> 16) & 0x1f)
+#define AK_GET_SHIFT(val)		(((val) >> 16) & 0x0f)
 #define AK_GET_VOL_CVT(val)		(((val) >> 21) & 1)
+#define AK_GET_IPGA(val)		(((val) >> 20) & 1)
 #define AK_GET_NEEDSMSB(val)		(((val) >> 22) & 1)
 #define AK_GET_INVERT(val)		(((val) >> 23) & 1)
 #define AK_GET_MASK(val)		(((val) >> 24) & 0xff)
@@ -371,8 +361,10 @@ static int put_ak_reg(struct snd_kcontrol *kcontrol, int addr,
 		return 0;
 
 	snd_akm4xxx_set_vol(ak, chip, addr, nval);
-	if (AK_GET_VOL_CVT(kcontrol->private_value))
+	if (AK_GET_VOL_CVT(kcontrol->private_value) && nval < 128)
 		nval = vol_cvt_datt[nval];
+	if (AK_GET_IPGA(kcontrol->private_value) && nval >= 128)
+		nval++; /* need to correct + 1 since both 127 and 128 are 0dB */
 	if (AK_GET_INVERT(kcontrol->private_value))
 		nval = mask - nval;
 	if (AK_GET_NEEDSMSB(kcontrol->private_value))
@@ -424,68 +416,6 @@ static int snd_akm4xxx_stereo_volume_put(struct snd_kcontrol *kcontrol,
 	return change;
 }
 
-#define snd_akm4xxx_ipga_gain_info	snd_akm4xxx_volume_info
-
-static int snd_akm4xxx_ipga_gain_get(struct snd_kcontrol *kcontrol,
-				     struct snd_ctl_elem_value *ucontrol)
-{
-	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
-	int chip = AK_GET_CHIP(kcontrol->private_value);
-	int addr = AK_GET_ADDR(kcontrol->private_value);
-
-	ucontrol->value.integer.value[0] =
-		snd_akm4xxx_get_ipga(ak, chip, addr);
-	return 0;
-}
-
-static int put_ak_ipga(struct snd_kcontrol *kcontrol, int addr,
-		       unsigned char nval)
-{
-	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
-	int chip = AK_GET_CHIP(kcontrol->private_value);
-
-	if (snd_akm4xxx_get_ipga(ak, chip, addr) == nval)
-		return 0;
-	snd_akm4xxx_set_ipga(ak, chip, addr, nval);
-	snd_akm4xxx_write(ak, chip, addr, nval | 0x80); /* need MSB */
-	return 1;
-}
-
-static int snd_akm4xxx_ipga_gain_put(struct snd_kcontrol *kcontrol,
-				     struct snd_ctl_elem_value *ucontrol)
-{
-	return put_ak_ipga(kcontrol, AK_GET_ADDR(kcontrol->private_value),
-			   ucontrol->value.integer.value[0]);
-}
-
-#define snd_akm4xxx_stereo_gain_info	snd_akm4xxx_stereo_volume_info
-
-static int snd_akm4xxx_stereo_gain_get(struct snd_kcontrol *kcontrol,
-				       struct snd_ctl_elem_value *ucontrol)
-{
-	struct snd_akm4xxx *ak = snd_kcontrol_chip(kcontrol);
-	int chip = AK_GET_CHIP(kcontrol->private_value);
-	int addr = AK_GET_ADDR(kcontrol->private_value);
-
-	ucontrol->value.integer.value[0] =
-		snd_akm4xxx_get_ipga(ak, chip, addr);
-	ucontrol->value.integer.value[1] =
-		snd_akm4xxx_get_ipga(ak, chip, addr + 1);
-	return 0;
-}
-
-static int snd_akm4xxx_stereo_gain_put(struct snd_kcontrol *kcontrol,
-				       struct snd_ctl_elem_value *ucontrol)
-{
-	int addr = AK_GET_ADDR(kcontrol->private_value);
-	int change;
-
-	change = put_ak_ipga(kcontrol, addr, ucontrol->value.integer.value[0]);
-	change |= put_ak_ipga(kcontrol, addr + 1,
-			      ucontrol->value.integer.value[1]);
-	return change;
-}
-
 static int snd_akm4xxx_deemphasis_info(struct snd_kcontrol *kcontrol,
 				       struct snd_ctl_elem_info *uinfo)
 {
@@ -702,35 +632,15 @@ static int build_adc_controls(struct snd_akm4xxx *ak)
 			knew.put = snd_akm4xxx_volume_put;
 		}
 		/* register 4 & 5 */
-		knew.private_value =
-			AK_COMPOSE(idx/2, (idx%2) + 4, 0, 127) |
-			AK_VOL_CVT;
-		knew.tlv.p = db_scale_vol_datt;
-		err = snd_ctl_add(ak->card, snd_ctl_new1(&knew, ak));
-		if (err < 0)
-			return err;
-
-		if (! ak->adc_info || ! ak->adc_info[mixer_ch].gain_name)
-			knew.name = "IPGA Analog Capture Volume";
+		if (ak->type == SND_AK5365)
+			knew.private_value =
+				AK_COMPOSE(idx/2, (idx%2) + 4, 0, 151) |
+				AK_VOL_CVT | AK_IPGA;
 		else
-			knew.name = ak->adc_info[mixer_ch].gain_name;
-		if (num_stereo == 2) {
-			knew.info = snd_akm4xxx_stereo_gain_info;
-			knew.get = snd_akm4xxx_stereo_gain_get;
-			knew.put = snd_akm4xxx_stereo_gain_put;
-		} else {
-			knew.info = snd_akm4xxx_ipga_gain_info;
-			knew.get = snd_akm4xxx_ipga_gain_get;
-			knew.put = snd_akm4xxx_ipga_gain_put;
-		}
-		/* register 4 & 5 */
-		if (ak->type == SND_AK4524)
-			knew.private_value = AK_COMPOSE(idx/2, (idx%2) + 4, 0,
-							24);
-		else /* AK5365 */
-			knew.private_value = AK_COMPOSE(idx/2, (idx%2) + 4, 0,
-							36);
-		knew.tlv.p = db_scale_ipga;
+			knew.private_value =
+				AK_COMPOSE(idx/2, (idx%2) + 4, 0, 163) |
+				AK_VOL_CVT | AK_IPGA;
+		knew.tlv.p = db_scale_vol_datt;
 		err = snd_ctl_add(ak->card, snd_ctl_new1(&knew, ak));
 		if (err < 0)
 			return err;
@@ -811,11 +721,9 @@ int snd_akm4xxx_build_controls(struct snd_akm4xxx *ak)
 	if (err < 0)
 		return err;
 
-	if (ak->type == SND_AK4524 || ak->type == SND_AK5365) {
-		err = build_adc_controls(ak);
-		if (err < 0)
-			return err;
-	}
+	err = build_adc_controls(ak);
+	if (err < 0)
+		return err;
 
 	if (ak->type == SND_AK4355 || ak->type == SND_AK4358)
 		num_emphs = 1;
diff --git a/sound/pci/ice1712/revo.c b/sound/pci/ice1712/revo.c
index c9eefa9bbff..bf98ea34feb 100644
--- a/sound/pci/ice1712/revo.c
+++ b/sound/pci/ice1712/revo.c
@@ -110,7 +110,6 @@ static struct snd_akm4xxx_dac_channel revo51_dac[] = {
 static struct snd_akm4xxx_adc_channel revo51_adc[] = {
 	{
 		.name = "PCM Capture Volume",
-		.gain_name = "PCM Capture Gain Volume",
 		.switch_name = "PCM Capture Switch",
 		.num_channels = 2
 	},
-- 
cgit v1.2.3-70-g09d2


From 4c8bd7eeee4c8f157fb61fb64b57500990b42e0e Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Fri, 22 Sep 2006 22:31:36 -0700
Subject: [KERNEL] Do not truncate to 'int' in ALIGN() macro.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 851aa1bcfc1..2b2ae4fdce8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -31,7 +31,7 @@ extern const char linux_banner[];
 #define STACK_MAGIC	0xdeadbeef
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
+#define ALIGN(x,a) (((x)+(a)-1UL)&~((a)-1UL))
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
 
-- 
cgit v1.2.3-70-g09d2


From 2efc80cb8ddc341d81de996920e3b2ad8a12b1f7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 23 Sep 2006 16:45:55 +0100
Subject: [PATCH] #elif that should've been #elif defined

 #elif CONFIG_44x
in ibm4xx.h should've been
 #elif defined(CONFIG_44x)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ppc/ibm4xx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-ppc/ibm4xx.h b/include/asm-ppc/ibm4xx.h
index cf62b69cb69..499c14691c7 100644
--- a/include/asm-ppc/ibm4xx.h
+++ b/include/asm-ppc/ibm4xx.h
@@ -86,7 +86,7 @@ void ppc4xx_init(unsigned long r3, unsigned long r4, unsigned long r5,
 #define PCI_DRAM_OFFSET	0
 #endif
 
-#elif CONFIG_44x
+#elif defined(CONFIG_44x)
 
 #if defined(CONFIG_BAMBOO)
 #include <platforms/4xx/bamboo.h>
-- 
cgit v1.2.3-70-g09d2