From 71bb49596bbf4e5a3328e1704d18604e822ba181 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Query supported CCIDs

This provides a data structure to record which CCIDs are locally supported
and three accessor functions:
 - a test function for internal use which is used to validate CCID requests
   made by the user;
 - a copy function so that the list can be used for feature-negotiation;
 - documented getsockopt() support so that the user can query capabilities.

The data structure is a table which is filled in at compile-time with the
list of available CCIDs (which in turn depends on the Kconfig choices).

Using the copy function for cloning the list of supported CCIDs is useful for
feature negotiation, since the negotiation is now with the full list of available
CCIDs (e.g. {2, 3}) instead of the default value {2}. This means negotiation
will not fail if the peer requests to use CCID3 instead of CCID2.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 Documentation/networking/dccp.txt | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 39131a3c78f..f0aeb20fa63 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -57,6 +57,10 @@ can be set before calling bind().
 DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet
 size (application payload size) in bytes, see RFC 4340, section 14.
 
+DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs
+supported by the endpoint (see include/linux/dccp.h for symbolic constants).
+The caller needs to provide a sufficiently large (> 2) array of type uint8_t.
+
 DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold
 timewait state when closing the connection (RFC 4340, 8.3). The usual case is
 that the closing server sends a CloseReq, whereupon the client holds timewait
-- 
cgit v1.2.3-70-g09d2


From 17c30b40ed79e9f3955e884632c8f01e577b204a Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Deprecate Ack Ratio sysctl

This patch deprecates the Ack Ratio sysctl, since
 * Ack Ratio is entirely ignored by CCID-3 and CCID-4,
 * Ack Ratio currently doesn't work in CCID-2 (i.e. is always set to 1);
 * even if it would work in CCID-2, there is no point for a user to change it:
   - Ack Ratio is constrained by cwnd (RFC 4341, 6.1.2),
   - if Ack Ratio > cwnd, the system resorts to spurious RTO timeouts
     (since waiting for Acks which will never arrive in this window),
   - cwnd is not a user-configurable value.

The only reasonable place for Ack Ratio is to print it for debugging. It is
planned to do this later on, as part of e.g. dccp_probe.

With this patch Ack Ratio is now under full control of feature negotiation:
 * Ack Ratio is resolved as a dependency of the selected CCID;
 * if the chosen CCID supports it (i.e. CCID == CCID-2), Ack Ratio is set to
   the default of 2, following RFC 4340, 11.3 - "New connections start with Ack
   Ratio 2 for both endpoints";
 * what happens then is part of another patch set, since it concerns the
   dynamic update of Ack Ratio while the connection is in full flight.

Thanks to Tomasz Grobelny for discussion leading up to this patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 Documentation/networking/dccp.txt | 3 ---
 include/linux/dccp.h              | 2 --
 net/dccp/dccp.h                   | 1 -
 net/dccp/minisocks.c              | 1 -
 net/dccp/options.c                | 1 -
 net/dccp/sysctl.c                 | 7 -------
 6 files changed, 15 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index f0aeb20fa63..43df4487379 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -125,9 +125,6 @@ send_ndp = 1
 send_ackvec = 1
 	Whether or not to send Ack Vector options (sec. 11.5).
 
-ack_ratio = 2
-	The default Ack Ratio (sec. 11.3) to use.
-
 tx_ccid = 2
 	Default CCID for the sender-receiver half-connection.
 
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 5a5a89935db..eda389ce04f 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -368,7 +368,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   * @dccpms_ccid - Congestion Control Id (CCID) (section 10)
   * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
   * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
-  * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3)
   * @dccpms_pending - List of features being negotiated
   * @dccpms_conf -
   */
@@ -378,7 +377,6 @@ struct dccp_minisock {
 	__u8			dccpms_tx_ccid;
 	__u8			dccpms_send_ack_vector;
 	__u8			dccpms_send_ndp_count;
-	__u8			dccpms_ack_ratio;
 	struct list_head	dccpms_pending;
 	struct list_head	dccpms_conf;
 };
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index e656dafb5d9..031ce350d3c 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -98,7 +98,6 @@ extern int  sysctl_dccp_retries2;
 extern int  sysctl_dccp_feat_sequence_window;
 extern int  sysctl_dccp_feat_rx_ccid;
 extern int  sysctl_dccp_feat_tx_ccid;
-extern int  sysctl_dccp_feat_ack_ratio;
 extern int  sysctl_dccp_feat_send_ack_vector;
 extern int  sysctl_dccp_feat_send_ndp_count;
 extern int  sysctl_dccp_tx_qlen;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index e487133ae07..ee7f40f8ddf 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -47,7 +47,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk)
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
 	dmsk->dccpms_rx_ccid	     = sysctl_dccp_feat_rx_ccid;
 	dmsk->dccpms_tx_ccid	     = sysctl_dccp_feat_tx_ccid;
-	dmsk->dccpms_ack_ratio	     = sysctl_dccp_feat_ack_ratio;
 	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
 	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
 }
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 67a171a1268..515ad45013a 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -26,7 +26,6 @@
 int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
 int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_ack_ratio	      = DCCPF_INITIAL_ACK_RATIO;
 int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
 int sysctl_dccp_feat_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
 
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 21295993fdb..f6e54f433e2 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "ack_ratio",
-		.data		= &sysctl_dccp_feat_ack_ratio,
-		.maxlen		= sizeof(sysctl_dccp_feat_ack_ratio),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "send_ackvec",
 		.data		= &sysctl_dccp_feat_send_ack_vector,
-- 
cgit v1.2.3-70-g09d2


From fade756f18d42694e3acb00e3471ab43002cba16 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Set per-connection CCIDs via socket options

With this patch, TX/RX CCIDs can now be changed on a per-connection basis, which
overrides the defaults set by the global sysctl variables for TX/RX CCIDs.

To make full use of this facility, the remaining patches of this patch set are
needed, which track dependencies and activate negotiated feature values.

Note on the maximum number of CCIDs that can be registered:
-----------------------------------------------------------
The maximum number of CCIDs that can be registered on the socket is constrained
by the space in a Confirm/Change feature negotiation option.

The space in these in turn depends on the size of header options as defined
in RFC 4340, 5.8. Since this is a recurring constant, it has been moved from
ackvec.h into linux/dccp.h, clarifying its purpose.

Relative to this size, the maximum number of CCID identifiers that can be
present in a Confirm option (which always consumes 1 byte more than a Change
option, cf. 6.1) is 2 bytes less than the maximum TLV size: one for the
CCID-feature-type and one for the selected value.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 Documentation/networking/dccp.txt | 14 ++++++++++++++
 include/linux/dccp.h              |  5 +++++
 net/dccp/ackvec.c                 |  9 ++++-----
 net/dccp/ackvec.h                 |  5 ++---
 net/dccp/feat.h                   |  2 ++
 net/dccp/proto.c                  | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 61 insertions(+), 8 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 43df4487379..610083ff73f 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -61,6 +61,20 @@ DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs
 supported by the endpoint (see include/linux/dccp.h for symbolic constants).
 The caller needs to provide a sufficiently large (> 2) array of type uint8_t.
 
+DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same
+time, combining the operation of the next two socket options. This option is
+preferrable over the latter two, since often applications will use the same
+type of CCID for both directions; and mixed use of CCIDs is not currently well
+understood. This socket option takes as argument at least one uint8_t value, or
+an array of uint8_t values, which must match available CCIDS (see above). CCIDs
+must be registered on the socket before calling connect() or listen().
+
+DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets
+the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID.
+Please note that the getsockopt argument type here is `int', not uint8_t.
+
+DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID.
+
 DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold
 timewait state when closing the connection (RFC 4340, 8.3). The usual case is
 that the closing server sends a CloseReq, whereupon the client holds timewait
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index eda389ce04f..6a72ff52a8a 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -168,6 +168,8 @@ enum {
 	DCCPO_MIN_CCID_SPECIFIC = 128,
 	DCCPO_MAX_CCID_SPECIFIC = 255,
 };
+/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */
+#define DCCP_SINGLE_OPT_MAXLEN	253
 
 /* DCCP CCIDS */
 enum {
@@ -203,6 +205,9 @@ enum dccp_feature_numbers {
 #define DCCP_SOCKOPT_SEND_CSCOV		10
 #define DCCP_SOCKOPT_RECV_CSCOV		11
 #define DCCP_SOCKOPT_AVAILABLE_CCIDS	12
+#define DCCP_SOCKOPT_CCID		13
+#define DCCP_SOCKOPT_TX_CCID		14
+#define DCCP_SOCKOPT_RX_CCID		15
 #define DCCP_SOCKOPT_CCID_RX_INFO	128
 #define DCCP_SOCKOPT_CCID_TX_INFO	192
 
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 1e8be246ad1..01e4d39fa23 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -12,7 +12,6 @@
 #include "ackvec.h"
 #include "dccp.h"
 
-#include <linux/dccp.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
@@ -68,7 +67,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
 	/* Figure out how many options do we need to represent the ackvec */
-	const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN);
+	const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN);
 	u16 len = av->av_vec_len + 2 * nr_opts, i;
 	u32 elapsed_time;
 	const unsigned char *tail, *from;
@@ -100,8 +99,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 	for (i = 0; i < nr_opts; ++i) {
 		int copylen = len;
 
-		if (len > DCCP_MAX_ACKVEC_OPT_LEN)
-			copylen = DCCP_MAX_ACKVEC_OPT_LEN;
+		if (len > DCCP_SINGLE_OPT_MAXLEN)
+			copylen = DCCP_SINGLE_OPT_MAXLEN;
 
 		*to++ = DCCPO_ACK_VECTOR_0;
 		*to++ = copylen + 2;
@@ -432,7 +431,7 @@ found:
 int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
 		      u64 *ackno, const u8 opt, const u8 *value, const u8 len)
 {
-	if (len > DCCP_MAX_ACKVEC_OPT_LEN)
+	if (len > DCCP_SINGLE_OPT_MAXLEN)
 		return -1;
 
 	/* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index bcb64fb4ace..4ccee030524 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -11,15 +11,14 @@
  *	published by the Free Software Foundation.
  */
 
+#include <linux/dccp.h>
 #include <linux/compiler.h>
 #include <linux/ktime.h>
 #include <linux/list.h>
 #include <linux/types.h>
 
-/* Read about the ECN nonce to see why it is 253 */
-#define DCCP_MAX_ACKVEC_OPT_LEN 253
 /* We can spread an ack vector across multiple options */
-#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2)
+#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
 
 #define DCCP_ACKVEC_STATE_RECEIVED	0
 #define DCCP_ACKVEC_STATE_ECN_MARKED	(1 << 6)
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 2c92bd18e5f..b53b11717c4 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -22,6 +22,8 @@
 /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
 #define DCCPF_SEQ_WMIN		32
 #define DCCPF_SEQ_WMAX		0x3FFFFFFFFFFFull
+/* Maximum number of SP values that fit in a single (Confirm) option */
+#define DCCP_FEAT_MAX_SP_VALS	(DCCP_SINGLE_OPT_MAXLEN - 2)
 
 enum dccp_feat_type {
 	FEAT_AT_RX   = 1,	/* located at RX side of half-connection  */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e29bbf91405..2cd56df44d8 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -506,6 +506,36 @@ static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
 	return rc;
 }
 
+static int dccp_setsockopt_ccid(struct sock *sk, int type,
+				char __user *optval, int optlen)
+{
+	u8 *val;
+	int rc = 0;
+
+	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
+		return -EINVAL;
+
+	val = kmalloc(optlen, GFP_KERNEL);
+	if (val == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(val, optval, optlen)) {
+		kfree(val);
+		return -EFAULT;
+	}
+
+	lock_sock(sk);
+	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
+		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
+
+	if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
+		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
+	release_sock(sk);
+
+	kfree(val);
+	return rc;
+}
+
 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 		char __user *optval, int optlen)
 {
@@ -520,6 +550,10 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_CHANGE_R:
 		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
 		return 0;
+	case DCCP_SOCKOPT_CCID:
+	case DCCP_SOCKOPT_RX_CCID:
+	case DCCP_SOCKOPT_TX_CCID:
+		return dccp_setsockopt_ccid(sk, optname, optval, optlen);
 	}
 
 	if (optlen < (int)sizeof(int))
-- 
cgit v1.2.3-70-g09d2


From 78673e24df27c76ec75565f4024d45c2c74ef148 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Remove obsolete parts of the old CCID interface

The TX/RX CCIDs of the minisock are now redundant: similar to the Ack Vector
case, their value equals initially that of the sysctl, but at the end of
feature negotiation may be something different.

The old interface removed by this patch thus has been replaced by the newer
interface to dynamically query the currently loaded CCIDs earlier in this
patch set.

Also removed the constructors for the TX CCID and the RX CCID, since the
switch rx/non-rx is done by the handler in minisocks.c (and the handler is
the only place in the code where CCIDs are loaded).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 Documentation/networking/dccp.txt |  5 +++--
 include/linux/dccp.h              |  3 ---
 net/dccp/ccid.c                   | 14 --------------
 net/dccp/ccid.h                   |  5 -----
 net/dccp/feat.c                   | 12 ------------
 net/dccp/minisocks.c              |  2 --
 6 files changed, 3 insertions(+), 38 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 610083ff73f..a203d132dbe 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -140,10 +140,11 @@ send_ackvec = 1
 	Whether or not to send Ack Vector options (sec. 11.5).
 
 tx_ccid = 2
-	Default CCID for the sender-receiver half-connection.
+	Default CCID for the sender-receiver half-connection. Depending on the
+	choice of CCID, the Send Ack Vector feature is enabled automatically.
 
 rx_ccid = 2
-	Default CCID for the receiver-sender half-connection.
+	Default CCID for the receiver-sender half-connection; see tx_ccid.
 
 seq_window = 100
 	The initial sequence window (sec. 7.5.2).
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 6a72ff52a8a..46daea312d9 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -370,7 +370,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   * Will be used to pass the state from dccp_request_sock to dccp_sock.
   *
   * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
-  * @dccpms_ccid - Congestion Control Id (CCID) (section 10)
   * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
   * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
   * @dccpms_pending - List of features being negotiated
@@ -378,8 +377,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   */
 struct dccp_minisock {
 	__u64			dccpms_sequence_window;
-	__u8			dccpms_rx_ccid;
-	__u8			dccpms_tx_ccid;
 	__u8			dccpms_send_ack_vector;
 	__u8			dccpms_send_ndp_count;
 	struct list_head	dccpms_pending;
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index f72ca83df55..330372a1a0b 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -253,20 +253,6 @@ out_module_put:
 
 EXPORT_SYMBOL_GPL(ccid_new);
 
-struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
-{
-	return ccid_new(id, sk, 1, gfp);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
-
-struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk,  gfp_t gfp)
-{
-	return ccid_new(id, sk, 0, gfp);
-}
-
-EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
-
 static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
 {
 	struct ccid_operations *ccid_ops;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 803343aed00..18f69423a70 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -111,11 +111,6 @@ extern int  ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
 			     gfp_t gfp);
 
-extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
-				   gfp_t gfp);
-extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
-				   gfp_t gfp);
-
 static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
 {
 	struct ccid *ccid = dp->dccps_hc_rx_ccid;
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 6c82dea409d..cb2ddd2b894 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1119,18 +1119,6 @@ int dccp_feat_init(struct sock *sk)
 	INIT_LIST_HEAD(&dmsk->dccpms_pending);	/* XXX no longer used */
 	INIT_LIST_HEAD(&dmsk->dccpms_conf);	/* XXX no longer used */
 
-	/* CCID L */
-	rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 1, 0,
-				&dmsk->dccpms_tx_ccid, 1);
-	if (rc)
-		goto out;
-
-	/* CCID R */
-	rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 0, 0,
-				&dmsk->dccpms_rx_ccid, 1);
-	if (rc)
-		goto out;
-
 	/* Ack ratio */
 	rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0,
 				dp->dccps_l_ack_ratio);
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 258195972bc..486d61df260 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -45,8 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row);
 void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
-	dmsk->dccpms_rx_ccid	     = sysctl_dccp_feat_rx_ccid;
-	dmsk->dccpms_tx_ccid	     = sysctl_dccp_feat_tx_ccid;
 	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
 	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
 }
-- 
cgit v1.2.3-70-g09d2


From 68e074bfcef269bc61006c2740d7f89ccbbd93d7 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Remove manual influence on NDP Count feature

Updating the NDP count feature is handled automatically now:
 * for CCID-2 it is disabled, since the code does not use NDP counts;
 * for CCID-3 it is enabled, as NDP counts are used to determine loss lengths.

Allowing the user to change NDP values leads to unpredictable and failing
behaviour, since it is then possible to disable NDP counts even when they
are needed (e.g. in CCID-3).

This means that only those user settings are sensible that agree with the
values for Send NDP Count implied by the choice of CCID. But those settings
are already activated by the feature negotiation (CCID dependency tracking),
hence this form of support is redundant.

At startup the initialisation of the NDP count feature is with the default
value of 0, which is done implicitly by the zeroing-out of the socket when
it is allocated. If the choice of CCID or feature negotiation enables NDP
count, this will then be updated via the NDP activation handler.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 Documentation/networking/dccp.txt | 3 ---
 include/linux/dccp.h              | 4 ++--
 net/dccp/dccp.h                   | 1 -
 net/dccp/feat.c                   | 2 +-
 net/dccp/minisocks.c              | 1 -
 net/dccp/options.c                | 4 +---
 net/dccp/sysctl.c                 | 7 -------
 7 files changed, 4 insertions(+), 18 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index a203d132dbe..1403745ab40 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -133,9 +133,6 @@ retries2
 	importance for retransmitted acknowledgments and feature negotiation,
 	data packets are never retransmitted. Analogue of tcp_retries2.
 
-send_ndp = 1
-	Whether or not to send NDP count options (sec. 7.7.2).
-
 send_ackvec = 1
 	Whether or not to send Ack Vector options (sec. 11.5).
 
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 46daea312d9..60e94438ead 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -371,14 +371,12 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   *
   * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
   * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
-  * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
   * @dccpms_pending - List of features being negotiated
   * @dccpms_conf -
   */
 struct dccp_minisock {
 	__u64			dccpms_sequence_window;
 	__u8			dccpms_send_ack_vector;
-	__u8			dccpms_send_ndp_count;
 	struct list_head	dccpms_pending;
 	struct list_head	dccpms_conf;
 };
@@ -490,6 +488,7 @@ struct dccp_ackvec;
  * @dccps_r_ack_ratio - feature-remote Ack Ratio
  * @dccps_pcslen - sender   partial checksum coverage (via sockopt)
  * @dccps_pcrlen - receiver partial checksum coverage (via sockopt)
+ * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2)
  * @dccps_ndp_count - number of Non Data Packets since last data packet
  * @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
  * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
@@ -529,6 +528,7 @@ struct dccp_sock {
 	__u16				dccps_r_ack_ratio;
 	__u8				dccps_pcslen:4;
 	__u8				dccps_pcrlen:4;
+	__u8				dccps_send_ndp_count:1;
 	__u64				dccps_ndp_count:48;
 	unsigned long			dccps_rate_last;
 	struct dccp_minisock		dccps_minisock;
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 1baed789bcc..51436c82565 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -99,7 +99,6 @@ extern int  sysctl_dccp_feat_sequence_window;
 extern int  sysctl_dccp_feat_rx_ccid;
 extern int  sysctl_dccp_feat_tx_ccid;
 extern int  sysctl_dccp_feat_send_ack_vector;
-extern int  sysctl_dccp_feat_send_ndp_count;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index cb2ddd2b894..35a57ab3bb1 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -86,7 +86,7 @@ static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
 static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
 {
 	if (!rx)
-		dccp_msk(sk)->dccpms_send_ndp_count = (enable > 0);
+		dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
 	return 0;
 }
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 486d61df260..9e223257266 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -46,7 +46,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
 	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
-	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
 }
 
 void dccp_time_wait(struct sock *sk, int state, int timeo)
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 3a9a22f0ac1..6b0704497e8 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -27,7 +27,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
 int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
-int sysctl_dccp_feat_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
 
 u64 dccp_decode_value_var(const u8 *bf, const u8 len)
 {
@@ -531,8 +530,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
-	if (dmsk->dccpms_send_ndp_count &&
-	    dccp_insert_option_ndp(sk, skb))
+	if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
 		return -1;
 
 	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index f6e54f433e2..587c12f915c 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -47,13 +47,6 @@ static struct ctl_table dccp_default_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "send_ndp",
-		.data		= &sysctl_dccp_feat_send_ndp_count,
-		.maxlen		= sizeof(sysctl_dccp_feat_send_ndp_count),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
-- 
cgit v1.2.3-70-g09d2


From b235dc4abbc1356284bd0dc730efa711f394e0e2 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp ccid-2: Phase out the use of boolean Ack Vector sysctl

This removes the use of the sysctl and the minisock variable for the Send Ack
Vector feature, which is now handled fully dynamically via feature negotiation;
i.e. when CCID2 is enabled, Ack Vectors are automatically enabled (as per
RFC 4341, 4.).

Using a sysctl in parallel to this implementation would open the door to
crashes, since much of the code relies on tests of the boolean minisock /
sysctl variable. Thus, this patch replaces all tests of type

	if (dccp_msk(sk)->dccpms_send_ack_vector)
		/* ... */
with
	if (dp->dccps_hc_rx_ackvec != NULL)
		/* ... */

The dccps_hc_rx_ackvec is allocated by the dccp_hdlr_ackvec() when feature
negotiation concluded that Ack Vectors are to be used on the half-connection.
Otherwise, it is NULL (due to dccp_init_sock/dccp_create_openreq_child),
so that the test is a valid one.

The activation handler for Ack Vectors is called as soon as the feature
negotiation has concluded at the
 * server when the Ack marking the transition RESPOND => OPEN arrives;
 * client after it has sent its ACK, marking the transition REQUEST => PARTOPEN.

Adding the sequence number of the Response packet to the Ack Vector has been
removed, since
 (a) connection establishment implies that the Response has been received;
 (b) the CCIDs only look at packets received in the (PART)OPEN state, i.e.
     this entry will always be ignored;
 (c) it can not be used for anything useful - to detect loss for instance, only
     packets received after the loss can serve as pseudo-dupacks.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 Documentation/networking/dccp.txt |  3 ---
 include/linux/dccp.h              |  3 ---
 net/dccp/dccp.h                   |  3 +--
 net/dccp/diag.c                   |  2 +-
 net/dccp/input.c                  | 12 +++---------
 net/dccp/minisocks.c              |  1 -
 net/dccp/options.c                |  7 ++-----
 net/dccp/proto.c                  |  3 +--
 net/dccp/sysctl.c                 |  7 -------
 9 files changed, 8 insertions(+), 33 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 1403745ab40..7a3bb1abb83 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -133,9 +133,6 @@ retries2
 	importance for retransmitted acknowledgments and feature negotiation,
 	data packets are never retransmitted. Analogue of tcp_retries2.
 
-send_ackvec = 1
-	Whether or not to send Ack Vector options (sec. 11.5).
-
 tx_ccid = 2
 	Default CCID for the sender-receiver half-connection. Depending on the
 	choice of CCID, the Send Ack Vector feature is enabled automatically.
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 60e94438ead..61734e27abb 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -360,7 +360,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 #define DCCPF_INITIAL_SEQUENCE_WINDOW		100
 #define DCCPF_INITIAL_ACK_RATIO			2
 #define DCCPF_INITIAL_CCID			DCCPC_CCID2
-#define DCCPF_INITIAL_SEND_ACK_VECTOR		1
 /* FIXME: for now we're default to 1 but it should really be 0 */
 #define DCCPF_INITIAL_SEND_NDP_COUNT		1
 
@@ -370,13 +369,11 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
   * Will be used to pass the state from dccp_request_sock to dccp_sock.
   *
   * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
-  * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
   * @dccpms_pending - List of features being negotiated
   * @dccpms_conf -
   */
 struct dccp_minisock {
 	__u64			dccpms_sequence_window;
-	__u8			dccpms_send_ack_vector;
 	struct list_head	dccpms_pending;
 	struct list_head	dccpms_conf;
 };
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 51436c82565..3fd16e82c00 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -98,7 +98,6 @@ extern int  sysctl_dccp_retries2;
 extern int  sysctl_dccp_feat_sequence_window;
 extern int  sysctl_dccp_feat_rx_ccid;
 extern int  sysctl_dccp_feat_tx_ccid;
-extern int  sysctl_dccp_feat_send_ack_vector;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
@@ -434,7 +433,7 @@ static inline int dccp_ack_pending(const struct sock *sk)
 	const struct dccp_sock *dp = dccp_sk(sk);
 	return dp->dccps_timestamp_echo != 0 ||
 #ifdef CONFIG_IP_DCCP_ACKVEC
-	       (dccp_msk(sk)->dccpms_send_ack_vector &&
+	       (dp->dccps_hc_rx_ackvec != NULL &&
 		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
 #endif
 	       inet_csk_ack_scheduled(sk);
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index d8a3509b26f..93aae7c9555 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_backoff	= icsk->icsk_backoff;
 	info->tcpi_pmtu		= icsk->icsk_pmtu_cookie;
 
-	if (dccp_msk(sk)->dccpms_send_ack_vector)
+	if (dp->dccps_hc_rx_ackvec != NULL)
 		info->tcpi_options |= TCPI_OPT_SACK;
 
 	ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 0672b7e1763..5eb443f656c 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -163,7 +163,7 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	if (dccp_msk(sk)->dccpms_send_ack_vector)
+	if (dp->dccps_hc_rx_ackvec != NULL)
 		dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
 					    DCCP_SKB_CB(skb)->dccpd_ack_seq);
 }
@@ -375,7 +375,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 		dccp_event_ack_recv(sk, skb);
 
-	if (dccp_msk(sk)->dccpms_send_ack_vector &&
+	if (dp->dccps_hc_rx_ackvec != NULL &&
 	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
 			    DCCP_SKB_CB(skb)->dccpd_seq,
 			    DCCP_ACKVEC_STATE_RECEIVED))
@@ -434,12 +434,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
 			    dp->dccps_options_received.dccpor_timestamp_echo));
 
-		if (dccp_msk(sk)->dccpms_send_ack_vector &&
-		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
-				    DCCP_SKB_CB(skb)->dccpd_seq,
-				    DCCP_ACKVEC_STATE_RECEIVED))
-			goto out_invalid_packet; /* FIXME: change error code */
-
 		/* Stop the REQUEST timer */
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 		WARN_ON(sk->sk_send_head == NULL);
@@ -637,7 +631,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
 			dccp_event_ack_recv(sk, skb);
 
-		if (dccp_msk(sk)->dccpms_send_ack_vector &&
+		if (dp->dccps_hc_rx_ackvec != NULL &&
 		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
 				    DCCP_SKB_CB(skb)->dccpd_seq,
 				    DCCP_ACKVEC_STATE_RECEIVED))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 9e223257266..0ebf8ebcf3d 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -45,7 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row);
 void dccp_minisock_init(struct dccp_minisock *dmsk)
 {
 	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
-	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
 }
 
 void dccp_time_wait(struct sock *sk, int state, int timeo)
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 6b0704497e8..aca309e1663 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -26,7 +26,6 @@
 int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
 int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
 int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
-int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
 
 u64 dccp_decode_value_var(const u8 *bf, const u8 len)
 {
@@ -145,8 +144,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		case DCCPO_ACK_VECTOR_1:
 			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
 				break;
-
-			if (dccp_msk(sk)->dccpms_send_ack_vector &&
+			if (dp->dccps_hc_rx_ackvec != NULL &&
 			    dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
 				goto out_invalid_option;
 			break;
@@ -526,7 +524,6 @@ static void dccp_insert_option_padding(struct sk_buff *skb)
 int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
@@ -547,7 +544,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 			if (dccp_insert_option_timestamp(sk, skb))
 				return -1;
 
-		} else if (dmsk->dccpms_send_ack_vector	&&
+		} else if (dp->dccps_hc_rx_ackvec != NULL &&
 			   dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
 			   dccp_insert_option_ackvec(sk, skb)) {
 				return -1;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0d420790b79..775eaa3d0c4 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -207,7 +207,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock);
 void dccp_destroy_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	/*
 	 * DCCP doesn't use sk_write_queue, just sk_send_head
@@ -225,7 +224,7 @@ void dccp_destroy_sock(struct sock *sk)
 	kfree(dp->dccps_service_list);
 	dp->dccps_service_list = NULL;
 
-	if (dmsk->dccpms_send_ack_vector) {
+	if (dp->dccps_hc_rx_ackvec != NULL) {
 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 587c12f915c..018e210875e 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "send_ackvec",
-		.data		= &sysctl_dccp_feat_send_ack_vector,
-		.maxlen		= sizeof(sysctl_dccp_feat_send_ack_vector),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
-- 
cgit v1.2.3-70-g09d2


From 51c7d4fa2675c106a980ddcdbe308b54b5151945 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Implement both feature-local and feature-remote Sequence Window
 feature

This adds full support for local/remote Sequence Window feature, from which the
  * sequence-number-validity (W) and
  * acknowledgment-number-validity (W') windows
derive as specified in RFC 4340, 7.5.3.

Specifically, the following changes are introduced:
  * integrated new socket fields into dccp_sk;
  * updated the update_gsr/gss routines with regard to these fields;
  * updated handler code: the Sequence Window feature is located at the TX side,
    so the local feature is meant if the handler-rx flag is false;
  * the initialisation of `rcv_wnd' in reqsk is removed, since
    - rcv_wnd is not used by the code anywhere;
    - sequence number checks are not done in the LISTEN state (cf. 7.5.3);
    - dccp_check_req checks the Ack number validity more rigorously;
  * the `struct dccp_minisock' became empty and is now removed.

Until the handshake completes with activating negotiated values, the local/remote
Sequence-Window values are undefined and thus can not reliably be estimated.
This issue is addressed in a separate patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
---
 Documentation/networking/dccp.txt |  3 ++-
 include/linux/dccp.h              | 24 ++++--------------------
 net/dccp/dccp.h                   | 16 +++++++---------
 net/dccp/feat.c                   | 13 +++++++++++--
 net/dccp/minisocks.c              | 11 -----------
 net/dccp/proto.c                  |  2 --
 6 files changed, 24 insertions(+), 45 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index 7a3bb1abb83..b132e4a3cf0 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -141,7 +141,8 @@ rx_ccid = 2
 	Default CCID for the receiver-sender half-connection; see tx_ccid.
 
 seq_window = 100
-	The initial sequence window (sec. 7.5.2).
+	The initial sequence window (sec. 7.5.2) of the sender. This influences
+	the local ackno validity and the remote seqno validity windows (7.5.1).
 
 tx_qlen = 5
 	The size of the transmit buffer in packets. A value of 0 corresponds
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 990e97fa1f0..7a0502ab383 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -363,19 +363,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 /* FIXME: for now we're default to 1 but it should really be 0 */
 #define DCCPF_INITIAL_SEND_NDP_COUNT		1
 
-/**
-  * struct dccp_minisock - Minimal DCCP connection representation
-  *
-  * Will be used to pass the state from dccp_request_sock to dccp_sock.
-  *
-  * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
-  */
-struct dccp_minisock {
-	__u64			dccpms_sequence_window;
-};
-
-extern void dccp_minisock_init(struct dccp_minisock *dmsk);
-
 /**
  * struct dccp_request_sock  -  represent DCCP-specific connection request
  * @dreq_inet_rsk: structure inherited from
@@ -464,13 +451,14 @@ struct dccp_ackvec;
  * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo
  * @dccps_l_ack_ratio - feature-local Ack Ratio
  * @dccps_r_ack_ratio - feature-remote Ack Ratio
+ * @dccps_l_seq_win - local Sequence Window (influences ack number validity)
+ * @dccps_r_seq_win - remote Sequence Window (influences seq number validity)
  * @dccps_pcslen - sender   partial checksum coverage (via sockopt)
  * @dccps_pcrlen - receiver partial checksum coverage (via sockopt)
  * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2)
  * @dccps_ndp_count - number of Non Data Packets since last data packet
  * @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
  * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
- * @dccps_minisock - associated minisock (accessed via dccp_msk)
  * @dccps_featneg - tracks feature-negotiation state (mostly during handshake)
  * @dccps_hc_rx_ackvec - rx half connection ack vector
  * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection)
@@ -504,12 +492,13 @@ struct dccp_sock {
 	__u32				dccps_timestamp_time;
 	__u16				dccps_l_ack_ratio;
 	__u16				dccps_r_ack_ratio;
+	__u64				dccps_l_seq_win:48;
+	__u64				dccps_r_seq_win:48;
 	__u8				dccps_pcslen:4;
 	__u8				dccps_pcrlen:4;
 	__u8				dccps_send_ndp_count:1;
 	__u64				dccps_ndp_count:48;
 	unsigned long			dccps_rate_last;
-	struct dccp_minisock		dccps_minisock;
 	struct list_head		dccps_featneg;
 	struct dccp_ackvec		*dccps_hc_rx_ackvec;
 	struct ccid			*dccps_hc_rx_ccid;
@@ -527,11 +516,6 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk)
 	return (struct dccp_sock *)sk;
 }
 
-static inline struct dccp_minisock *dccp_msk(const struct sock *sk)
-{
-	return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock;
-}
-
 static inline const char *dccp_role(const struct sock *sk)
 {
 	switch (dccp_sk(sk)->dccps_role) {
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 3fd16e82c00..6101ecdb85b 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -409,23 +409,21 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
 static inline void dccp_update_gsr(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	const struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	dp->dccps_gsr = seq;
-	dccp_set_seqno(&dp->dccps_swl,
-		       dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
-	dccp_set_seqno(&dp->dccps_swh,
-		       dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
+	/* Sequence validity window depends on remote Sequence Window (7.5.1) */
+	dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+	dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
 }
 
 static inline void dccp_update_gss(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	dp->dccps_awh = dp->dccps_gss = seq;
-	dccp_set_seqno(&dp->dccps_awl,
-		       (dp->dccps_gss -
-			dccp_msk(sk)->dccpms_sequence_window + 1));
+	dp->dccps_gss = seq;
+	/* Ack validity window depends on local Sequence Window value (7.5.1) */
+	dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+	dp->dccps_awh = dp->dccps_gss;
 }
 
 static inline int dccp_ack_pending(const struct sock *sk)
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index 9a493809278..84346599985 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -52,8 +52,17 @@ static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
 
 static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
 {
-	if (!rx)
-		dccp_msk(sk)->dccpms_sequence_window = seq_win;
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	if (rx) {
+		dp->dccps_r_seq_win = seq_win;
+		/* propagate changes to update SWL/SWH */
+		dccp_update_gsr(sk, dp->dccps_gsr);
+	} else {
+		dp->dccps_l_seq_win = seq_win;
+		/* propagate changes to update AWL */
+		dccp_update_gss(sk, dp->dccps_gss);
+	}
 	return 0;
 }
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 0ebf8ebcf3d..0ecb19c5e8c 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -42,11 +42,6 @@ struct inet_timewait_death_row dccp_death_row = {
 
 EXPORT_SYMBOL_GPL(dccp_death_row);
 
-void dccp_minisock_init(struct dccp_minisock *dmsk)
-{
-	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
-}
-
 void dccp_time_wait(struct sock *sk, int state, int timeo)
 {
 	struct inet_timewait_sock *tw = NULL;
@@ -110,7 +105,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		struct dccp_request_sock *dreq = dccp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct dccp_sock *newdp = dccp_sk(newsk);
-		struct dccp_minisock *newdmsk = dccp_msk(newsk);
 
 		newdp->dccps_role	    = DCCP_ROLE_SERVER;
 		newdp->dccps_hc_rx_ackvec   = NULL;
@@ -128,10 +122,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		 *    Initialize S.GAR := S.ISS
 		 *    Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
 		 */
-
-		/* See dccp_v4_conn_request */
-		newdmsk->dccpms_sequence_window = req->rcv_wnd;
-
 		newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
 		dccp_update_gss(newsk, dreq->dreq_iss);
 
@@ -289,7 +279,6 @@ int dccp_reqsk_init(struct request_sock *req,
 
 	inet_rsk(req)->rmt_port	  = dccp_hdr(skb)->dccph_sport;
 	inet_rsk(req)->acked	  = 0;
-	req->rcv_wnd		  = sysctl_dccp_feat_sequence_window;
 	dreq->dreq_timestamp_echo = 0;
 
 	/* inherit feature negotiation options from listening socket */
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 775eaa3d0c4..392a5d822b3 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -180,8 +180,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	dccp_minisock_init(&dp->dccps_minisock);
-
 	icsk->icsk_rto		= DCCP_TIMEOUT_INIT;
 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
 	sk->sk_state		= DCCP_CLOSED;
-- 
cgit v1.2.3-70-g09d2


From d6da3511d6b558d0b017777b61dc08b8fbc06ea4 Mon Sep 17 00:00:00 2001
From: Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
Date: Thu, 4 Sep 2008 07:30:19 +0200
Subject: dccp: Policy-based packet dequeueing infrastructure

This patch adds a generic infrastructure for policy-based dequeueing of
TX packets and provides two policies:
 * a simple FIFO policy (which is the default) and
 * a priority based policy (set via socket options).
Both policies honour the tx_qlen sysctl for the maximum size of the write
queue (can be overridden via socket options).

The priority policy uses skb->priority internally to assign an u32 priority
identifier, using the same ranking as SO_PRIORITY. The skb->priority field
is set to 0 when the packet leaves DCCP. The priority is supplied as ancillary
data using cmsg(3), the patch also provides the requisite parsing routines.

Signed-off-by: Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 Documentation/networking/dccp.txt |  19 ++++++
 include/linux/dccp.h              |  21 +++++++
 net/dccp/Makefile                 |   2 +-
 net/dccp/dccp.h                   |  12 ++++
 net/dccp/output.c                 |   7 +--
 net/dccp/proto.c                  |  67 +++++++++++++++++++-
 net/dccp/qpolicy.c                | 126 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 246 insertions(+), 8 deletions(-)
 create mode 100644 net/dccp/qpolicy.c

(limited to 'Documentation')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index b132e4a3cf0..fcfc1253442 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -45,6 +45,25 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree
 
 Socket options
 ==============
+DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes
+a policy ID as argument and can only be set before the connection (i.e. changes
+during an established connection are not supported). Currently, two policies are
+defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special,
+and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an
+u32 priority value as ancillary data to sendmsg(), where higher numbers indicate
+a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to
+be formatted using a cmsg(3) message header filled in as follows:
+	cmsg->cmsg_level = SOL_DCCP;
+	cmsg->cmsg_type	 = DCCP_SCM_PRIORITY;
+	cmsg->cmsg_len	 = CMSG_LEN(sizeof(uint32_t));	/* or CMSG_LEN(4) */
+
+DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero
+value is always interpreted as unbounded queue length. If different from zero,
+the interpretation of this parameter depends on the current dequeuing policy
+(see above): the "simple" policy will enforce a fixed queue size by returning
+EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the
+lowest-priority packet first. The default value for this parameter is
+initialised from /proc/sys/net/dccp/default/tx_qlen.
 
 DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of
 service codes (RFC 4340, sec. 8.1.2); if this socket option is not set,
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index eed52bcd35d..010e2d87ed7 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -197,6 +197,21 @@ enum dccp_feature_numbers {
 	DCCPF_MAX_CCID_SPECIFIC = 255,
 };
 
+/* DCCP socket control message types for cmsg */
+enum dccp_cmsg_type {
+	DCCP_SCM_PRIORITY = 1,
+	DCCP_SCM_QPOLICY_MAX = 0xFFFF,
+	/* ^-- Up to here reserved exclusively for qpolicy parameters */
+	DCCP_SCM_MAX
+};
+
+/* DCCP priorities for outgoing/queued packets */
+enum dccp_packet_dequeueing_policy {
+	DCCPQ_POLICY_SIMPLE,
+	DCCPQ_POLICY_PRIO,
+	DCCPQ_POLICY_MAX
+};
+
 /* DCCP socket options */
 #define DCCP_SOCKOPT_PACKET_SIZE	1 /* XXX deprecated, without effect */
 #define DCCP_SOCKOPT_SERVICE		2
@@ -210,6 +225,8 @@ enum dccp_feature_numbers {
 #define DCCP_SOCKOPT_CCID		13
 #define DCCP_SOCKOPT_TX_CCID		14
 #define DCCP_SOCKOPT_RX_CCID		15
+#define DCCP_SOCKOPT_QPOLICY_ID		16
+#define DCCP_SOCKOPT_QPOLICY_TXQLEN	17
 #define DCCP_SOCKOPT_CCID_RX_INFO	128
 #define DCCP_SOCKOPT_CCID_TX_INFO	192
 
@@ -458,6 +475,8 @@ struct dccp_ackvec;
  * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection)
  * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection)
  * @dccps_options_received - parsed set of retrieved options
+ * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy
+ * @dccps_tx_qlen - maximum length of the TX queue
  * @dccps_role - role of this sock, one of %dccp_role
  * @dccps_hc_rx_insert_options - receiver wants to add options when acking
  * @dccps_hc_tx_insert_options - sender wants to add options when sending
@@ -500,6 +519,8 @@ struct dccp_sock {
 	struct ccid			*dccps_hc_rx_ccid;
 	struct ccid			*dccps_hc_tx_ccid;
 	struct dccp_options_received	dccps_options_received;
+	__u8				dccps_qpolicy;
+	__u32				dccps_tx_qlen;
 	enum dccp_role			dccps_role:2;
 	__u8				dccps_hc_rx_insert_options:1;
 	__u8				dccps_hc_tx_insert_options:1;
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index b68440bd7fa..0c1c9af2bf7 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
 
 dccp-y := ccid.o feat.o input.o minisocks.o options.o \
-	  output.o proto.o timer.o ackvec.o
+	  qpolicy.o output.o proto.o timer.o ackvec.o
 
 dccp_ipv4-y := ipv4.o
 
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 74c90cd2767..ce2dd6f6f34 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -234,6 +234,18 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 extern void dccp_send_sync(struct sock *sk, const u64 seq,
 			   const enum dccp_pkt_type pkt_type);
 
+/*
+ * TX Packet Dequeueing Interface
+ */
+extern void		dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
+extern bool		dccp_qpolicy_full(struct sock *sk);
+extern void		dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
+extern struct sk_buff	*dccp_qpolicy_top(struct sock *sk);
+extern struct sk_buff	*dccp_qpolicy_pop(struct sock *sk);
+
+/*
+ * TX Packet Output and TX Timers
+ */
 extern void dccp_write_xmit(struct sock *sk);
 extern void dccp_write_space(struct sock *sk);
 extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
diff --git a/net/dccp/output.c b/net/dccp/output.c
index b1eaf7bcfb1..2532797a800 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -241,7 +241,7 @@ static void dccp_xmit_packet(struct sock *sk)
 {
 	int err, len;
 	struct dccp_sock *dp = dccp_sk(sk);
-	struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue);
+	struct sk_buff *skb = dccp_qpolicy_pop(sk);
 
 	if (unlikely(skb == NULL))
 		return;
@@ -344,7 +344,7 @@ void dccp_write_xmit(struct sock *sk)
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct sk_buff *skb;
 
-	while ((skb = skb_peek(&sk->sk_write_queue))) {
+	while ((skb = dccp_qpolicy_top(sk))) {
 		int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
 
 		switch (ccid_packet_dequeue_eval(rc)) {
@@ -358,8 +358,7 @@ void dccp_write_xmit(struct sock *sk)
 			dccp_xmit_packet(sk);
 			break;
 		case CCID_PACKET_ERR:
-			skb_dequeue(&sk->sk_write_queue);
-			kfree_skb(skb);
+			dccp_qpolicy_drop(sk, skb);
 			dccp_pr_debug("packet discarded due to err=%d\n", rc);
 		}
 	}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 8c125ffab1c..b56efdd2a42 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -189,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	dp->dccps_rate_last	= jiffies;
 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
 	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
+	dp->dccps_tx_qlen	= sysctl_dccp_tx_qlen;
 
 	dccp_init_xmit_timers(sk);
 
@@ -541,6 +542,20 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_RECV_CSCOV:
 		err = dccp_setsockopt_cscov(sk, val, true);
 		break;
+	case DCCP_SOCKOPT_QPOLICY_ID:
+		if (sk->sk_state != DCCP_CLOSED)
+			err = -EISCONN;
+		else if (val < 0 || val >= DCCPQ_POLICY_MAX)
+			err = -EINVAL;
+		else
+			dp->dccps_qpolicy = val;
+		break;
+	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+		if (val < 0)
+			err = -EINVAL;
+		else
+			dp->dccps_tx_qlen = val;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -648,6 +663,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_RECV_CSCOV:
 		val = dp->dccps_pcrlen;
 		break;
+	case DCCP_SOCKOPT_QPOLICY_ID:
+		val = dp->dccps_qpolicy;
+		break;
+	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+		val = dp->dccps_tx_qlen;
+		break;
 	case 128 ... 191:
 		return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
 					     len, (u32 __user *)optval, optlen);
@@ -690,6 +711,43 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
 #endif
 
+static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+
+	/*
+	 * Assign an (opaque) qpolicy priority value to skb->priority.
+	 *
+	 * We are overloading this skb field for use with the qpolicy subystem.
+	 * The skb->priority is normally used for the SO_PRIORITY option, which
+	 * is initialised from sk_priority. Since the assignment of sk_priority
+	 * to skb->priority happens later (on layer 3), we overload this field
+	 * for use with queueing priorities as long as the skb is on layer 4.
+	 * The default priority value (if nothing is set) is 0.
+	 */
+	skb->priority = 0;
+
+	for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+
+		if (cmsg->cmsg_level != SOL_DCCP)
+			continue;
+
+		switch (cmsg->cmsg_type) {
+		case DCCP_SCM_PRIORITY:
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
+				return -EINVAL;
+			skb->priority = *(__u32 *)CMSG_DATA(cmsg);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		 size_t len)
 {
@@ -705,8 +763,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	lock_sock(sk);
 
-	if (sysctl_dccp_tx_qlen &&
-	    (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
+	if (dccp_qpolicy_full(sk)) {
 		rc = -EAGAIN;
 		goto out_release;
 	}
@@ -734,7 +791,11 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (rc != 0)
 		goto out_discard;
 
-	skb_queue_tail(&sk->sk_write_queue, skb);
+	rc = dccp_msghdr_parse(msg, skb);
+	if (rc != 0)
+		goto out_discard;
+
+	dccp_qpolicy_push(sk, skb);
 	dccp_write_xmit(sk);
 out_release:
 	release_sock(sk);
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 00000000000..414696b0d83
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,126 @@
+/*
+ *  net/dccp/qpolicy.c
+ *
+ *  Policy-based packet dequeueing interface for DCCP.
+ *
+ *  Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License v2
+ *  as published by the Free Software Foundation.
+ */
+#include "dccp.h"
+
+/*
+ *	Simple Dequeueing Policy:
+ *	If tx_qlen is different from 0, enqueue up to tx_qlen elements.
+ */
+static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
+{
+	skb_queue_tail(&sk->sk_write_queue, skb);
+}
+
+static bool qpolicy_simple_full(struct sock *sk)
+{
+	return dccp_sk(sk)->dccps_tx_qlen &&
+	       sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
+}
+
+static struct sk_buff *qpolicy_simple_top(struct sock *sk)
+{
+	return skb_peek(&sk->sk_write_queue);
+}
+
+/*
+ *	Priority-based Dequeueing Policy:
+ *	If tx_qlen is different from 0 and the queue has reached its upper bound
+ *	of tx_qlen elements, replace older packets lowest-priority-first.
+ */
+static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
+{
+	struct sk_buff *skb, *best = NULL;
+
+	skb_queue_walk(&sk->sk_write_queue, skb)
+		if (best == NULL || skb->priority > best->priority)
+			best = skb;
+	return best;
+}
+
+static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
+{
+	struct sk_buff *skb, *worst = NULL;
+
+	skb_queue_walk(&sk->sk_write_queue, skb)
+		if (worst == NULL || skb->priority < worst->priority)
+			worst = skb;
+	return worst;
+}
+
+static bool qpolicy_prio_full(struct sock *sk)
+{
+	if (qpolicy_simple_full(sk))
+		dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
+	return false;
+}
+
+/**
+ * struct dccp_qpolicy_operations  -  TX Packet Dequeueing Interface
+ * @push: add a new @skb to the write queue
+ * @full: indicates that no more packets will be admitted
+ * @top:  peeks at whatever the queueing policy defines as its `top'
+ */
+static struct dccp_qpolicy_operations {
+	void		(*push)	(struct sock *sk, struct sk_buff *skb);
+	bool		(*full) (struct sock *sk);
+	struct sk_buff*	(*top)  (struct sock *sk);
+
+} qpol_table[DCCPQ_POLICY_MAX] = {
+	[DCCPQ_POLICY_SIMPLE] = {
+		.push = qpolicy_simple_push,
+		.full = qpolicy_simple_full,
+		.top  = qpolicy_simple_top,
+	},
+	[DCCPQ_POLICY_PRIO] = {
+		.push = qpolicy_simple_push,
+		.full = qpolicy_prio_full,
+		.top  = qpolicy_prio_best_skb,
+	},
+};
+
+/*
+ *	Externally visible interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
+{
+	qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
+}
+
+bool dccp_qpolicy_full(struct sock *sk)
+{
+	return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
+}
+
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
+{
+	if (skb != NULL) {
+		skb_unlink(skb, &sk->sk_write_queue);
+		kfree_skb(skb);
+	}
+}
+
+struct sk_buff *dccp_qpolicy_top(struct sock *sk)
+{
+	return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
+}
+
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
+{
+	struct sk_buff *skb = dccp_qpolicy_top(sk);
+
+	/* Clear any skb fields that we used internally */
+	skb->priority = 0;
+
+	if (skb)
+		skb_unlink(skb, &sk->sk_write_queue);
+	return skb;
+}
-- 
cgit v1.2.3-70-g09d2


From 410e27a49bb98bc7fa3ff5fc05cc313817b9f253 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Tue, 9 Sep 2008 13:27:22 +0200
Subject: This reverts "Merge branch 'dccp' of
 git://eden-feed.erg.abdn.ac.uk/dccp_exp" as it accentally contained the wrong
 set of patches. These will be submitted separately. Signed-off-by: Gerrit
 Renker <gerrit@erg.abdn.ac.uk>

---
 Documentation/networking/dccp.txt   |   54 +-
 include/linux/dccp.h                |  122 ++-
 include/net/tcp.h                   |   15 -
 net/dccp/Kconfig                    |    3 +
 net/dccp/Makefile                   |    5 +-
 net/dccp/ackvec.c                   |  619 ++++++------
 net/dccp/ackvec.h                   |  204 ++--
 net/dccp/ccid.c                     |  101 +-
 net/dccp/ccid.h                     |  113 +--
 net/dccp/ccids/Kconfig              |   30 +-
 net/dccp/ccids/ccid2.c              |  622 +++++++-----
 net/dccp/ccids/ccid2.h              |   63 +-
 net/dccp/ccids/ccid3.c              |  762 +++++++++------
 net/dccp/ccids/ccid3.h              |  153 +--
 net/dccp/ccids/lib/loss_interval.c  |   30 +-
 net/dccp/ccids/lib/loss_interval.h  |    4 +-
 net/dccp/ccids/lib/packet_history.c |  282 +++---
 net/dccp/ccids/lib/packet_history.h |   78 +-
 net/dccp/ccids/lib/tfrc.h           |   16 -
 net/dccp/ccids/lib/tfrc_equation.c  |   29 +-
 net/dccp/dccp.h                     |  104 +-
 net/dccp/diag.c                     |    2 +-
 net/dccp/feat.c                     | 1805 +++++++++--------------------------
 net/dccp/feat.h                     |  144 +--
 net/dccp/input.c                    |  164 ++--
 net/dccp/ipv4.c                     |    4 +-
 net/dccp/ipv6.c                     |    4 +-
 net/dccp/minisocks.c                |   87 +-
 net/dccp/options.c                  |  341 ++++---
 net/dccp/output.c                   |  279 ++----
 net/dccp/probe.c                    |   75 +-
 net/dccp/proto.c                    |  281 ++----
 net/dccp/qpolicy.c                  |  137 ---
 net/dccp/sysctl.c                   |   64 +-
 net/dccp/timer.c                    |   42 +-
 net/ipv4/tcp_input.c                |   17 +-
 36 files changed, 2884 insertions(+), 3971 deletions(-)
 delete mode 100644 net/dccp/qpolicy.c

(limited to 'Documentation')

diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt
index fcfc1253442..39131a3c78f 100644
--- a/Documentation/networking/dccp.txt
+++ b/Documentation/networking/dccp.txt
@@ -45,25 +45,6 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree
 
 Socket options
 ==============
-DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes
-a policy ID as argument and can only be set before the connection (i.e. changes
-during an established connection are not supported). Currently, two policies are
-defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special,
-and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an
-u32 priority value as ancillary data to sendmsg(), where higher numbers indicate
-a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to
-be formatted using a cmsg(3) message header filled in as follows:
-	cmsg->cmsg_level = SOL_DCCP;
-	cmsg->cmsg_type	 = DCCP_SCM_PRIORITY;
-	cmsg->cmsg_len	 = CMSG_LEN(sizeof(uint32_t));	/* or CMSG_LEN(4) */
-
-DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero
-value is always interpreted as unbounded queue length. If different from zero,
-the interpretation of this parameter depends on the current dequeuing policy
-(see above): the "simple" policy will enforce a fixed queue size by returning
-EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the
-lowest-priority packet first. The default value for this parameter is
-initialised from /proc/sys/net/dccp/default/tx_qlen.
 
 DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of
 service codes (RFC 4340, sec. 8.1.2); if this socket option is not set,
@@ -76,24 +57,6 @@ can be set before calling bind().
 DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet
 size (application payload size) in bytes, see RFC 4340, section 14.
 
-DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs
-supported by the endpoint (see include/linux/dccp.h for symbolic constants).
-The caller needs to provide a sufficiently large (> 2) array of type uint8_t.
-
-DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same
-time, combining the operation of the next two socket options. This option is
-preferrable over the latter two, since often applications will use the same
-type of CCID for both directions; and mixed use of CCIDs is not currently well
-understood. This socket option takes as argument at least one uint8_t value, or
-an array of uint8_t values, which must match available CCIDS (see above). CCIDs
-must be registered on the socket before calling connect() or listen().
-
-DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets
-the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID.
-Please note that the getsockopt argument type here is `int', not uint8_t.
-
-DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID.
-
 DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold
 timewait state when closing the connection (RFC 4340, 8.3). The usual case is
 that the closing server sends a CloseReq, whereupon the client holds timewait
@@ -152,16 +115,23 @@ retries2
 	importance for retransmitted acknowledgments and feature negotiation,
 	data packets are never retransmitted. Analogue of tcp_retries2.
 
+send_ndp = 1
+	Whether or not to send NDP count options (sec. 7.7.2).
+
+send_ackvec = 1
+	Whether or not to send Ack Vector options (sec. 11.5).
+
+ack_ratio = 2
+	The default Ack Ratio (sec. 11.3) to use.
+
 tx_ccid = 2
-	Default CCID for the sender-receiver half-connection. Depending on the
-	choice of CCID, the Send Ack Vector feature is enabled automatically.
+	Default CCID for the sender-receiver half-connection.
 
 rx_ccid = 2
-	Default CCID for the receiver-sender half-connection; see tx_ccid.
+	Default CCID for the receiver-sender half-connection.
 
 seq_window = 100
-	The initial sequence window (sec. 7.5.2) of the sender. This influences
-	the local ackno validity and the remote seqno validity windows (7.5.1).
+	The initial sequence window (sec. 7.5.2).
 
 tx_qlen = 5
 	The size of the transmit buffer in packets. A value of 0 corresponds
diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 010e2d87ed7..6080449fbec 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -165,13 +165,9 @@ enum {
 	DCCPO_TIMESTAMP_ECHO = 42,
 	DCCPO_ELAPSED_TIME = 43,
 	DCCPO_MAX = 45,
-	DCCPO_MIN_RX_CCID_SPECIFIC = 128,	/* from sender to receiver */
-	DCCPO_MAX_RX_CCID_SPECIFIC = 191,
-	DCCPO_MIN_TX_CCID_SPECIFIC = 192,	/* from receiver to sender */
-	DCCPO_MAX_TX_CCID_SPECIFIC = 255,
+	DCCPO_MIN_CCID_SPECIFIC = 128,
+	DCCPO_MAX_CCID_SPECIFIC = 255,
 };
-/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */
-#define DCCP_SINGLE_OPT_MAXLEN	253
 
 /* DCCP CCIDS */
 enum {
@@ -180,36 +176,27 @@ enum {
 };
 
 /* DCCP features (RFC 4340 section 6.4) */
-enum dccp_feature_numbers {
+enum {
 	DCCPF_RESERVED = 0,
 	DCCPF_CCID = 1,
-	DCCPF_SHORT_SEQNOS = 2,
+	DCCPF_SHORT_SEQNOS = 2,		/* XXX: not yet implemented */
 	DCCPF_SEQUENCE_WINDOW = 3,
-	DCCPF_ECN_INCAPABLE = 4,
+	DCCPF_ECN_INCAPABLE = 4,	/* XXX: not yet implemented */
 	DCCPF_ACK_RATIO = 5,
 	DCCPF_SEND_ACK_VECTOR = 6,
 	DCCPF_SEND_NDP_COUNT = 7,
 	DCCPF_MIN_CSUM_COVER = 8,
-	DCCPF_DATA_CHECKSUM = 9,
+	DCCPF_DATA_CHECKSUM = 9,	/* XXX: not yet implemented */
 	/* 10-127 reserved */
 	DCCPF_MIN_CCID_SPECIFIC = 128,
-	DCCPF_SEND_LEV_RATE = 192,	/* RFC 4342, sec. 8.4 */
 	DCCPF_MAX_CCID_SPECIFIC = 255,
 };
 
-/* DCCP socket control message types for cmsg */
-enum dccp_cmsg_type {
-	DCCP_SCM_PRIORITY = 1,
-	DCCP_SCM_QPOLICY_MAX = 0xFFFF,
-	/* ^-- Up to here reserved exclusively for qpolicy parameters */
-	DCCP_SCM_MAX
-};
-
-/* DCCP priorities for outgoing/queued packets */
-enum dccp_packet_dequeueing_policy {
-	DCCPQ_POLICY_SIMPLE,
-	DCCPQ_POLICY_PRIO,
-	DCCPQ_POLICY_MAX
+/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */
+struct dccp_so_feat {
+	__u8 dccpsf_feat;
+	__u8 __user *dccpsf_val;
+	__u8 dccpsf_len;
 };
 
 /* DCCP socket options */
@@ -221,12 +208,6 @@ enum dccp_packet_dequeueing_policy {
 #define DCCP_SOCKOPT_SERVER_TIMEWAIT	6
 #define DCCP_SOCKOPT_SEND_CSCOV		10
 #define DCCP_SOCKOPT_RECV_CSCOV		11
-#define DCCP_SOCKOPT_AVAILABLE_CCIDS	12
-#define DCCP_SOCKOPT_CCID		13
-#define DCCP_SOCKOPT_TX_CCID		14
-#define DCCP_SOCKOPT_RX_CCID		15
-#define DCCP_SOCKOPT_QPOLICY_ID		16
-#define DCCP_SOCKOPT_QPOLICY_TXQLEN	17
 #define DCCP_SOCKOPT_CCID_RX_INFO	128
 #define DCCP_SOCKOPT_CCID_TX_INFO	192
 
@@ -374,13 +355,62 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb)
 	return __dccp_hdr_len(dccp_hdr(skb));
 }
 
+
+/* initial values for each feature */
+#define DCCPF_INITIAL_SEQUENCE_WINDOW		100
+#define DCCPF_INITIAL_ACK_RATIO			2
+#define DCCPF_INITIAL_CCID			DCCPC_CCID2
+#define DCCPF_INITIAL_SEND_ACK_VECTOR		1
+/* FIXME: for now we're default to 1 but it should really be 0 */
+#define DCCPF_INITIAL_SEND_NDP_COUNT		1
+
+/**
+  * struct dccp_minisock - Minimal DCCP connection representation
+  *
+  * Will be used to pass the state from dccp_request_sock to dccp_sock.
+  *
+  * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2)
+  * @dccpms_ccid - Congestion Control Id (CCID) (section 10)
+  * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5)
+  * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2)
+  * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3)
+  * @dccpms_pending - List of features being negotiated
+  * @dccpms_conf -
+  */
+struct dccp_minisock {
+	__u64			dccpms_sequence_window;
+	__u8			dccpms_rx_ccid;
+	__u8			dccpms_tx_ccid;
+	__u8			dccpms_send_ack_vector;
+	__u8			dccpms_send_ndp_count;
+	__u8			dccpms_ack_ratio;
+	struct list_head	dccpms_pending;
+	struct list_head	dccpms_conf;
+};
+
+struct dccp_opt_conf {
+	__u8			*dccpoc_val;
+	__u8			dccpoc_len;
+};
+
+struct dccp_opt_pend {
+	struct list_head	dccpop_node;
+	__u8			dccpop_type;
+	__u8			dccpop_feat;
+	__u8		        *dccpop_val;
+	__u8			dccpop_len;
+	int			dccpop_conf;
+	struct dccp_opt_conf    *dccpop_sc;
+};
+
+extern void dccp_minisock_init(struct dccp_minisock *dmsk);
+
 /**
  * struct dccp_request_sock  -  represent DCCP-specific connection request
  * @dreq_inet_rsk: structure inherited from
  * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1)
  * @dreq_isr: initial sequence number received on the Request
  * @dreq_service: service code present on the Request (there is just one)
- * @dreq_featneg: feature negotiation options for this connection
  * The following two fields are analogous to the ones in dccp_sock:
  * @dreq_timestamp_echo: last received timestamp to echo (13.1)
  * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo
@@ -390,7 +420,6 @@ struct dccp_request_sock {
 	__u64			 dreq_iss;
 	__u64			 dreq_isr;
 	__be32			 dreq_service;
-	struct list_head	 dreq_featneg;
 	__u32			 dreq_timestamp_echo;
 	__u32			 dreq_timestamp_time;
 };
@@ -462,28 +491,21 @@ struct dccp_ackvec;
  * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo
  * @dccps_l_ack_ratio - feature-local Ack Ratio
  * @dccps_r_ack_ratio - feature-remote Ack Ratio
- * @dccps_l_seq_win - local Sequence Window (influences ack number validity)
- * @dccps_r_seq_win - remote Sequence Window (influences seq number validity)
  * @dccps_pcslen - sender   partial checksum coverage (via sockopt)
  * @dccps_pcrlen - receiver partial checksum coverage (via sockopt)
- * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2)
  * @dccps_ndp_count - number of Non Data Packets since last data packet
  * @dccps_mss_cache - current value of MSS (path MTU minus header sizes)
  * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4)
- * @dccps_featneg - tracks feature-negotiation state (mostly during handshake)
+ * @dccps_minisock - associated minisock (accessed via dccp_msk)
  * @dccps_hc_rx_ackvec - rx half connection ack vector
  * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection)
  * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection)
  * @dccps_options_received - parsed set of retrieved options
- * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy
- * @dccps_tx_qlen - maximum length of the TX queue
  * @dccps_role - role of this sock, one of %dccp_role
  * @dccps_hc_rx_insert_options - receiver wants to add options when acking
  * @dccps_hc_tx_insert_options - sender wants to add options when sending
  * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3)
- * @dccps_sync_scheduled - flag which signals "send out-of-band message soon"
- * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets
- * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing)
+ * @dccps_xmit_timer - timer for when CCID is not ready to send
  * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs)
  */
 struct dccp_sock {
@@ -507,26 +529,19 @@ struct dccp_sock {
 	__u32				dccps_timestamp_time;
 	__u16				dccps_l_ack_ratio;
 	__u16				dccps_r_ack_ratio;
-	__u64				dccps_l_seq_win:48;
-	__u64				dccps_r_seq_win:48;
-	__u8				dccps_pcslen:4;
-	__u8				dccps_pcrlen:4;
-	__u8				dccps_send_ndp_count:1;
+	__u16				dccps_pcslen;
+	__u16				dccps_pcrlen;
 	__u64				dccps_ndp_count:48;
 	unsigned long			dccps_rate_last;
-	struct list_head		dccps_featneg;
+	struct dccp_minisock		dccps_minisock;
 	struct dccp_ackvec		*dccps_hc_rx_ackvec;
 	struct ccid			*dccps_hc_rx_ccid;
 	struct ccid			*dccps_hc_tx_ccid;
 	struct dccp_options_received	dccps_options_received;
-	__u8				dccps_qpolicy;
-	__u32				dccps_tx_qlen;
 	enum dccp_role			dccps_role:2;
 	__u8				dccps_hc_rx_insert_options:1;
 	__u8				dccps_hc_tx_insert_options:1;
 	__u8				dccps_server_timewait:1;
-	__u8				dccps_sync_scheduled:1;
-	struct tasklet_struct		dccps_xmitlet;
 	struct timer_list		dccps_xmit_timer;
 };
 
@@ -535,6 +550,11 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk)
 	return (struct dccp_sock *)sk;
 }
 
+static inline struct dccp_minisock *dccp_msk(const struct sock *sk)
+{
+	return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock;
+}
+
 static inline const char *dccp_role(const struct sock *sk)
 {
 	switch (dccp_sk(sk)->dccps_role) {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6bc4b8148ca..8983386356a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -782,21 +782,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
 /* Use define here intentionally to get WARN_ON location shown at the caller */
 #define tcp_verify_left_out(tp)	WARN_ON(tcp_left_out(tp) > tp->packets_out)
 
-/*
- * Convert RFC3390 larger initial windows into an equivalent number of packets.
- *
- * John Heffner states:
- *
- *	The RFC specifies a window of no more than 4380 bytes
- *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
- *	is a bit misleading because they use a clamp at 4380 bytes
- *	rather than a multiplier in the relevant range.
- */
-static inline u32 rfc3390_bytes_to_packets(const u32 bytes)
-{
-	return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3);
-}
-
 extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
 extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
 
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 206c16ad9c3..7aa2a7acc7e 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -25,6 +25,9 @@ config INET_DCCP_DIAG
 	def_tristate y if (IP_DCCP = y && INET_DIAG = y)
 	def_tristate m
 
+config IP_DCCP_ACKVEC
+	bool
+
 source "net/dccp/ccids/Kconfig"
 
 menu "DCCP Kernel Hacking"
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 0c1c9af2bf7..f4f8793aaff 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,7 +1,6 @@
 obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
 
-dccp-y := ccid.o feat.o input.o minisocks.o options.o \
-	  qpolicy.o output.o proto.o timer.o ackvec.o
+dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o
 
 dccp_ipv4-y := ipv4.o
 
@@ -9,6 +8,8 @@ dccp_ipv4-y := ipv4.o
 obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
 dccp_ipv6-y := ipv6.o
 
+dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
+
 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
 obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
 
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 41819848bdd..1e8be246ad1 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,375 +1,445 @@
 /*
  *  net/dccp/ackvec.c
  *
- *  An implementation of Ack Vectors for the DCCP protocol
- *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
+ *  An implementation of the DCCP protocol
  *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
  *
  *      This program is free software; you can redistribute it and/or modify it
  *      under the terms of the GNU General Public License as published by the
  *      Free Software Foundation; version 2 of the License;
  */
+
+#include "ackvec.h"
 #include "dccp.h"
+
+#include <linux/dccp.h>
+#include <linux/init.h>
+#include <linux/errno.h>
 #include <linux/kernel.h>
+#include <linux/skbuff.h>
 #include <linux/slab.h>
 
+#include <net/sock.h>
+
 static struct kmem_cache *dccp_ackvec_slab;
 static struct kmem_cache *dccp_ackvec_record_slab;
 
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
+static struct dccp_ackvec_record *dccp_ackvec_record_new(void)
 {
-	struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
+	struct dccp_ackvec_record *avr =
+			kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
 
-	if (av != NULL) {
-		av->av_buf_head	= av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
-		INIT_LIST_HEAD(&av->av_records);
-	}
-	return av;
+	if (avr != NULL)
+		INIT_LIST_HEAD(&avr->avr_node);
+
+	return avr;
 }
 
-static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
+static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr)
 {
-	struct dccp_ackvec_record *cur, *next;
-
-	list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
-		kmem_cache_free(dccp_ackvec_record_slab, cur);
-	INIT_LIST_HEAD(&av->av_records);
+	if (unlikely(avr == NULL))
+		return;
+	/* Check if deleting a linked record */
+	WARN_ON(!list_empty(&avr->avr_node));
+	kmem_cache_free(dccp_ackvec_record_slab, avr);
 }
 
-void dccp_ackvec_free(struct dccp_ackvec *av)
+static void dccp_ackvec_insert_avr(struct dccp_ackvec *av,
+				   struct dccp_ackvec_record *avr)
 {
-	if (likely(av != NULL)) {
-		dccp_ackvec_purge_records(av);
-		kmem_cache_free(dccp_ackvec_slab, av);
+	/*
+	 * AVRs are sorted by seqno. Since we are sending them in order, we
+	 * just add the AVR at the head of the list.
+	 * -sorbo.
+	 */
+	if (!list_empty(&av->av_records)) {
+		const struct dccp_ackvec_record *head =
+					list_entry(av->av_records.next,
+						   struct dccp_ackvec_record,
+						   avr_node);
+		BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno));
 	}
+
+	list_add(&avr->avr_node, &av->av_records);
 }
 
-/**
- * dccp_ackvec_update_records  -  Record information about sent Ack Vectors
- * @av:		Ack Vector records to update
- * @seqno:	Sequence number of the packet carrying the Ack Vector just sent
- * @nonce_sum:	The sum of all buffer nonces contained in the Ack Vector
- */
-int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
+int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+	/* Figure out how many options do we need to represent the ackvec */
+	const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN);
+	u16 len = av->av_vec_len + 2 * nr_opts, i;
+	u32 elapsed_time;
+	const unsigned char *tail, *from;
+	unsigned char *to;
 	struct dccp_ackvec_record *avr;
+	suseconds_t delta;
+
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+		return -1;
+
+	delta = ktime_us_delta(ktime_get_real(), av->av_time);
+	elapsed_time = delta / 10;
 
-	avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
+	if (elapsed_time != 0 &&
+	    dccp_insert_option_elapsed_time(sk, skb, elapsed_time))
+		return -1;
+
+	avr = dccp_ackvec_record_new();
 	if (avr == NULL)
-		return -ENOBUFS;
+		return -1;
+
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+
+	to   = skb_push(skb, len);
+	len  = av->av_vec_len;
+	from = av->av_buf + av->av_buf_head;
+	tail = av->av_buf + DCCP_MAX_ACKVEC_LEN;
+
+	for (i = 0; i < nr_opts; ++i) {
+		int copylen = len;
+
+		if (len > DCCP_MAX_ACKVEC_OPT_LEN)
+			copylen = DCCP_MAX_ACKVEC_OPT_LEN;
+
+		*to++ = DCCPO_ACK_VECTOR_0;
+		*to++ = copylen + 2;
+
+		/* Check if buf_head wraps */
+		if (from + copylen > tail) {
+			const u16 tailsize = tail - from;
+
+			memcpy(to, from, tailsize);
+			to	+= tailsize;
+			len	-= tailsize;
+			copylen	-= tailsize;
+			from	= av->av_buf;
+		}
+
+		memcpy(to, from, copylen);
+		from += copylen;
+		to   += copylen;
+		len  -= copylen;
+	}
 
-	avr->avr_ack_seqno  = seqno;
-	avr->avr_ack_ptr    = av->av_buf_head;
-	avr->avr_ack_ackno  = av->av_buf_ackno;
-	avr->avr_ack_nonce  = nonce_sum;
-	avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
-	/*
-	 * When the buffer overflows, we keep no more than one record. This is
-	 * the simplest way of disambiguating sender-Acks dating from before the
-	 * overflow from sender-Acks which refer to after the overflow; a simple
-	 * solution is preferable here since we are handling an exception.
-	 */
-	if (av->av_overflow)
-		dccp_ackvec_purge_records(av);
 	/*
-	 * Since GSS is incremented for each packet, the list is automatically
-	 * arranged in descending order of @ack_seqno.
+	 *	From RFC 4340, A.2:
+	 *
+	 *	For each acknowledgement it sends, the HC-Receiver will add an
+	 *	acknowledgement record.  ack_seqno will equal the HC-Receiver
+	 *	sequence number it used for the ack packet; ack_ptr will equal
+	 *	buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
+	 *	equal buf_nonce.
 	 */
-	list_add(&avr->avr_node, &av->av_records);
+	avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+	avr->avr_ack_ptr   = av->av_buf_head;
+	avr->avr_ack_ackno = av->av_buf_ackno;
+	avr->avr_ack_nonce = av->av_buf_nonce;
+	avr->avr_sent_len  = av->av_vec_len;
 
-	dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
+	dccp_ackvec_insert_avr(av, avr);
+
+	dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, "
+		      "ack_ackno=%llu\n",
+		      dccp_role(sk), avr->avr_sent_len,
 		      (unsigned long long)avr->avr_ack_seqno,
-		      (unsigned long long)avr->avr_ack_ackno,
-		      avr->avr_ack_runlen);
+		      (unsigned long long)avr->avr_ack_ackno);
 	return 0;
 }
 
-static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
-						     const u64 ackno)
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
 {
-	struct dccp_ackvec_record *avr;
-	/*
-	 * Exploit that records are inserted in descending order of sequence
-	 * number, start with the oldest record first. If @ackno is `before'
-	 * the earliest ack_ackno, the packet is too old to be considered.
-	 */
-	list_for_each_entry_reverse(avr, av_list, avr_node) {
-		if (avr->avr_ack_seqno == ackno)
-			return avr;
-		if (before48(ackno, avr->avr_ack_seqno))
-			break;
+	struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority);
+
+	if (av != NULL) {
+		av->av_buf_head	 = DCCP_MAX_ACKVEC_LEN - 1;
+		av->av_buf_ackno = UINT48_MAX + 1;
+		av->av_buf_nonce = 0;
+		av->av_time	 = ktime_set(0, 0);
+		av->av_vec_len	 = 0;
+		INIT_LIST_HEAD(&av->av_records);
 	}
-	return NULL;
+
+	return av;
 }
 
-/*
- * Buffer index and length computation using modulo-buffersize arithmetic.
- * Note that, as pointers move from right to left, head is `before' tail.
- */
-static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
+void dccp_ackvec_free(struct dccp_ackvec *av)
 {
-	return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
+	if (unlikely(av == NULL))
+		return;
+
+	if (!list_empty(&av->av_records)) {
+		struct dccp_ackvec_record *avr, *next;
+
+		list_for_each_entry_safe(avr, next, &av->av_records, avr_node) {
+			list_del_init(&avr->avr_node);
+			dccp_ackvec_record_delete(avr);
+		}
+	}
+
+	kmem_cache_free(dccp_ackvec_slab, av);
 }
 
-static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
+static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
+				   const u32 index)
 {
-	return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
+	return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK;
 }
 
-u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
+static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
+				 const u32 index)
 {
-	if (unlikely(av->av_overflow))
-		return DCCPAV_MAX_ACKVEC_LEN;
-	return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
+	return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK;
 }
 
-/**
- * dccp_ackvec_update_old  -  Update previous state as per RFC 4340, 11.4.1
- * @av:		non-empty buffer to update
- * @distance:   negative or zero distance of @seqno from buf_ackno downward
- * @seqno:	the (old) sequence number whose record is to be updated
- * @state:	state in which packet carrying @seqno was received
+/*
+ * If several packets are missing, the HC-Receiver may prefer to enter multiple
+ * bytes with run length 0, rather than a single byte with a larger run length;
+ * this simplifies table updates if one of the missing packets arrives.
  */
-static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
-				   u64 seqno, enum dccp_ackvec_states state)
+static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
+						 const unsigned int packets,
+						 const unsigned char state)
 {
-	u16 ptr = av->av_buf_head;
+	unsigned int gap;
+	long new_head;
 
-	BUG_ON(distance > 0);
-	if (unlikely(dccp_ackvec_is_empty(av)))
-		return;
+	if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN)
+		return -ENOBUFS;
 
-	do {
-		u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
+	gap	 = packets - 1;
+	new_head = av->av_buf_head - packets;
 
-		if (distance + runlen >= 0) {
-			/*
-			 * Only update the state if packet has not been received
-			 * yet. This is OK as per the second table in RFC 4340,
-			 * 11.4.1; i.e. here we are using the following table:
-			 *                     RECEIVED
-			 *                      0   1   3
-			 *              S     +---+---+---+
-			 *              T   0 | 0 | 0 | 0 |
-			 *              O     +---+---+---+
-			 *              R   1 | 1 | 1 | 1 |
-			 *              E     +---+---+---+
-			 *              D   3 | 0 | 1 | 3 |
-			 *                    +---+---+---+
-			 * The "Not Received" state was set by reserve_seats().
-			 */
-			if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
-				av->av_buf[ptr] = state;
-			else
-				dccp_pr_debug("Not changing %llu state to %u\n",
-					      (unsigned long long)seqno, state);
-			break;
+	if (new_head < 0) {
+		if (gap > 0) {
+			memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
+			       gap + new_head + 1);
+			gap = -new_head;
 		}
+		new_head += DCCP_MAX_ACKVEC_LEN;
+	}
 
-		distance += runlen + 1;
-		ptr	  = __ackvec_idx_add(ptr, 1);
+	av->av_buf_head = new_head;
 
-	} while (ptr != av->av_buf_tail);
-}
+	if (gap > 0)
+		memset(av->av_buf + av->av_buf_head + 1,
+		       DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
 
-/* Mark @num entries after buf_head as "Not yet received". */
-static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
-{
-	u16 start = __ackvec_idx_add(av->av_buf_head, 1),
-	    len	  = DCCPAV_MAX_ACKVEC_LEN - start;
-
-	/* check for buffer wrap-around */
-	if (num > len) {
-		memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
-		start = 0;
-		num  -= len;
-	}
-	if (num)
-		memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
+	av->av_buf[av->av_buf_head] = state;
+	av->av_vec_len += packets;
+	return 0;
 }
 
-/**
- * dccp_ackvec_add_new  -  Record one or more new entries in Ack Vector buffer
- * @av:		 container of buffer to update (can be empty or non-empty)
- * @num_packets: number of packets to register (must be >= 1)
- * @seqno:	 sequence number of the first packet in @num_packets
- * @state:	 state in which packet carrying @seqno was received
+/*
+ * Implements the RFC 4340, Appendix A
  */
-static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
-				u64 seqno, enum dccp_ackvec_states state)
+int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
+		    const u64 ackno, const u8 state)
 {
-	u32 num_cells = num_packets;
+	/*
+	 * Check at the right places if the buffer is full, if it is, tell the
+	 * caller to start dropping packets till the HC-Sender acks our ACK
+	 * vectors, when we will free up space in av_buf.
+	 *
+	 * We may well decide to do buffer compression, etc, but for now lets
+	 * just drop.
+	 *
+	 * From Appendix A.1.1 (`New Packets'):
+	 *
+	 *	Of course, the circular buffer may overflow, either when the
+	 *	HC-Sender is sending data at a very high rate, when the
+	 *	HC-Receiver's acknowledgements are not reaching the HC-Sender,
+	 *	or when the HC-Sender is forgetting to acknowledge those acks
+	 *	(so the HC-Receiver is unable to clean up old state). In this
+	 *	case, the HC-Receiver should either compress the buffer (by
+	 *	increasing run lengths when possible), transfer its state to
+	 *	a larger buffer, or, as a last resort, drop all received
+	 *	packets, without processing them whatsoever, until its buffer
+	 *	shrinks again.
+	 */
 
-	if (num_packets > DCCPAV_BURST_THRESH) {
-		u32 lost_packets = num_packets - 1;
+	/* See if this is the first ackno being inserted */
+	if (av->av_vec_len == 0) {
+		av->av_buf[av->av_buf_head] = state;
+		av->av_vec_len = 1;
+	} else if (after48(ackno, av->av_buf_ackno)) {
+		const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
 
-		DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
 		/*
-		 * We received 1 packet and have a loss of size "num_packets-1"
-		 * which we squeeze into num_cells-1 rather than reserving an
-		 * entire byte for each lost packet.
-		 * The reason is that the vector grows in O(burst_length); when
-		 * it grows too large there will no room left for the payload.
-		 * This is a trade-off: if a few packets out of the burst show
-		 * up later, their state will not be changed; it is simply too
-		 * costly to reshuffle/reallocate/copy the buffer each time.
-		 * Should such problems persist, we will need to switch to a
-		 * different underlying data structure.
+		 * Look if the state of this packet is the same as the
+		 * previous ackno and if so if we can bump the head len.
 		 */
-		for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
-			u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
-
-			av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
-			av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
+		if (delta == 1 &&
+		    dccp_ackvec_state(av, av->av_buf_head) == state &&
+		    dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK)
+			av->av_buf[av->av_buf_head]++;
+		else if (dccp_ackvec_set_buf_head_state(av, delta, state))
+			return -ENOBUFS;
+	} else {
+		/*
+		 * A.1.2.  Old Packets
+		 *
+		 *	When a packet with Sequence Number S <= buf_ackno
+		 *	arrives, the HC-Receiver will scan the table for
+		 *	the byte corresponding to S. (Indexing structures
+		 *	could reduce the complexity of this scan.)
+		 */
+		u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
+		u32 index = av->av_buf_head;
 
-			lost_packets -= len;
+		while (1) {
+			const u8 len = dccp_ackvec_len(av, index);
+			const u8 av_state = dccp_ackvec_state(av, index);
+			/*
+			 * valid packets not yet in av_buf have a reserved
+			 * entry, with a len equal to 0.
+			 */
+			if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
+			    len == 0 && delta == 0) { /* Found our
+							 reserved seat! */
+				dccp_pr_debug("Found %llu reserved seat!\n",
+					      (unsigned long long)ackno);
+				av->av_buf[index] = state;
+				goto out;
+			}
+			/* len == 0 means one packet */
+			if (delta < len + 1)
+				goto out_duplicate;
+
+			delta -= len + 1;
+			if (++index == DCCP_MAX_ACKVEC_LEN)
+				index = 0;
 		}
 	}
 
-	if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
-		DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
-		av->av_overflow = true;
-	}
-
-	av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
-	if (av->av_overflow)
-		av->av_buf_tail = av->av_buf_head;
-
-	av->av_buf[av->av_buf_head] = state;
-	av->av_buf_ackno	    = seqno;
+	av->av_buf_ackno = ackno;
+	av->av_time = ktime_get_real();
+out:
+	return 0;
 
-	if (num_packets > 1)
-		dccp_ackvec_reserve_seats(av, num_packets - 1);
+out_duplicate:
+	/* Duplicate packet */
+	dccp_pr_debug("Received a dup or already considered lost "
+		      "packet: %llu\n", (unsigned long long)ackno);
+	return -EILSEQ;
 }
 
-/**
- * dccp_ackvec_input  -  Register incoming packet in the buffer
- */
-void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
+static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
+				     struct dccp_ackvec_record *avr)
 {
-	u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
-	enum dccp_ackvec_states state = DCCPAV_RECEIVED;
+	struct dccp_ackvec_record *next;
 
-	if (dccp_ackvec_is_empty(av)) {
-		dccp_ackvec_add_new(av, 1, seqno, state);
-		av->av_tail_ackno = seqno;
+	/* sort out vector length */
+	if (av->av_buf_head <= avr->avr_ack_ptr)
+		av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
+	else
+		av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 -
+				 av->av_buf_head + avr->avr_ack_ptr;
 
-	} else {
-		s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
-		u8 *current_head = av->av_buf + av->av_buf_head;
-
-		if (num_packets == 1 &&
-		    dccp_ackvec_state(current_head) == state &&
-		    dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
+	/* free records */
+	list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
+		list_del_init(&avr->avr_node);
+		dccp_ackvec_record_delete(avr);
+	}
+}
 
-			*current_head   += 1;
-			av->av_buf_ackno = seqno;
+void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
+				 const u64 ackno)
+{
+	struct dccp_ackvec_record *avr;
 
-		} else if (num_packets > 0) {
-			dccp_ackvec_add_new(av, num_packets, seqno, state);
-		} else {
-			dccp_ackvec_update_old(av, num_packets, seqno, state);
-		}
+	/*
+	 * If we traverse backwards, it should be faster when we have large
+	 * windows. We will be receiving ACKs for stuff we sent a while back
+	 * -sorbo.
+	 */
+	list_for_each_entry_reverse(avr, &av->av_records, avr_node) {
+		if (ackno == avr->avr_ack_seqno) {
+			dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
+				      "ack_ackno=%llu, ACKED!\n",
+				      dccp_role(sk), 1,
+				      (unsigned long long)avr->avr_ack_seqno,
+				      (unsigned long long)avr->avr_ack_ackno);
+			dccp_ackvec_throw_record(av, avr);
+			break;
+		} else if (avr->avr_ack_seqno > ackno)
+			break; /* old news */
 	}
 }
 
-/**
- * dccp_ackvec_clear_state  -  Perform house-keeping / garbage-collection
- * This routine is called when the peer acknowledges the receipt of Ack Vectors
- * up to and including @ackno. While based on on section A.3 of RFC 4340, here
- * are additional precautions to prevent corrupted buffer state. In particular,
- * we use tail_ackno to identify outdated records; it always marks the earliest
- * packet of group (2) in 11.4.2.
- */
-void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
- {
-	struct dccp_ackvec_record *avr, *next;
-	u8 runlen_now, eff_runlen;
-	s64 delta;
+static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
+					    struct sock *sk, u64 *ackno,
+					    const unsigned char len,
+					    const unsigned char *vector)
+{
+	unsigned char i;
+	struct dccp_ackvec_record *avr;
 
-	avr = dccp_ackvec_lookup(&av->av_records, ackno);
-	if (avr == NULL)
+	/* Check if we actually sent an ACK vector */
+	if (list_empty(&av->av_records))
 		return;
-	/*
-	 * Deal with outdated acknowledgments: this arises when e.g. there are
-	 * several old records and the acks from the peer come in slowly. In
-	 * that case we may still have records that pre-date tail_ackno.
-	 */
-	delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
-	if (delta < 0)
-		goto free_records;
-	/*
-	 * Deal with overlapping Ack Vectors: don't subtract more than the
-	 * number of packets between tail_ackno and ack_ackno.
-	 */
-	eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
 
-	runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
+	i = len;
 	/*
-	 * The run length of Ack Vector cells does not decrease over time. If
-	 * the run length is the same as at the time the Ack Vector was sent, we
-	 * free the ack_ptr cell. That cell can however not be freed if the run
-	 * length has increased: in this case we need to move the tail pointer
-	 * backwards (towards higher indices), to its next-oldest neighbour.
+	 * XXX
+	 * I think it might be more efficient to work backwards. See comment on
+	 * rcv_ackno. -sorbo.
 	 */
-	if (runlen_now > eff_runlen) {
+	avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
+	while (i--) {
+		const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+		u64 ackno_end_rl;
 
-		av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
-		av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
+		dccp_set_seqno(&ackno_end_rl, *ackno - rl);
 
-		/* This move may not have cleared the overflow flag. */
-		if (av->av_overflow)
-			av->av_overflow = (av->av_buf_head == av->av_buf_tail);
-	} else {
-		av->av_buf_tail	= avr->avr_ack_ptr;
 		/*
-		 * We have made sure that avr points to a valid cell within the
-		 * buffer. This cell is either older than head, or equals head
-		 * (empty buffer): in both cases we no longer have any overflow.
+		 * If our AVR sequence number is greater than the ack, go
+		 * forward in the AVR list until it is not so.
 		 */
-		av->av_overflow	= 0;
-	}
-
-	/*
-	 * The peer has acknowledged up to and including ack_ackno. Hence the
-	 * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
-	 */
-	av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
+		list_for_each_entry_from(avr, &av->av_records, avr_node) {
+			if (!after48(avr->avr_ack_seqno, *ackno))
+				goto found;
+		}
+		/* End of the av_records list, not found, exit */
+		break;
+found:
+		if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
+			const u8 state = *vector & DCCP_ACKVEC_STATE_MASK;
+			if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
+				dccp_pr_debug("%s ACK vector 0, len=%d, "
+					      "ack_seqno=%llu, ack_ackno=%llu, "
+					      "ACKED!\n",
+					      dccp_role(sk), len,
+					      (unsigned long long)
+					      avr->avr_ack_seqno,
+					      (unsigned long long)
+					      avr->avr_ack_ackno);
+				dccp_ackvec_throw_record(av, avr);
+				break;
+			}
+			/*
+			 * If it wasn't received, continue scanning... we might
+			 * find another one.
+			 */
+		}
 
-free_records:
-	list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
-		list_del(&avr->avr_node);
-		kmem_cache_free(dccp_ackvec_record_slab, avr);
+		dccp_set_seqno(ackno, ackno_end_rl - 1);
+		++vector;
 	}
 }
 
-/*
- *	Routines to keep track of Ack Vectors received in an skb
- */
-int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
+int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
+		      u64 *ackno, const u8 opt, const u8 *value, const u8 len)
 {
-	struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
-
-	if (new == NULL)
-		return -ENOBUFS;
-	new->vec   = vec;
-	new->len   = len;
-	new->nonce = nonce;
+	if (len > DCCP_MAX_ACKVEC_OPT_LEN)
+		return -1;
 
-	list_add_tail(&new->node, head);
+	/* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
+	dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
+					ackno, len, value);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
-
-void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
-{
-	struct dccp_ackvec_parsed *cur, *next;
-
-	list_for_each_entry_safe(cur, next, parsed_chunks, node)
-		kfree(cur);
-	INIT_LIST_HEAD(parsed_chunks);
-}
-EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
 
 int __init dccp_ackvec_init(void)
 {
@@ -379,9 +449,10 @@ int __init dccp_ackvec_init(void)
 	if (dccp_ackvec_slab == NULL)
 		goto out_err;
 
-	dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
-					     sizeof(struct dccp_ackvec_record),
-					     0, SLAB_HWCACHE_ALIGN, NULL);
+	dccp_ackvec_record_slab =
+			kmem_cache_create("dccp_ackvec_record",
+					  sizeof(struct dccp_ackvec_record),
+					  0, SLAB_HWCACHE_ALIGN, NULL);
 	if (dccp_ackvec_record_slab == NULL)
 		goto out_destroy_slab;
 
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 6cdca79a99f..bcb64fb4ace 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,134 +3,156 @@
 /*
  *  net/dccp/ackvec.h
  *
- *  An implementation of Ack Vectors for the DCCP protocol
- *  Copyright (c) 2007 University of Aberdeen, Scotland, UK
+ *  An implementation of the DCCP protocol
  *  Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
  *	This program is free software; you can redistribute it and/or modify it
  *	under the terms of the GNU General Public License version 2 as
  *	published by the Free Software Foundation.
  */
 
-#include <linux/dccp.h>
 #include <linux/compiler.h>
+#include <linux/ktime.h>
 #include <linux/list.h>
 #include <linux/types.h>
 
-/*
- * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
- * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
- * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
- * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
- * The maximum value is bounded by the u16 types for indices and functions.
- */
-#define DCCPAV_NUM_ACKVECS	2
-#define DCCPAV_MAX_ACKVEC_LEN	(DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
-
-/* Estimated minimum average Ack Vector length - used for updating MPS */
-#define DCCPAV_MIN_OPTLEN	16
-
-/* Threshold for coping with large bursts of losses */
-#define DCCPAV_BURST_THRESH	(DCCPAV_MAX_ACKVEC_LEN / 8)
-
-enum dccp_ackvec_states {
-	DCCPAV_RECEIVED =	0x00,
-	DCCPAV_ECN_MARKED =	0x40,
-	DCCPAV_RESERVED =	0x80,
-	DCCPAV_NOT_RECEIVED =	0xC0
-};
-#define DCCPAV_MAX_RUNLEN	0x3F
+/* Read about the ECN nonce to see why it is 253 */
+#define DCCP_MAX_ACKVEC_OPT_LEN 253
+/* We can spread an ack vector across multiple options */
+#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2)
 
-static inline u8 dccp_ackvec_runlen(const u8 *cell)
-{
-	return *cell & DCCPAV_MAX_RUNLEN;
-}
+#define DCCP_ACKVEC_STATE_RECEIVED	0
+#define DCCP_ACKVEC_STATE_ECN_MARKED	(1 << 6)
+#define DCCP_ACKVEC_STATE_NOT_RECEIVED	(3 << 6)
 
-static inline u8 dccp_ackvec_state(const u8 *cell)
-{
-	return *cell & ~DCCPAV_MAX_RUNLEN;
-}
+#define DCCP_ACKVEC_STATE_MASK		0xC0 /* 11000000 */
+#define DCCP_ACKVEC_LEN_MASK		0x3F /* 00111111 */
 
-/** struct dccp_ackvec - Ack Vector main data structure
+/** struct dccp_ackvec - ack vector
+ *
+ * This data structure is the one defined in RFC 4340, Appendix A.
  *
- * This implements a fixed-size circular buffer within an array and is largely
- * based on Appendix A of RFC 4340.
+ * @av_buf_head - circular buffer head
+ * @av_buf_tail - circular buffer tail
+ * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the
+ *		       buffer (i.e. %av_buf_head)
+ * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
+ * 		       by the buffer with State 0
  *
- * @av_buf:	   circular buffer storage area
- * @av_buf_head:   head index; begin of live portion in @av_buf
- * @av_buf_tail:   tail index; first index _after_ the live portion in @av_buf
- * @av_buf_ackno:  highest seqno of acknowledgeable packet recorded in @av_buf
- * @av_tail_ackno: lowest  seqno of acknowledgeable packet recorded in @av_buf
- * @av_buf_nonce:  ECN nonce sums, each covering subsequent segments of up to
- *		   %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
- * @av_overflow:   if 1 then buf_head == buf_tail indicates buffer wraparound
- * @av_records:	   list of %dccp_ackvec_record (Ack Vectors sent previously)
+ * Additionally, the HC-Receiver must keep some information about the
+ * Ack Vectors it has recently sent. For each packet sent carrying an
+ * Ack Vector, it remembers four variables:
+ *
+ * @av_records - list of dccp_ackvec_record
+ * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ *
+ * @av_time - the time in usecs
+ * @av_buf - circular buffer of acknowledgeable packets
  */
 struct dccp_ackvec {
-	u8			av_buf[DCCPAV_MAX_ACKVEC_LEN];
-	u16			av_buf_head;
-	u16			av_buf_tail;
-	u64			av_buf_ackno:48;
-	u64			av_tail_ackno:48;
-	bool			av_buf_nonce[DCCPAV_NUM_ACKVECS];
-	u8			av_overflow:1;
+	u64			av_buf_ackno;
 	struct list_head	av_records;
+	ktime_t			av_time;
+	u16			av_buf_head;
+	u16			av_vec_len;
+	u8			av_buf_nonce;
+	u8			av_ack_nonce;
+	u8			av_buf[DCCP_MAX_ACKVEC_LEN];
 };
 
-/** struct dccp_ackvec_record - Records information about sent Ack Vectors
+/** struct dccp_ackvec_record - ack vector record
  *
- * These list entries define the additional information which the HC-Receiver
- * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
+ * ACK vector record as defined in Appendix A of spec.
  *
- * @avr_node:	    the list node in @av_records
- * @avr_ack_seqno:  sequence number of the packet the Ack Vector was sent on
- * @avr_ack_ackno:  the Ack number that this record/Ack Vector refers to
- * @avr_ack_ptr:    pointer into @av_buf where this record starts
- * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
- * @avr_ack_nonce:  the sum of @av_buf_nonce's at the time this record was sent
+ * The list is sorted by avr_ack_seqno
  *
- * The list as a whole is sorted in descending order by @avr_ack_seqno.
+ * @avr_node - node in av_records
+ * @avr_ack_seqno - sequence number of the packet this record was sent on
+ * @avr_ack_ackno - sequence number being acknowledged
+ * @avr_ack_ptr - pointer into av_buf where this record starts
+ * @avr_ack_nonce - av_ack_nonce at the time this record was sent
+ * @avr_sent_len - lenght of the record in av_buf
  */
 struct dccp_ackvec_record {
 	struct list_head avr_node;
-	u64		 avr_ack_seqno:48;
-	u64		 avr_ack_ackno:48;
+	u64		 avr_ack_seqno;
+	u64		 avr_ack_ackno;
 	u16		 avr_ack_ptr;
-	u8		 avr_ack_runlen;
-	u8		 avr_ack_nonce:1;
+	u16		 avr_sent_len;
+	u8		 avr_ack_nonce;
 };
 
-extern int  dccp_ackvec_init(void);
+struct sock;
+struct sk_buff;
+
+#ifdef CONFIG_IP_DCCP_ACKVEC
+extern int dccp_ackvec_init(void);
 extern void dccp_ackvec_exit(void);
 
 extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
 extern void dccp_ackvec_free(struct dccp_ackvec *av);
 
-extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
-extern int  dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
-extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
-extern u16  dccp_ackvec_buflen(const struct dccp_ackvec *av);
+extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
+			   const u64 ackno, const u8 state);
+
+extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
+					struct sock *sk, const u64 ackno);
+extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
+			     u64 *ackno, const u8 opt,
+			     const u8 *value, const u8 len);
 
-static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
+extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
+
+static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+{
+	return av->av_vec_len;
+}
+#else /* CONFIG_IP_DCCP_ACKVEC */
+static inline int dccp_ackvec_init(void)
 {
-	return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
+	return 0;
 }
 
-/**
- * struct dccp_ackvec_parsed  -  Record offsets of Ack Vectors in skb
- * @vec:	start of vector (offset into skb)
- * @len:	length of @vec
- * @nonce:	whether @vec had an ECN nonce of 0 or 1
- * @node:	FIFO - arranged in descending order of ack_ackno
- * This structure is used by CCIDs to access Ack Vectors in a received skb.
- */
-struct dccp_ackvec_parsed {
-	u8		 *vec,
-			 len,
-			 nonce:1;
-	struct list_head node;
-};
+static inline void dccp_ackvec_exit(void)
+{
+}
+
+static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
+{
+	return NULL;
+}
+
+static inline void dccp_ackvec_free(struct dccp_ackvec *av)
+{
+}
+
+static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
+				  const u64 ackno, const u8 state)
+{
+	return -1;
+}
 
-extern int dccp_ackvec_parsed_add(struct list_head *head,
-				  u8 *vec, u8 len, u8 nonce);
-extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
+static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
+					       struct sock *sk, const u64 ackno)
+{
+}
+
+static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
+				    const u64 *ackno, const u8 opt,
+				    const u8 *value, const u8 len)
+{
+	return -1;
+}
+
+static inline int dccp_insert_option_ackvec(const struct sock *sk,
+					    const struct sk_buff *skb)
+{
+	return -1;
+}
+
+static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
+{
+	return 0;
+}
+#endif /* CONFIG_IP_DCCP_ACKVEC */
 #endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index e3fb52b4f5c..4809753d12a 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -13,13 +13,6 @@
 
 #include "ccid.h"
 
-static u8 builtin_ccids[] = {
-	DCCPC_CCID2,		/* CCID2 is supported by default */
-#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE)
-	DCCPC_CCID3,
-#endif
-};
-
 static struct ccid_operations *ccids[CCID_MAX];
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 static atomic_t ccids_lockct = ATOMIC_INIT(0);
@@ -93,47 +86,6 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
 	}
 }
 
-/* check that up to @array_len members in @ccid_array are supported */
-bool ccid_support_check(u8 const *ccid_array, u8 array_len)
-{
-	u8 i, j, found;
-
-	for (i = 0, found = 0; i < array_len; i++, found = 0) {
-		for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++)
-			found = (ccid_array[i] == builtin_ccids[j]);
-		if (!found)
-			return false;
-	}
-	return true;
-}
-
-/**
- * ccid_get_builtin_ccids  -  Provide copy of `builtin' CCID array
- * @ccid_array: pointer to copy into
- * @array_len: value to return length into
- * This function allocates memory - caller must see that it is freed after use.
- */
-int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
-{
-	*ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any());
-	if (*ccid_array == NULL)
-		return -ENOBUFS;
-	*array_len = ARRAY_SIZE(builtin_ccids);
-	return 0;
-}
-
-int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
-				    char __user *optval, int __user *optlen)
-{
-	if (len < sizeof(builtin_ccids))
-		return -EINVAL;
-
-	if (put_user(sizeof(builtin_ccids), optlen) ||
-	    copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids)))
-		return -EFAULT;
-	return 0;
-}
-
 int ccid_register(struct ccid_operations *ccid_ops)
 {
 	int err = -ENOBUFS;
@@ -196,41 +148,22 @@ int ccid_unregister(struct ccid_operations *ccid_ops)
 
 EXPORT_SYMBOL_GPL(ccid_unregister);
 
-/**
- * ccid_request_module  -  Pre-load CCID module for later use
- * This should be called only from process context (e.g. during connection
- * setup) and is necessary for later calls to ccid_new (typically in software
- * interrupt), so that it has the modules available when they are needed.
- */
-static int ccid_request_module(u8 id)
-{
-	if (!in_atomic()) {
-		ccids_read_lock();
-		if (ccids[id] == NULL) {
-			ccids_read_unlock();
-			return request_module("net-dccp-ccid-%d", id);
-		}
-		ccids_read_unlock();
-	}
-	return 0;
-}
-
-int ccid_request_modules(u8 const *ccid_array, u8 array_len)
-{
-#ifdef CONFIG_KMOD
-	while (array_len--)
-		if (ccid_request_module(ccid_array[array_len]))
-			return -1;
-#endif
-	return 0;
-}
-
 struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp)
 {
 	struct ccid_operations *ccid_ops;
 	struct ccid *ccid = NULL;
 
 	ccids_read_lock();
+#ifdef CONFIG_KMOD
+	if (ccids[id] == NULL) {
+		/* We only try to load if in process context */
+		ccids_read_unlock();
+		if (gfp & GFP_ATOMIC)
+			goto out;
+		request_module("net-dccp-ccid-%d", id);
+		ccids_read_lock();
+	}
+#endif
 	ccid_ops = ccids[id];
 	if (ccid_ops == NULL)
 		goto out_unlock;
@@ -272,6 +205,20 @@ out_module_put:
 
 EXPORT_SYMBOL_GPL(ccid_new);
 
+struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp)
+{
+	return ccid_new(id, sk, 1, gfp);
+}
+
+EXPORT_SYMBOL_GPL(ccid_hc_rx_new);
+
+struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk,  gfp_t gfp)
+{
+	return ccid_new(id, sk, 0, gfp);
+}
+
+EXPORT_SYMBOL_GPL(ccid_hc_tx_new);
+
 static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx)
 {
 	struct ccid_operations *ccid_ops;
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index d27054ba215..fdeae7b5731 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -60,18 +60,22 @@ struct ccid_operations {
 	void		(*ccid_hc_tx_exit)(struct sock *sk);
 	void		(*ccid_hc_rx_packet_recv)(struct sock *sk,
 						  struct sk_buff *skb);
-	int		(*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
-						    u8 opt, u8 *val, u8 len);
+	int		(*ccid_hc_rx_parse_options)(struct sock *sk,
+						    unsigned char option,
+						    unsigned char len, u16 idx,
+						    unsigned char* value);
 	int		(*ccid_hc_rx_insert_options)(struct sock *sk,
 						     struct sk_buff *skb);
 	void		(*ccid_hc_tx_packet_recv)(struct sock *sk,
 						  struct sk_buff *skb);
-	int		(*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
-						    u8 opt, u8 *val, u8 len);
+	int		(*ccid_hc_tx_parse_options)(struct sock *sk,
+						    unsigned char option,
+						    unsigned char len, u16 idx,
+						    unsigned char* value);
 	int		(*ccid_hc_tx_send_packet)(struct sock *sk,
 						  struct sk_buff *skb);
 	void		(*ccid_hc_tx_packet_sent)(struct sock *sk,
-						  unsigned int len);
+						  int more, unsigned int len);
 	void		(*ccid_hc_rx_get_info)(struct sock *sk,
 					       struct tcp_info *info);
 	void		(*ccid_hc_tx_get_info)(struct sock *sk,
@@ -99,78 +103,31 @@ static inline void *ccid_priv(const struct ccid *ccid)
 	return (void *)ccid->ccid_priv;
 }
 
-extern bool ccid_support_check(u8 const *ccid_array, u8 array_len);
-extern int  ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
-extern int  ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
-					  char __user *, int __user *);
-
-extern int    ccid_request_modules(u8 const *ccid_array, u8 array_len);
 extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx,
 			     gfp_t gfp);
 
-static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
-{
-	struct ccid *ccid = dp->dccps_hc_rx_ccid;
-
-	if (ccid == NULL || ccid->ccid_ops == NULL)
-		return -1;
-	return ccid->ccid_ops->ccid_id;
-}
-
-static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
-{
-	struct ccid *ccid = dp->dccps_hc_tx_ccid;
-
-	if (ccid == NULL || ccid->ccid_ops == NULL)
-		return -1;
-	return ccid->ccid_ops->ccid_id;
-}
+extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk,
+				   gfp_t gfp);
+extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk,
+				   gfp_t gfp);
 
 extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
 extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
 
-/*
- * Congestion control of queued data packets via CCID decision.
- *
- * The TX CCID performs its congestion-control by indicating whether and when a
- * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
- * The following modes are supported via the symbolic constants below:
- * - timer-based pacing    (CCID returns a delay value in milliseconds);
- * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
- */
-
-enum ccid_dequeueing_decision {
-	CCID_PACKET_SEND_AT_ONCE =	 0x00000,  /* "green light": no delay */
-	CCID_PACKET_DELAY_MAX =		 0x0FFFF,  /* maximum delay in msecs  */
-	CCID_PACKET_DELAY =		 0x10000,  /* CCID msec-delay mode */
-	CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000,  /* CCID autonomous mode */
-	CCID_PACKET_ERR =		 0xF0000,  /* error condition */
-};
-
-static inline int ccid_packet_dequeue_eval(const int return_code)
-{
-	if (return_code < 0)
-		return CCID_PACKET_ERR;
-	if (return_code == 0)
-		return CCID_PACKET_SEND_AT_ONCE;
-	if (return_code <= CCID_PACKET_DELAY_MAX)
-		return CCID_PACKET_DELAY;
-	return return_code;
-}
-
 static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
 					 struct sk_buff *skb)
 {
+	int rc = 0;
 	if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
-		return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
-	return CCID_PACKET_SEND_AT_ONCE;
+		rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
+	return rc;
 }
 
 static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
-					  unsigned int len)
+					  int more, unsigned int len)
 {
 	if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
-		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
+		ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len);
 }
 
 static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
@@ -187,31 +144,27 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
 		ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
 }
 
-/**
- * ccid_hc_tx_parse_options  -  Parse CCID-specific options sent by the receiver
- * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
- * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
- * @val: value of @opt
- * @len: length of @val in bytes
- */
 static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
-					   u8 pkt, u8 opt, u8 *val, u8 len)
+					   unsigned char option,
+					   unsigned char len, u16 idx,
+					   unsigned char* value)
 {
-	if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
-		return 0;
-	return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
+	int rc = 0;
+	if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL)
+		rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx,
+						    value);
+	return rc;
 }
 
-/**
- * ccid_hc_rx_parse_options  -  Parse CCID-specific options sent by the sender
- * Arguments are analogous to ccid_hc_tx_parse_options()
- */
 static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
-					   u8 pkt, u8 opt, u8 *val, u8 len)
+					   unsigned char option,
+					   unsigned char len, u16 idx,
+					   unsigned char* value)
 {
-	if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
-		return 0;
-	return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
+	int rc = 0;
+	if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL)
+		rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value);
+	return rc;
 }
 
 static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index fb168be2cb4..12275943eab 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,8 +1,10 @@
 menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
 
 config IP_DCCP_CCID2
-	tristate "CCID2 (TCP-Like)"
+	tristate "CCID2 (TCP-Like) (EXPERIMENTAL)"
 	def_tristate IP_DCCP
+	select IP_DCCP_ACKVEC
 	---help---
 	  CCID 2, TCP-like Congestion Control, denotes Additive Increase,
 	  Multiplicative Decrease (AIMD) congestion control with behavior
@@ -34,7 +36,7 @@ config IP_DCCP_CCID2_DEBUG
 	    If in doubt, say N.
 
 config IP_DCCP_CCID3
-	tristate "CCID3 (TCP-Friendly)"
+	tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)"
 	def_tristate IP_DCCP
 	select IP_DCCP_TFRC_LIB
 	---help---
@@ -62,9 +64,9 @@ config IP_DCCP_CCID3
 
 	  If in doubt, say M.
 
-if IP_DCCP_CCID3
 config IP_DCCP_CCID3_DEBUG
 	  bool "CCID3 debugging messages"
+	  depends on IP_DCCP_CCID3
 	  ---help---
 	    Enable CCID3-specific debugging messages.
 
@@ -74,29 +76,10 @@ config IP_DCCP_CCID3_DEBUG
 
 	    If in doubt, say N.
 
-choice
-	  prompt "Select method for measuring the packet size s"
-	  default IP_DCCP_CCID3_MEASURE_S_AS_MPS
-
-config IP_DCCP_CCID3_MEASURE_S_AS_MPS
-	  bool "Always use MPS in place of s"
-	  ---help---
-	    This use is recommended as it is consistent with the initialisation
-	    of X and suggested when s varies (rfc3448bis, (1) in section 4.1).
-config IP_DCCP_CCID3_MEASURE_S_AS_AVG
-	  bool "Use moving average"
-	  ---help---
-	    An alternative way of tracking s, also supported by rfc3448bis.
-	    This used to be the default for CCID-3 in previous kernels.
-config IP_DCCP_CCID3_MEASURE_S_AS_MAX
-	  bool "Track the maximum payload length"
-	  ---help---
-	    An experimental method based on tracking the maximum packet size.
-endchoice
-
 config IP_DCCP_CCID3_RTO
 	  int "Use higher bound for nofeedback timer"
 	  default 100
+	  depends on IP_DCCP_CCID3 && EXPERIMENTAL
 	  ---help---
 	    Use higher lower bound for nofeedback timer expiration.
 
@@ -123,7 +106,6 @@ config IP_DCCP_CCID3_RTO
 	    The purpose of the nofeedback timer is to slow DCCP down when there
 	    is serious network congestion: experimenting with larger values should
 	    therefore not be performed on WANs.
-endif	# IP_DCCP_CCID3
 
 config IP_DCCP_TFRC_LIB
 	tristate
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index fa713227c66..9a430734530 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -25,7 +25,7 @@
 /*
  * This implementation should follow RFC 4341
  */
-#include "../feat.h"
+
 #include "../ccid.h"
 #include "../dccp.h"
 #include "ccid2.h"
@@ -34,8 +34,51 @@
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
 static int ccid2_debug;
 #define ccid2_pr_debug(format, a...)	DCCP_PR_DEBUG(ccid2_debug, format, ##a)
+
+static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx)
+{
+	int len = 0;
+	int pipe = 0;
+	struct ccid2_seq *seqp = hctx->ccid2hctx_seqh;
+
+	/* there is data in the chain */
+	if (seqp != hctx->ccid2hctx_seqt) {
+		seqp = seqp->ccid2s_prev;
+		len++;
+		if (!seqp->ccid2s_acked)
+			pipe++;
+
+		while (seqp != hctx->ccid2hctx_seqt) {
+			struct ccid2_seq *prev = seqp->ccid2s_prev;
+
+			len++;
+			if (!prev->ccid2s_acked)
+				pipe++;
+
+			/* packets are sent sequentially */
+			BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq,
+						prev->ccid2s_seq ) >= 0);
+			BUG_ON(time_before(seqp->ccid2s_sent,
+					   prev->ccid2s_sent));
+
+			seqp = prev;
+		}
+	}
+
+	BUG_ON(pipe != hctx->ccid2hctx_pipe);
+	ccid2_pr_debug("len of chain=%d\n", len);
+
+	do {
+		seqp = seqp->ccid2s_prev;
+		len++;
+	} while (seqp != hctx->ccid2hctx_seqh);
+
+	ccid2_pr_debug("total len=%d\n", len);
+	BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN);
+}
 #else
 #define ccid2_pr_debug(format, a...)
+#define ccid2_hc_tx_check_sanity(hctx)
 #endif
 
 static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
@@ -44,7 +87,8 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
 	int i;
 
 	/* check if we have space to preserve the pointer to the buffer */
-	if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *))
+	if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) /
+					sizeof(struct ccid2_seq*)))
 		return -ENOMEM;
 
 	/* allocate buffer and initialize linked list */
@@ -60,35 +104,38 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx)
 	seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
 
 	/* This is the first allocation.  Initiate the head and tail.  */
-	if (hctx->seqbufc == 0)
-		hctx->seqh = hctx->seqt = seqp;
+	if (hctx->ccid2hctx_seqbufc == 0)
+		hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp;
 	else {
 		/* link the existing list with the one we just created */
-		hctx->seqh->ccid2s_next = seqp;
-		seqp->ccid2s_prev = hctx->seqh;
+		hctx->ccid2hctx_seqh->ccid2s_next = seqp;
+		seqp->ccid2s_prev = hctx->ccid2hctx_seqh;
 
-		hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
-		seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt;
+		hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+		seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt;
 	}
 
 	/* store the original pointer to the buffer so we can free it */
-	hctx->seqbuf[hctx->seqbufc] = seqp;
-	hctx->seqbufc++;
+	hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp;
+	hctx->ccid2hctx_seqbufc++;
 
 	return 0;
 }
 
 static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
-	if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
-		return CCID_PACKET_WILL_DEQUEUE_LATER;
-	return CCID_PACKET_SEND_AT_ONCE;
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd)
+		return 0;
+
+	return 1; /* XXX CCID should dequeue when ready instead of polling */
 }
 
 static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
-	u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2);
+	u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2);
 
 	/*
 	 * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
@@ -100,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 		DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
 		val = max_ratio;
 	}
-	if (val > DCCPF_ACK_RATIO_MAX)
-		val = DCCPF_ACK_RATIO_MAX;
+	if (val > 0xFFFF)		/* RFC 4340, 11.3 */
+		val = 0xFFFF;
 
 	if (val == dp->dccps_l_ack_ratio)
 		return;
@@ -110,77 +157,99 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
 	dp->dccps_l_ack_ratio = val;
 }
 
+static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val)
+{
+	ccid2_pr_debug("change SRTT to %ld\n", val);
+	hctx->ccid2hctx_srtt = val;
+}
+
+static void ccid2_start_rto_timer(struct sock *sk);
+
 static void ccid2_hc_tx_rto_expire(unsigned long data)
 {
 	struct sock *sk = (struct sock *)data;
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-	const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
+	long s;
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
-		sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5);
+		sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
+			       jiffies + HZ / 5);
 		goto out;
 	}
 
 	ccid2_pr_debug("RTO_EXPIRE\n");
 
+	ccid2_hc_tx_check_sanity(hctx);
+
 	/* back-off timer */
-	hctx->rto <<= 1;
-	if (hctx->rto > DCCP_RTO_MAX)
-		hctx->rto = DCCP_RTO_MAX;
+	hctx->ccid2hctx_rto <<= 1;
+
+	s = hctx->ccid2hctx_rto / HZ;
+	if (s > 60)
+		hctx->ccid2hctx_rto = 60 * HZ;
+
+	ccid2_start_rto_timer(sk);
 
 	/* adjust pipe, cwnd etc */
-	hctx->ssthresh = hctx->cwnd / 2;
-	if (hctx->ssthresh < 2)
-		hctx->ssthresh = 2;
-	hctx->cwnd = 1;
-	hctx->pipe = 0;
+	hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2;
+	if (hctx->ccid2hctx_ssthresh < 2)
+		hctx->ccid2hctx_ssthresh = 2;
+	hctx->ccid2hctx_cwnd	 = 1;
+	hctx->ccid2hctx_pipe	 = 0;
 
 	/* clear state about stuff we sent */
-	hctx->seqt = hctx->seqh;
-	hctx->packets_acked = 0;
+	hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh;
+	hctx->ccid2hctx_packets_acked = 0;
 
 	/* clear ack ratio state. */
-	hctx->rpseq    = 0;
-	hctx->rpdupack = -1;
+	hctx->ccid2hctx_rpseq	 = 0;
+	hctx->ccid2hctx_rpdupack = -1;
 	ccid2_change_l_ack_ratio(sk, 1);
-
-	/* if we were blocked before, we may now send cwnd=1 packet */
-	if (sender_was_blocked)
-		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
-	/* restart backed-off timer */
-	sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
+	ccid2_hc_tx_check_sanity(hctx);
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
 
-static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+static void ccid2_start_rto_timer(struct sock *sk)
+{
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto);
+
+	BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer));
+	sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer,
+		       jiffies + hctx->ccid2hctx_rto);
+}
+
+static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 	struct ccid2_seq *next;
 
-	hctx->pipe++;
+	hctx->ccid2hctx_pipe++;
 
-	hctx->seqh->ccid2s_seq   = dp->dccps_gss;
-	hctx->seqh->ccid2s_acked = 0;
-	hctx->seqh->ccid2s_sent  = jiffies;
+	hctx->ccid2hctx_seqh->ccid2s_seq   = dp->dccps_gss;
+	hctx->ccid2hctx_seqh->ccid2s_acked = 0;
+	hctx->ccid2hctx_seqh->ccid2s_sent  = jiffies;
 
-	next = hctx->seqh->ccid2s_next;
+	next = hctx->ccid2hctx_seqh->ccid2s_next;
 	/* check if we need to alloc more space */
-	if (next == hctx->seqt) {
+	if (next == hctx->ccid2hctx_seqt) {
 		if (ccid2_hc_tx_alloc_seq(hctx)) {
 			DCCP_CRIT("packet history - out of memory!");
 			/* FIXME: find a more graceful way to bail out */
 			return;
 		}
-		next = hctx->seqh->ccid2s_next;
-		BUG_ON(next == hctx->seqt);
+		next = hctx->ccid2hctx_seqh->ccid2s_next;
+		BUG_ON(next == hctx->ccid2hctx_seqt);
 	}
-	hctx->seqh = next;
+	hctx->ccid2hctx_seqh = next;
 
-	ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe);
+	ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd,
+		       hctx->ccid2hctx_pipe);
 
 	/*
 	 * FIXME: The code below is broken and the variables have been removed
@@ -203,12 +272,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 	 */
 #if 0
 	/* Ack Ratio.  Need to maintain a concept of how many windows we sent */
-	hctx->arsent++;
+	hctx->ccid2hctx_arsent++;
 	/* We had an ack loss in this window... */
-	if (hctx->ackloss) {
-		if (hctx->arsent >= hctx->cwnd) {
-			hctx->arsent  = 0;
-			hctx->ackloss = 0;
+	if (hctx->ccid2hctx_ackloss) {
+		if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) {
+			hctx->ccid2hctx_arsent	= 0;
+			hctx->ccid2hctx_ackloss	= 0;
 		}
 	} else {
 		/* No acks lost up to now... */
@@ -218,28 +287,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 			int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
 				    dp->dccps_l_ack_ratio;
 
-			denom = hctx->cwnd * hctx->cwnd / denom;
+			denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom;
 
-			if (hctx->arsent >= denom) {
+			if (hctx->ccid2hctx_arsent >= denom) {
 				ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
-				hctx->arsent = 0;
+				hctx->ccid2hctx_arsent = 0;
 			}
 		} else {
 			/* we can't increase ack ratio further [1] */
-			hctx->arsent = 0; /* or maybe set it to cwnd*/
+			hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/
 		}
 	}
 #endif
 
 	/* setup RTO timer */
-	if (!timer_pending(&hctx->rtotimer))
-		sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
+	if (!timer_pending(&hctx->ccid2hctx_rtotimer))
+		ccid2_start_rto_timer(sk);
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
 	do {
-		struct ccid2_seq *seqp = hctx->seqt;
+		struct ccid2_seq *seqp = hctx->ccid2hctx_seqt;
 
-		while (seqp != hctx->seqh) {
+		while (seqp != hctx->ccid2hctx_seqh) {
 			ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n",
 				       (unsigned long long)seqp->ccid2s_seq,
 				       seqp->ccid2s_acked, seqp->ccid2s_sent);
@@ -247,158 +316,205 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
 		}
 	} while (0);
 	ccid2_pr_debug("=========\n");
+	ccid2_hc_tx_check_sanity(hctx);
 #endif
 }
 
-/**
- * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
- * This code is almost identical with TCP's tcp_rtt_estimator(), since
- * - it has a higher sampling frequency (recommended by RFC 1323),
- * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
- * - it is simple (cf. more complex proposals such as Eifel timer or research
- *   which suggests that the gain should be set according to window size),
- * - in tests it was found to work well with CCID2 [gerrit].
+/* XXX Lame code duplication!
+ * returns -1 if none was found.
+ * else returns the next offset to use in the function call.
  */
-static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
+static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
+			   unsigned char **vec, unsigned char *veclen)
 {
-	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-	long m = mrtt ? : 1;
-
-	if (hctx->srtt == 0) {
-		/* First measurement m */
-		hctx->srtt = m << 3;
-		hctx->mdev = m << 1;
-
-		hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev);
-		hctx->rttvar   = hctx->mdev_max;
-		hctx->rtt_seq  = dccp_sk(sk)->dccps_gss;
-	} else {
-		/* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
-		m -= (hctx->srtt >> 3);
-		hctx->srtt += m;
-
-		/* Similarly, update scaled mdev with regard to |m| */
-		if (m < 0) {
-			m = -m;
-			m -= (hctx->mdev >> 2);
+	const struct dccp_hdr *dh = dccp_hdr(skb);
+	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
+	unsigned char *opt_ptr;
+	const unsigned char *opt_end = (unsigned char *)dh +
+					(dh->dccph_doff * 4);
+	unsigned char opt, len;
+	unsigned char *value;
+
+	BUG_ON(offset < 0);
+	options += offset;
+	opt_ptr = options;
+	if (opt_ptr >= opt_end)
+		return -1;
+
+	while (opt_ptr != opt_end) {
+		opt   = *opt_ptr++;
+		len   = 0;
+		value = NULL;
+
+		/* Check if this isn't a single byte option */
+		if (opt > DCCPO_MAX_RESERVED) {
+			if (opt_ptr == opt_end)
+				goto out_invalid_option;
+
+			len = *opt_ptr++;
+			if (len < 3)
+				goto out_invalid_option;
 			/*
-			 * This neutralises RTO increase when RTT < SRTT - mdev
-			 * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
-			 * in Linux TCP", USENIX 2002, pp. 49-62).
+			 * Remove the type and len fields, leaving
+			 * just the value size
 			 */
-			if (m > 0)
-				m >>= 3;
-		} else {
-			m -= (hctx->mdev >> 2);
-		}
-		hctx->mdev += m;
+			len     -= 2;
+			value   = opt_ptr;
+			opt_ptr += len;
 
-		if (hctx->mdev > hctx->mdev_max) {
-			hctx->mdev_max = hctx->mdev;
-			if (hctx->mdev_max > hctx->rttvar)
-				hctx->rttvar = hctx->mdev_max;
+			if (opt_ptr > opt_end)
+				goto out_invalid_option;
 		}
 
-		/*
-		 * Decay RTTVAR at most once per flight, exploiting that
-		 *  1) pipe <= cwnd <= Sequence_Window = W  (RFC 4340, 7.5.2)
-		 *  2) AWL = GSS-W+1 <= GAR <= GSS          (RFC 4340, 7.5.1)
-		 * GAR is a useful bound for FlightSize = pipe, AWL is probably
-		 * too low as it over-estimates pipe.
-		 */
-		if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) {
-			if (hctx->mdev_max < hctx->rttvar)
-				hctx->rttvar -= (hctx->rttvar -
-						 hctx->mdev_max) >> 2;
-			hctx->rtt_seq  = dccp_sk(sk)->dccps_gss;
-			hctx->mdev_max = TCP_RTO_MIN;
+		switch (opt) {
+		case DCCPO_ACK_VECTOR_0:
+		case DCCPO_ACK_VECTOR_1:
+			*vec	= value;
+			*veclen = len;
+			return offset + (opt_ptr - options);
 		}
 	}
 
-	/*
-	 * Set RTO from SRTT and RTTVAR
-	 * Clock granularity is ignored since the minimum error for RTTVAR is
-	 * clamped to 50msec (corresponding to HZ=20). This leads to a minimum
-	 * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP
-	 * does not retransmit data, DCCP does not require TCP's recommended
-	 * minimum timeout of one second".
-	 */
-	hctx->rto = (hctx->srtt >> 3) + hctx->rttvar;
+	return -1;
 
-	if (hctx->rto > DCCP_RTO_MAX)
-		hctx->rto = DCCP_RTO_MAX;
+out_invalid_option:
+	DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
+	return -1;
 }
 
-static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
-			  unsigned int *maxincr)
+static void ccid2_hc_tx_kill_rto_timer(struct sock *sk)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	if (hctx->cwnd < hctx->ssthresh) {
-		if (*maxincr > 0 && ++hctx->packets_acked == 2) {
-			hctx->cwnd += 1;
-			*maxincr   -= 1;
-			hctx->packets_acked = 0;
-		}
-	} else if (++hctx->packets_acked >= hctx->cwnd) {
-			hctx->cwnd += 1;
-			hctx->packets_acked = 0;
-	}
-	/*
-	 * FIXME: RTT is sampled several times per acknowledgment (for each
-	 * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
-	 * This causes the RTT to be over-estimated, since the older entries
-	 * in the Ack Vector have earlier sending times.
-	 * The cleanest solution is to not use the ccid2s_sent field at all
-	 * and instead use DCCP timestamps - need to be resolved at some time.
-	 */
-	ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent);
+	sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer);
+	ccid2_pr_debug("deleted RTO timer\n");
 }
 
-static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
+static inline void ccid2_new_ack(struct sock *sk,
+				 struct ccid2_seq *seqp,
+				 unsigned int *maxincr)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	if (time_before(seqp->ccid2s_sent, hctx->last_cong)) {
-		ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
-		return;
+	if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) {
+		if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) {
+			hctx->ccid2hctx_cwnd += 1;
+			*maxincr	     -= 1;
+			hctx->ccid2hctx_packets_acked = 0;
+		}
+	} else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) {
+			hctx->ccid2hctx_cwnd += 1;
+			hctx->ccid2hctx_packets_acked = 0;
 	}
 
-	hctx->last_cong = jiffies;
+	/* update RTO */
+	if (hctx->ccid2hctx_srtt == -1 ||
+	    time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) {
+		unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent;
+		int s;
+
+		/* first measurement */
+		if (hctx->ccid2hctx_srtt == -1) {
+			ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n",
+				       r, jiffies,
+				       (unsigned long long)seqp->ccid2s_seq);
+			ccid2_change_srtt(hctx, r);
+			hctx->ccid2hctx_rttvar = r >> 1;
+		} else {
+			/* RTTVAR */
+			long tmp = hctx->ccid2hctx_srtt - r;
+			long srtt;
+
+			if (tmp < 0)
+				tmp *= -1;
+
+			tmp >>= 2;
+			hctx->ccid2hctx_rttvar *= 3;
+			hctx->ccid2hctx_rttvar >>= 2;
+			hctx->ccid2hctx_rttvar += tmp;
+
+			/* SRTT */
+			srtt = hctx->ccid2hctx_srtt;
+			srtt *= 7;
+			srtt >>= 3;
+			tmp = r >> 3;
+			srtt += tmp;
+			ccid2_change_srtt(hctx, srtt);
+		}
+		s = hctx->ccid2hctx_rttvar << 2;
+		/* clock granularity is 1 when based on jiffies */
+		if (!s)
+			s = 1;
+		hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s;
+
+		/* must be at least a second */
+		s = hctx->ccid2hctx_rto / HZ;
+		/* DCCP doesn't require this [but I like it cuz my code sux] */
+#if 1
+		if (s < 1)
+			hctx->ccid2hctx_rto = HZ;
+#endif
+		/* max 60 seconds */
+		if (s > 60)
+			hctx->ccid2hctx_rto = HZ * 60;
 
-	hctx->cwnd     = hctx->cwnd / 2 ? : 1U;
-	hctx->ssthresh = max(hctx->cwnd, 2U);
+		hctx->ccid2hctx_lastrtt = jiffies;
 
-	/* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
-	if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd)
-		ccid2_change_l_ack_ratio(sk, hctx->cwnd);
+		ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n",
+			       hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar,
+			       hctx->ccid2hctx_rto, HZ, r);
+	}
+
+	/* we got a new ack, so re-start RTO timer */
+	ccid2_hc_tx_kill_rto_timer(sk);
+	ccid2_start_rto_timer(sk);
 }
 
-static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
-				     u8 option, u8 *optval, u8 optlen)
+static void ccid2_hc_tx_dec_pipe(struct sock *sk)
 {
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 
-	switch (option) {
-	case DCCPO_ACK_VECTOR_0:
-	case DCCPO_ACK_VECTOR_1:
-		return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen,
-					      option - DCCPO_ACK_VECTOR_0);
+	if (hctx->ccid2hctx_pipe == 0)
+		DCCP_BUG("pipe == 0");
+	else
+		hctx->ccid2hctx_pipe--;
+
+	if (hctx->ccid2hctx_pipe == 0)
+		ccid2_hc_tx_kill_rto_timer(sk);
+}
+
+static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
+{
+	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
+
+	if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) {
+		ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
+		return;
 	}
-	return 0;
+
+	hctx->ccid2hctx_last_cong = jiffies;
+
+	hctx->ccid2hctx_cwnd     = hctx->ccid2hctx_cwnd / 2 ? : 1U;
+	hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U);
+
+	/* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
+	if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd)
+		ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd);
 }
 
 static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
-	const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx);
-	struct dccp_ackvec_parsed *avp;
 	u64 ackno, seqno;
 	struct ccid2_seq *seqp;
+	unsigned char *vector;
+	unsigned char veclen;
+	int offset = 0;
 	int done = 0;
 	unsigned int maxincr = 0;
 
+	ccid2_hc_tx_check_sanity(hctx);
 	/* check reverse path congestion */
 	seqno = DCCP_SKB_CB(skb)->dccpd_seq;
 
@@ -407,21 +523,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	 * -sorbo.
 	 */
 	/* need to bootstrap */
-	if (hctx->rpdupack == -1) {
-		hctx->rpdupack = 0;
-		hctx->rpseq = seqno;
+	if (hctx->ccid2hctx_rpdupack == -1) {
+		hctx->ccid2hctx_rpdupack = 0;
+		hctx->ccid2hctx_rpseq = seqno;
 	} else {
 		/* check if packet is consecutive */
-		if (dccp_delta_seqno(hctx->rpseq, seqno) == 1)
-			hctx->rpseq = seqno;
+		if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1)
+			hctx->ccid2hctx_rpseq = seqno;
 		/* it's a later packet */
-		else if (after48(seqno, hctx->rpseq)) {
-			hctx->rpdupack++;
+		else if (after48(seqno, hctx->ccid2hctx_rpseq)) {
+			hctx->ccid2hctx_rpdupack++;
 
 			/* check if we got enough dupacks */
-			if (hctx->rpdupack >= NUMDUPACK) {
-				hctx->rpdupack = -1; /* XXX lame */
-				hctx->rpseq = 0;
+			if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) {
+				hctx->ccid2hctx_rpdupack = -1; /* XXX lame */
+				hctx->ccid2hctx_rpseq = 0;
 
 				ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
 			}
@@ -429,22 +545,27 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* check forward path congestion */
-	if (dccp_packet_without_ack(skb))
+	/* still didn't send out new data packets */
+	if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt)
 		return;
 
-	/* still didn't send out new data packets */
-	if (hctx->seqh == hctx->seqt)
-		goto done;
+	switch (DCCP_SKB_CB(skb)->dccpd_type) {
+	case DCCP_PKT_ACK:
+	case DCCP_PKT_DATAACK:
+		break;
+	default:
+		return;
+	}
 
 	ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
-	if (after48(ackno, hctx->high_ack))
-		hctx->high_ack = ackno;
+	if (after48(ackno, hctx->ccid2hctx_high_ack))
+		hctx->ccid2hctx_high_ack = ackno;
 
-	seqp = hctx->seqt;
+	seqp = hctx->ccid2hctx_seqt;
 	while (before48(seqp->ccid2s_seq, ackno)) {
 		seqp = seqp->ccid2s_next;
-		if (seqp == hctx->seqh) {
-			seqp = hctx->seqh->ccid2s_prev;
+		if (seqp == hctx->ccid2hctx_seqh) {
+			seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
 			break;
 		}
 	}
@@ -454,26 +575,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	 * packets per acknowledgement. Rounding up avoids that cwnd is not
 	 * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
 	 */
-	if (hctx->cwnd < hctx->ssthresh)
+	if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh)
 		maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
 
 	/* go through all ack vectors */
-	list_for_each_entry(avp, &hctx->av_chunks, node) {
+	while ((offset = ccid2_ackvector(sk, skb, offset,
+					 &vector, &veclen)) != -1) {
 		/* go through this ack vector */
-		for (; avp->len--; avp->vec++) {
-			u64 ackno_end_rl = SUB48(ackno,
-						 dccp_ackvec_runlen(avp->vec));
+		while (veclen--) {
+			const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+			u64 ackno_end_rl = SUB48(ackno, rl);
 
-			ccid2_pr_debug("ackvec %llu |%u,%u|\n",
+			ccid2_pr_debug("ackvec start:%llu end:%llu\n",
 				       (unsigned long long)ackno,
-				       dccp_ackvec_state(avp->vec) >> 6,
-				       dccp_ackvec_runlen(avp->vec));
+				       (unsigned long long)ackno_end_rl);
 			/* if the seqno we are analyzing is larger than the
 			 * current ackno, then move towards the tail of our
 			 * seqnos.
 			 */
 			while (after48(seqp->ccid2s_seq, ackno)) {
-				if (seqp == hctx->seqt) {
+				if (seqp == hctx->ccid2hctx_seqt) {
 					done = 1;
 					break;
 				}
@@ -486,24 +607,26 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			 * run length
 			 */
 			while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
-				const u8 state = dccp_ackvec_state(avp->vec);
+				const u8 state = *vector &
+						 DCCP_ACKVEC_STATE_MASK;
 
 				/* new packet received or marked */
-				if (state != DCCPAV_NOT_RECEIVED &&
+				if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED &&
 				    !seqp->ccid2s_acked) {
-					if (state == DCCPAV_ECN_MARKED)
+					if (state ==
+					    DCCP_ACKVEC_STATE_ECN_MARKED) {
 						ccid2_congestion_event(sk,
 								       seqp);
-					else
+					} else
 						ccid2_new_ack(sk, seqp,
 							      &maxincr);
 
 					seqp->ccid2s_acked = 1;
 					ccid2_pr_debug("Got ack for %llu\n",
 						       (unsigned long long)seqp->ccid2s_seq);
-					hctx->pipe--;
+					ccid2_hc_tx_dec_pipe(sk);
 				}
-				if (seqp == hctx->seqt) {
+				if (seqp == hctx->ccid2hctx_seqt) {
 					done = 1;
 					break;
 				}
@@ -513,6 +636,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 				break;
 
 			ackno = SUB48(ackno_end_rl, 1);
+			vector++;
 		}
 		if (done)
 			break;
@@ -521,11 +645,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	/* The state about what is acked should be correct now
 	 * Check for NUMDUPACK
 	 */
-	seqp = hctx->seqt;
-	while (before48(seqp->ccid2s_seq, hctx->high_ack)) {
+	seqp = hctx->ccid2hctx_seqt;
+	while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) {
 		seqp = seqp->ccid2s_next;
-		if (seqp == hctx->seqh) {
-			seqp = hctx->seqh->ccid2s_prev;
+		if (seqp == hctx->ccid2hctx_seqh) {
+			seqp = hctx->ccid2hctx_seqh->ccid2s_prev;
 			break;
 		}
 	}
@@ -536,7 +660,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			if (done == NUMDUPACK)
 				break;
 		}
-		if (seqp == hctx->seqt)
+		if (seqp == hctx->ccid2hctx_seqt)
 			break;
 		seqp = seqp->ccid2s_prev;
 	}
@@ -557,34 +681,25 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 				 * one ack vector.
 				 */
 				ccid2_congestion_event(sk, seqp);
-				hctx->pipe--;
+				ccid2_hc_tx_dec_pipe(sk);
 			}
-			if (seqp == hctx->seqt)
+			if (seqp == hctx->ccid2hctx_seqt)
 				break;
 			seqp = seqp->ccid2s_prev;
 		}
 
-		hctx->seqt = last_acked;
+		hctx->ccid2hctx_seqt = last_acked;
 	}
 
 	/* trim acked packets in tail */
-	while (hctx->seqt != hctx->seqh) {
-		if (!hctx->seqt->ccid2s_acked)
+	while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) {
+		if (!hctx->ccid2hctx_seqt->ccid2s_acked)
 			break;
 
-		hctx->seqt = hctx->seqt->ccid2s_next;
+		hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next;
 	}
 
-	/* restart RTO timer if not all outstanding data has been acked */
-	if (hctx->pipe == 0)
-		sk_stop_timer(sk, &hctx->rtotimer);
-	else
-		sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto);
-done:
-	/* check if incoming Acks allow pending packets to be sent */
-	if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx))
-		tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
-	dccp_ackvec_parsed_cleanup(&hctx->av_chunks);
+	ccid2_hc_tx_check_sanity(hctx);
 }
 
 static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -594,13 +709,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	u32 max_ratio;
 
 	/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
-	hctx->ssthresh = ~0U;
+	hctx->ccid2hctx_ssthresh  = ~0U;
 
-	/* Use larger initial windows (RFC 3390, rfc2581bis) */
-	hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
+	/*
+	 * RFC 4341, 5: "The cwnd parameter is initialized to at most four
+	 * packets for new connections, following the rules from [RFC3390]".
+	 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
+	 */
+	hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
 
 	/* Make sure that Ack Ratio is enabled and within bounds. */
-	max_ratio = DIV_ROUND_UP(hctx->cwnd, 2);
+	max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2);
 	if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
 		dp->dccps_l_ack_ratio = max_ratio;
 
@@ -608,11 +727,15 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	if (ccid2_hc_tx_alloc_seq(hctx))
 		return -ENOMEM;
 
-	hctx->rto	= DCCP_TIMEOUT_INIT;
-	hctx->rpdupack  = -1;
-	hctx->last_cong = jiffies;
-	setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk);
-	INIT_LIST_HEAD(&hctx->av_chunks);
+	hctx->ccid2hctx_rto	 = 3 * HZ;
+	ccid2_change_srtt(hctx, -1);
+	hctx->ccid2hctx_rttvar	 = -1;
+	hctx->ccid2hctx_rpdupack = -1;
+	hctx->ccid2hctx_last_cong = jiffies;
+	setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire,
+			(unsigned long)sk);
+
+	ccid2_hc_tx_check_sanity(hctx);
 	return 0;
 }
 
@@ -621,11 +744,11 @@ static void ccid2_hc_tx_exit(struct sock *sk)
 	struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk);
 	int i;
 
-	sk_stop_timer(sk, &hctx->rtotimer);
+	ccid2_hc_tx_kill_rto_timer(sk);
 
-	for (i = 0; i < hctx->seqbufc; i++)
-		kfree(hctx->seqbuf[i]);
-	hctx->seqbufc = 0;
+	for (i = 0; i < hctx->ccid2hctx_seqbufc; i++)
+		kfree(hctx->ccid2hctx_seqbuf[i]);
+	hctx->ccid2hctx_seqbufc = 0;
 }
 
 static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -636,28 +759,27 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	switch (DCCP_SKB_CB(skb)->dccpd_type) {
 	case DCCP_PKT_DATA:
 	case DCCP_PKT_DATAACK:
-		hcrx->data++;
-		if (hcrx->data >= dp->dccps_r_ack_ratio) {
+		hcrx->ccid2hcrx_data++;
+		if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) {
 			dccp_send_ack(sk);
-			hcrx->data = 0;
+			hcrx->ccid2hcrx_data = 0;
 		}
 		break;
 	}
 }
 
 static struct ccid_operations ccid2 = {
-	.ccid_id		  = DCCPC_CCID2,
-	.ccid_name		  = "TCP-like",
-	.ccid_owner		  = THIS_MODULE,
-	.ccid_hc_tx_obj_size	  = sizeof(struct ccid2_hc_tx_sock),
-	.ccid_hc_tx_init	  = ccid2_hc_tx_init,
-	.ccid_hc_tx_exit	  = ccid2_hc_tx_exit,
-	.ccid_hc_tx_send_packet	  = ccid2_hc_tx_send_packet,
-	.ccid_hc_tx_packet_sent	  = ccid2_hc_tx_packet_sent,
-	.ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
-	.ccid_hc_tx_packet_recv	  = ccid2_hc_tx_packet_recv,
-	.ccid_hc_rx_obj_size	  = sizeof(struct ccid2_hc_rx_sock),
-	.ccid_hc_rx_packet_recv	  = ccid2_hc_rx_packet_recv,
+	.ccid_id		= DCCPC_CCID2,
+	.ccid_name		= "TCP-like",
+	.ccid_owner		= THIS_MODULE,
+	.ccid_hc_tx_obj_size	= sizeof(struct ccid2_hc_tx_sock),
+	.ccid_hc_tx_init	= ccid2_hc_tx_init,
+	.ccid_hc_tx_exit	= ccid2_hc_tx_exit,
+	.ccid_hc_tx_send_packet	= ccid2_hc_tx_send_packet,
+	.ccid_hc_tx_packet_sent	= ccid2_hc_tx_packet_sent,
+	.ccid_hc_tx_packet_recv	= ccid2_hc_tx_packet_recv,
+	.ccid_hc_rx_obj_size	= sizeof(struct ccid2_hc_rx_sock),
+	.ccid_hc_rx_packet_recv	= ccid2_hc_rx_packet_recv,
 };
 
 #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 8b7a2dee2f6..2c94ca02901 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -42,49 +42,34 @@ struct ccid2_seq {
 
 /** struct ccid2_hc_tx_sock - CCID2 TX half connection
  *
- * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
- * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
- * @srtt: smoothed RTT estimate, scaled by 2^3
- * @mdev: smoothed RTT variation, scaled by 2^2
- * @mdev_max: maximum of @mdev during one flight
- * @rttvar: moving average/maximum of @mdev_max
- * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
- * @rtt_seq: to decay RTTVAR at most once per flight
- * @rpseq: last consecutive seqno
- * @rpdupack: dupacks since rpseq
- * @av_chunks: list of Ack Vectors received on current skb
- */
+ * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
+ * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465)
+ * @ccid2hctx_lastrtt -time RTT was last measured
+ * @ccid2hctx_rpseq - last consecutive seqno
+ * @ccid2hctx_rpdupack - dupacks since rpseq
+*/
 struct ccid2_hc_tx_sock {
-	u32			cwnd;
-	u32			ssthresh;
-	u32			pipe;
-	u32			packets_acked;
-	struct ccid2_seq	*seqbuf[CCID2_SEQBUF_MAX];
-	int			seqbufc;
-	struct ccid2_seq	*seqh;
-	struct ccid2_seq	*seqt;
-	/* RTT measurement: variables/principles are the same as in TCP */
-	u32			srtt,
-				mdev,
-				mdev_max,
-				rttvar,
-				rto;
-	u64			rtt_seq:48;
-	struct timer_list	rtotimer;
-	u64			rpseq;
-	int			rpdupack;
-	unsigned long		last_cong;
-	u64			high_ack;
-	struct list_head	av_chunks;
+	u32			ccid2hctx_cwnd;
+	u32			ccid2hctx_ssthresh;
+	u32			ccid2hctx_pipe;
+	u32			ccid2hctx_packets_acked;
+	struct ccid2_seq	*ccid2hctx_seqbuf[CCID2_SEQBUF_MAX];
+	int			ccid2hctx_seqbufc;
+	struct ccid2_seq	*ccid2hctx_seqh;
+	struct ccid2_seq	*ccid2hctx_seqt;
+	long			ccid2hctx_rto;
+	long			ccid2hctx_srtt;
+	long			ccid2hctx_rttvar;
+	unsigned long		ccid2hctx_lastrtt;
+	struct timer_list	ccid2hctx_rtotimer;
+	u64			ccid2hctx_rpseq;
+	int			ccid2hctx_rpdupack;
+	unsigned long		ccid2hctx_last_cong;
+	u64			ccid2hctx_high_ack;
 };
 
-static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx)
-{
-	return (hctx->pipe >= hctx->cwnd);
-}
-
 struct ccid2_hc_rx_sock {
-	int			data;
+	int	ccid2hcrx_data;
 };
 
 static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 06cfdad84a6..3b8bd7ca676 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -49,41 +49,75 @@ static int ccid3_debug;
 /*
  *	Transmitter Half-Connection Routines
  */
-/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */
-static int do_osc_prev = true;
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
+{
+	static char *ccid3_state_names[] = {
+	[TFRC_SSTATE_NO_SENT]  = "NO_SENT",
+	[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
+	[TFRC_SSTATE_FBACK]    = "FBACK",
+	[TFRC_SSTATE_TERM]     = "TERM",
+	};
+
+	return ccid3_state_names[state];
+}
+#endif
+
+static void ccid3_hc_tx_set_state(struct sock *sk,
+				  enum ccid3_hc_tx_states state)
+{
+	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+	enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
+
+	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+		       dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
+		       ccid3_tx_state_name(state));
+	WARN_ON(state == oldstate);
+	hctx->ccid3hctx_state = state;
+}
 
 /*
  * Compute the initial sending rate X_init in the manner of RFC 3390:
  *
- *	X_init  =  min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT
+ *	X_init  =  min(4 * s, max(2 * s, 4380 bytes)) / RTT
  *
+ * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
+ * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
  * For consistency with other parts of the code, X_init is scaled by 2^6.
  */
 static inline u64 rfc3390_initial_rate(struct sock *sk)
 {
-	const u32 mps = dccp_sk(sk)->dccps_mss_cache,
-	       w_init = clamp(4380U, 2 * mps, 4 * mps);
+	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+	const __u32 w_init = clamp_t(__u32, 4380U,
+			2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s);
 
-	return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt);
+	return scaled_div(w_init << 6, hctx->ccid3hctx_rtt);
 }
 
-/**
- * ccid3_update_send_interval  -  Calculate new t_ipi = s / X
- * This respects the granularity of X (64 * bytes/second) and enforces the
- * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342.
+/*
+ * Recalculate t_ipi and delta (should be called whenever X changes)
  */
 static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx)
 {
-	if (unlikely(hctx->x <= hctx->s))
-		hctx->x	= hctx->s;
-	hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x);
+	/* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */
+	hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6,
+					     hctx->ccid3hctx_x);
+
+	/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
+	hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
+					   TFRC_OPSYS_HALF_TIME_GRAN);
+
+	ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n",
+		       hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta,
+		       hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6));
+
 }
 
 static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
 {
-	u32 delta = ktime_us_delta(now, hctx->t_last_win_count);
+	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count);
 
-	return delta / hctx->rtt;
+	return delta / hctx->ccid3hctx_rtt;
 }
 
 /**
@@ -99,8 +133,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now)
 static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	u64 min_rate = 2 * hctx->x_recv;
-	const u64 old_x = hctx->x;
+	__u64 min_rate = 2 * hctx->ccid3hctx_x_recv;
+	const  __u64 old_x = hctx->ccid3hctx_x;
 	ktime_t now = stamp ? *stamp : ktime_get_real();
 
 	/*
@@ -111,44 +145,50 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
 	 */
 	if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) {
 		min_rate = rfc3390_initial_rate(sk);
-		min_rate = max(min_rate, 2 * hctx->x_recv);
+		min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv);
 	}
 
-	if (hctx->p > 0) {
+	if (hctx->ccid3hctx_p > 0) {
 
-		hctx->x = min(((u64)hctx->x_calc) << 6, min_rate);
+		hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6,
+					min_rate);
+		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
+					(((__u64)hctx->ccid3hctx_s) << 6) /
+								TFRC_T_MBI);
 
-	} else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) {
+	} else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld)
+				- (s64)hctx->ccid3hctx_rtt >= 0) {
 
-		hctx->x = min(2 * hctx->x, min_rate);
-		hctx->x = max(hctx->x,
-			      scaled_div(((u64)hctx->s) << 6, hctx->rtt));
-		hctx->t_ld = now;
+		hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate);
+		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x,
+			    scaled_div(((__u64)hctx->ccid3hctx_s) << 6,
+				       hctx->ccid3hctx_rtt));
+		hctx->ccid3hctx_t_ld = now;
 	}
 
-	if (hctx->x != old_x) {
+	if (hctx->ccid3hctx_x != old_x) {
 		ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
 			       "X_recv=%u\n", (unsigned)(old_x >> 6),
-			       (unsigned)(hctx->x >> 6), hctx->x_calc,
-			       (unsigned)(hctx->x_recv >> 6));
+			       (unsigned)(hctx->ccid3hctx_x >> 6),
+			       hctx->ccid3hctx_x_calc,
+			       (unsigned)(hctx->ccid3hctx_x_recv >> 6));
 
 		ccid3_update_send_interval(hctx);
 	}
 }
 
 /*
- * ccid3_hc_tx_measure_packet_size  -  Measuring the packet size `s' (sec 4.1)
- * @new_len: DCCP payload size in bytes (not used by all methods)
+ *	Track the mean packet size `s' (cf. RFC 4342, 5.3 and  RFC 3448, 4.1)
+ *	@len: DCCP packet payload size in bytes
  */
-static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
+static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len)
 {
-#if   defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG)
-	return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9);
-#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX)
-	return max(ccid3_hc_tx_sk(sk)->s, new_len);
-#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS	*/
-	return dccp_sk(sk)->dccps_mss_cache;
-#endif
+	const u16 old_s = hctx->ccid3hctx_s;
+
+	hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9);
+
+	if (hctx->ccid3hctx_s != old_s)
+		ccid3_update_send_interval(hctx);
 }
 
 /*
@@ -158,13 +198,13 @@ static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len)
 static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx,
 						ktime_t now)
 {
-	u32 delta = ktime_us_delta(now, hctx->t_last_win_count),
-	    quarter_rtts = (4 * delta) / hctx->rtt;
+	u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count),
+	    quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt;
 
 	if (quarter_rtts > 0) {
-		hctx->t_last_win_count = now;
-		hctx->last_win_count  += min(quarter_rtts, 5U);
-		hctx->last_win_count  &= 0xF;		/* mod 16 */
+		hctx->ccid3hctx_t_last_win_count = now;
+		hctx->ccid3hctx_last_win_count  += min(quarter_rtts, 5U);
+		hctx->ccid3hctx_last_win_count	&= 0xF;		/* mod 16 */
 	}
 }
 
@@ -181,26 +221,25 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 		goto restart_timer;
 	}
 
-	ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk,
-		       hctx->feedback ? "" : "out");
+	ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk,
+		       ccid3_tx_state_name(hctx->ccid3hctx_state));
 
-	/* Ignore and do not restart after leaving the established state */
-	if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
+	if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK)
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+	else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
 		goto out;
 
-	/* Reset feedback state to "no feedback received" */
-	hctx->feedback = false;
-
 	/*
 	 * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
-	 * RTO is 0 if and only if no feedback has been received yet.
 	 */
-	if (hctx->t_rto == 0 || hctx->p == 0) {
+	if (hctx->ccid3hctx_t_rto == 0 ||	/* no feedback received yet */
+	    hctx->ccid3hctx_p == 0) {
 
 		/* halve send rate directly */
-		hctx->x /= 2;
+		hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2,
+					(((__u64)hctx->ccid3hctx_s) << 6) /
+								    TFRC_T_MBI);
 		ccid3_update_send_interval(hctx);
-
 	} else {
 		/*
 		 *  Modify the cached value of X_recv
@@ -212,41 +251,44 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 		 *
 		 *  Note that X_recv is scaled by 2^6 while X_calc is not
 		 */
-		BUG_ON(hctx->p && !hctx->x_calc);
+		BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
 
-		if (hctx->x_calc > (hctx->x_recv >> 5))
-			hctx->x_recv /= 2;
+		if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5))
+			hctx->ccid3hctx_x_recv =
+				max(hctx->ccid3hctx_x_recv / 2,
+				    (((__u64)hctx->ccid3hctx_s) << 6) /
+							      (2 * TFRC_T_MBI));
 		else {
-			hctx->x_recv = hctx->x_calc;
-			hctx->x_recv <<= 4;
+			hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc;
+			hctx->ccid3hctx_x_recv <<= 4;
 		}
 		ccid3_hc_tx_update_x(sk, NULL);
 	}
 	ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
-			(unsigned long long)hctx->x);
+			(unsigned long long)hctx->ccid3hctx_x);
 
 	/*
 	 * Set new timeout for the nofeedback timer.
 	 * See comments in packet_recv() regarding the value of t_RTO.
 	 */
-	if (unlikely(hctx->t_rto == 0))		/* no feedback received yet */
+	if (unlikely(hctx->ccid3hctx_t_rto == 0))	/* no feedback yet */
 		t_nfb = TFRC_INITIAL_TIMEOUT;
 	else
-		t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
+		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
 
 restart_timer:
-	sk_reset_timer(sk, &hctx->no_feedback_timer,
+	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
 			   jiffies + usecs_to_jiffies(t_nfb));
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
 
-/**
- * ccid3_hc_tx_send_packet  -  Delay-based dequeueing of TX packets
- * @skb: next packet candidate to send on @sk
- * This function uses the convention of ccid_packet_dequeue_eval() and
- * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
+/*
+ * returns
+ *   > 0: delay (in msecs) that should pass before actually sending
+ *   = 0: can send immediately
+ *   < 0: error condition; do not send packet
  */
 static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 {
@@ -263,14 +305,18 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 	if (unlikely(skb->len == 0))
 		return -EBADMSG;
 
-	if (hctx->s == 0) {
-		sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies +
+	switch (hctx->ccid3hctx_state) {
+	case TFRC_SSTATE_NO_SENT:
+		sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
+			       (jiffies +
 				usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
-		hctx->last_win_count   = 0;
-		hctx->t_last_win_count = now;
+		hctx->ccid3hctx_last_win_count	 = 0;
+		hctx->ccid3hctx_t_last_win_count = now;
 
 		/* Set t_0 for initial packet */
-		hctx->t_nom = now;
+		hctx->ccid3hctx_t_nom = now;
+
+		hctx->ccid3hctx_s = skb->len;
 
 		/*
 		 * Use initial RTT sample when available: recommended by erratum
@@ -279,9 +325,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		 */
 		if (dp->dccps_syn_rtt) {
 			ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
-			hctx->rtt  = dp->dccps_syn_rtt;
-			hctx->x    = rfc3390_initial_rate(sk);
-			hctx->t_ld = now;
+			hctx->ccid3hctx_rtt  = dp->dccps_syn_rtt;
+			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
+			hctx->ccid3hctx_t_ld = now;
 		} else {
 			/*
 			 * Sender does not have RTT sample:
@@ -289,20 +335,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 			 *   is needed in several parts (e.g.  window counter);
 			 * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
 			 */
-			hctx->rtt = DCCP_FALLBACK_RTT;
-			hctx->x	  = dp->dccps_mss_cache;
-			hctx->x <<= 6;
+			hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT;
+			hctx->ccid3hctx_x   = hctx->ccid3hctx_s;
+			hctx->ccid3hctx_x <<= 6;
 		}
-
-		/* Compute t_ipi = s / X */
-		hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len);
 		ccid3_update_send_interval(hctx);
 
-		/* Seed value for Oscillation Prevention (sec. 4.5) */
-		hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt);
-
-	} else {
-		delay = ktime_us_delta(hctx->t_nom, now);
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+		break;
+	case TFRC_SSTATE_NO_FBACK:
+	case TFRC_SSTATE_FBACK:
+		delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now);
 		ccid3_pr_debug("delay=%ld\n", (long)delay);
 		/*
 		 *	Scheduling of packet transmissions [RFC 3448, 4.6]
@@ -312,80 +355,99 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		 * else
 		 *       // send the packet in (t_nom - t_now) milliseconds.
 		 */
-		if (delay >= TFRC_T_DELTA)
-			return (u32)delay / USEC_PER_MSEC;
+		if (delay - (s64)hctx->ccid3hctx_delta >= 1000)
+			return (u32)delay / 1000L;
 
 		ccid3_hc_tx_update_win_count(hctx, now);
+		break;
+	case TFRC_SSTATE_TERM:
+		DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk);
+		return -EINVAL;
 	}
 
 	/* prepare to send now (add options etc.) */
 	dp->dccps_hc_tx_insert_options = 1;
-	DCCP_SKB_CB(skb)->dccpd_ccval  = hctx->last_win_count;
+	DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
 
 	/* set the nominal send time for the next following packet */
-	hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi);
-	return CCID_PACKET_SEND_AT_ONCE;
+	hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom,
+					     hctx->ccid3hctx_t_ipi);
+	return 0;
 }
 
-static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+static void ccid3_hc_tx_packet_sent(struct sock *sk, int more,
+				    unsigned int len)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
-	/* Changes to s will become effective the next time X is computed */
-	hctx->s = ccid3_hc_tx_measure_packet_size(sk, len);
+	ccid3_hc_tx_update_s(hctx, len);
 
-	if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss))
+	if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss))
 		DCCP_CRIT("packet history - out of memory!");
 }
 
 static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	struct tfrc_tx_hist_entry *acked;
+	struct ccid3_options_received *opt_recv;
 	ktime_t now;
 	unsigned long t_nfb;
-	u32 r_sample;
+	u32 pinv, r_sample;
 
 	/* we are only interested in ACKs */
 	if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
 	      DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
 		return;
-	/*
-	 * Locate the acknowledged packet in the TX history.
-	 *
-	 * Returning "entry not found" here can for instance happen when
-	 *  - the host has not sent out anything (e.g. a passive server),
-	 *  - the Ack is outdated (packet with higher Ack number was received),
-	 *  - it is a bogus Ack (for a packet not sent on this connection).
-	 */
-	acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb));
-	if (acked == NULL)
+	/* ... and only in the established state */
+	if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK &&
+	    hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
+		return;
+
+	opt_recv = &hctx->ccid3hctx_options_received;
+	now = ktime_get_real();
+
+	/* Estimate RTT from history if ACK number is valid */
+	r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist,
+				    DCCP_SKB_CB(skb)->dccpd_ack_seq, now);
+	if (r_sample == 0) {
+		DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk,
+			  dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type),
+			  (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq);
 		return;
-	/* For the sake of RTT sampling, ignore/remove all older entries */
-	tfrc_tx_hist_purge(&acked->next);
+	}
 
-	/* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
-	now	  = ktime_get_real();
-	r_sample  = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
-	hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9);
+	/* Update receive rate in units of 64 * bytes/second */
+	hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
+	hctx->ccid3hctx_x_recv <<= 6;
 
+	/* Update loss event rate (which is scaled by 1e6) */
+	pinv = opt_recv->ccid3or_loss_event_rate;
+	if (pinv == ~0U || pinv == 0)	       /* see RFC 4342, 8.5   */
+		hctx->ccid3hctx_p = 0;
+	else				       /* can not exceed 100% */
+		hctx->ccid3hctx_p = scaled_div(1, pinv);
+	/*
+	 * Validate new RTT sample and update moving average
+	 */
+	r_sample = dccp_sample_rtt(sk, r_sample);
+	hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9);
 	/*
 	 * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
 	 */
-	if (!hctx->feedback) {
-		hctx->feedback = true;
+	if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
+		ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
 
-		if (hctx->t_rto == 0) {
+		if (hctx->ccid3hctx_t_rto == 0) {
 			/*
 			 * Initial feedback packet: Larger Initial Windows (4.2)
 			 */
-			hctx->x    = rfc3390_initial_rate(sk);
-			hctx->t_ld = now;
+			hctx->ccid3hctx_x    = rfc3390_initial_rate(sk);
+			hctx->ccid3hctx_t_ld = now;
 
 			ccid3_update_send_interval(hctx);
 
 			goto done_computing_x;
-		} else if (hctx->p == 0) {
+		} else if (hctx->ccid3hctx_p == 0) {
 			/*
 			 * First feedback after nofeedback timer expiry (4.3)
 			 */
@@ -394,52 +456,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
-	if (hctx->p > 0)
-		hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p);
+	if (hctx->ccid3hctx_p > 0)
+		hctx->ccid3hctx_x_calc =
+				tfrc_calc_x(hctx->ccid3hctx_s,
+					    hctx->ccid3hctx_rtt,
+					    hctx->ccid3hctx_p);
 	ccid3_hc_tx_update_x(sk, &now);
 
 done_computing_x:
 	ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
 			       "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
-			       dccp_role(sk), sk, hctx->rtt, r_sample,
-			       hctx->s, hctx->p, hctx->x_calc,
-			       (unsigned)(hctx->x_recv >> 6),
-			       (unsigned)(hctx->x >> 6));
-	/*
-	 * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to
-	 * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This
-	 * can be useful if few connections share a link, avoiding that buffer
-	 * fill levels (RTT) oscillate as a result of frequent adjustments to X.
-	 * A useful presentation with background information is in
-	 *    Joerg Widmer, "Equation-Based Congestion Control",
-	 *    MSc Thesis, University of Mannheim, Germany, 2000
-	 * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation").
-	 */
-	if (do_osc_prev) {
-		r_sample = tfrc_scaled_sqrt(r_sample);
-		/*
-		 * The modulation can work in both ways: increase/decrease t_ipi
-		 * according to long-term increases/decreases of the RTT. The
-		 * former is a useful measure, since it works against queue
-		 * build-up. The latter temporarily increases the sending rate,
-		 * so that buffers fill up more quickly. This in turn causes
-		 * the RTT to increase, so that either later reduction becomes
-		 * necessary or the RTT stays at a very high level. Decreasing
-		 * t_ipi is therefore not supported.
-		 * Furthermore, during the initial slow-start phase the RTT
-		 * naturally increases, where using the algorithm would cause
-		 * delays. Hence it is disabled during the initial slow-start.
-		 */
-		if (r_sample > hctx->r_sqmean && hctx->p > 0)
-			hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample,
-					      hctx->r_sqmean);
-		hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI);
-		/* update R_sqmean _after_ computing the modulation factor */
-		hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9);
-	}
+			       dccp_role(sk),
+			       sk, hctx->ccid3hctx_rtt, r_sample,
+			       hctx->ccid3hctx_s, hctx->ccid3hctx_p,
+			       hctx->ccid3hctx_x_calc,
+			       (unsigned)(hctx->ccid3hctx_x_recv >> 6),
+			       (unsigned)(hctx->ccid3hctx_x >> 6));
 
 	/* unschedule no feedback timer */
-	sk_stop_timer(sk, &hctx->no_feedback_timer);
+	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
 
 	/*
 	 * As we have calculated new ipi, delta, t_nom it is possible
@@ -453,66 +488,95 @@ done_computing_x:
 	 * This can help avoid triggering the nofeedback timer too
 	 * often ('spinning') on LANs with small RTTs.
 	 */
-	hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO *
-						 (USEC_PER_SEC / 1000)));
+	hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
+					   (CONFIG_IP_DCCP_CCID3_RTO *
+					    (USEC_PER_SEC / 1000)));
 	/*
 	 * Schedule no feedback timer to expire in
 	 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
 	 */
-	t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi);
+	t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
 
 	ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
 		       "expire in %lu jiffies (%luus)\n",
-		       dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
+		       dccp_role(sk),
+		       sk, usecs_to_jiffies(t_nfb), t_nfb);
 
-	sk_reset_timer(sk, &hctx->no_feedback_timer,
+	sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
 			   jiffies + usecs_to_jiffies(t_nfb));
 }
 
-static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
-				     u8 option, u8 *optval, u8 optlen)
+static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
+				     unsigned char len, u16 idx,
+				     unsigned char *value)
 {
+	int rc = 0;
+	const struct dccp_sock *dp = dccp_sk(sk);
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+	struct ccid3_options_received *opt_recv;
 	__be32 opt_val;
 
-	switch (option) {
-	case TFRC_OPT_RECEIVE_RATE:
-	case TFRC_OPT_LOSS_EVENT_RATE:
-		/* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
-		if (packet_type == DCCP_PKT_DATA)
-			break;
-		if (unlikely(optlen != 4)) {
-			DCCP_WARN("%s(%p), invalid len %d for %u\n",
-				  dccp_role(sk), sk, optlen, option);
-			return -EINVAL;
-		}
-		opt_val = ntohl(get_unaligned((__be32 *)optval));
+	opt_recv = &hctx->ccid3hctx_options_received;
 
-		if (option == TFRC_OPT_RECEIVE_RATE) {
-			/* Receive Rate is kept in units of 64 bytes/second */
-			hctx->x_recv = opt_val;
-			hctx->x_recv <<= 6;
+	if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
+		opt_recv->ccid3or_seqno		     = dp->dccps_gsr;
+		opt_recv->ccid3or_loss_event_rate    = ~0;
+		opt_recv->ccid3or_loss_intervals_idx = 0;
+		opt_recv->ccid3or_loss_intervals_len = 0;
+		opt_recv->ccid3or_receive_rate	     = 0;
+	}
 
-			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
-				       dccp_role(sk), sk, opt_val);
+	switch (option) {
+	case TFRC_OPT_LOSS_EVENT_RATE:
+		if (unlikely(len != 4)) {
+			DCCP_WARN("%s(%p), invalid len %d "
+				  "for TFRC_OPT_LOSS_EVENT_RATE\n",
+				  dccp_role(sk), sk, len);
+			rc = -EINVAL;
 		} else {
-			/* Update the fixpoint Loss Event Rate fraction */
-			hctx->p = tfrc_invert_loss_event_rate(opt_val);
-
+			opt_val = get_unaligned((__be32 *)value);
+			opt_recv->ccid3or_loss_event_rate = ntohl(opt_val);
 			ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
-				       dccp_role(sk), sk, opt_val);
+				       dccp_role(sk), sk,
+				       opt_recv->ccid3or_loss_event_rate);
 		}
+		break;
+	case TFRC_OPT_LOSS_INTERVALS:
+		opt_recv->ccid3or_loss_intervals_idx = idx;
+		opt_recv->ccid3or_loss_intervals_len = len;
+		ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n",
+			       dccp_role(sk), sk,
+			       opt_recv->ccid3or_loss_intervals_idx,
+			       opt_recv->ccid3or_loss_intervals_len);
+		break;
+	case TFRC_OPT_RECEIVE_RATE:
+		if (unlikely(len != 4)) {
+			DCCP_WARN("%s(%p), invalid len %d "
+				  "for TFRC_OPT_RECEIVE_RATE\n",
+				  dccp_role(sk), sk, len);
+			rc = -EINVAL;
+		} else {
+			opt_val = get_unaligned((__be32 *)value);
+			opt_recv->ccid3or_receive_rate = ntohl(opt_val);
+			ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
+				       dccp_role(sk), sk,
+				       opt_recv->ccid3or_receive_rate);
+		}
+		break;
 	}
-	return 0;
+
+	return rc;
 }
 
 static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid);
 
-	hctx->hist  = NULL;
-	setup_timer(&hctx->no_feedback_timer,
-		    ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
+	hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
+	hctx->ccid3hctx_hist = NULL;
+	setup_timer(&hctx->ccid3hctx_no_feedback_timer,
+			ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
+
 	return 0;
 }
 
@@ -520,36 +584,42 @@ static void ccid3_hc_tx_exit(struct sock *sk)
 {
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 
-	sk_stop_timer(sk, &hctx->no_feedback_timer);
-	tfrc_tx_hist_purge(&hctx->hist);
+	ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
+	sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+
+	tfrc_tx_hist_purge(&hctx->ccid3hctx_hist);
 }
 
 static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
 {
-	info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto;
-	info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt;
+	struct ccid3_hc_tx_sock *hctx;
+
+	/* Listen socks doesn't have a private CCID block */
+	if (sk->sk_state == DCCP_LISTEN)
+		return;
+
+	hctx = ccid3_hc_tx_sk(sk);
+	info->tcpi_rto = hctx->ccid3hctx_t_rto;
+	info->tcpi_rtt = hctx->ccid3hctx_rtt;
 }
 
 static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 				  u32 __user *optval, int __user *optlen)
 {
-	const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-	struct tfrc_tx_info tfrc;
+	const struct ccid3_hc_tx_sock *hctx;
 	const void *val;
 
+	/* Listen socks doesn't have a private CCID block */
+	if (sk->sk_state == DCCP_LISTEN)
+		return -EINVAL;
+
+	hctx = ccid3_hc_tx_sk(sk);
 	switch (optname) {
 	case DCCP_SOCKOPT_CCID_TX_INFO:
-		if (len < sizeof(tfrc))
+		if (len < sizeof(hctx->ccid3hctx_tfrc))
 			return -EINVAL;
-		tfrc.tfrctx_x	   = hctx->x;
-		tfrc.tfrctx_x_recv = hctx->x_recv;
-		tfrc.tfrctx_x_calc = hctx->x_calc;
-		tfrc.tfrctx_rtt	   = hctx->rtt;
-		tfrc.tfrctx_p	   = hctx->p;
-		tfrc.tfrctx_rto	   = hctx->t_rto;
-		tfrc.tfrctx_ipi	   = hctx->t_ipi;
-		len = sizeof(tfrc);
-		val = &tfrc;
+		len = sizeof(hctx->ccid3hctx_tfrc);
+		val = &hctx->ccid3hctx_tfrc;
 		break;
 	default:
 		return -ENOPROTOOPT;
@@ -564,82 +634,112 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
 /*
  *	Receiver Half-Connection Routines
  */
+
+/* CCID3 feedback types */
+enum ccid3_fback_type {
+	CCID3_FBACK_NONE = 0,
+	CCID3_FBACK_INITIAL,
+	CCID3_FBACK_PERIODIC,
+	CCID3_FBACK_PARAM_CHANGE
+};
+
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
+{
+	static char *ccid3_rx_state_names[] = {
+	[TFRC_RSTATE_NO_DATA] = "NO_DATA",
+	[TFRC_RSTATE_DATA]    = "DATA",
+	[TFRC_RSTATE_TERM]    = "TERM",
+	};
+
+	return ccid3_rx_state_names[state];
+}
+#endif
+
+static void ccid3_hc_rx_set_state(struct sock *sk,
+				  enum ccid3_hc_rx_states state)
+{
+	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
+
+	ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
+		       dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
+		       ccid3_rx_state_name(state));
+	WARN_ON(state == oldstate);
+	hcrx->ccid3hcrx_state = state;
+}
+
 static void ccid3_hc_rx_send_feedback(struct sock *sk,
 				      const struct sk_buff *skb,
 				      enum ccid3_fback_type fbtype)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	struct dccp_sock *dp = dccp_sk(sk);
+	ktime_t now;
+	s64 delta = 0;
+
+	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM))
+		return;
+
+	now = ktime_get_real();
 
 	switch (fbtype) {
 	case CCID3_FBACK_INITIAL:
-		hcrx->x_recv = 0;
-		hcrx->p_inverse = ~0U;   /* see RFC 4342, 8.5 */
+		hcrx->ccid3hcrx_x_recv = 0;
+		hcrx->ccid3hcrx_pinv   = ~0U;   /* see RFC 4342, 8.5 */
 		break;
 	case CCID3_FBACK_PARAM_CHANGE:
-		if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) {
-			/*
-			 * rfc3448bis-06, 6.3.1: First packet(s) lost or marked
-			 * FIXME: in rfc3448bis the receiver returns X_recv=0
-			 * here as it normally would in the first feedback packet.
-			 * However this is not possible yet, since the code still
-			 * uses RFC 3448, i.e.
-			 *    If (p > 0)
-			 *      Calculate X_calc using the TCP throughput equation.
-			 *      X = max(min(X_calc, 2*X_recv), s/t_mbi);
-			 * would bring X down to s/t_mbi. That is why we return
-			 * X_recv according to rfc3448bis-06 for the moment.
-			 */
-			u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
-			    rtt = tfrc_rx_hist_rtt(&hcrx->hist);
-
-			hcrx->x_recv = scaled_div32(s, 2 * rtt);
-			break;
-		}
 		/*
 		 * When parameters change (new loss or p > p_prev), we do not
 		 * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
-		 * always check whether at least RTT time units were covered.
+		 * need to  reuse the previous value of X_recv. However, when
+		 * X_recv was 0 (due to early loss), this would kill X down to
+		 * s/t_mbi (i.e. one packet in 64 seconds).
+		 * To avoid such drastic reduction, we approximate X_recv as
+		 * the number of bytes since last feedback.
+		 * This is a safe fallback, since X is bounded above by X_calc.
 		 */
-		hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
-		break;
+		if (hcrx->ccid3hcrx_x_recv > 0)
+			break;
+		/* fall through */
 	case CCID3_FBACK_PERIODIC:
-		/*
-		 * Step (2) of rfc3448bis-06, 6.2:
-		 * - if no data packets have been received, just restart timer
-		 * - if data packets have been received, re-compute X_recv
-		 */
-		if (hcrx->hist.bytes_recvd == 0)
-			goto prepare_for_next_time;
-		hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
+		delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback);
+		if (delta <= 0)
+			DCCP_BUG("delta (%ld) <= 0", (long)delta);
+		else
+			hcrx->ccid3hcrx_x_recv =
+				scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
 		break;
 	default:
 		return;
 	}
 
-	ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse);
+	ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
+		       hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv);
 
-	dccp_sk(sk)->dccps_hc_rx_insert_options = 1;
-	dccp_send_ack(sk);
+	hcrx->ccid3hcrx_tstamp_last_feedback = now;
+	hcrx->ccid3hcrx_last_counter	     = dccp_hdr(skb)->dccph_ccval;
+	hcrx->ccid3hcrx_bytes_recv	     = 0;
 
-prepare_for_next_time:
-	tfrc_rx_hist_restart_byte_counter(&hcrx->hist);
-	hcrx->last_counter = dccp_hdr(skb)->dccph_ccval;
-	hcrx->feedback	   = fbtype;
+	dp->dccps_hc_rx_insert_options = 1;
+	dccp_send_ack(sk);
 }
 
 static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 {
-	const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	const struct ccid3_hc_rx_sock *hcrx;
 	__be32 x_recv, pinv;
 
 	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
 		return 0;
 
+	hcrx = ccid3_hc_rx_sk(sk);
+
 	if (dccp_packet_without_ack(skb))
 		return 0;
 
-	x_recv = htonl(hcrx->x_recv);
-	pinv   = htonl(hcrx->p_inverse);
+	x_recv = htonl(hcrx->ccid3hcrx_x_recv);
+	pinv   = htonl(hcrx->ccid3hcrx_pinv);
 
 	if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
 			       &pinv, sizeof(pinv)) ||
@@ -662,95 +762,171 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 static u32 ccid3_first_li(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	u32 s = tfrc_rx_hist_packet_size(&hcrx->hist),
-	    rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p;
+	u32 x_recv, p, delta;
 	u64 fval;
 
-	/*
-	 * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p
-	 * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p
-	 * is about 20.64%. This yields an interval length of 4.84 (rounded up).
-	 */
-	if (unlikely(hcrx->feedback == CCID3_FBACK_NONE))
-		return 5;
+	if (hcrx->ccid3hcrx_rtt == 0) {
+		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
+		hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT;
+	}
 
-	x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv);
-	if (x_recv == 0)
-		goto failed;
+	delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback));
+	x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta);
+	if (x_recv == 0) {		/* would also trigger divide-by-zero */
+		DCCP_WARN("X_recv==0\n");
+		if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) {
+			DCCP_BUG("stored value of X_recv is zero");
+			return ~0U;
+		}
+	}
 
-	fval = scaled_div32(scaled_div(s, rtt), x_recv);
+	fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt);
+	fval = scaled_div32(fval, x_recv);
 	p = tfrc_calc_x_reverse_lookup(fval);
 
 	ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
 		       "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
 
-	if (p > 0)
-		return scaled_div(1, p);
-failed:
-	return UINT_MAX;
+	return p == 0 ? ~0U : scaled_div(1, p);
 }
 
 static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
 	const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
 	const bool is_data_packet = dccp_data_packet(skb);
 
+	if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) {
+		if (is_data_packet) {
+			const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+			do_feedback = CCID3_FBACK_INITIAL;
+			ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
+			hcrx->ccid3hcrx_s = payload;
+			/*
+			 * Not necessary to update ccid3hcrx_bytes_recv here,
+			 * since X_recv = 0 for the first feedback packet (cf.
+			 * RFC 3448, 6.3) -- gerrit
+			 */
+		}
+		goto update_records;
+	}
+
+	if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb))
+		return; /* done receiving */
+
+	if (is_data_packet) {
+		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+		/*
+		 * Update moving-average of s and the sum of received payload bytes
+		 */
+		hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9);
+		hcrx->ccid3hcrx_bytes_recv += payload;
+	}
+
 	/*
 	 * Perform loss detection and handle pending losses
 	 */
-	if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist,
-				     skb, ndp, ccid3_first_li, sk))
-		ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE);
+	if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist,
+				skb, ndp, ccid3_first_li, sk)) {
+		do_feedback = CCID3_FBACK_PARAM_CHANGE;
+		goto done_receiving;
+	}
+
+	if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist))
+		return; /* done receiving */
+
 	/*
-	 * Feedback for first non-empty data packet (RFC 3448, 6.3)
+	 * Handle data packets: RTT sampling and monitoring p
 	 */
-	else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet))
-		ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL);
+	if (unlikely(!is_data_packet))
+		goto update_records;
+
+	if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) {
+		const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb);
+		/*
+		 * Empty loss history: no loss so far, hence p stays 0.
+		 * Sample RTT values, since an RTT estimate is required for the
+		 * computation of p when the first loss occurs; RFC 3448, 6.3.1.
+		 */
+		if (sample != 0)
+			hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9);
+
+	} else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) {
+		/*
+		 * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
+		 * has decreased (resp. p has increased), send feedback now.
+		 */
+		do_feedback = CCID3_FBACK_PARAM_CHANGE;
+	}
+
 	/*
 	 * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
 	 */
-	else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet &&
-		 SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3)
-		ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC);
+	if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3)
+		do_feedback = CCID3_FBACK_PERIODIC;
+
+update_records:
+	tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp);
+
+done_receiving:
+	if (do_feedback)
+		ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
 }
 
 static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid);
 
-	tfrc_lh_init(&hcrx->li_hist);
-	return tfrc_rx_hist_init(&hcrx->hist, sk);
+	hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
+	tfrc_lh_init(&hcrx->ccid3hcrx_li_hist);
+	return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist);
 }
 
 static void ccid3_hc_rx_exit(struct sock *sk)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
 
-	tfrc_rx_hist_purge(&hcrx->hist);
-	tfrc_lh_cleanup(&hcrx->li_hist);
+	ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
+
+	tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist);
+	tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist);
 }
 
 static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
 {
+	const struct ccid3_hc_rx_sock *hcrx;
+
+	/* Listen socks doesn't have a private CCID block */
+	if (sk->sk_state == DCCP_LISTEN)
+		return;
+
+	hcrx = ccid3_hc_rx_sk(sk);
+	info->tcpi_ca_state = hcrx->ccid3hcrx_state;
 	info->tcpi_options  |= TCPI_OPT_TIMESTAMPS;
-	info->tcpi_rcv_rtt  = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist);
+	info->tcpi_rcv_rtt  = hcrx->ccid3hcrx_rtt;
 }
 
 static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
 				  u32 __user *optval, int __user *optlen)
 {
-	const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	const struct ccid3_hc_rx_sock *hcrx;
 	struct tfrc_rx_info rx_info;
 	const void *val;
 
+	/* Listen socks doesn't have a private CCID block */
+	if (sk->sk_state == DCCP_LISTEN)
+		return -EINVAL;
+
+	hcrx = ccid3_hc_rx_sk(sk);
 	switch (optname) {
 	case DCCP_SOCKOPT_CCID_RX_INFO:
 		if (len < sizeof(rx_info))
 			return -EINVAL;
-		rx_info.tfrcrx_x_recv = hcrx->x_recv;
-		rx_info.tfrcrx_rtt    = tfrc_rx_hist_rtt(&hcrx->hist);
-		rx_info.tfrcrx_p      = tfrc_invert_loss_event_rate(hcrx->p_inverse);
+		rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv;
+		rx_info.tfrcrx_rtt    = hcrx->ccid3hcrx_rtt;
+		rx_info.tfrcrx_p      = hcrx->ccid3hcrx_pinv == 0 ? ~0U :
+					   scaled_div(1, hcrx->ccid3hcrx_pinv);
 		len = sizeof(rx_info);
 		val = &rx_info;
 		break;
@@ -786,9 +962,6 @@ static struct ccid_operations ccid3 = {
 	.ccid_hc_tx_getsockopt	   = ccid3_hc_tx_getsockopt,
 };
 
-module_param(do_osc_prev, bool, 0644);
-MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)");
-
 #ifdef CONFIG_IP_DCCP_CCID3_DEBUG
 module_param(ccid3_debug, bool, 0644);
 MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
@@ -796,19 +969,6 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
 
 static __init int ccid3_module_init(void)
 {
-	struct timespec tp;
-
-	/*
-	 * Without a fine-grained clock resolution, RTTs/X_recv are not sampled
-	 * correctly and feedback is sent either too early or too late.
-	 */
-	hrtimer_get_res(CLOCK_MONOTONIC, &tp);
-	if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) {
-		printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec"
-		       " resolution - check your clocksource.\n", __func__,
-		       tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION);
-		return -ESOCKTNOSUPPORT;
-	}
 	return ccid_register(&ccid3);
 }
 module_init(ccid3_module_init);
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index af6e1bf937d..49ca32bd7e7 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -47,22 +47,11 @@
 /* Two seconds as per RFC 3448 4.2 */
 #define TFRC_INITIAL_TIMEOUT	   (2 * USEC_PER_SEC)
 
-/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */
-#define TFRC_T_MBI		   (64 * USEC_PER_SEC)
+/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
+#define TFRC_OPSYS_HALF_TIME_GRAN  (USEC_PER_SEC / (2 * HZ))
 
-/*
- * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are
- * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
- * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
- * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
- * when using the constant t_delta  =  t_gran / 2  =  %USEC_PER_SEC / (2 * HZ).
- */
-#if (HZ >= 500)
-# define TFRC_T_DELTA		   USEC_PER_MSEC
-#else
-# define TFRC_T_DELTA		   (USEC_PER_SEC / (2 * HZ))
-#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC.
-#endif
+/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
+#define TFRC_T_MBI		   64
 
 enum ccid3_options {
 	TFRC_OPT_LOSS_EVENT_RATE = 192,
@@ -70,43 +59,62 @@ enum ccid3_options {
 	TFRC_OPT_RECEIVE_RATE	 = 194,
 };
 
+struct ccid3_options_received {
+	u64 ccid3or_seqno:48,
+	    ccid3or_loss_intervals_idx:16;
+	u16 ccid3or_loss_intervals_len;
+	u32 ccid3or_loss_event_rate;
+	u32 ccid3or_receive_rate;
+};
+
+/* TFRC sender states */
+enum ccid3_hc_tx_states {
+	TFRC_SSTATE_NO_SENT = 1,
+	TFRC_SSTATE_NO_FBACK,
+	TFRC_SSTATE_FBACK,
+	TFRC_SSTATE_TERM,
+};
+
 /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
  *
- * @x - Current sending rate in 64 * bytes per second
- * @x_recv - Receive rate    in 64 * bytes per second
- * @x_calc - Calculated rate in bytes per second
- * @rtt - Estimate of current round trip time in usecs
- * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5)
- * @p - Current loss event rate (0-1) scaled by 1000000
- * @s - Packet size in bytes
- * @t_rto - Nofeedback Timer setting in usecs
- * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
- * @feedback - Whether feedback has been received or not
- * @last_win_count - Last window counter sent
- * @t_last_win_count - Timestamp of earliest packet with
- *                     last_win_count value sent
- * @no_feedback_timer - Handle to no feedback timer
- * @t_ld - Time last doubled during slow start
- * @t_nom - Nominal send time of next packet
- * @hist - Packet history
+ * @ccid3hctx_x - Current sending rate in 64 * bytes per second
+ * @ccid3hctx_x_recv - Receive rate    in 64 * bytes per second
+ * @ccid3hctx_x_calc - Calculated rate in bytes per second
+ * @ccid3hctx_rtt - Estimate of current round trip time in usecs
+ * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
+ * @ccid3hctx_s - Packet size in bytes
+ * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs
+ * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs
+ * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states
+ * @ccid3hctx_last_win_count - Last window counter sent
+ * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
+ *				 with last_win_count value sent
+ * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
+ * @ccid3hctx_t_ld - Time last doubled during slow start
+ * @ccid3hctx_t_nom - Nominal send time of next packet
+ * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs
+ * @ccid3hctx_hist - Packet history
+ * @ccid3hctx_options_received - Parsed set of retrieved options
  */
 struct ccid3_hc_tx_sock {
-	u64				x;
-	u64				x_recv;
-	u32				x_calc;
-	u32				rtt;
-	u16				r_sqmean;
-	u32				p;
-	u32				t_rto;
-	u32				t_ipi;
-	u16				s;
-	bool				feedback:1;
-	u8				last_win_count;
-	ktime_t				t_last_win_count;
-	struct timer_list		no_feedback_timer;
-	ktime_t				t_ld;
-	ktime_t				t_nom;
-	struct tfrc_tx_hist_entry	*hist;
+	struct tfrc_tx_info		ccid3hctx_tfrc;
+#define ccid3hctx_x			ccid3hctx_tfrc.tfrctx_x
+#define ccid3hctx_x_recv		ccid3hctx_tfrc.tfrctx_x_recv
+#define ccid3hctx_x_calc		ccid3hctx_tfrc.tfrctx_x_calc
+#define ccid3hctx_rtt			ccid3hctx_tfrc.tfrctx_rtt
+#define ccid3hctx_p			ccid3hctx_tfrc.tfrctx_p
+#define ccid3hctx_t_rto			ccid3hctx_tfrc.tfrctx_rto
+#define ccid3hctx_t_ipi			ccid3hctx_tfrc.tfrctx_ipi
+	u16				ccid3hctx_s;
+	enum ccid3_hc_tx_states		ccid3hctx_state:8;
+	u8				ccid3hctx_last_win_count;
+	ktime_t				ccid3hctx_t_last_win_count;
+	struct timer_list		ccid3hctx_no_feedback_timer;
+	ktime_t				ccid3hctx_t_ld;
+	ktime_t				ccid3hctx_t_nom;
+	u32				ccid3hctx_delta;
+	struct tfrc_tx_hist_entry	*ccid3hctx_hist;
+	struct ccid3_options_received	ccid3hctx_options_received;
 };
 
 static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
@@ -116,32 +124,41 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
     return hctx;
 }
 
-
-enum ccid3_fback_type {
-	CCID3_FBACK_NONE = 0,
-	CCID3_FBACK_INITIAL,
-	CCID3_FBACK_PERIODIC,
-	CCID3_FBACK_PARAM_CHANGE
+/* TFRC receiver states */
+enum ccid3_hc_rx_states {
+	TFRC_RSTATE_NO_DATA = 1,
+	TFRC_RSTATE_DATA,
+	TFRC_RSTATE_TERM    = 127,
 };
 
 /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
  *
- *  @last_counter  -  Tracks window counter (RFC 4342, 8.1)
- *  @feedback  -  The type of the feedback last sent
- *  @x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
- *  @tstamp_last_feedback  -  Time at which last feedback was sent
- *  @hist  -  Packet history (loss detection + RTT sampling)
- *  @li_hist  -  Loss Interval database
- *  @p_inverse  -  Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
+ *  @ccid3hcrx_x_recv  -  Receiver estimate of send rate (RFC 3448 4.3)
+ *  @ccid3hcrx_rtt  -  Receiver estimate of rtt (non-standard)
+ *  @ccid3hcrx_p  -  Current loss event rate (RFC 3448 5.4)
+ *  @ccid3hcrx_last_counter  -  Tracks window counter (RFC 4342, 8.1)
+ *  @ccid3hcrx_state  -  Receiver state, one of %ccid3_hc_rx_states
+ *  @ccid3hcrx_bytes_recv  -  Total sum of DCCP payload bytes
+ *  @ccid3hcrx_x_recv  -  Receiver estimate of send rate (RFC 3448, sec. 4.3)
+ *  @ccid3hcrx_rtt  -  Receiver estimate of RTT
+ *  @ccid3hcrx_tstamp_last_feedback  -  Time at which last feedback was sent
+ *  @ccid3hcrx_tstamp_last_ack  -  Time at which last feedback was sent
+ *  @ccid3hcrx_hist  -  Packet history (loss detection + RTT sampling)
+ *  @ccid3hcrx_li_hist  -  Loss Interval database
+ *  @ccid3hcrx_s  -  Received packet size in bytes
+ *  @ccid3hcrx_pinv  -  Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
  */
 struct ccid3_hc_rx_sock {
-	u8				last_counter:4;
-	enum ccid3_fback_type		feedback:4;
-	u32				x_recv;
-	ktime_t				tstamp_last_feedback;
-	struct tfrc_rx_hist		hist;
-	struct tfrc_loss_hist		li_hist;
-#define p_inverse			li_hist.i_mean
+	u8				ccid3hcrx_last_counter:4;
+	enum ccid3_hc_rx_states		ccid3hcrx_state:8;
+	u32				ccid3hcrx_bytes_recv;
+	u32				ccid3hcrx_x_recv;
+	u32				ccid3hcrx_rtt;
+	ktime_t				ccid3hcrx_tstamp_last_feedback;
+	struct tfrc_rx_hist		ccid3hcrx_hist;
+	struct tfrc_loss_hist		ccid3hcrx_li_hist;
+	u16				ccid3hcrx_s;
+#define ccid3hcrx_pinv			ccid3hcrx_li_hist.i_mean
 };
 
 static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index b1ae8f8259e..5b3ce0688c5 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -86,26 +86,21 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
 
 /**
  * tfrc_lh_update_i_mean  -  Update the `open' loss interval I_0
- * This updates I_mean as the sequence numbers increase. As a consequence, the
- * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1)
- * decreases, and thus there is no need to send renewed feedback.
+ * For recomputing p: returns `true' if p > p_prev  <=>  1/p < 1/p_prev
  */
-void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
+u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
 {
 	struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
+	u32 old_i_mean = lh->i_mean;
 	s64 len;
 
 	if (cur == NULL)			/* not initialised */
-		return;
-
-	/* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */
-	if (!dccp_data_packet(skb))
-		return;
+		return 0;
 
 	len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
 
 	if (len - (s64)cur->li_length <= 0)	/* duplicate or reordered */
-		return;
+		return 0;
 
 	if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
 		/*
@@ -119,11 +114,14 @@ void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
 		cur->li_is_closed = 1;
 
 	if (tfrc_lh_length(lh) == 1)		/* due to RFC 3448, 6.3.1 */
-		return;
+		return 0;
 
 	cur->li_length = len;
 	tfrc_lh_calc_i_mean(lh);
+
+	return (lh->i_mean < old_i_mean);
 }
+EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean);
 
 /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
 static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
@@ -140,18 +138,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
  * @sk:		   Used by @calc_first_li in caller-specific way (subtyping)
  * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
  */
-bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
-			  u32 (*calc_first_li)(struct sock *), struct sock *sk)
+int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
+			 u32 (*calc_first_li)(struct sock *), struct sock *sk)
 {
 	struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
 
 	if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
-		return false;
+		return 0;
 
 	new = tfrc_lh_demand_next(lh);
 	if (unlikely(new == NULL)) {
 		DCCP_CRIT("Cannot allocate/add loss record.");
-		return false;
+		return 0;
 	}
 
 	new->li_seqno	  = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
@@ -169,7 +167,7 @@ bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
 
 		tfrc_lh_calc_i_mean(lh);
 	}
-	return true;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(tfrc_lh_interval_add);
 
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index d08a226db43..246018a3b26 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -67,9 +67,9 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
 
 struct tfrc_rx_hist;
 
-extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
+extern int  tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
 				 u32 (*first_li)(struct sock *), struct sock *);
-extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
+extern u8   tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
 extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
 
 #endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index cce9f03bda3..6cc108afdc3 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -40,6 +40,18 @@
 #include "packet_history.h"
 #include "../../dccp.h"
 
+/**
+ *  tfrc_tx_hist_entry  -  Simple singly-linked TX history list
+ *  @next:  next oldest entry (LIFO order)
+ *  @seqno: sequence number of this entry
+ *  @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+	struct tfrc_tx_hist_entry *next;
+	u64			  seqno;
+	ktime_t			  stamp;
+};
+
 /*
  * Transmitter History Routines
  */
@@ -61,6 +73,15 @@ void tfrc_tx_packet_history_exit(void)
 	}
 }
 
+static struct tfrc_tx_hist_entry *
+	tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
+{
+	while (head != NULL && head->seqno != seqno)
+		head = head->next;
+
+	return head;
+}
+
 int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
 {
 	struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
@@ -90,6 +111,25 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
 }
 EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge);
 
+u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno,
+		     const ktime_t now)
+{
+	u32 rtt = 0;
+	struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno);
+
+	if (packet != NULL) {
+		rtt = ktime_us_delta(now, packet->stamp);
+		/*
+		 * Garbage-collect older (irrelevant) entries:
+		 */
+		tfrc_tx_hist_purge(&packet->next);
+	}
+
+	return rtt;
+}
+EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt);
+
+
 /*
  * 	Receiver History Routines
  */
@@ -151,31 +191,14 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate);
 
-
-static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
-{
-	struct tfrc_rx_hist_entry *tmp = h->ring[a];
-
-	h->ring[a] = h->ring[b];
-	h->ring[b] = tmp;
-}
-
 static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
 {
-	__tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a),
-			       tfrc_rx_hist_index(h, b));
-}
+	const u8 idx_a = tfrc_rx_hist_index(h, a),
+		 idx_b = tfrc_rx_hist_index(h, b);
+	struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
 
-/**
- * tfrc_rx_hist_resume_rtt_sampling  -  Prepare RX history for RTT sampling
- * This is called after loss detection has finished, when the history entry
- * with the index of `loss_count' holds the highest-received sequence number.
- * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt).
- */
-static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h)
-{
-	__tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count));
-	h->loss_count = h->loss_start = 0;
+	h->ring[idx_a] = h->ring[idx_b];
+	h->ring[idx_b] = tmp;
 }
 
 /*
@@ -192,8 +215,10 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
 	u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
 	    s1 = DCCP_SKB_CB(skb)->dccpd_seq;
 
-	if (!dccp_loss_free(s0, s1, n1))	/* gap between S0 and S1 */
+	if (!dccp_loss_free(s0, s1, n1)) {	/* gap between S0 and S1 */
 		h->loss_count = 1;
+		tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
+	}
 }
 
 static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
@@ -215,7 +240,8 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2
 
 		if (dccp_loss_free(s2, s1, n1)) {
 			/* hole is filled: S0, S2, and S1 are consecutive */
-			tfrc_rx_hist_resume_rtt_sampling(h);
+			h->loss_count = 0;
+			h->loss_start = tfrc_rx_hist_index(h, 1);
 		} else
 			/* gap between S2 and S1: just update loss_prev */
 			tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
@@ -268,7 +294,8 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
 
 			if (dccp_loss_free(s1, s2, n2)) {
 				/* entire hole filled by S0, S3, S1, S2 */
-				tfrc_rx_hist_resume_rtt_sampling(h);
+				h->loss_start = tfrc_rx_hist_index(h, 2);
+				h->loss_count = 0;
 			} else {
 				/* gap remains between S1 and S2 */
 				h->loss_start = tfrc_rx_hist_index(h, 1);
@@ -312,7 +339,8 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
 
 		if (dccp_loss_free(s2, s3, n3)) {
 			/* no gap between S2 and S3: entire hole is filled */
-			tfrc_rx_hist_resume_rtt_sampling(h);
+			h->loss_start = tfrc_rx_hist_index(h, 3);
+			h->loss_count = 0;
 		} else {
 			/* gap between S2 and S3 */
 			h->loss_start = tfrc_rx_hist_index(h, 2);
@@ -326,13 +354,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
 }
 
 /**
- *  tfrc_rx_congestion_event  -  Loss detection and further processing
- *  @h:		The non-empty RX history object
- *  @lh:	Loss Intervals database to update
- *  @skb:	Currently received packet
- *  @ndp:	The NDP count belonging to @skb
- *  @first_li:	Caller-dependent computation of first loss interval in @lh
- *  @sk:	Used by @calc_first_li (see tfrc_lh_interval_add)
+ *  tfrc_rx_handle_loss  -  Loss detection and further processing
+ *  @h:		    The non-empty RX history object
+ *  @lh:	    Loss Intervals database to update
+ *  @skb:	    Currently received packet
+ *  @ndp:	    The NDP count belonging to @skb
+ *  @calc_first_li: Caller-dependent computation of first loss interval in @lh
+ *  @sk:	    Used by @calc_first_li (see tfrc_lh_interval_add)
  *  Chooses action according to pending loss, updates LI database when a new
  *  loss was detected, and does required post-processing. Returns 1 when caller
  *  should send feedback, 0 otherwise.
@@ -340,20 +368,15 @@ static void __three_after_loss(struct tfrc_rx_hist *h)
  *  records accordingly, the caller should not perform any more RX history
  *  operations when loss_count is greater than 0 after calling this function.
  */
-bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
-			      struct tfrc_loss_hist *lh,
-			      struct sk_buff *skb, const u64 ndp,
-			      u32 (*first_li)(struct sock *), struct sock *sk)
+int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
+			struct tfrc_loss_hist *lh,
+			struct sk_buff *skb, const u64 ndp,
+			u32 (*calc_first_li)(struct sock *), struct sock *sk)
 {
-	bool new_event = false;
-
-	if (tfrc_rx_hist_duplicate(h, skb))
-		return 0;
+	int is_new_loss = 0;
 
 	if (h->loss_count == 0) {
 		__do_track_loss(h, skb, ndp);
-		tfrc_rx_hist_sample_rtt(h, skb);
-		tfrc_rx_hist_add_packet(h, skb, ndp);
 	} else if (h->loss_count == 1) {
 		__one_after_loss(h, skb, ndp);
 	} else if (h->loss_count != 2) {
@@ -362,57 +385,34 @@ bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
 		/*
 		 * Update Loss Interval database and recycle RX records
 		 */
-		new_event = tfrc_lh_interval_add(lh, h, first_li, sk);
+		is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
 		__three_after_loss(h);
 	}
-
-	/*
-	 * Update moving-average of `s' and the sum of received payload bytes.
-	 */
-	if (dccp_data_packet(skb)) {
-		const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
-
-		h->packet_size = tfrc_ewma(h->packet_size, payload, 9);
-		h->bytes_recvd += payload;
-	}
-
-	/* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */
-	if (!new_event)
-		tfrc_lh_update_i_mean(lh, skb);
-
-	return new_event;
+	return is_new_loss;
 }
-EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event);
+EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss);
 
-/* Compute the sending rate X_recv measured between feedback intervals */
-u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv)
+int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
 {
-	u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate;
-	s64 delta = ktime_to_us(net_timedelta(h->bytes_start));
-
-	WARN_ON(delta <= 0);
-	/*
-	 * Ensure that the sampling interval for X_recv is at least one RTT,
-	 * by extending the sampling interval backwards in time, over the last
-	 * R_(m-1) seconds, as per rfc3448bis-06, 6.2.
-	 * To reduce noise (e.g. when the RTT changes often), this is only
-	 * done when delta is smaller than RTT/2.
-	 */
-	if (last_x_recv > 0 && delta < last_rtt/2) {
-		tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n",
-			      (long)delta, (unsigned)last_rtt);
+	int i;
 
-		delta = (bytes ? delta : 0) + last_rtt;
-		bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC);
+	for (i = 0; i <= TFRC_NDUPACK; i++) {
+		h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
+		if (h->ring[i] == NULL)
+			goto out_free;
 	}
 
-	if (unlikely(bytes == 0)) {
-		DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv);
-		return last_x_recv;
+	h->loss_count = h->loss_start = 0;
+	return 0;
+
+out_free:
+	while (i-- != 0) {
+		kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
+		h->ring[i] = NULL;
 	}
-	return scaled_div32(bytes, delta);
+	return -ENOBUFS;
 }
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv);
+EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc);
 
 void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
 {
@@ -426,81 +426,73 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge);
 
-static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
+/**
+ * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
+ */
+static inline struct tfrc_rx_hist_entry *
+			tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
 {
-	int i;
-
-	memset(h, 0, sizeof(*h));
-
-	for (i = 0; i <= TFRC_NDUPACK; i++) {
-		h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
-		if (h->ring[i] == NULL) {
-			tfrc_rx_hist_purge(h);
-			return -ENOBUFS;
-		}
-	}
-	return 0;
+	return h->ring[0];
 }
 
-int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk)
+/**
+ * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry
+ */
+static inline struct tfrc_rx_hist_entry *
+			tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
 {
-	if (tfrc_rx_hist_alloc(h))
-		return -ENOBUFS;
-	/*
-	 * Initialise first entry with GSR to start loss detection as early as
-	 * possible. Code using this must not use any other fields. The entry
-	 * will be overwritten once the CCID updates its received packets.
-	 */
-	tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr;
-	return 0;
+	return h->ring[h->rtt_sample_prev];
 }
-EXPORT_SYMBOL_GPL(tfrc_rx_hist_init);
 
 /**
  * tfrc_rx_hist_sample_rtt  -  Sample RTT from timestamp / CCVal
- * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss
- * is pending and uses the following history entries (via rtt_sample_prev):
- * - h->ring[0]  contains the most recent history entry prior to @skb;
- * - h->ring[1]  is an unused `dummy' entry when the current difference is 0;
+ * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
+ * to compute a sample with given data - calling function should check this.
  */
-void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
+u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
 {
-	struct tfrc_rx_hist_entry *last = h->ring[0];
-	u32 sample, delta_v;
-
-	/*
-	 * When not to sample:
-	 * - on non-data packets
-	 *   (RFC 4342, 8.1: CCVal only fully defined for data packets);
-	 * - when no data packets have been received yet
-	 *   (FIXME: using sampled packet size as indicator here);
-	 * - as long as there are gaps in the sequence space (pending loss).
-	 */
-	if (!dccp_data_packet(skb) || h->packet_size == 0 ||
-	    tfrc_rx_hist_loss_pending(h))
-		return;
+	u32 sample = 0,
+	    delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
+			    tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+
+	if (delta_v < 1 || delta_v > 4) {	/* unsuitable CCVal delta */
+		if (h->rtt_sample_prev == 2) {	/* previous candidate stored */
+			sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+				       tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+			if (sample)
+				sample = 4 / sample *
+				         ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
+							tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
+			else    /*
+				 * FIXME: This condition is in principle not
+				 * possible but occurs when CCID is used for
+				 * two-way data traffic. I have tried to trace
+				 * it, but the cause does not seem to be here.
+				 */
+				DCCP_BUG("please report to dccp@vger.kernel.org"
+					 " => prev = %u, last = %u",
+					 tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+					 tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+		} else if (delta_v < 1) {
+			h->rtt_sample_prev = 1;
+			goto keep_ref_for_next_time;
+		}
 
-	h->rtt_sample_prev = 0;		/* reset previous candidate */
+	} else if (delta_v == 4) /* optimal match */
+		sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
+	else {			 /* suboptimal match */
+		h->rtt_sample_prev = 2;
+		goto keep_ref_for_next_time;
+	}
 
-	delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval);
-	if (delta_v == 0) {		/* less than RTT/4 difference */
-		h->rtt_sample_prev = 1;
-		return;
+	if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
+		DCCP_WARN("RTT sample %u too large, using max\n", sample);
+		sample = DCCP_SANE_RTT_MAX;
 	}
-	sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp)));
 
-	if (delta_v <= 4)		/* between RTT/4 and RTT */
-		sample *= 4 / delta_v;
-	else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2))
-		/*
-		* Optimisation: CCVal difference is greater than 1 RTT, yet the
-		* sample is less than the local RTT estimate; which means that
-		* the RTT estimate is too high.
-		* To avoid noise, it is not done if the sample is below RTT/2.
-		*/
-		return;
+	h->rtt_sample_prev = 0;	       /* use current entry as next reference */
+keep_ref_for_next_time:
 
-	/* Use a lower weight than usual to increase responsiveness */
-	h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5);
+	return sample;
 }
 EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 555e65cd73a..461cc91cce8 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -40,28 +40,12 @@
 #include <linux/slab.h>
 #include "tfrc.h"
 
-/**
- *  tfrc_tx_hist_entry  -  Simple singly-linked TX history list
- *  @next:  next oldest entry (LIFO order)
- *  @seqno: sequence number of this entry
- *  @stamp: send time of packet with sequence number @seqno
- */
-struct tfrc_tx_hist_entry {
-	struct tfrc_tx_hist_entry *next;
-	u64			  seqno;
-	ktime_t			  stamp;
-};
-
-static inline struct tfrc_tx_hist_entry *
-	tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
-{
-	while (head != NULL && head->seqno != seqno)
-		head = head->next;
-	return head;
-}
+struct tfrc_tx_hist_entry;
 
 extern int  tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
 extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
+extern u32  tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head,
+			     const u64 seqno, const ktime_t now);
 
 /* Subtraction a-b modulo-16, respects circular wrap-around */
 #define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
@@ -91,22 +75,12 @@ struct tfrc_rx_hist_entry {
  * @loss_count:		Number of entries in circular history
  * @loss_start:		Movable index (for loss detection)
  * @rtt_sample_prev:	Used during RTT sampling, points to candidate entry
- * @rtt_estimate:	Receiver RTT estimate
- * @packet_size:	Packet size in bytes (as per RFC 3448, 3.1)
- * @bytes_recvd:	Number of bytes received since @bytes_start
- * @bytes_start:	Start time for counting @bytes_recvd
  */
 struct tfrc_rx_hist {
 	struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
 	u8			  loss_count:2,
 				  loss_start:2;
-	/* Receiver RTT sampling */
 #define rtt_sample_prev		  loss_start
-	u32			  rtt_estimate;
-	/* Receiver sampling of application payload lengths */
-	u32			  packet_size,
-				  bytes_recvd;
-	ktime_t			  bytes_start;
 };
 
 /**
@@ -150,50 +124,20 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
 	return h->loss_count > 0;
 }
 
-/*
- * Accessor functions to retrieve parameters sampled by the RX history
- */
-static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h)
-{
-	if (h->packet_size == 0) {
-		DCCP_WARN("No sample for s, using fallback\n");
-		return TCP_MIN_RCVMSS;
-	}
-	return h->packet_size;
-
-}
-static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h)
-{
-	if (h->rtt_estimate == 0) {
-		DCCP_WARN("No RTT estimate available, using fallback RTT\n");
-		return  DCCP_FALLBACK_RTT;
-	}
-	return h->rtt_estimate;
-}
-
-static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h)
-{
-	h->bytes_recvd = 0;
-	h->bytes_start = ktime_get_real();
-}
-
-extern u32  tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv);
-
-
 extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
 				    const struct sk_buff *skb, const u64 ndp);
 
 extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
 
 struct tfrc_loss_hist;
-extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h,
-				     struct tfrc_loss_hist *lh,
-				     struct sk_buff *skb, const u64 ndp,
-				     u32 (*first_li)(struct sock *sk),
-				     struct sock *sk);
-extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
-				    const struct sk_buff *skb);
-extern int  tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk);
+extern int  tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
+				struct tfrc_loss_hist *lh,
+				struct sk_buff *skb, const u64 ndp,
+				u32 (*first_li)(struct sock *sk),
+				struct sock *sk);
+extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h,
+				   const struct sk_buff *skb);
+extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
 extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
 
 #endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index ede12f53de5..ed9857527ac 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -47,21 +47,6 @@ static inline u32 scaled_div32(u64 a, u64 b)
 	return result;
 }
 
-/**
- * tfrc_scaled_sqrt  -  Compute scaled integer sqrt(x) for 0 < x < 2^22-1
- * Uses scaling to improve accuracy of the integer approximation of sqrt(). The
- * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for
- * clamped RTT samples (dccp_sample_rtt).
- * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the
- * scaling factor is neutralised. For this purpose, it avoids returning zero.
- */
-static inline u16 tfrc_scaled_sqrt(const u32 sample)
-{
-	const unsigned long non_zero_sample = sample ? : 1;
-
-	return int_sqrt(non_zero_sample << 10);
-}
-
 /**
  * tfrc_ewma  -  Exponentially weighted moving average
  * @weight: Weight to be used as damping factor, in units of 1/10
@@ -73,7 +58,6 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
 
 extern u32  tfrc_calc_x(u16 s, u32 R, u32 p);
 extern u32  tfrc_calc_x_reverse_lookup(u32 fvalue);
-extern u32  tfrc_invert_loss_event_rate(u32 loss_event_rate);
 
 extern int  tfrc_tx_packet_history_init(void);
 extern void tfrc_tx_packet_history_exit(void);
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 38239c4d5e1..2f20a29cffe 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -632,16 +632,8 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 
 	if (p <= TFRC_CALC_X_SPLIT) 		{     /* 0.0000 < p <= 0.05   */
 		if (p < TFRC_SMALLEST_P) {	      /* 0.0000 < p <  0.0001 */
-			/*
-			 * In the congestion-avoidance phase p decays towards 0
-			 * when there are no further losses, so this case is
-			 * natural. Truncating to p_min = 0.01% means that the
-			 * maximum achievable throughput is limited to about
-			 * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g.
-			 * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps.
-			 */
-			tfrc_pr_debug("Value of p (%d) below resolution. "
-				      "Substituting %d\n", p, TFRC_SMALLEST_P);
+			DCCP_WARN("Value of p (%d) below resolution. "
+				  "Substituting %d\n", p, TFRC_SMALLEST_P);
 			index = 0;
 		} else 				      /* 0.0001 <= p <= 0.05  */
 			index =  p/TFRC_SMALLEST_P - 1;
@@ -666,6 +658,7 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 	result = scaled_div(s, R);
 	return scaled_div32(result, f);
 }
+
 EXPORT_SYMBOL_GPL(tfrc_calc_x);
 
 /**
@@ -700,19 +693,5 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 	index = tfrc_binsearch(fvalue, 0);
 	return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
 }
-EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
 
-/**
- * tfrc_invert_loss_event_rate  -  Compute p so that 10^6 corresponds to 100%
- * When @loss_event_rate is large, there is a chance that p is truncated to 0.
- * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
- */
-u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
-{
-	if (loss_event_rate == UINT_MAX)		/* see RFC 4342, 8.5 */
-		return 0;
-	if (unlikely(loss_event_rate == 0))		/* map 1/0 into 100% */
-		return 1000000;
-	return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
-}
-EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate);
+EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 5281190aa19..b4bc6e095a0 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -42,11 +42,9 @@
 extern int dccp_debug;
 #define dccp_pr_debug(format, a...)	  DCCP_PR_DEBUG(dccp_debug, format, ##a)
 #define dccp_pr_debug_cat(format, a...)   DCCP_PRINTK(dccp_debug, format, ##a)
-#define dccp_debug(fmt, a...)		  dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
 #else
 #define dccp_pr_debug(format, a...)
 #define dccp_pr_debug_cat(format, a...)
-#define dccp_debug(format, a...)
 #endif
 
 extern struct inet_hashinfo dccp_hashinfo;
@@ -63,14 +61,11 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
  *    - DCCP-Reset    with ACK Subheader and 4 bytes of Reset Code fields
  *  Hence a safe upper bound for the maximum option length is 1020-28 = 992
  */
-#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
+#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int))
 #define DCCP_MAX_PACKET_HDR 28
 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
 #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
 
-/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
-#define DCCP_FEATNEG_OVERHEAD	 (32 * sizeof(uint32_t))
-
 #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
 				     * state, about 60 seconds */
 
@@ -86,13 +81,10 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
  */
 #define DCCP_RTO_MAX ((unsigned)(64 * HZ))
 
-/* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */
-#define DCCP_TIME_RESOLUTION	10
-
 /*
  * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
  */
-#define DCCP_SANE_RTT_MIN	(10 * DCCP_TIME_RESOLUTION)
+#define DCCP_SANE_RTT_MIN	100
 #define DCCP_FALLBACK_RTT	(USEC_PER_SEC / 5)
 #define DCCP_SANE_RTT_MAX	(3 * USEC_PER_SEC)
 
@@ -103,6 +95,12 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
 extern int  sysctl_dccp_request_retries;
 extern int  sysctl_dccp_retries1;
 extern int  sysctl_dccp_retries2;
+extern int  sysctl_dccp_feat_sequence_window;
+extern int  sysctl_dccp_feat_rx_ccid;
+extern int  sysctl_dccp_feat_tx_ccid;
+extern int  sysctl_dccp_feat_ack_ratio;
+extern int  sysctl_dccp_feat_send_ack_vector;
+extern int  sysctl_dccp_feat_send_ndp_count;
 extern int  sysctl_dccp_tx_qlen;
 extern int  sysctl_dccp_sync_ratelimit;
 
@@ -237,22 +235,8 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 extern void dccp_send_sync(struct sock *sk, const u64 seq,
 			   const enum dccp_pkt_type pkt_type);
 
-/*
- * TX Packet Dequeueing Interface
- */
-extern void		dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
-extern bool		dccp_qpolicy_full(struct sock *sk);
-extern void		dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
-extern struct sk_buff	*dccp_qpolicy_top(struct sock *sk);
-extern struct sk_buff	*dccp_qpolicy_pop(struct sock *sk);
-extern bool		dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
-
-/*
- * TX Packet Output and TX Timers
- */
-extern void dccp_write_xmit(struct sock *sk);
+extern void dccp_write_xmit(struct sock *sk, int block);
 extern void dccp_write_space(struct sock *sk);
-extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
 
 extern void dccp_init_xmit_timers(struct sock *sk);
 static inline void dccp_clear_xmit_timers(struct sock *sk)
@@ -268,8 +252,7 @@ extern const char *dccp_state_name(const int state);
 extern void dccp_set_state(struct sock *sk, const int state);
 extern void dccp_done(struct sock *sk);
 
-extern int  dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
-			    struct sk_buff const *skb);
+extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb);
 
 extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 
@@ -334,14 +317,7 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk,
 extern int	   dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
 extern void	   dccp_send_close(struct sock *sk, const int active);
 extern int	   dccp_invalid_packet(struct sk_buff *skb);
-
-static inline u32  dccp_sane_rtt(long usec_sample)
-{
-	if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX))
-		DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample);
-	return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX);
-}
-extern u32 dccp_sample_rtt(struct sock *sk, long delta);
+extern u32	   dccp_sample_rtt(struct sock *sk, long delta);
 
 static inline int dccp_bad_service_code(const struct sock *sk,
 					const __be32 service)
@@ -435,62 +411,36 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
 static inline void dccp_update_gsr(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	const struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	dp->dccps_gsr = seq;
-	/* Sequence validity window depends on remote Sequence Window (7.5.1) */
-	dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
-	/*
-	 * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
-	 * 7.5.1 we perform this check beyond the initial handshake: W/W' are
-	 * always > 32, so for the first W/W' packets in the lifetime of a
-	 * connection we always have to adjust SWL.
-	 * A second reason why we are doing this is that the window depends on
-	 * the feature-remote value of Sequence Window: nothing stops the peer
-	 * from updating this value while we are busy adjusting SWL for the
-	 * first W packets (we would have to count from scratch again then).
-	 * Therefore it is safer to always make sure that the Sequence Window
-	 * is not artificially extended by a peer who grows SWL downwards by
-	 * continually updating the feature-remote Sequence-Window.
-	 * If sequence numbers wrap it is bad luck. But that will take a while
-	 * (48 bit), and this measure prevents Sequence-number attacks.
-	 */
-	if (before48(dp->dccps_swl, dp->dccps_isr))
-		dp->dccps_swl = dp->dccps_isr;
-	dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
+	dccp_set_seqno(&dp->dccps_swl,
+		       dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4));
+	dccp_set_seqno(&dp->dccps_swh,
+		       dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4);
 }
 
 static inline void dccp_update_gss(struct sock *sk, u64 seq)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 
-	dp->dccps_gss = seq;
-	/* Ack validity window depends on local Sequence Window value (7.5.1) */
-	dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
-	/* Adjust AWL so that it is not below ISS - see comment above for SWL */
-	if (before48(dp->dccps_awl, dp->dccps_iss))
-		dp->dccps_awl = dp->dccps_iss;
-	dp->dccps_awh = dp->dccps_gss;
-}
-
-static inline int dccp_ackvec_pending(const struct sock *sk)
-{
-	return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
-	       !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
+	dp->dccps_awh = dp->dccps_gss = seq;
+	dccp_set_seqno(&dp->dccps_awl,
+		       (dp->dccps_gss -
+			dccp_msk(sk)->dccpms_sequence_window + 1));
 }
 
 static inline int dccp_ack_pending(const struct sock *sk)
 {
-	return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
+	const struct dccp_sock *dp = dccp_sk(sk);
+	return dp->dccps_timestamp_echo != 0 ||
+#ifdef CONFIG_IP_DCCP_ACKVEC
+	       (dccp_msk(sk)->dccpms_send_ack_vector &&
+		dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
+#endif
+	       inet_csk_ack_scheduled(sk);
 }
 
-extern int  dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
-extern int  dccp_feat_finalise_settings(struct dccp_sock *dp);
-extern int  dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
-extern int  dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
-				  struct sk_buff *skb);
-extern int  dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
-extern void dccp_feat_list_purge(struct list_head *fn_list);
-
 extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
 extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*);
 extern int dccp_insert_option_elapsed_time(struct sock *sk,
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 93aae7c9555..d8a3509b26f 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_backoff	= icsk->icsk_backoff;
 	info->tcpi_pmtu		= icsk->icsk_pmtu_cookie;
 
-	if (dp->dccps_hc_rx_ackvec != NULL)
+	if (dccp_msk(sk)->dccpms_send_ack_vector)
 		info->tcpi_options |= TCPI_OPT_SACK;
 
 	ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
index f94c7c9d1a7..933a0ecf8d4 100644
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -1,19 +1,11 @@
 /*
  *  net/dccp/feat.c
  *
- *  Feature negotiation for the DCCP protocol (RFC 4340, section 6)
- *
- *  Copyright (c) 2008 The University of Aberdeen, Scotland, UK
- *  Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
- *  Rewrote from scratch, some bits from earlier code by
- *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
+ *  An implementation of the DCCP protocol
+ *  Andrea Bittau <a.bittau@cs.ucl.ac.uk>
  *
  *  ASSUMPTIONS
  *  -----------
- *  o Feature negotiation is coordinated with connection setup (as in TCP), wild
- *    changes of parameters of an established connection are not supported.
- *  o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN.
  *  o All currently known SP features have 1-byte quantities. If in the future
  *    extensions of RFCs 4340..42 define features with item lengths larger than
  *    one byte, a feature-specific extension of the code will be required.
@@ -23,1510 +15,635 @@
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
  */
+
 #include <linux/module.h>
+
 #include "ccid.h"
 #include "feat.h"
 
-/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
-unsigned long	sysctl_dccp_sequence_window __read_mostly = 100;
-int		sysctl_dccp_rx_ccid	    __read_mostly = 2,
-		sysctl_dccp_tx_ccid	    __read_mostly = 2;
+#define DCCP_FEAT_SP_NOAGREE (-123)
 
-/*
- * Feature activation handlers.
- *
- * These all use an u64 argument, to provide enough room for NN/SP features. At
- * this stage the negotiated values have been checked to be within their range.
- */
-static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
+int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
+		     u8 *val, u8 len, gfp_t gfp)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any());
+	struct dccp_opt_pend *opt;
 
-	if (new_ccid == NULL)
-		return -ENOMEM;
+	dccp_feat_debug(type, feature, *val);
 
-	if (rx) {
-		ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-		dp->dccps_hc_rx_ccid = new_ccid;
-	} else {
-		ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-		dp->dccps_hc_tx_ccid = new_ccid;
+	if (len > 3) {
+		DCCP_WARN("invalid length %d\n", len);
+		return -EINVAL;
+	}
+	/* XXX add further sanity checks */
+
+	/* check if that feature is already being negotiated */
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+		/* ok we found a negotiation for this option already */
+		if (opt->dccpop_feat == feature && opt->dccpop_type == type) {
+			dccp_pr_debug("Replacing old\n");
+			/* replace */
+			BUG_ON(opt->dccpop_val == NULL);
+			kfree(opt->dccpop_val);
+			opt->dccpop_val	 = val;
+			opt->dccpop_len	 = len;
+			opt->dccpop_conf = 0;
+			return 0;
+		}
 	}
-	return 0;
-}
 
-static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
+	/* negotiation for a new feature */
+	opt = kmalloc(sizeof(*opt), gfp);
+	if (opt == NULL)
+		return -ENOMEM;
 
-	if (rx) {
-		dp->dccps_r_seq_win = seq_win;
-		/* propagate changes to update SWL/SWH */
-		dccp_update_gsr(sk, dp->dccps_gsr);
-	} else {
-		dp->dccps_l_seq_win = seq_win;
-		/* propagate changes to update AWL */
-		dccp_update_gss(sk, dp->dccps_gss);
-	}
-	return 0;
-}
+	opt->dccpop_type = type;
+	opt->dccpop_feat = feature;
+	opt->dccpop_len	 = len;
+	opt->dccpop_val	 = val;
+	opt->dccpop_conf = 0;
+	opt->dccpop_sc	 = NULL;
 
-static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
-{
-#ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__
-	/*
-	 * FIXME: This is required until several problems in the CCID-2 code are
-	 * resolved. The CCID-2 code currently does not cope well; using dynamic
-	 * Ack Ratios greater than 1 caused instabilities. These were manifest
-	 * in hangups and long RTO timeouts (1...3 seconds). Until this has been
-	 * stabilised, it is safer not to activate dynamic Ack Ratio changes.
-	 */
-	dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n",
-		      rx ? "RX" : "TX", (u16)ratio);
-	ratio = 1;
-#endif
-	if (rx)
-		dccp_sk(sk)->dccps_r_ack_ratio = ratio;
-	else
-		dccp_sk(sk)->dccps_l_ack_ratio = ratio;
+	BUG_ON(opt->dccpop_val == NULL);
+
+	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending);
 	return 0;
 }
 
-static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
+EXPORT_SYMBOL_GPL(dccp_feat_change);
+
+static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	/* figure out if we are changing our CCID or the peer's */
+	const int rx = type == DCCPO_CHANGE_R;
+	const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid;
+	struct ccid *new_ccid;
+
+	/* Check if nothing is being changed. */
+	if (ccid_nr == new_ccid_nr)
+		return 0;
+
+	new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC);
+	if (new_ccid == NULL)
+		return -ENOMEM;
 
 	if (rx) {
-		if (enable && dp->dccps_hc_rx_ackvec == NULL) {
-			dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
-			if (dp->dccps_hc_rx_ackvec == NULL)
-				return -ENOMEM;
-		} else if (!enable) {
-			dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
-			dp->dccps_hc_rx_ackvec = NULL;
-		}
+		ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+		dp->dccps_hc_rx_ccid = new_ccid;
+		dmsk->dccpms_rx_ccid = new_ccid_nr;
+	} else {
+		ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+		dp->dccps_hc_tx_ccid = new_ccid;
+		dmsk->dccpms_tx_ccid = new_ccid_nr;
 	}
-	return 0;
-}
 
-static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
-{
-	if (!rx)
-		dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
 	return 0;
 }
 
-/*
- * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
- * `rx' holds when the sending peer informs about his partial coverage via a
- * ChangeR() option. In the other case, we are the sender and the receiver
- * announces its coverage via ChangeL() options. The policy here is to honour
- * such communication by enabling the corresponding partial coverage - but only
- * if it has not been set manually before; the warning here means that all
- * packets will be dropped.
- */
-static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
+static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
+	dccp_feat_debug(type, feat, val);
 
-	if (rx)
-		dp->dccps_pcrlen = cscov;
-	else {
-		if (dp->dccps_pcslen == 0)
-			dp->dccps_pcslen = cscov;
-		else if (cscov > dp->dccps_pcslen)
-			DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
-				  dp->dccps_pcslen, (u8)cscov);
+	switch (feat) {
+	case DCCPF_CCID:
+		return dccp_feat_update_ccid(sk, type, val);
+	default:
+		dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n",
+			      dccp_feat_typename(type), feat);
+		break;
 	}
 	return 0;
 }
 
-static const struct {
-	u8			feat_num;		/* DCCPF_xxx */
-	enum dccp_feat_type	rxtx;			/* RX or TX  */
-	enum dccp_feat_type	reconciliation;		/* SP or NN  */
-	u8			default_value;		/* as in 6.4 */
-	int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
-/*
- *    Lookup table for location and type of features (from RFC 4340/4342)
- *  +--------------------------+----+-----+----+----+---------+-----------+
- *  | Feature                  | Location | Reconc. | Initial |  Section  |
- *  |                          | RX | TX  | SP | NN |  Value  | Reference |
- *  +--------------------------+----+-----+----+----+---------+-----------+
- *  | DCCPF_CCID               |    |  X  | X  |    |   2     | 10        |
- *  | DCCPF_SHORT_SEQNOS       |    |  X  | X  |    |   0     |  7.6.1    |
- *  | DCCPF_SEQUENCE_WINDOW    |    |  X  |    | X  | 100     |  7.5.2    |
- *  | DCCPF_ECN_INCAPABLE      | X  |     | X  |    |   0     | 12.1      |
- *  | DCCPF_ACK_RATIO          |    |  X  |    | X  |   2     | 11.3      |
- *  | DCCPF_SEND_ACK_VECTOR    | X  |     | X  |    |   0     | 11.5      |
- *  | DCCPF_SEND_NDP_COUNT     |    |  X  | X  |    |   0     |  7.7.2    |
- *  | DCCPF_MIN_CSUM_COVER     | X  |     | X  |    |   0     |  9.2.1    |
- *  | DCCPF_DATA_CHECKSUM      | X  |     | X  |    |   0     |  9.3.1    |
- *  | DCCPF_SEND_LEV_RATE      | X  |     | X  |    |   0     | 4342/8.4  |
- *  +--------------------------+----+-----+----+----+---------+-----------+
- */
-} dccp_feat_table[] = {
-	{ DCCPF_CCID,		 FEAT_AT_TX, FEAT_SP, 2,   dccp_hdlr_ccid     },
-	{ DCCPF_SHORT_SEQNOS,	 FEAT_AT_TX, FEAT_SP, 0,   NULL },
-	{ DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win  },
-	{ DCCPF_ECN_INCAPABLE,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
-	{ DCCPF_ACK_RATIO,	 FEAT_AT_TX, FEAT_NN, 2,   dccp_hdlr_ack_ratio},
-	{ DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0,   dccp_hdlr_ackvec   },
-	{ DCCPF_SEND_NDP_COUNT,  FEAT_AT_TX, FEAT_SP, 0,   dccp_hdlr_ndp      },
-	{ DCCPF_MIN_CSUM_COVER,  FEAT_AT_RX, FEAT_SP, 0,   dccp_hdlr_min_cscov},
-	{ DCCPF_DATA_CHECKSUM,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
-	{ DCCPF_SEND_LEV_RATE,	 FEAT_AT_RX, FEAT_SP, 0,   NULL },
-};
-#define DCCP_FEAT_SUPPORTED_MAX		ARRAY_SIZE(dccp_feat_table)
-
-/**
- * dccp_feat_index  -  Hash function to map feature number into array position
- * Returns consecutive array index or -1 if the feature is not understood.
- */
-static int dccp_feat_index(u8 feat_num)
+static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt,
+			       u8 *rpref, u8 rlen)
 {
-	/* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
-	if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
-		return feat_num - 1;
+	struct dccp_sock *dp = dccp_sk(sk);
+	u8 *spref, slen, *res = NULL;
+	int i, j, rc, agree = 1;
 
+	BUG_ON(rpref == NULL);
+
+	/* check if we are the black sheep */
+	if (dp->dccps_role == DCCP_ROLE_CLIENT) {
+		spref = rpref;
+		slen  = rlen;
+		rpref = opt->dccpop_val;
+		rlen  = opt->dccpop_len;
+	} else {
+		spref = opt->dccpop_val;
+		slen  = opt->dccpop_len;
+	}
 	/*
-	 * Other features: add cases for new feature types here after adding
-	 * them to the above table.
+	 * Now we have server preference list in spref and client preference in
+	 * rpref
 	 */
-	switch (feat_num) {
-	case DCCPF_SEND_LEV_RATE:
-			return DCCP_FEAT_SUPPORTED_MAX - 1;
-	}
-	return -1;
-}
-
-static u8 dccp_feat_type(u8 feat_num)
-{
-	int idx = dccp_feat_index(feat_num);
-
-	if (idx < 0)
-		return FEAT_UNKNOWN;
-	return dccp_feat_table[idx].reconciliation;
-}
+	BUG_ON(spref == NULL);
+	BUG_ON(rpref == NULL);
 
-static int dccp_feat_default_value(u8 feat_num)
-{
-	int idx = dccp_feat_index(feat_num);
+	/* FIXME sanity check vals */
 
-	return idx < 0 ? : dccp_feat_table[idx].default_value;
-}
-
-/*
- *	Debugging and verbose-printing section
- */
-static const char *dccp_feat_fname(const u8 feat)
-{
-	static const char *feature_names[] = {
-		[DCCPF_RESERVED]	= "Reserved",
-		[DCCPF_CCID]		= "CCID",
-		[DCCPF_SHORT_SEQNOS]	= "Allow Short Seqnos",
-		[DCCPF_SEQUENCE_WINDOW]	= "Sequence Window",
-		[DCCPF_ECN_INCAPABLE]	= "ECN Incapable",
-		[DCCPF_ACK_RATIO]	= "Ack Ratio",
-		[DCCPF_SEND_ACK_VECTOR]	= "Send ACK Vector",
-		[DCCPF_SEND_NDP_COUNT]	= "Send NDP Count",
-		[DCCPF_MIN_CSUM_COVER]	= "Min. Csum Coverage",
-		[DCCPF_DATA_CHECKSUM]	= "Send Data Checksum",
-	};
-	if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
-		return feature_names[DCCPF_RESERVED];
-
-	if (feat ==  DCCPF_SEND_LEV_RATE)
-		return "Send Loss Event Rate";
-	if (feat >= DCCPF_MIN_CCID_SPECIFIC)
-		return "CCID-specific";
-
-	return feature_names[feat];
-}
-
-static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING",
-					 "UNSTABLE", "STABLE" };
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-static const char *dccp_feat_oname(const u8 opt)
-{
-	switch (opt) {
-	case DCCPO_CHANGE_L:  return "Change_L";
-	case DCCPO_CONFIRM_L: return "Confirm_L";
-	case DCCPO_CHANGE_R:  return "Change_R";
-	case DCCPO_CONFIRM_R: return "Confirm_R";
+	/* Are values in any order?  XXX Lame "algorithm" here */
+	for (i = 0; i < slen; i++) {
+		for (j = 0; j < rlen; j++) {
+			if (spref[i] == rpref[j]) {
+				res = &spref[i];
+				break;
+			}
+		}
+		if (res)
+			break;
 	}
-	return NULL;
-}
 
-static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
-{
-	u8 i, type = dccp_feat_type(feat_num);
-
-	if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
-		dccp_pr_debug_cat("(NULL)");
-	else if (type == FEAT_SP)
-		for (i = 0; i < val->sp.len; i++)
-			dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
-	else if (type == FEAT_NN)
-		dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
-	else
-		dccp_pr_debug_cat("unknown type %u", type);
-}
-
-static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
-{
-	u8 type = dccp_feat_type(feat_num);
-	dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
-
-	if (type == FEAT_NN)
-		fval.nn = dccp_decode_value_var(list, len);
-	dccp_feat_printval(feat_num, &fval);
-}
+	/* we didn't agree on anything */
+	if (res == NULL) {
+		/* confirm previous value */
+		switch (opt->dccpop_feat) {
+		case DCCPF_CCID:
+			/* XXX did i get this right? =P */
+			if (opt->dccpop_type == DCCPO_CHANGE_L)
+				res = &dccp_msk(sk)->dccpms_tx_ccid;
+			else
+				res = &dccp_msk(sk)->dccpms_rx_ccid;
+			break;
 
-static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
-{
-	dccp_debug("   * %s %s = ", entry->is_local ? "local" : "remote",
-				    dccp_feat_fname(entry->feat_num));
-	dccp_feat_printval(entry->feat_num, &entry->val);
-	dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
-			  entry->needs_confirm ? "(Confirm pending)" : "");
-}
+		default:
+			DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat);
+			/* XXX implement res */
+			return -EFAULT;
+		}
 
-#define dccp_feat_print_opt(opt, feat, val, len, mandatory)	do {	      \
-	dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
-	dccp_feat_printvals(feat, val, len);				      \
-	dccp_pr_debug_cat(") %s\n", mandatory ? "!" : "");	} while (0)
-
-#define dccp_feat_print_fnlist(fn_list)  {		\
-	const struct dccp_feat_entry *___entry;		\
-							\
-	dccp_pr_debug("List Dump:\n");			\
-	list_for_each_entry(___entry, fn_list, node)	\
-		dccp_feat_print_entry(___entry);	\
-}
-#else	/* ! CONFIG_IP_DCCP_DEBUG */
-#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
-#define dccp_feat_print_fnlist(fn_list)
-#endif
+		dccp_pr_debug("Don't agree... reconfirming %d\n", *res);
+		agree = 0; /* this is used for mandatory options... */
+	}
 
-static int __dccp_feat_activate(struct sock *sk, const int idx,
-				const bool is_local, dccp_feat_val const *fval)
-{
-	bool rx;
-	u64 val;
+	/* need to put result and our preference list */
+	rlen = 1 + opt->dccpop_len;
+	rpref = kmalloc(rlen, GFP_ATOMIC);
+	if (rpref == NULL)
+		return -ENOMEM;
 
-	if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
-		return -1;
-	if (dccp_feat_table[idx].activation_hdlr == NULL)
-		return 0;
+	*rpref = *res;
+	memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len);
 
-	if (fval == NULL) {
-		val = dccp_feat_table[idx].default_value;
-	} else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
-		if (fval->sp.vec == NULL) {
-			/*
-			 * This can happen when an empty Confirm is sent
-			 * for an SP (i.e. known) feature. In this case
-			 * we would be using the default anyway.
-			 */
-			DCCP_CRIT("Feature #%d undefined: using default", idx);
-			val = dccp_feat_table[idx].default_value;
-		} else {
-			val = fval->sp.vec[0];
+	/* put it in the "confirm queue" */
+	if (opt->dccpop_sc == NULL) {
+		opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC);
+		if (opt->dccpop_sc == NULL) {
+			kfree(rpref);
+			return -ENOMEM;
 		}
 	} else {
-		val = fval->nn;
+		/* recycle the confirm slot */
+		BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
+		kfree(opt->dccpop_sc->dccpoc_val);
+		dccp_pr_debug("recycling confirm slot\n");
+	}
+	memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc));
+
+	opt->dccpop_sc->dccpoc_val = rpref;
+	opt->dccpop_sc->dccpoc_len = rlen;
+
+	/* update the option on our side [we are about to send the confirm] */
+	rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res);
+	if (rc) {
+		kfree(opt->dccpop_sc->dccpoc_val);
+		kfree(opt->dccpop_sc);
+		opt->dccpop_sc = NULL;
+		return rc;
 	}
 
-	/* Location is RX if this is a local-RX or remote-TX feature */
-	rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
-
-	dccp_debug("   -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
-		   dccp_feat_fname(dccp_feat_table[idx].feat_num),
-		   fval ? "" : "default ",  (unsigned long long)val);
-
-	return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
-}
-
-/**
- * dccp_feat_activate  -  Activate feature value on socket
- * @sk: fully connected DCCP socket (after handshake is complete)
- * @feat_num: feature to activate, one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is meant
- * @fval: the value (SP or NN) to activate, or NULL to use the default value
- * For general use this function is preferable over __dccp_feat_activate().
- */
-static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
-			      dccp_feat_val const *fval)
-{
-	return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
-}
-
-/* Test for "Req'd" feature (RFC 4340, 6.4) */
-static inline int dccp_feat_must_be_understood(u8 feat_num)
-{
-	return	feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
-		feat_num == DCCPF_SEQUENCE_WINDOW;
-}
+	dccp_pr_debug("Will confirm %d\n", *rpref);
 
-/* copy constructor, fval must not already contain allocated memory */
-static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
-{
-	fval->sp.len = len;
-	if (fval->sp.len > 0) {
-		fval->sp.vec = kmemdup(val, len, gfp_any());
-		if (fval->sp.vec == NULL) {
-			fval->sp.len = 0;
-			return -ENOBUFS;
-		}
+	/* say we want to change to X but we just got a confirm X, suppress our
+	 * change
+	 */
+	if (!opt->dccpop_conf) {
+		if (*opt->dccpop_val == *res)
+			opt->dccpop_conf = 1;
+		dccp_pr_debug("won't ask for change of same feature\n");
 	}
-	return 0;
-}
 
-static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
-{
-	if (unlikely(val == NULL))
-		return;
-	if (dccp_feat_type(feat_num) == FEAT_SP)
-		kfree(val->sp.vec);
-	memset(val, 0, sizeof(*val));
+	return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */
 }
 
-static struct dccp_feat_entry *
-	      dccp_feat_clone_entry(struct dccp_feat_entry const *original)
+static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 {
-	struct dccp_feat_entry *new;
-	u8 type = dccp_feat_type(original->feat_num);
-
-	if (type == FEAT_UNKNOWN)
-		return NULL;
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	struct dccp_opt_pend *opt;
+	int rc = 1;
+	u8 t;
 
-	new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
-	if (new == NULL)
-		return NULL;
+	/*
+	 * We received a CHANGE.  We gotta match it against our own preference
+	 * list.  If we got a CHANGE_R it means it's a change for us, so we need
+	 * to compare our CHANGE_L list.
+	 */
+	if (type == DCCPO_CHANGE_L)
+		t = DCCPO_CHANGE_R;
+	else
+		t = DCCPO_CHANGE_L;
 
-	if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
-						      original->val.sp.vec,
-						      original->val.sp.len)) {
-		kfree(new);
-		return NULL;
-	}
-	return new;
-}
+	/* find our preference list for this feature */
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+		if (opt->dccpop_type != t || opt->dccpop_feat != feature)
+			continue;
 
-static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
-{
-	if (entry != NULL) {
-		dccp_feat_val_destructor(entry->feat_num, &entry->val);
-		kfree(entry);
+		/* find the winner from the two preference lists */
+		rc = dccp_feat_reconcile(sk, opt, val, len);
+		break;
 	}
-}
 
-/*
- * List management functions
- *
- * Feature negotiation lists rely on and maintain the following invariants:
- * - each feat_num in the list is known, i.e. we know its type and default value
- * - each feat_num/is_local combination is unique (old entries are overwritten)
- * - SP values are always freshly allocated
- * - list is sorted in increasing order of feature number (faster lookup)
- */
-static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
-						     u8 feat_num, bool is_local)
-{
-	struct dccp_feat_entry *entry;
+	/* We didn't deal with the change.  This can happen if we have no
+	 * preference list for the feature.  In fact, it just shouldn't
+	 * happen---if we understand a feature, we should have a preference list
+	 * with at least the default value.
+	 */
+	BUG_ON(rc == 1);
 
-	list_for_each_entry(entry, fn_list, node)
-		if (entry->feat_num == feat_num && entry->is_local == is_local)
-			return entry;
-		else if (entry->feat_num > feat_num)
-			break;
-	return NULL;
+	return rc;
 }
 
-/**
- * dccp_feat_entry_new  -  Central list update routine (called by all others)
- * @head:  list to add to
- * @feat:  feature number
- * @local: whether the local (1) or remote feature with number @feat is meant
- * This is the only constructor and serves to ensure the above invariants.
- */
-static struct dccp_feat_entry *
-	      dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
+static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 {
-	struct dccp_feat_entry *entry;
-
-	list_for_each_entry(entry, head, node)
-		if (entry->feat_num == feat && entry->is_local == local) {
-			dccp_feat_val_destructor(entry->feat_num, &entry->val);
-			return entry;
-		} else if (entry->feat_num > feat) {
-			head = &entry->node;
-			break;
-		}
+	struct dccp_opt_pend *opt;
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	u8 *copy;
+	int rc;
 
-	entry = kmalloc(sizeof(*entry), gfp_any());
-	if (entry != NULL) {
-		entry->feat_num = feat;
-		entry->is_local = local;
-		list_add_tail(&entry->node, head);
+	/* NN features must be Change L (sec. 6.3.2) */
+	if (type != DCCPO_CHANGE_L) {
+		dccp_pr_debug("received %s for NN feature %d\n",
+				dccp_feat_typename(type), feature);
+		return -EFAULT;
 	}
-	return entry;
-}
 
-/**
- * dccp_feat_push_change  -  Add/overwrite a Change option in the list
- * @fn_list: feature-negotiation list to update
- * @feat: one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is meant
- * @needs_mandatory: whether to use Mandatory feature negotiation options
- * @fval: pointer to NN/SP value to be inserted (will be copied)
- */
-static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
-				 u8 mandatory, dccp_feat_val *fval)
-{
-	struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+	/* XXX sanity check opt val */
 
-	if (new == NULL)
+	/* copy option so we can confirm it */
+	opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
+	if (opt == NULL)
 		return -ENOMEM;
 
-	new->feat_num	     = feat;
-	new->is_local	     = local;
-	new->state	     = FEAT_INITIALISING;
-	new->needs_confirm   = 0;
-	new->empty_confirm   = 0;
-	new->val	     = *fval;
-	new->needs_mandatory = mandatory;
+	copy = kmemdup(val, len, GFP_ATOMIC);
+	if (copy == NULL) {
+		kfree(opt);
+		return -ENOMEM;
+	}
 
-	return 0;
-}
+	opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */
+	opt->dccpop_feat = feature;
+	opt->dccpop_val	 = copy;
+	opt->dccpop_len	 = len;
 
-/**
- * dccp_feat_push_confirm  -  Add a Confirm entry to the FN list
- * @fn_list: feature-negotiation list to add to
- * @feat: one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is being confirmed
- * @fval: pointer to NN/SP value to be inserted or NULL
- * Returns 0 on success, a Reset code for further processing otherwise.
- */
-static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
-				  dccp_feat_val *fval)
-{
-	struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+	/* change feature */
+	rc = dccp_feat_update(sk, type, feature, *val);
+	if (rc) {
+		kfree(opt->dccpop_val);
+		kfree(opt);
+		return rc;
+	}
 
-	if (new == NULL)
-		return DCCP_RESET_CODE_TOO_BUSY;
+	dccp_feat_debug(type, feature, *copy);
 
-	new->feat_num	     = feat;
-	new->is_local	     = local;
-	new->state	     = FEAT_STABLE;	/* transition in 6.6.2 */
-	new->needs_confirm   = 1;
-	new->empty_confirm   = (fval == NULL);
-	new->val.nn	     = 0;		/* zeroes the whole structure */
-	if (!new->empty_confirm)
-		new->val     = *fval;
-	new->needs_mandatory = 0;
+	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
 
 	return 0;
 }
 
-static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
+static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk,
+				    u8 type, u8 feature)
 {
-	return dccp_feat_push_confirm(fn_list, feat, local, NULL);
-}
+	/* XXX check if other confirms for that are queued and recycle slot */
+	struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC);
 
-static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
-{
-	list_del(&entry->node);
-	dccp_feat_entry_destructor(entry);
-}
-
-void dccp_feat_list_purge(struct list_head *fn_list)
-{
-	struct dccp_feat_entry *entry, *next;
-
-	list_for_each_entry_safe(entry, next, fn_list, node)
-		dccp_feat_entry_destructor(entry);
-	INIT_LIST_HEAD(fn_list);
-}
-EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
-
-/* generate @to as full clone of @from - @to must not contain any nodes */
-int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
-{
-	struct dccp_feat_entry *entry, *new;
-
-	INIT_LIST_HEAD(to);
-	list_for_each_entry(entry, from, node) {
-		new = dccp_feat_clone_entry(entry);
-		if (new == NULL)
-			goto cloning_failed;
-		list_add_tail(&new->node, to);
+	if (opt == NULL) {
+		/* XXX what do we do?  Ignoring should be fine.  It's a change
+		 * after all =P
+		 */
+		return;
 	}
-	return 0;
 
-cloning_failed:
-	dccp_feat_list_purge(to);
-	return -ENOMEM;
-}
+	switch (type) {
+	case DCCPO_CHANGE_L:
+		opt->dccpop_type = DCCPO_CONFIRM_R;
+		break;
+	case DCCPO_CHANGE_R:
+		opt->dccpop_type = DCCPO_CONFIRM_L;
+		break;
+	default:
+		DCCP_WARN("invalid type %d\n", type);
+		kfree(opt);
+		return;
+	}
+	opt->dccpop_feat = feature;
+	opt->dccpop_val	 = NULL;
+	opt->dccpop_len	 = 0;
 
-/**
- * dccp_feat_valid_nn_length  -  Enforce length constraints on NN options
- * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
- * incoming options are accepted as long as their values are valid.
- */
-static u8 dccp_feat_valid_nn_length(u8 feat_num)
-{
-	if (feat_num == DCCPF_ACK_RATIO)	/* RFC 4340, 11.3 and 6.6.8 */
-		return 2;
-	if (feat_num == DCCPF_SEQUENCE_WINDOW)	/* RFC 4340, 7.5.2 and 6.5  */
-		return 6;
-	return 0;
-}
+	/* change feature */
+	dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature);
 
-static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
-{
-	switch (feat_num) {
-	case DCCPF_ACK_RATIO:
-		return val <= DCCPF_ACK_RATIO_MAX;
-	case DCCPF_SEQUENCE_WINDOW:
-		return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
-	}
-	return 0;	/* feature unknown - so we can't tell */
+	list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf);
 }
 
-/* check that SP values are within the ranges defined in RFC 4340 */
-static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
+static void dccp_feat_flush_confirm(struct sock *sk)
 {
-	switch (feat_num) {
-	case DCCPF_CCID:
-		return val == DCCPC_CCID2 || val == DCCPC_CCID3;
-	/* Type-check Boolean feature values: */
-	case DCCPF_SHORT_SEQNOS:
-	case DCCPF_ECN_INCAPABLE:
-	case DCCPF_SEND_ACK_VECTOR:
-	case DCCPF_SEND_NDP_COUNT:
-	case DCCPF_DATA_CHECKSUM:
-	case DCCPF_SEND_LEV_RATE:
-		return val < 2;
-	case DCCPF_MIN_CSUM_COVER:
-		return val < 16;
-	}
-	return 0;			/* feature unknown */
-}
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	/* Check if there is anything to confirm in the first place */
+	int yes = !list_empty(&dmsk->dccpms_conf);
 
-static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
-{
-	if (sp_list == NULL || sp_len < 1)
-		return 0;
-	while (sp_len--)
-		if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
-			return 0;
-	return 1;
-}
+	if (!yes) {
+		struct dccp_opt_pend *opt;
 
-/**
- * dccp_feat_insert_opts  -  Generate FN options from current list state
- * @skb: next sk_buff to be sent to the peer
- * @dp: for client during handshake and general negotiation
- * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
- */
-int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
-			  struct sk_buff *skb)
-{
-	struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
-	struct dccp_feat_entry *pos, *next;
-	u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
-	bool rpt;
-
-	/* put entries into @skb in the order they appear in the list */
-	list_for_each_entry_safe_reverse(pos, next, fn, node) {
-		opt  = dccp_feat_genopt(pos);
-		type = dccp_feat_type(pos->feat_num);
-		rpt  = false;
-
-		if (pos->empty_confirm) {
-			len = 0;
-			ptr = NULL;
-		} else {
-			if (type == FEAT_SP) {
-				len = pos->val.sp.len;
-				ptr = pos->val.sp.vec;
-				rpt = pos->needs_confirm;
-			} else if (type == FEAT_NN) {
-				len = dccp_feat_valid_nn_length(pos->feat_num);
-				ptr = nn_in_nbo;
-				dccp_encode_value_var(pos->val.nn, ptr, len);
-			} else {
-				DCCP_BUG("unknown feature %u", pos->feat_num);
-				return -1;
+		list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+			if (opt->dccpop_conf) {
+				yes = 1;
+				break;
 			}
 		}
-		dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
-
-		if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
-			return -1;
-		if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
-			return -1;
-		/*
-		 * Enter CHANGING after transmitting the Change option (6.6.2).
-		 */
-		if (pos->state == FEAT_INITIALISING)
-			pos->state = FEAT_CHANGING;
 	}
-	return 0;
-}
-
-/**
- * __feat_register_nn  -  Register new NN value on socket
- * @fn: feature-negotiation list to register with
- * @feat: an NN feature from %dccp_feature_numbers
- * @mandatory: use Mandatory option if 1
- * @nn_val: value to register (restricted to 4 bytes)
- * Note that NN features are local by definition (RFC 4340, 6.3.2).
- */
-static int __feat_register_nn(struct list_head *fn, u8 feat,
-			      u8 mandatory, u64 nn_val)
-{
-	dccp_feat_val fval = { .nn = nn_val };
-
-	if (dccp_feat_type(feat) != FEAT_NN ||
-	    !dccp_feat_is_valid_nn_val(feat, nn_val))
-		return -EINVAL;
-
-	/* Don't bother with default values, they will be activated anyway. */
-	if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
-		return 0;
-
-	return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
-}
-
-/**
- * __feat_register_sp  -  Register new SP value/list on socket
- * @fn: feature-negotiation list to register with
- * @feat: an SP feature from %dccp_feature_numbers
- * @is_local: whether the local (1) or the remote (0) @feat is meant
- * @mandatory: use Mandatory option if 1
- * @sp_val: SP value followed by optional preference list
- * @sp_len: length of @sp_val in bytes
- */
-static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
-			      u8 mandatory, u8 const *sp_val, u8 sp_len)
-{
-	dccp_feat_val fval;
 
-	if (dccp_feat_type(feat) != FEAT_SP ||
-	    !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
-		return -EINVAL;
-
-	/* Avoid negotiating alien CCIDs by only advertising supported ones */
-	if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
-		return -EOPNOTSUPP;
-
-	if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
-		return -ENOMEM;
+	if (!yes)
+		return;
 
-	return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
+	/* OK there is something to confirm... */
+	/* XXX check if packet is in flight?  Send delayed ack?? */
+	if (sk->sk_state == DCCP_OPEN)
+		dccp_send_ack(sk);
 }
 
-/**
- * dccp_feat_register_sp  -  Register requests to change SP feature values
- * @sk: client or listening socket
- * @feat: one of %dccp_feature_numbers
- * @is_local: whether the local (1) or remote (0) @feat is meant
- * @list: array of preferred values, in descending order of preference
- * @len: length of @list in bytes
- */
-int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
-			  u8 const *list, u8 len)
-{	 /* any changes must be registered before establishing the connection */
-	if (sk->sk_state != DCCP_CLOSED)
-		return -EISCONN;
-	if (dccp_feat_type(feat) != FEAT_SP)
-		return -EINVAL;
-	return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
-				  0, list, len);
-}
-
-/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
-int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
+int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len)
 {
-	/* any changes must be registered before establishing the connection */
-	if (sk->sk_state != DCCP_CLOSED)
-		return -EISCONN;
-	if (dccp_feat_type(feat) != FEAT_NN)
-		return -EINVAL;
-	return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
-}
+	int rc;
 
-/**
- * dccp_feat_signal_nn_change  -  Update NN values for an established connection
- * @sk: DCCP socket of an established connection
- * @feat: NN feature number from %dccp_feature_numbers
- * @nn_val: the new value to use
- * This function is used to communicate NN updates out-of-band. The difference
- * to feature negotiation during connection setup is that values are activated
- * immediately after validation, i.e. we don't wait for the Confirm: either the
- * value is accepted by the peer (and then the waiting is futile), or it is not
- * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values
- * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2).
- */
-int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
-{
-	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
-	dccp_feat_val fval = { .nn = nn_val };
-	struct dccp_feat_entry *entry;
+	dccp_feat_debug(type, feature, *val);
 
-	if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
-		return 0;
+	/* figure out if it's SP or NN feature */
+	switch (feature) {
+	/* deal with SP features */
+	case DCCPF_CCID:
+		rc = dccp_feat_sp(sk, type, feature, val, len);
+		break;
 
-	if (dccp_feat_type(feat) != FEAT_NN ||
-	    !dccp_feat_is_valid_nn_val(feat, nn_val))
-		return -EINVAL;
+	/* deal with NN features */
+	case DCCPF_ACK_RATIO:
+		rc = dccp_feat_nn(sk, type, feature, val, len);
+		break;
 
-	entry = dccp_feat_list_lookup(fn, feat, 1);
-	if (entry != NULL) {
-		dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n",
-			      (unsigned long long)nn_val,
-			      (unsigned long long)entry->val.nn,
-			      dccp_feat_sname[entry->state]);
-		return 0;
+	/* XXX implement other features */
+	default:
+		dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n",
+			      dccp_feat_typename(type), feature);
+		rc = -EFAULT;
+		break;
 	}
 
-	if (dccp_feat_activate(sk, feat, 1, &fval))
-		return -EADV;
-
-	inet_csk_schedule_ack(sk);
-	return dccp_feat_push_change(fn, feat, 1, 0, &fval);
-}
-EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
-
-/*
- *	Tracking features whose value depend on the choice of CCID
- *
- * This is designed with an extension in mind so that a list walk could be done
- * before activating any features. However, the existing framework was found to
- * work satisfactorily up until now, the automatic verification is left open.
- * When adding new CCIDs, add a corresponding dependency table here.
- */
-static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
-{
-	static const struct ccid_dependency ccid2_dependencies[2][2] = {
-		/*
-		 * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
-		 * feature and Send Ack Vector is an RX feature, `is_local'
-		 * needs to be reversed.
+	/* check if there were problems changing features */
+	if (rc) {
+		/* If we don't agree on SP, we sent a confirm for old value.
+		 * However we propagate rc to caller in case option was
+		 * mandatory
 		 */
-		{	/* Dependencies of the receiver-side (remote) CCID2 */
-			{
-				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
-				.is_local	= true,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{ 0, 0, 0, 0 }
-		},
-		{	/* Dependencies of the sender-side (local) CCID2 */
-			{
-				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
-				.is_local	= false,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{ 0, 0, 0, 0 }
-		}
-	};
-	static const struct ccid_dependency ccid3_dependencies[2][5] = {
-		{	/*
-			 * Dependencies of the receiver-side CCID3
-			 */
-			{	/* locally disable Ack Vectors */
-				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
-				.is_local	= true,
-				.is_mandatory	= false,
-				.val		= 0
-			},
-			{	/* see below why Send Loss Event Rate is on */
-				.dependent_feat	= DCCPF_SEND_LEV_RATE,
-				.is_local	= true,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{	/* NDP Count is needed as per RFC 4342, 6.1.1 */
-				.dependent_feat	= DCCPF_SEND_NDP_COUNT,
-				.is_local	= false,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{ 0, 0, 0, 0 },
-		},
-		{	/*
-			 * CCID3 at the TX side: we request that the HC-receiver
-			 * will not send Ack Vectors (they will be ignored, so
-			 * Mandatory is not set); we enable Send Loss Event Rate
-			 * (Mandatory since the implementation does not support
-			 * the Loss Intervals option of RFC 4342, 8.6).
-			 * The last two options are for peer's information only.
-			*/
-			{
-				.dependent_feat	= DCCPF_SEND_ACK_VECTOR,
-				.is_local	= false,
-				.is_mandatory	= false,
-				.val		= 0
-			},
-			{
-				.dependent_feat	= DCCPF_SEND_LEV_RATE,
-				.is_local	= false,
-				.is_mandatory	= true,
-				.val		= 1
-			},
-			{	/* this CCID does not support Ack Ratio */
-				.dependent_feat	= DCCPF_ACK_RATIO,
-				.is_local	= true,
-				.is_mandatory	= false,
-				.val		= 0
-			},
-			{	/* tell receiver we are sending NDP counts */
-				.dependent_feat	= DCCPF_SEND_NDP_COUNT,
-				.is_local	= true,
-				.is_mandatory	= false,
-				.val		= 1
-			},
-			{ 0, 0, 0, 0 }
-		}
-	};
-	switch (ccid) {
-	case DCCPC_CCID2:
-		return ccid2_dependencies[is_local];
-	case DCCPC_CCID3:
-		return ccid3_dependencies[is_local];
-	default:
-		return NULL;
+		if (rc != DCCP_FEAT_SP_NOAGREE)
+			dccp_feat_empty_confirm(dccp_msk(sk), type, feature);
 	}
-}
 
-/**
- * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
- * @fn: feature-negotiation list to update
- * @id: CCID number to track
- * @is_local: whether TX CCID (1) or RX CCID (0) is meant
- * This function needs to be called after registering all other features.
- */
-static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
-{
-	const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
-	int i, rc = (table == NULL);
-
-	for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
-		if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
-			rc = __feat_register_sp(fn, table[i].dependent_feat,
-						    table[i].is_local,
-						    table[i].is_mandatory,
-						    &table[i].val, 1);
-		else
-			rc = __feat_register_nn(fn, table[i].dependent_feat,
-						    table[i].is_mandatory,
-						    table[i].val);
-	return rc;
-}
-
-/**
- * dccp_feat_finalise_settings  -  Finalise settings before starting negotiation
- * @dp: client or listening socket (settings will be inherited)
- * This is called after all registrations (socket initialisation, sysctls, and
- * sockopt calls), and before sending the first packet containing Change options
- * (ie. client-Request or server-Response), to ensure internal consistency.
- */
-int dccp_feat_finalise_settings(struct dccp_sock *dp)
-{
-	struct list_head *fn = &dp->dccps_featneg;
-	struct dccp_feat_entry *entry;
-	int i = 2, ccids[2] = { -1, -1 };
+	/* generate the confirm [if required] */
+	dccp_feat_flush_confirm(sk);
 
-	/*
-	 * Propagating CCIDs:
-	 * 1) not useful to propagate CCID settings if this host advertises more
-	 *    than one CCID: the choice of CCID  may still change - if this is
-	 *    the client, or if this is the server and the client sends
-	 *    singleton CCID values.
-	 * 2) since is that propagate_ccid changes the list, we defer changing
-	 *    the sorted list until after the traversal.
-	 */
-	list_for_each_entry(entry, fn, node)
-		if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
-			ccids[entry->is_local] = entry->val.sp.vec[0];
-	while (i--)
-		if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
-			return -1;
-	dccp_feat_print_fnlist(fn);
-	return 0;
+	return rc;
 }
 
-/**
- * dccp_feat_server_ccid_dependencies  -  Resolve CCID-dependent features
- * It is the server which resolves the dependencies once the CCID has been
- * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
- */
-int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
-{
-	struct list_head *fn = &dreq->dreq_featneg;
-	struct dccp_feat_entry *entry;
-	u8 is_local, ccid;
-
-	for (is_local = 0; is_local <= 1; is_local++) {
-		entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
-
-		if (entry != NULL && !entry->empty_confirm)
-			ccid = entry->val.sp.vec[0];
-		else
-			ccid = dccp_feat_default_value(DCCPF_CCID);
-
-		if (dccp_feat_propagate_ccid(fn, ccid, is_local))
-			return -1;
-	}
-	return 0;
-}
+EXPORT_SYMBOL_GPL(dccp_feat_change_recv);
 
-/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
-static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
+int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
+			   u8 *val, u8 len)
 {
-	u8 c, s;
+	u8 t;
+	struct dccp_opt_pend *opt;
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	int found = 0;
+	int all_confirmed = 1;
 
-	for (s = 0; s < slen; s++)
-		for (c = 0; c < clen; c++)
-			if (servlist[s] == clilist[c])
-				return servlist[s];
-	return -1;
-}
+	dccp_feat_debug(type, feature, *val);
 
-/**
- * dccp_feat_prefer  -  Move preferred entry to the start of array
- * Reorder the @array_len elements in @array so that @preferred_value comes
- * first. Returns >0 to indicate that @preferred_value does occur in @array.
- */
-static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
-{
-	u8 i, does_occur = 0;
+	/* locate our change request */
+	switch (type) {
+	case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break;
+	case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break;
+	default:	      DCCP_WARN("invalid type %d\n", type);
+			      return 1;
 
-	if (array != NULL) {
-		for (i = 0; i < array_len; i++)
-			if (array[i] == preferred_value) {
-				array[i] = array[0];
-				does_occur++;
-			}
-		if (does_occur)
-			array[0] = preferred_value;
 	}
-	return does_occur;
-}
+	/* XXX sanity check feature value */
 
-/**
- * dccp_feat_reconcile  -  Reconcile SP preference lists
- *  @fval: SP list to reconcile into
- *  @arr: received SP preference list
- *  @len: length of @arr in bytes
- *  @is_server: whether this side is the server (and @fv is the server's list)
- *  @reorder: whether to reorder the list in @fv after reconciling with @arr
- * When successful, > 0 is returned and the reconciled list is in @fval.
- * A value of 0 means that negotiation failed (no shared entry).
- */
-static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
-			       bool is_server, bool reorder)
-{
-	int rc;
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+		if (!opt->dccpop_conf && opt->dccpop_type == t &&
+		    opt->dccpop_feat == feature) {
+			found = 1;
+			dccp_pr_debug("feature %d found\n", opt->dccpop_feat);
 
-	if (!fv->sp.vec || !arr) {
-		DCCP_CRIT("NULL feature value or array");
-		return 0;
-	}
+			/* XXX do sanity check */
 
-	if (is_server)
-		rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
-	else
-		rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
-
-	if (!reorder)
-		return rc;
-	if (rc < 0)
-		return 0;
+			opt->dccpop_conf = 1;
 
-	/*
-	 * Reorder list: used for activating features and in dccp_insert_fn_opt.
-	 */
-	return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
-}
+			/* We got a confirmation---change the option */
+			dccp_feat_update(sk, opt->dccpop_type,
+					 opt->dccpop_feat, *val);
 
-/**
- * dccp_feat_change_recv  -  Process incoming ChangeL/R options
- * @fn: feature-negotiation list to update
- * @is_mandatory: whether the Change was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
- * @feat: one of %dccp_feature_numbers
- * @val: NN value or SP value/preference list
- * @len: length of @val in bytes
- * @server: whether this node is the server (1) or the client (0)
- */
-static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
-				u8 feat, u8 *val, u8 len, const bool server)
-{
-	u8 defval, type = dccp_feat_type(feat);
-	const bool local = (opt == DCCPO_CHANGE_R);
-	struct dccp_feat_entry *entry;
-	dccp_feat_val fval;
-
-	if (len == 0 || type == FEAT_UNKNOWN)		/* 6.1 and 6.6.8 */
-		goto unknown_feature_or_value;
-
-	dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
-
-	/*
-	 *	Negotiation of NN features: Change R is invalid, so there is no
-	 *	simultaneous negotiation; hence we do not look up in the list.
-	 */
-	if (type == FEAT_NN) {
-		if (local || len > sizeof(fval.nn))
-			goto unknown_feature_or_value;
-
-		/* 6.3.2: "The feature remote MUST accept any valid value..." */
-		fval.nn = dccp_decode_value_var(val, len);
-		if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
-			goto unknown_feature_or_value;
+			/* XXX check the return value of dccp_feat_update */
+			break;
+		}
 
-		return dccp_feat_push_confirm(fn, feat, local, &fval);
+		if (!opt->dccpop_conf)
+			all_confirmed = 0;
 	}
 
-	/*
-	 *	Unidirectional/simultaneous negotiation of SP features (6.3.1)
+	/* fix re-transmit timer */
+	/* XXX gotta make sure that no option negotiation occurs during
+	 * connection shutdown.  Consider that the CLOSEREQ is sent and timer is
+	 * on.  if all options are confirmed it might kill timer which should
+	 * remain alive until close is received.
 	 */
-	entry = dccp_feat_list_lookup(fn, feat, local);
-	if (entry == NULL) {
-		/*
-		 * No particular preferences have been registered. We deal with
-		 * this situation by assuming that all valid values are equally
-		 * acceptable, and apply the following checks:
-		 * - if the peer's list is a singleton, we accept a valid value;
-		 * - if we are the server, we first try to see if the peer (the
-		 *   client) advertises the default value. If yes, we use it,
-		 *   otherwise we accept the preferred value;
-		 * - else if we are the client, we use the first list element.
-		 */
-		if (dccp_feat_clone_sp_val(&fval, val, 1))
-			return DCCP_RESET_CODE_TOO_BUSY;
-
-		if (len > 1 && server) {
-			defval = dccp_feat_default_value(feat);
-			if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
-				fval.sp.vec[0] = defval;
-		} else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
-			kfree(fval.sp.vec);
-			goto unknown_feature_or_value;
-		}
-
-		/* Treat unsupported CCIDs like invalid values */
-		if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
-			kfree(fval.sp.vec);
-			goto not_valid_or_not_known;
-		}
-
-		return dccp_feat_push_confirm(fn, feat, local, &fval);
-
-	} else if (entry->state == FEAT_UNSTABLE) {	/* 6.6.2 */
-		return 0;
+	if (all_confirmed) {
+		dccp_pr_debug("clear feat negotiation timer %p\n", sk);
+		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 	}
 
-	if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
-		entry->empty_confirm = 0;
-	} else if (is_mandatory) {
-		return DCCP_RESET_CODE_MANDATORY_ERROR;
-	} else if (entry->state == FEAT_INITIALISING) {
-		/*
-		 * Failed simultaneous negotiation (server only): try to `save'
-		 * the connection by checking whether entry contains the default
-		 * value for @feat. If yes, send an empty Confirm to signal that
-		 * the received Change was not understood - which implies using
-		 * the default value.
-		 * If this also fails, we use Reset as the last resort.
-		 */
-		WARN_ON(!server);
-		defval = dccp_feat_default_value(feat);
-		if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
-			return DCCP_RESET_CODE_OPTION_ERROR;
-		entry->empty_confirm = 1;
-	}
-	entry->needs_confirm   = 1;
-	entry->needs_mandatory = 0;
-	entry->state	       = FEAT_STABLE;
+	if (!found)
+		dccp_pr_debug("%s(%d, ...) never requested\n",
+			      dccp_feat_typename(type), feature);
 	return 0;
-
-unknown_feature_or_value:
-	if (!is_mandatory)
-		return dccp_push_empty_confirm(fn, feat, local);
-
-not_valid_or_not_known:
-	return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
-			    : DCCP_RESET_CODE_OPTION_ERROR;
 }
 
-/**
- * dccp_feat_confirm_recv  -  Process received Confirm options
- * @fn: feature-negotiation list to update
- * @is_mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
- * @feat: one of %dccp_feature_numbers
- * @val: NN value or SP value/preference list
- * @len: length of @val in bytes
- * @server: whether this node is server (1) or client (0)
- */
-static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
-				 u8 feat, u8 *val, u8 len, const bool server)
-{
-	u8 *plist, plen, type = dccp_feat_type(feat);
-	const bool local = (opt == DCCPO_CONFIRM_R);
-	struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
-
-	dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
-
-	if (entry == NULL) {	/* nothing queued: ignore or handle error */
-		if (is_mandatory && type == FEAT_UNKNOWN)
-			return DCCP_RESET_CODE_MANDATORY_ERROR;
-
-		if (!local && type == FEAT_NN)		/* 6.3.2 */
-			goto confirmation_failed;
-		return 0;
-	}
-
-	if (entry->state != FEAT_CHANGING)		/* 6.6.2 */
-		return 0;
-
-	if (len == 0) {
-		if (dccp_feat_must_be_understood(feat))	/* 6.6.7 */
-			goto confirmation_failed;
-		/*
-		 * Empty Confirm during connection setup: this means reverting
-		 * to the `old' value, which in this case is the default. Since
-		 * we handle default values automatically when no other values
-		 * have been set, we revert to the old value by removing this
-		 * entry from the list.
-		 */
-		dccp_feat_list_pop(entry);
-		return 0;
-	}
+EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv);
 
-	if (type == FEAT_NN) {
-		if (len > sizeof(entry->val.nn))
-			goto confirmation_failed;
+void dccp_feat_clean(struct dccp_minisock *dmsk)
+{
+	struct dccp_opt_pend *opt, *next;
 
-		if (entry->val.nn == dccp_decode_value_var(val, len))
-			goto confirmation_succeeded;
+	list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending,
+				 dccpop_node) {
+		BUG_ON(opt->dccpop_val == NULL);
+		kfree(opt->dccpop_val);
 
-		DCCP_WARN("Bogus Confirm for non-existing value\n");
-		goto confirmation_failed;
-	}
+		if (opt->dccpop_sc != NULL) {
+			BUG_ON(opt->dccpop_sc->dccpoc_val == NULL);
+			kfree(opt->dccpop_sc->dccpoc_val);
+			kfree(opt->dccpop_sc);
+		}
 
-	/*
-	 * Parsing SP Confirms: the first element of @val is the preferred
-	 * SP value which the peer confirms, the remainder depends on @len.
-	 * Note that only the confirmed value need to be a valid SP value.
-	 */
-	if (!dccp_feat_is_valid_sp_val(feat, *val))
-		goto confirmation_failed;
-
-	if (len == 1) {		/* peer didn't supply a preference list */
-		plist = val;
-		plen  = len;
-	} else {		/* preferred value + preference list */
-		plist = val + 1;
-		plen  = len - 1;
+		kfree(opt);
 	}
+	INIT_LIST_HEAD(&dmsk->dccpms_pending);
 
-	/* Check whether the peer got the reconciliation right (6.6.8) */
-	if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
-		DCCP_WARN("Confirm selected the wrong value %u\n", *val);
-		return DCCP_RESET_CODE_OPTION_ERROR;
+	list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
+		BUG_ON(opt == NULL);
+		if (opt->dccpop_val != NULL)
+			kfree(opt->dccpop_val);
+		kfree(opt);
 	}
-	entry->val.sp.vec[0] = *val;
-
-confirmation_succeeded:
-	entry->state = FEAT_STABLE;
-	return 0;
-
-confirmation_failed:
-	DCCP_WARN("Confirmation failed\n");
-	return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
-			    : DCCP_RESET_CODE_OPTION_ERROR;
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);
 }
 
-/**
- * dccp_feat_handle_nn_established  -  Fast-path reception of NN options
- * @sk:		socket of an established DCCP connection
- * @mandatory:	whether @opt was preceded by a Mandatory option
- * @opt:	%DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
- * @feat:	NN number, one of %dccp_feature_numbers
- * @val:	NN value
- * @len:	length of @val in bytes
- * This function combines the functionality of change_recv/confirm_recv, with
- * the following differences (reset codes are the same):
- *    - cleanup after receiving the Confirm;
- *    - values are directly activated after successful parsing;
- *    - deliberately restricted to NN features.
- * The restriction to NN features is essential since SP features can have non-
- * predictable outcomes (depending on the remote configuration), and are inter-
- * dependent (CCIDs for instance cause further dependencies).
+EXPORT_SYMBOL_GPL(dccp_feat_clean);
+
+/* this is to be called only when a listening sock creates its child.  It is
+ * assumed by the function---the confirm is not duplicated, but rather it is
+ * "passed on".
  */
-static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
-					  u8 feat, u8 *val, u8 len)
+int dccp_feat_clone(struct sock *oldsk, struct sock *newsk)
 {
-	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
-	const bool local = (opt == DCCPO_CONFIRM_R);
-	struct dccp_feat_entry *entry;
-	u8 type = dccp_feat_type(feat);
-	dccp_feat_val fval;
+	struct dccp_minisock *olddmsk = dccp_msk(oldsk);
+	struct dccp_minisock *newdmsk = dccp_msk(newsk);
+	struct dccp_opt_pend *opt;
+	int rc = 0;
 
-	dccp_feat_print_opt(opt, feat, val, len, mandatory);
+	INIT_LIST_HEAD(&newdmsk->dccpms_pending);
+	INIT_LIST_HEAD(&newdmsk->dccpms_conf);
 
-	/* Ignore non-mandatory unknown and non-NN features */
-	if (type == FEAT_UNKNOWN) {
-		if (local && !mandatory)
-			return 0;
-		goto fast_path_unknown;
-	} else if (type != FEAT_NN) {
-		return 0;
-	}
-
-	/*
-	 * We don't accept empty Confirms, since in fast-path feature
-	 * negotiation the values are enabled immediately after sending
-	 * the Change option.
-	 * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
-	 */
-	if (len == 0 || len > sizeof(fval.nn))
-		goto fast_path_unknown;
-
-	if (opt == DCCPO_CHANGE_L) {
-		fval.nn = dccp_decode_value_var(val, len);
-		if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
-			goto fast_path_unknown;
+	list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) {
+		struct dccp_opt_pend *newopt;
+		/* copy the value of the option */
+		u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC);
 
-		if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
-		    dccp_feat_activate(sk, feat, local, &fval))
-			return DCCP_RESET_CODE_TOO_BUSY;
+		if (val == NULL)
+			goto out_clean;
 
-		/* set the `Ack Pending' flag to piggyback a Confirm */
-		inet_csk_schedule_ack(sk);
-
-	} else if (opt == DCCPO_CONFIRM_R) {
-		entry = dccp_feat_list_lookup(fn, feat, local);
-		if (entry == NULL || entry->state != FEAT_CHANGING)
-			return 0;
-
-		fval.nn = dccp_decode_value_var(val, len);
-		if (fval.nn != entry->val.nn) {
-			DCCP_WARN("Bogus Confirm for non-existing value\n");
-			goto fast_path_failed;
+		newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC);
+		if (newopt == NULL) {
+			kfree(val);
+			goto out_clean;
 		}
 
-		/* It has been confirmed - so remove the entry */
-		dccp_feat_list_pop(entry);
+		/* insert the option */
+		newopt->dccpop_val = val;
+		list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending);
 
-	} else {
-		DCCP_WARN("Received illegal option %u\n", opt);
-		goto fast_path_failed;
+		/* XXX what happens with backlogs and multiple connections at
+		 * once...
+		 */
+		/* the master socket no longer needs to worry about confirms */
+		opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */
+
+		/* reset state for a new socket */
+		opt->dccpop_conf = 0;
 	}
-	return 0;
 
-fast_path_unknown:
-	if (!mandatory)
-		return dccp_push_empty_confirm(fn, feat, local);
+	/* XXX not doing anything about the conf queue */
+
+out:
+	return rc;
 
-fast_path_failed:
-	return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
-			 : DCCP_RESET_CODE_OPTION_ERROR;
+out_clean:
+	dccp_feat_clean(newdmsk);
+	rc = -ENOMEM;
+	goto out;
 }
 
-/**
- * dccp_feat_parse_options  -  Process Feature-Negotiation Options
- * @sk: for general use and used by the client during connection setup
- * @dreq: used by the server during connection setup
- * @mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
- * @feat: one of %dccp_feature_numbers
- * @val: value contents of @opt
- * @len: length of @val in bytes
- * Returns 0 on success, a Reset code for ending the connection otherwise.
- */
-int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
-			    u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
+EXPORT_SYMBOL_GPL(dccp_feat_clone);
+
+static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat,
+			    u8 *val, u8 len)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
-	bool server = false;
+	int rc = -ENOMEM;
+	u8 *copy = kmemdup(val, len, GFP_KERNEL);
 
-	switch (sk->sk_state) {
-	/*
-	 *	Negotiation during connection setup
-	 */
-	case DCCP_LISTEN:
-		server = true;			/* fall through */
-	case DCCP_REQUESTING:
-		switch (opt) {
-		case DCCPO_CHANGE_L:
-		case DCCPO_CHANGE_R:
-			return dccp_feat_change_recv(fn, mandatory, opt, feat,
-						     val, len, server);
-		case DCCPO_CONFIRM_R:
-		case DCCPO_CONFIRM_L:
-			return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
-						      val, len, server);
-		}
-		break;
-	/*
-	 *	Support for exchanging NN options on an established connection
-	 *	This is currently restricted to Ack Ratio (RFC 4341, 6.1.2)
-	 */
-	case DCCP_OPEN:
-	case DCCP_PARTOPEN:
-		return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
-						       val, len);
+	if (copy != NULL) {
+		rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL);
+		if (rc)
+			kfree(copy);
 	}
-	return 0;	/* ignore FN options in all other states */
+	return rc;
 }
 
-/**
- * dccp_feat_init  -  Seed feature negotiation with host-specific defaults
- * This initialises global defaults, depending on the value of the sysctls.
- * These can later be overridden by registering changes via setsockopt calls.
- * The last link in the chain is finalise_settings, to make sure that between
- * here and the start of actual feature negotiation no inconsistencies enter.
- *
- * All features not appearing below use either defaults or are otherwise
- * later adjusted through dccp_feat_finalise_settings().
- */
-int dccp_feat_init(struct sock *sk)
+int dccp_feat_init(struct dccp_minisock *dmsk)
 {
-	struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
-	u8 on = 1, off = 0;
 	int rc;
-	struct {
-		u8 *val;
-		u8 len;
-	} tx, rx;
-
-	/* Non-negotiable (NN) features */
-	rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
-				    sysctl_dccp_sequence_window);
-	if (rc)
-		return rc;
 
-	/* Server-priority (SP) features */
-
-	/* Advertise that short seqnos are not supported (7.6.1) */
-	rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
-	if (rc)
-		return rc;
+	INIT_LIST_HEAD(&dmsk->dccpms_pending);
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);
 
-	/* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
-	rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
+	/* CCID L */
+	rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID,
+			      &dmsk->dccpms_tx_ccid, 1);
 	if (rc)
-		return rc;
-
-	/*
-	 * We advertise the available list of CCIDs and reorder according to
-	 * preferences, to avoid failure resulting from negotiating different
-	 * singleton values (which always leads to failure).
-	 * These settings can still (later) be overridden via sockopts.
-	 */
-	if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
-	    ccid_get_builtin_ccids(&rx.val, &rx.len))
-		return -ENOBUFS;
-
-	/* Pre-load all CCID modules that are going to be advertised */
-	rc = -EUNATCH;
-	if (ccid_request_modules(tx.val, tx.len))
-		goto free_ccid_lists;
-
-	if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
-	    !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
-		goto free_ccid_lists;
+		goto out;
 
-	rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
+	/* CCID R */
+	rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID,
+			      &dmsk->dccpms_rx_ccid, 1);
 	if (rc)
-		goto free_ccid_lists;
+		goto out;
 
-	rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
-
-free_ccid_lists:
-	kfree(tx.val);
-	kfree(rx.val);
+	/* Ack ratio */
+	rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO,
+			      &dmsk->dccpms_ack_ratio, 1);
+out:
 	return rc;
 }
 
-int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_feat_entry *cur, *next;
-	int idx;
-	dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
-		 [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
-	};
-
-	list_for_each_entry(cur, fn_list, node) {
-		/*
-		 * An empty Confirm means that either an unknown feature type
-		 * or an invalid value was present. In the first case there is
-		 * nothing to activate, in the other the default value is used.
-		 */
-		if (cur->empty_confirm)
-			continue;
+EXPORT_SYMBOL_GPL(dccp_feat_init);
 
-		idx = dccp_feat_index(cur->feat_num);
-		if (idx < 0) {
-			DCCP_BUG("Unknown feature %u", cur->feat_num);
-			goto activation_failed;
-		}
-		if (cur->state != FEAT_STABLE) {
-			DCCP_CRIT("Negotiation of %s %s failed in state %s",
-				  cur->is_local ? "local" : "remote",
-				  dccp_feat_fname(cur->feat_num),
-				  dccp_feat_sname[cur->state]);
-			goto activation_failed;
-		}
-		fvals[idx][cur->is_local] = &cur->val;
+#ifdef CONFIG_IP_DCCP_DEBUG
+const char *dccp_feat_typename(const u8 type)
+{
+	switch(type) {
+	case DCCPO_CHANGE_L:  return("ChangeL");
+	case DCCPO_CONFIRM_L: return("ConfirmL");
+	case DCCPO_CHANGE_R:  return("ChangeR");
+	case DCCPO_CONFIRM_R: return("ConfirmR");
+	/* the following case must not appear in feature negotation  */
+	default:	      dccp_pr_debug("unknown type %d [BUG!]\n", type);
 	}
+	return NULL;
+}
 
-	/*
-	 * Activate in decreasing order of index, so that the CCIDs are always
-	 * activated as the last feature. This avoids the case where a CCID
-	 * relies on the initialisation of one or more features that it depends
-	 * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
-	 */
-	for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
-		if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
-		    __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
-			DCCP_CRIT("Could not activate %d", idx);
-			goto activation_failed;
-		}
+EXPORT_SYMBOL_GPL(dccp_feat_typename);
 
-	/* Clean up Change options which have been confirmed already */
-	list_for_each_entry_safe(cur, next, fn_list, node)
-		if (!cur->needs_confirm)
-			dccp_feat_list_pop(cur);
+const char *dccp_feat_name(const u8 feat)
+{
+	static const char *feature_names[] = {
+		[DCCPF_RESERVED]	= "Reserved",
+		[DCCPF_CCID]		= "CCID",
+		[DCCPF_SHORT_SEQNOS]	= "Allow Short Seqnos",
+		[DCCPF_SEQUENCE_WINDOW]	= "Sequence Window",
+		[DCCPF_ECN_INCAPABLE]	= "ECN Incapable",
+		[DCCPF_ACK_RATIO]	= "Ack Ratio",
+		[DCCPF_SEND_ACK_VECTOR]	= "Send ACK Vector",
+		[DCCPF_SEND_NDP_COUNT]	= "Send NDP Count",
+		[DCCPF_MIN_CSUM_COVER]	= "Min. Csum Coverage",
+		[DCCPF_DATA_CHECKSUM]	= "Send Data Checksum",
+	};
+	if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
+		return feature_names[DCCPF_RESERVED];
 
-	dccp_pr_debug("Activation OK\n");
-	return 0;
+	if (feat >= DCCPF_MIN_CCID_SPECIFIC)
+		return "CCID-specific";
 
-activation_failed:
-	/*
-	 * We clean up everything that may have been allocated, since
-	 * it is difficult to track at which stage negotiation failed.
-	 * This is ok, since all allocation functions below are robust
-	 * against NULL arguments.
-	 */
-	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
-	dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
-	dp->dccps_hc_rx_ackvec = NULL;
-	return -1;
+	return feature_names[feat];
 }
+
+EXPORT_SYMBOL_GPL(dccp_feat_name);
+#endif /* CONFIG_IP_DCCP_DEBUG */
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
index 2217066e22d..e272222c7ac 100644
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -3,134 +3,38 @@
 /*
  *  net/dccp/feat.h
  *
- *  Feature negotiation for the DCCP protocol (RFC 4340, section 6)
- *  Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ *  An implementation of the DCCP protocol
  *  Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
  *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License version 2 as
+ *	published by the Free Software Foundation.
  */
+
 #include <linux/types.h>
 #include "dccp.h"
 
-/*
- * Known limit values
- */
-/* Ack Ratio takes 2-byte integer values (11.3) */
-#define DCCPF_ACK_RATIO_MAX	0xFFFF
-/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
-#define DCCPF_SEQ_WMIN		32
-#define DCCPF_SEQ_WMAX		0x3FFFFFFFFFFFull
-/* Maximum number of SP values that fit in a single (Confirm) option */
-#define DCCP_FEAT_MAX_SP_VALS	(DCCP_SINGLE_OPT_MAXLEN - 2)
-
-enum dccp_feat_type {
-	FEAT_AT_RX   = 1,	/* located at RX side of half-connection  */
-	FEAT_AT_TX   = 2,	/* located at TX side of half-connection  */
-	FEAT_SP      = 4,	/* server-priority reconciliation (6.3.1) */
-	FEAT_NN	     = 8,	/* non-negotiable reconciliation (6.3.2)  */
-	FEAT_UNKNOWN = 0xFF	/* not understood or invalid feature	  */
-};
-
-enum dccp_feat_state {
-	FEAT_DEFAULT = 0,	/* using default values from 6.4 */
-	FEAT_INITIALISING,	/* feature is being initialised  */
-	FEAT_CHANGING,		/* Change sent but not confirmed yet */
-	FEAT_UNSTABLE,		/* local modification in state CHANGING */
-	FEAT_STABLE		/* both ends (think they) agree */
-};
+#ifdef CONFIG_IP_DCCP_DEBUG
+extern const char *dccp_feat_typename(const u8 type);
+extern const char *dccp_feat_name(const u8 feat);
 
-/**
- * dccp_feat_val  -  Container for SP or NN feature values
- * @nn:     single NN value
- * @sp.vec: single SP value plus optional preference list
- * @sp.len: length of @sp.vec in bytes
- */
-typedef union {
-	u64 nn;
-	struct {
-		u8	*vec;
-		u8	len;
-	}   sp;
-} dccp_feat_val;
-
-/**
- * struct feat_entry  -  Data structure to perform feature negotiation
- * @feat_num: one of %dccp_feature_numbers
- * @val: feature's current value (SP features may have preference list)
- * @state: feature's current state
- * @needs_mandatory: whether Mandatory options should be sent
- * @needs_confirm: whether to send a Confirm instead of a Change
- * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
- * @is_local: feature location (1) or feature-remote (0)
- * @node: list pointers, entries arranged in FIFO order
- */
-struct dccp_feat_entry {
-	u8                      feat_num;
-	dccp_feat_val           val;
-	enum dccp_feat_state    state:8;
-	bool			needs_mandatory:1,
-				needs_confirm:1,
-				empty_confirm:1,
-				is_local:1;
-
-	struct list_head	node;
-};
-
-static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
+static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val)
 {
-	if (entry->needs_confirm)
-		return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
-	return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
+	dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type),
+					   dccp_feat_name(feat), feat, val);
 }
+#else
+#define dccp_feat_debug(type, feat, val)
+#endif /* CONFIG_IP_DCCP_DEBUG */
+
+extern int  dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature,
+			     u8 *val, u8 len, gfp_t gfp);
+extern int  dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature,
+				  u8 *val, u8 len);
+extern int  dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature,
+				   u8 *val, u8 len);
+extern void dccp_feat_clean(struct dccp_minisock *dmsk);
+extern int  dccp_feat_clone(struct sock *oldsk, struct sock *newsk);
+extern int  dccp_feat_init(struct dccp_minisock *dmsk);
 
-/**
- * struct ccid_dependency  -  Track changes resulting from choosing a CCID
- * @dependent_feat: one of %dccp_feature_numbers
- * @is_local: local (1) or remote (0) @dependent_feat
- * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
- * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
- */
-struct ccid_dependency {
-	u8	dependent_feat;
-	bool	is_local:1,
-		is_mandatory:1;
-	u8	val;
-};
-
-/*
- * Sysctls to seed defaults for feature negotiation
- */
-extern unsigned long sysctl_dccp_sequence_window;
-extern int	     sysctl_dccp_rx_ccid;
-extern int	     sysctl_dccp_tx_ccid;
-
-extern int  dccp_feat_init(struct sock *sk);
-extern void dccp_feat_initialise_sysctls(void);
-extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
-				  u8 const *list, u8 len);
-extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
-extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
-				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
-extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);
-
-/*
- * Encoding variable-length options and their maximum length.
- *
- * This affects NN options (SP options are all u8) and other variable-length
- * options (see table 3 in RFC 4340). The limit is currently given the Sequence
- * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
- * options consume less than 6 bytes (timestamps are 4 bytes).
- * When updating this constant (e.g. due to new internet drafts / RFCs), make
- * sure that you also update all code which refers to it.
- */
-#define DCCP_OPTVAL_MAXLEN	6
-
-extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
-extern u64  dccp_decode_value_var(const u8 *bf, const u8 len);
-
-extern int  dccp_insert_option_mandatory(struct sk_buff *skb);
-extern int  dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
-			       u8 *val, u8 len, bool repeat_first);
 #endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index df0e6714aa1..779d0ed9ae9 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -159,15 +159,13 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
 	dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
 }
 
-static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
+static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
 {
-	struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
+	struct dccp_sock *dp = dccp_sk(sk);
 
-	if (av == NULL)
-		return;
-	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
-		dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
-	dccp_ackvec_input(av, skb);
+	if (dccp_msk(sk)->dccpms_send_ack_vector)
+		dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
+					    DCCP_SKB_CB(skb)->dccpd_ack_seq);
 }
 
 static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -366,13 +364,22 @@ discard:
 int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			 const struct dccp_hdr *dh, const unsigned len)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
+
 	if (dccp_check_seqno(sk, skb))
 		goto discard;
 
 	if (dccp_parse_options(sk, NULL, skb))
 		return 1;
 
-	dccp_handle_ackvec_processing(sk, skb);
+	if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+		dccp_event_ack_recv(sk, skb);
+
+	if (dccp_msk(sk)->dccpms_send_ack_vector &&
+	    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+			    DCCP_SKB_CB(skb)->dccpd_seq,
+			    DCCP_ACKVEC_STATE_RECEIVED))
+		goto discard;
 	dccp_deliver_input_to_ccids(sk, skb);
 
 	return __dccp_rcv_established(sk, skb, dh, len);
@@ -414,33 +421,40 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 			goto out_invalid_packet;
 		}
 
-		/*
-		 * If option processing (Step 8) failed, return 1 here so that
-		 * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
-		 * the option type and is set in dccp_parse_options().
-		 */
 		if (dccp_parse_options(sk, NULL, skb))
-			return 1;
+			goto out_invalid_packet;
 
 		/* Obtain usec RTT sample from SYN exchange (used by CCID 3) */
 		if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
 			dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
 			    dp->dccps_options_received.dccpor_timestamp_echo));
 
+		if (dccp_msk(sk)->dccpms_send_ack_vector &&
+		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+				    DCCP_SKB_CB(skb)->dccpd_seq,
+				    DCCP_ACKVEC_STATE_RECEIVED))
+			goto out_invalid_packet; /* FIXME: change error code */
+
 		/* Stop the REQUEST timer */
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 		WARN_ON(sk->sk_send_head == NULL);
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 
+		dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+		dccp_update_gsr(sk, dp->dccps_isr);
 		/*
-		 * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
-		 * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
-		 * is done as part of activating the feature values below, since
-		 * these settings depend on the local/remote Sequence Window
-		 * features, which were undefined or not confirmed until now.
+		 * SWL and AWL are initially adjusted so that they are not less than
+		 * the initial Sequence Numbers received and sent, respectively:
+		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
+		 *	AWL := max(GSS - W' + 1, ISS).
+		 * These adjustments MUST be applied only at the beginning of the
+		 * connection.
+		 *
+		 * AWL was adjusted in dccp_v4_connect -acme
 		 */
-		dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
+		dccp_set_seqno(&dp->dccps_swl,
+			       max48(dp->dccps_swl, dp->dccps_isr));
 
 		dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 
@@ -461,15 +475,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
 		 */
 		dccp_set_state(sk, DCCP_PARTOPEN);
 
-		/*
-		 * If feature negotiation was successful, activate features now;
-		 * an activation failure means that this host could not activate
-		 * one ore more features (e.g. insufficient memory), which would
-		 * leave at least one feature in an undefined state.
-		 */
-		if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
-			goto unable_to_proceed;
-
 		/* Make sure socket is routed, for correct metrics. */
 		icsk->icsk_af_ops->rebuild_header(sk);
 
@@ -504,16 +509,6 @@ out_invalid_packet:
 	/* dccp_v4_do_rcv will send a reset */
 	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
 	return 1;
-
-unable_to_proceed:
-	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
-	/*
-	 * We mark this socket as no longer usable, so that the loop in
-	 * dccp_sendmsg() terminates and the application gets notified.
-	 */
-	dccp_set_state(sk, DCCP_CLOSED);
-	sk->sk_err = ECOMM;
-	return 1;
 }
 
 static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
@@ -595,6 +590,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
 								    skb) < 0)
 				return 1;
+
+			/* FIXME: do congestion control initialization */
 			goto discard;
 		}
 		if (dh->dccph_type == DCCP_PKT_RESET)
@@ -603,35 +600,29 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		/* Caller (dccp_v4_do_rcv) will send Reset */
 		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
 		return 1;
-	} else if (sk->sk_state == DCCP_CLOSED) {
-		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
-		return 1;
 	}
 
-	/* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
-	if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
-		goto discard;
+	if (sk->sk_state != DCCP_REQUESTING) {
+		if (dccp_check_seqno(sk, skb))
+			goto discard;
 
-	/*
-	 *   Step 7: Check for unexpected packet types
-	 *      If (S.is_server and P.type == Response)
-	 *	    or (S.is_client and P.type == Request)
-	 *	    or (S.state == RESPOND and P.type == Data),
-	 *	  Send Sync packet acknowledging P.seqno
-	 *	  Drop packet and return
-	 */
-	if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
-	     dh->dccph_type == DCCP_PKT_RESPONSE) ||
-	    (dp->dccps_role == DCCP_ROLE_CLIENT &&
-	     dh->dccph_type == DCCP_PKT_REQUEST) ||
-	    (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
-		dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
-		goto discard;
-	}
+		/*
+		 * Step 8: Process options and mark acknowledgeable
+		 */
+		if (dccp_parse_options(sk, NULL, skb))
+			return 1;
 
-	/*  Step 8: Process options */
-	if (dccp_parse_options(sk, NULL, skb))
-		return 1;
+		if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+			dccp_event_ack_recv(sk, skb);
+
+		if (dccp_msk(sk)->dccpms_send_ack_vector &&
+		    dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
+				    DCCP_SKB_CB(skb)->dccpd_seq,
+				    DCCP_ACKVEC_STATE_RECEIVED))
+			goto discard;
+
+		dccp_deliver_input_to_ccids(sk, skb);
+	}
 
 	/*
 	 *  Step 9: Process Reset
@@ -640,22 +631,44 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 	 *		S.state := TIMEWAIT
 	 *		Set TIMEWAIT timer
 	 *		Drop packet and return
-	 */
+	*/
 	if (dh->dccph_type == DCCP_PKT_RESET) {
 		dccp_rcv_reset(sk, skb);
 		return 0;
-	} else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {	/* Step 13 */
+		/*
+		 *   Step 7: Check for unexpected packet types
+		 *      If (S.is_server and P.type == Response)
+		 *	    or (S.is_client and P.type == Request)
+		 *	    or (S.state == RESPOND and P.type == Data),
+		 *	  Send Sync packet acknowledging P.seqno
+		 *	  Drop packet and return
+		 */
+	} else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+		    dh->dccph_type == DCCP_PKT_RESPONSE) ||
+		    (dp->dccps_role == DCCP_ROLE_CLIENT &&
+		     dh->dccph_type == DCCP_PKT_REQUEST) ||
+		    (sk->sk_state == DCCP_RESPOND &&
+		     dh->dccph_type == DCCP_PKT_DATA)) {
+		dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
+		goto discard;
+	} else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
 		if (dccp_rcv_closereq(sk, skb))
 			return 0;
 		goto discard;
-	} else if (dh->dccph_type == DCCP_PKT_CLOSE) {		/* Step 14 */
+	} else if (dh->dccph_type == DCCP_PKT_CLOSE) {
 		if (dccp_rcv_close(sk, skb))
 			return 0;
 		goto discard;
 	}
 
 	switch (sk->sk_state) {
+	case DCCP_CLOSED:
+		dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+		return 1;
+
 	case DCCP_REQUESTING:
+		/* FIXME: do congestion control initialization */
+
 		queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
 		if (queued >= 0)
 			return queued;
@@ -663,12 +676,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		__kfree_skb(skb);
 		return 0;
 
-	case DCCP_PARTOPEN:
-		/* Step 8: if using Ack Vectors, mark packet acknowledgeable */
-		dccp_handle_ackvec_processing(sk, skb);
-		dccp_deliver_input_to_ccids(sk, skb);
-		/* fall through */
 	case DCCP_RESPOND:
+	case DCCP_PARTOPEN:
 		queued = dccp_rcv_respond_partopen_state_process(sk, skb,
 								 dh, len);
 		break;
@@ -707,7 +716,16 @@ u32 dccp_sample_rtt(struct sock *sk, long delta)
 	/* dccpor_elapsed_time is either zeroed out or set and > 0 */
 	delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
 
-	return dccp_sane_rtt(delta);
+	if (unlikely(delta <= 0)) {
+		DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
+		return DCCP_SANE_RTT_MIN;
+	}
+	if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
+		DCCP_WARN("RTT sample %ld too large, using max\n", delta);
+		return DCCP_SANE_RTT_MAX;
+	}
+
+	return delta;
 }
 
 EXPORT_SYMBOL_GPL(dccp_sample_rtt);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b623f6b2548..882c5c4de69 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -545,7 +545,6 @@ out:
 
 static void dccp_v4_reqsk_destructor(struct request_sock *req)
 {
-	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
 	kfree(inet_rsk(req)->opt);
 }
 
@@ -596,8 +595,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
-		goto drop_and_free;
+	dccp_reqsk_init(req, skb);
 
 	dreq = dccp_rsk(req);
 	if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ad6212e0043..5e1ee0da2c4 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -302,7 +302,6 @@ done:
 
 static void dccp_v6_reqsk_destructor(struct request_sock *req)
 {
-	dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
 	if (inet6_rsk(req)->pktopts != NULL)
 		kfree_skb(inet6_rsk(req)->pktopts);
 }
@@ -425,8 +424,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-	if (dccp_reqsk_init(req, dccp_sk(sk), skb))
-		goto drop_and_free;
+	dccp_reqsk_init(req, skb);
 
 	dreq = dccp_rsk(req);
 	if (dccp_parse_options(sk, dreq, skb))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index f4d9c8f60ed..b2804e2d1b8 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -42,6 +42,16 @@ struct inet_timewait_death_row dccp_death_row = {
 
 EXPORT_SYMBOL_GPL(dccp_death_row);
 
+void dccp_minisock_init(struct dccp_minisock *dmsk)
+{
+	dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window;
+	dmsk->dccpms_rx_ccid	     = sysctl_dccp_feat_rx_ccid;
+	dmsk->dccpms_tx_ccid	     = sysctl_dccp_feat_tx_ccid;
+	dmsk->dccpms_ack_ratio	     = sysctl_dccp_feat_ack_ratio;
+	dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector;
+	dmsk->dccpms_send_ndp_count  = sysctl_dccp_feat_send_ndp_count;
+}
+
 void dccp_time_wait(struct sock *sk, int state, int timeo)
 {
 	struct inet_timewait_sock *tw = NULL;
@@ -102,9 +112,10 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 	struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
 
 	if (newsk != NULL) {
-		struct dccp_request_sock *dreq = dccp_rsk(req);
+		const struct dccp_request_sock *dreq = dccp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct dccp_sock *newdp = dccp_sk(newsk);
+		struct dccp_minisock *newdmsk = dccp_msk(newsk);
 
 		newdp->dccps_role	    = DCCP_ROLE_SERVER;
 		newdp->dccps_hc_rx_ackvec   = NULL;
@@ -114,32 +125,65 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
 		newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
 		newicsk->icsk_rto	    = DCCP_TIMEOUT_INIT;
 
-		INIT_LIST_HEAD(&newdp->dccps_featneg);
+		if (dccp_feat_clone(sk, newsk))
+			goto out_free;
+
+		if (newdmsk->dccpms_send_ack_vector) {
+			newdp->dccps_hc_rx_ackvec =
+						dccp_ackvec_alloc(GFP_ATOMIC);
+			if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
+				goto out_free;
+		}
+
+		newdp->dccps_hc_rx_ccid =
+			    ccid_hc_rx_new(newdmsk->dccpms_rx_ccid,
+					   newsk, GFP_ATOMIC);
+		newdp->dccps_hc_tx_ccid =
+			    ccid_hc_tx_new(newdmsk->dccpms_tx_ccid,
+					   newsk, GFP_ATOMIC);
+		if (unlikely(newdp->dccps_hc_rx_ccid == NULL ||
+			     newdp->dccps_hc_tx_ccid == NULL)) {
+			dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
+			ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk);
+			ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk);
+out_free:
+			/* It is still raw copy of parent, so invalidate
+			 * destructor and make plain sk_free() */
+			newsk->sk_destruct = NULL;
+			sk_free(newsk);
+			return NULL;
+		}
+
 		/*
 		 * Step 3: Process LISTEN state
 		 *
 		 *    Choose S.ISS (initial seqno) or set from Init Cookies
 		 *    Initialize S.GAR := S.ISS
-		 *    Set S.ISR, S.GSR from packet (or Init Cookies)
-		 *
-		 *    Setting AWL/AWH and SWL/SWH happens as part of the feature
-		 *    activation below, as these windows all depend on the local
-		 *    and remote Sequence Window feature values (7.5.2).
+		 *    Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
 		 */
-		newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss;
-		newdp->dccps_gar = newdp->dccps_iss;
-		newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr;
+
+		/* See dccp_v4_conn_request */
+		newdmsk->dccpms_sequence_window = req->rcv_wnd;
+
+		newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss;
+		dccp_update_gss(newsk, dreq->dreq_iss);
+
+		newdp->dccps_isr = dreq->dreq_isr;
+		dccp_update_gsr(newsk, dreq->dreq_isr);
 
 		/*
-		 * Activate features: initialise CCIDs, sequence windows etc.
+		 * SWL and AWL are initially adjusted so that they are not less than
+		 * the initial Sequence Numbers received and sent, respectively:
+		 *	SWL := max(GSR + 1 - floor(W/4), ISR),
+		 *	AWL := max(GSS - W' + 1, ISS).
+		 * These adjustments MUST be applied only at the beginning of the
+		 * connection.
 		 */
-		if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
-			/* It is still raw copy of parent, so invalidate
-			 * destructor and make plain sk_free() */
-			newsk->sk_destruct = NULL;
-			sk_free(newsk);
-			return NULL;
-		}
+		dccp_set_seqno(&newdp->dccps_swl,
+			       max48(newdp->dccps_swl, newdp->dccps_isr));
+		dccp_set_seqno(&newdp->dccps_awl,
+			       max48(newdp->dccps_awl, newdp->dccps_iss));
+
 		dccp_init_xmit_timers(newsk);
 
 		DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
@@ -260,17 +304,14 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 
 EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
 
-int dccp_reqsk_init(struct request_sock *req,
-		    struct dccp_sock const *dp, struct sk_buff const *skb)
+void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
 {
 	struct dccp_request_sock *dreq = dccp_rsk(req);
 
 	inet_rsk(req)->rmt_port	  = dccp_hdr(skb)->dccph_sport;
 	inet_rsk(req)->acked	  = 0;
+	req->rcv_wnd		  = sysctl_dccp_feat_sequence_window;
 	dreq->dreq_timestamp_echo = 0;
-
-	/* inherit feature negotiation options from listening socket */
-	return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
 }
 
 EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index e5a32979d7d..0809b63cb05 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -23,20 +23,23 @@
 #include "dccp.h"
 #include "feat.h"
 
-u64 dccp_decode_value_var(const u8 *bf, const u8 len)
+int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW;
+int sysctl_dccp_feat_rx_ccid	      = DCCPF_INITIAL_CCID;
+int sysctl_dccp_feat_tx_ccid	      = DCCPF_INITIAL_CCID;
+int sysctl_dccp_feat_ack_ratio	      = DCCPF_INITIAL_ACK_RATIO;
+int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR;
+int sysctl_dccp_feat_send_ndp_count  = DCCPF_INITIAL_SEND_NDP_COUNT;
+
+static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
 {
-	u64 value = 0;
+	u32 value = 0;
 
-	if (len >= DCCP_OPTVAL_MAXLEN)
-		value += ((u64)*bf++) << 40;
-	if (len > 4)
-		value += ((u64)*bf++) << 32;
 	if (len > 3)
-		value += ((u64)*bf++) << 24;
+		value += *bf++ << 24;
 	if (len > 2)
-		value += ((u64)*bf++) << 16;
+		value += *bf++ << 16;
 	if (len > 1)
-		value += ((u64)*bf++) << 8;
+		value += *bf++ << 8;
 	if (len > 0)
 		value += *bf;
 
@@ -54,6 +57,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 	struct dccp_sock *dp = dccp_sk(sk);
 	const struct dccp_hdr *dh = dccp_hdr(skb);
 	const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
+	u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
 	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
 	unsigned char *opt_ptr = options;
 	const unsigned char *opt_end = (unsigned char *)dh +
@@ -95,11 +99,18 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 		}
 
 		/*
+		 * CCID-Specific Options (from RFC 4340, sec. 10.3):
+		 *
+		 * Option numbers 128 through 191 are for options sent from the
+		 * HC-Sender to the HC-Receiver; option numbers 192 through 255
+		 * are for options sent from the HC-Receiver to	the HC-Sender.
+		 *
 		 * CCID-specific options are ignored during connection setup, as
 		 * negotiation may still be in progress (see RFC 4340, 10.3).
 		 * The same applies to Ack Vectors, as these depend on the CCID.
+		 *
 		 */
-		if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
+		if (dreq != NULL && (opt >= 128 ||
 		    opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
 			goto ignore_option;
 
@@ -120,13 +131,43 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
 				      (unsigned long long)opt_recv->dccpor_ndp);
 			break;
-		case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
-			if (pkt_type == DCCP_PKT_DATA)      /* RFC 4340, 6 */
+		case DCCPO_CHANGE_L:
+			/* fall through */
+		case DCCPO_CHANGE_R:
+			if (pkt_type == DCCP_PKT_DATA)
 				break;
-			rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
-						    *value, value + 1, len - 1);
-			if (rc)
-				goto out_featneg_failed;
+			if (len < 2)
+				goto out_invalid_option;
+			rc = dccp_feat_change_recv(sk, opt, *value, value + 1,
+						   len - 1);
+			/*
+			 * When there is a change error, change_recv is
+			 * responsible for dealing with it.  i.e. reply with an
+			 * empty confirm.
+			 * If the change was mandatory, then we need to die.
+			 */
+			if (rc && mandatory)
+				goto out_invalid_option;
+			break;
+		case DCCPO_CONFIRM_L:
+			/* fall through */
+		case DCCPO_CONFIRM_R:
+			if (pkt_type == DCCP_PKT_DATA)
+				break;
+			if (len < 2)	/* FIXME this disallows empty confirm */
+				goto out_invalid_option;
+			if (dccp_feat_confirm_recv(sk, opt, *value,
+						   value + 1, len - 1))
+				goto out_invalid_option;
+			break;
+		case DCCPO_ACK_VECTOR_0:
+		case DCCPO_ACK_VECTOR_1:
+			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
+				break;
+
+			if (dccp_msk(sk)->dccpms_send_ack_vector &&
+			    dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
+				goto out_invalid_option;
 			break;
 		case DCCPO_TIMESTAMP:
 			if (len != 4)
@@ -154,8 +195,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 				      dccp_role(sk), ntohl(opt_val),
 				      (unsigned long long)
 				      DCCP_SKB_CB(skb)->dccpd_ack_seq);
-			/* schedule an Ack in case this sender is quiescent */
-			inet_csk_schedule_ack(sk);
 			break;
 		case DCCPO_TIMESTAMP_ECHO:
 			if (len != 4 && len != 6 && len != 8)
@@ -212,25 +251,23 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 			dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
 				      dccp_role(sk), elapsed_time);
 			break;
-		case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
+		case 128 ... 191: {
+			const u16 idx = value - options;
+
 			if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
-						     pkt_type, opt, value, len))
+						     opt, len, idx,
+						     value) != 0)
 				goto out_invalid_option;
+		}
 			break;
-		case DCCPO_ACK_VECTOR_0:
-		case DCCPO_ACK_VECTOR_1:
-			if (dccp_packet_without_ack(skb))   /* RFC 4340, 11.4 */
-				break;
-			/*
-			 * Ack vectors are processed by the TX CCID if it is
-			 * interested. The RX CCID need not parse Ack Vectors,
-			 * since it is only interested in clearing old state.
-			 * Fall through.
-			 */
-		case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
+		case 192 ... 255: {
+			const u16 idx = value - options;
+
 			if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
-						     pkt_type, opt, value, len))
+						     opt, len, idx,
+						     value) != 0)
 				goto out_invalid_option;
+		}
 			break;
 		default:
 			DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
@@ -252,10 +289,8 @@ out_nonsensical_length:
 
 out_invalid_option:
 	DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
-	rc = DCCP_RESET_CODE_OPTION_ERROR;
-out_featneg_failed:
-	DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
-	DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
+	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
+	DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len);
 	DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
 	DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
 	DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
@@ -264,12 +299,9 @@ out_featneg_failed:
 
 EXPORT_SYMBOL_GPL(dccp_parse_options);
 
-void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
+static void dccp_encode_value_var(const u32 value, unsigned char *to,
+				  const unsigned int len)
 {
-	if (len >= DCCP_OPTVAL_MAXLEN)
-		*to++ = (value & 0xFF0000000000ull) >> 40;
-	if (len > 4)
-		*to++ = (value & 0xFF00000000ull) >> 32;
 	if (len > 3)
 		*to++ = (value & 0xFF000000) >> 24;
 	if (len > 2)
@@ -429,140 +461,92 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
 	return 0;
 }
 
-static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat,
+				u8 *val, u8 len)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
-	struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
-	const u16 buflen = dccp_ackvec_buflen(av);
-	/* Figure out how many options do we need to represent the ackvec */
-	const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
-	u16 len = buflen + 2 * nr_opts;
-	u8 i, nonce = 0;
-	const unsigned char *tail, *from;
-	unsigned char *to;
+	u8 *to;
 
-	if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
-		DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
-			  dccp_packet_name(dcb->dccpd_type));
+	if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) {
+		DCCP_WARN("packet too small for feature %d option!\n", feat);
 		return -1;
 	}
-	/*
-	 * Since Ack Vectors are variable-length, we can not always predict
-	 * their size. To catch exception cases where the space is running out
-	 * on the skb, a separate Sync is scheduled to carry the Ack Vector.
-	 */
-	if (len > DCCPAV_MIN_OPTLEN &&
-	    len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
-		DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
-			  "MPS=%u ==> reduce payload size?\n", len, skb->len,
-			  dcb->dccpd_opt_len, dp->dccps_mss_cache);
-		dp->dccps_sync_scheduled = 1;
-		return 0;
-	}
-	dcb->dccpd_opt_len += len;
 
-	to   = skb_push(skb, len);
-	len  = buflen;
-	from = av->av_buf + av->av_buf_head;
-	tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
+	DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3;
 
-	for (i = 0; i < nr_opts; ++i) {
-		int copylen = len;
-
-		if (len > DCCP_SINGLE_OPT_MAXLEN)
-			copylen = DCCP_SINGLE_OPT_MAXLEN;
-
-		/*
-		 * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
-		 * its type; ack_nonce is the sum of all individual buf_nonce's.
-		 */
-		nonce ^= av->av_buf_nonce[i];
-
-		*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
-		*to++ = copylen + 2;
-
-		/* Check if buf_head wraps */
-		if (from + copylen > tail) {
-			const u16 tailsize = tail - from;
-
-			memcpy(to, from, tailsize);
-			to	+= tailsize;
-			len	-= tailsize;
-			copylen	-= tailsize;
-			from	= av->av_buf;
-		}
-
-		memcpy(to, from, copylen);
-		from += copylen;
-		to   += copylen;
-		len  -= copylen;
-	}
-	/*
-	 * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
-	 */
-	if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
-		return -ENOBUFS;
-	return 0;
-}
+	to    = skb_push(skb, len + 3);
+	*to++ = type;
+	*to++ = len + 3;
+	*to++ = feat;
 
-/**
- * dccp_insert_option_mandatory  -  Mandatory option (5.8.2)
- * Note that since we are using skb_push, this function needs to be called
- * _after_ inserting the option it is supposed to influence (stack order).
- */
-int dccp_insert_option_mandatory(struct sk_buff *skb)
-{
-	if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
-		return -1;
+	if (len)
+		memcpy(to, val, len);
 
-	DCCP_SKB_CB(skb)->dccpd_opt_len++;
-	*skb_push(skb, 1) = DCCPO_MANDATORY;
+	dccp_pr_debug("%s(%s (%d), ...), length %d\n",
+		      dccp_feat_typename(type),
+		      dccp_feat_name(feat), feat, len);
 	return 0;
 }
 
-/**
- * dccp_insert_fn_opt  -  Insert single Feature-Negotiation option into @skb
- * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
- * @feat: one out of %dccp_feature_numbers
- * @val: NN value or SP array (preferred element first) to copy
- * @len: true length of @val in bytes (excluding first element repetition)
- * @repeat_first: whether to copy the first element of @val twice
- * The last argument is used to construct Confirm options, where the preferred
- * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
- * lists are kept such that the preferred entry is always first, so we only need
- * to copy twice, and avoid the overhead of cloning into a bigger array.
- */
-int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
-		       u8 *val, u8 len, bool repeat_first)
+static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb)
 {
-	u8 tot_len, *to;
+	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
+	struct dccp_opt_pend *opt, *next;
+	int change = 0;
+
+	/* confirm any options [NN opts] */
+	list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) {
+		dccp_insert_feat_opt(skb, opt->dccpop_type,
+				     opt->dccpop_feat, opt->dccpop_val,
+				     opt->dccpop_len);
+		/* fear empty confirms */
+		if (opt->dccpop_val)
+			kfree(opt->dccpop_val);
+		kfree(opt);
+	}
+	INIT_LIST_HEAD(&dmsk->dccpms_conf);
+
+	/* see which features we need to send */
+	list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) {
+		/* see if we need to send any confirm */
+		if (opt->dccpop_sc) {
+			dccp_insert_feat_opt(skb, opt->dccpop_type + 1,
+					     opt->dccpop_feat,
+					     opt->dccpop_sc->dccpoc_val,
+					     opt->dccpop_sc->dccpoc_len);
+
+			BUG_ON(!opt->dccpop_sc->dccpoc_val);
+			kfree(opt->dccpop_sc->dccpoc_val);
+			kfree(opt->dccpop_sc);
+			opt->dccpop_sc = NULL;
+		}
 
-	/* take the `Feature' field and possible repetition into account */
-	if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
-		DCCP_WARN("length %u for feature %u too large\n", len, feat);
-		return -1;
+		/* any option not confirmed, re-send it */
+		if (!opt->dccpop_conf) {
+			dccp_insert_feat_opt(skb, opt->dccpop_type,
+					     opt->dccpop_feat, opt->dccpop_val,
+					     opt->dccpop_len);
+			change++;
+		}
 	}
 
-	if (unlikely(val == NULL || len == 0))
-		len = repeat_first = 0;
-	tot_len = 3 + repeat_first + len;
+	/* Retransmit timer.
+	 * If this is the master listening sock, we don't set a timer on it.  It
+	 * should be fine because if the dude doesn't receive our RESPONSE
+	 * [which will contain the CHANGE] he will send another REQUEST which
+	 * will "retrnasmit" the change.
+	 */
+	if (change && dp->dccps_role != DCCP_ROLE_LISTEN) {
+		dccp_pr_debug("reset feat negotiation timer %p\n", sk);
 
-	if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
-		DCCP_WARN("packet too small for feature %d option!\n", feat);
-		return -1;
+		/* XXX don't reset the timer on re-transmissions.  I.e. reset it
+		 * only when sending new stuff i guess.  Currently the timer
+		 * never backs off because on re-transmission it just resets it!
+		 */
+		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					  inet_csk(sk)->icsk_rto, DCCP_RTO_MAX);
 	}
-	DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
-
-	to    = skb_push(skb, tot_len);
-	*to++ = type;
-	*to++ = tot_len;
-	*to++ = feat;
 
-	if (repeat_first)
-		*to++ = *val;
-	if (len)
-		memcpy(to, val, len);
 	return 0;
 }
 
@@ -581,30 +565,19 @@ static void dccp_insert_option_padding(struct sk_buff *skb)
 int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
-	if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
+	if (dmsk->dccpms_send_ndp_count &&
+	    dccp_insert_option_ndp(sk, skb))
 		return -1;
 
-	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
-
-		/* Feature Negotiation */
-		if (dccp_feat_insert_opts(dp, NULL, skb))
+	if (!dccp_packet_without_ack(skb)) {
+		if (dmsk->dccpms_send_ack_vector &&
+		    dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) &&
+		    dccp_insert_option_ackvec(sk, skb))
 			return -1;
-
-		if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
-			/*
-			 * Obtain RTT sample from Request/Response exchange.
-			 * This is currently used in CCID 3 initialisation.
-			 */
-			if (dccp_insert_option_timestamp(sk, skb))
-				return -1;
-
-		} else if (dccp_ackvec_pending(sk) &&
-			   dccp_insert_option_ackvec(sk, skb)) {
-				return -1;
-		}
 	}
 
 	if (dp->dccps_hc_rx_insert_options) {
@@ -613,6 +586,21 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
 		dp->dccps_hc_rx_insert_options = 0;
 	}
 
+	/* Feature negotiation */
+	/* Data packets can't do feat negotiation */
+	if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA &&
+	    DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK &&
+	    dccp_insert_options_feat(sk, skb))
+		return -1;
+
+	/*
+	 * Obtain RTT sample from Request/Response exchange.
+	 * This is currently used in CCID 3 initialisation.
+	 */
+	if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST &&
+	    dccp_insert_option_timestamp(sk, skb))
+		return -1;
+
 	if (dp->dccps_timestamp_echo != 0 &&
 	    dccp_insert_option_timestamp_echo(dp, NULL, skb))
 		return -1;
@@ -625,9 +613,6 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
 {
 	DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
 
-	if (dccp_feat_insert_opts(NULL, dreq, skb))
-		return -1;
-
 	if (dreq->dreq_timestamp_echo != 0 &&
 	    dccp_insert_option_timestamp_echo(NULL, dreq, skb))
 		return -1;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 2532797a800..d06945c7d3d 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -26,13 +26,11 @@ static inline void dccp_event_ack_sent(struct sock *sk)
 	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
 
-/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
-static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
+static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
 {
 	skb_set_owner_w(skb, sk);
 	WARN_ON(sk->sk_send_head);
 	sk->sk_send_head = skb;
-	return skb_clone(sk->sk_send_head, gfp_any());
 }
 
 /*
@@ -163,27 +161,21 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
 	u32 ccmps = dccp_determine_ccmps(dp);
-	u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
+	int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
 
 	/* Account for header lengths and IPv4/v6 option overhead */
 	cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
 		    sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
 
 	/*
-	 * Leave enough headroom for common DCCP header options.
-	 * This only considers options which may appear on DCCP-Data packets, as
-	 * per table 3 in RFC 4340, 5.8. When running out of space for other
-	 * options (eg. Ack Vector which can take up to 255 bytes), it is better
-	 * to schedule a separate Ack. Thus we leave headroom for the following:
-	 *  - 1 byte for Slow Receiver (11.6)
-	 *  - 6 bytes for Timestamp (13.1)
-	 *  - 10 bytes for Timestamp Echo (13.3)
-	 *  - 8 bytes for NDP count (7.7, when activated)
-	 *  - 6 bytes for Data Checksum (9.3)
-	 *  - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
+	 * FIXME: this should come from the CCID infrastructure, where, say,
+	 * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
+	 * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
+	 * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
+	 * make it a multiple of 4
 	 */
-	cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
-			   (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
+
+	cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
 
 	/* And store cached results */
 	icsk->icsk_pmtu_cookie = pmtu;
@@ -208,158 +200,95 @@ void dccp_write_space(struct sock *sk)
 }
 
 /**
- * dccp_wait_for_ccid  -  Await CCID send permission
+ * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
  * @sk:    socket to wait for
- * @delay: timeout in jiffies
- * This is used by CCIDs which need to delay the send time in process context.
+ * @skb:   current skb to pass on for waiting
+ * @delay: sleep timeout in milliseconds (> 0)
+ * This function is called by default when the socket is closed, and
+ * when a non-zero linger time is set on the socket. For consistency
  */
-static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
+static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay)
 {
+	struct dccp_sock *dp = dccp_sk(sk);
 	DEFINE_WAIT(wait);
-	long remaining;
-
-	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
-	sk->sk_write_pending++;
-	release_sock(sk);
+	unsigned long jiffdelay;
+	int rc;
 
-	remaining = schedule_timeout(delay);
-
-	lock_sock(sk);
-	sk->sk_write_pending--;
-	finish_wait(sk->sk_sleep, &wait);
+	do {
+		dccp_pr_debug("delayed send by %d msec\n", delay);
+		jiffdelay = msecs_to_jiffies(delay);
 
-	if (signal_pending(current) || sk->sk_err)
-		return -1;
-	return remaining;
-}
-
-/**
- * dccp_xmit_packet  -  Send data packet under control of CCID
- * Transmits next-queued payload and informs CCID to account for the packet.
- */
-static void dccp_xmit_packet(struct sock *sk)
-{
-	int err, len;
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct sk_buff *skb = dccp_qpolicy_pop(sk);
+		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 
-	if (unlikely(skb == NULL))
-		return;
-	len = skb->len;
+		sk->sk_write_pending++;
+		release_sock(sk);
+		schedule_timeout(jiffdelay);
+		lock_sock(sk);
+		sk->sk_write_pending--;
 
-	if (sk->sk_state == DCCP_PARTOPEN) {
-		const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
-		/*
-		 * See 8.1.5 - Handshake Completion.
-		 *
-		 * For robustness we resend Confirm options until the client has
-		 * entered OPEN. During the initial feature negotiation, the MPS
-		 * is smaller than usual, reduced by the Change/Confirm options.
-		 */
-		if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
-			DCCP_WARN("Payload too large (%d) for featneg.\n", len);
-			dccp_send_ack(sk);
-			dccp_feat_list_purge(&dp->dccps_featneg);
-		}
+		if (sk->sk_err)
+			goto do_error;
+		if (signal_pending(current))
+			goto do_interrupted;
 
-		inet_csk_schedule_ack(sk);
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-					      inet_csk(sk)->icsk_rto,
-					      DCCP_RTO_MAX);
-		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
-	} else if (dccp_ack_pending(sk)) {
-		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
-	} else {
-		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
-	}
-
-	err = dccp_transmit_skb(sk, skb);
-	if (err)
-		dccp_pr_debug("transmit_skb() returned err=%d\n", err);
-	/*
-	 * Register this one as sent even if an error occurred. To the remote
-	 * end a local packet drop is indistinguishable from network loss, i.e.
-	 * any local drop will eventually be reported via receiver feedback.
-	 */
-	ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
-
-	/*
-	 * If the CCID needs to transfer additional header options out-of-band
-	 * (e.g. Ack Vectors or feature-negotiation options), it activates this
-	 * flag to schedule a Sync. The Sync will automatically incorporate all
-	 * currently pending header options, thus clearing the backlog.
-	 */
-	if (dp->dccps_sync_scheduled)
-		dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
+		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+	} while ((delay = rc) > 0);
+out:
+	finish_wait(sk->sk_sleep, &wait);
+	return rc;
+
+do_error:
+	rc = -EPIPE;
+	goto out;
+do_interrupted:
+	rc = -EINTR;
+	goto out;
 }
 
-/**
- * dccp_flush_write_queue  -  Drain queue at end of connection
- * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
- * happen that the TX queue is not empty at the end of a connection. We give the
- * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
- * returns with a non-empty write queue, it will be purged later.
- */
-void dccp_flush_write_queue(struct sock *sk, long *time_budget)
+void dccp_write_xmit(struct sock *sk, int block)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct sk_buff *skb;
-	long delay, rc;
-
-	while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
-		rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
 
-		switch (ccid_packet_dequeue_eval(rc)) {
-		case CCID_PACKET_WILL_DEQUEUE_LATER:
-			/*
-			 * If the CCID determines when to send, the next sending
-			 * time is unknown or the CCID may not even send again
-			 * (e.g. remote host crashes or lost Ack packets).
-			 */
-			DCCP_WARN("CCID did not manage to send all packets\n");
-			return;
-		case CCID_PACKET_DELAY:
-			delay = msecs_to_jiffies(rc);
-			if (delay > *time_budget)
-				return;
-			rc = dccp_wait_for_ccid(sk, delay);
-			if (rc < 0)
-				return;
-			*time_budget -= (delay - rc);
-			/* check again if we can send now */
-			break;
-		case CCID_PACKET_SEND_AT_ONCE:
-			dccp_xmit_packet(sk);
-			break;
-		case CCID_PACKET_ERR:
-			skb_dequeue(&sk->sk_write_queue);
-			kfree_skb(skb);
-			dccp_pr_debug("packet discarded due to err=%ld\n", rc);
+	while ((skb = skb_peek(&sk->sk_write_queue))) {
+		int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+
+		if (err > 0) {
+			if (!block) {
+				sk_reset_timer(sk, &dp->dccps_xmit_timer,
+						msecs_to_jiffies(err)+jiffies);
+				break;
+			} else
+				err = dccp_wait_for_ccid(sk, skb, err);
+			if (err && err != -EINTR)
+				DCCP_BUG("err=%d after dccp_wait_for_ccid", err);
 		}
-	}
-}
 
-void dccp_write_xmit(struct sock *sk)
-{
-	struct dccp_sock *dp = dccp_sk(sk);
-	struct sk_buff *skb;
+		skb_dequeue(&sk->sk_write_queue);
+		if (err == 0) {
+			struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+			const int len = skb->len;
 
-	while ((skb = dccp_qpolicy_top(sk))) {
-		int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
-		switch (ccid_packet_dequeue_eval(rc)) {
-		case CCID_PACKET_WILL_DEQUEUE_LATER:
-			return;
-		case CCID_PACKET_DELAY:
-			sk_reset_timer(sk, &dp->dccps_xmit_timer,
-				       jiffies + msecs_to_jiffies(rc));
-			return;
-		case CCID_PACKET_SEND_AT_ONCE:
-			dccp_xmit_packet(sk);
-			break;
-		case CCID_PACKET_ERR:
-			dccp_qpolicy_drop(sk, skb);
-			dccp_pr_debug("packet discarded due to err=%d\n", rc);
+			if (sk->sk_state == DCCP_PARTOPEN) {
+				/* See 8.1.5.  Handshake Completion */
+				inet_csk_schedule_ack(sk);
+				inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+						  inet_csk(sk)->icsk_rto,
+						  DCCP_RTO_MAX);
+				dcb->dccpd_type = DCCP_PKT_DATAACK;
+			} else if (dccp_ack_pending(sk))
+				dcb->dccpd_type = DCCP_PKT_DATAACK;
+			else
+				dcb->dccpd_type = DCCP_PKT_DATA;
+
+			err = dccp_transmit_skb(sk, skb);
+			ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
+			if (err)
+				DCCP_BUG("err=%d after ccid_hc_tx_packet_sent",
+					 err);
+		} else {
+			dccp_pr_debug("packet discarded due to err=%d\n", err);
+			kfree_skb(skb);
 		}
 	}
 }
@@ -410,12 +339,10 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
 	DCCP_SKB_CB(skb)->dccpd_seq  = dreq->dreq_iss;
 
-	/* Resolve feature dependencies resulting from choice of CCID */
-	if (dccp_feat_server_ccid_dependencies(dreq))
-		goto response_failed;
-
-	if (dccp_insert_options_rsk(dreq, skb))
-		goto response_failed;
+	if (dccp_insert_options_rsk(dreq, skb)) {
+		kfree_skb(skb);
+		return NULL;
+	}
 
 	/* Build and checksum header */
 	dh = dccp_zeroed_hdr(skb, dccp_header_size);
@@ -436,9 +363,6 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	inet_rsk(req)->acked = 1;
 	DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
 	return skb;
-response_failed:
-	kfree_skb(skb);
-	return NULL;
 }
 
 EXPORT_SYMBOL_GPL(dccp_make_response);
@@ -523,9 +447,8 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
 /*
  * Do all connect socket setups that can be done AF independent.
  */
-int dccp_connect(struct sock *sk)
+static inline void dccp_connect_init(struct sock *sk)
 {
-	struct sk_buff *skb;
 	struct dccp_sock *dp = dccp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
@@ -535,13 +458,19 @@ int dccp_connect(struct sock *sk)
 
 	dccp_sync_mss(sk, dst_mtu(dst));
 
-	/* do not connect if feature negotiation setup fails */
-	if (dccp_feat_finalise_settings(dccp_sk(sk)))
-		return -EPROTO;
-
 	/* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
 	dp->dccps_gar = dp->dccps_iss;
 
+	icsk->icsk_retransmits = 0;
+}
+
+int dccp_connect(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	dccp_connect_init(sk);
+
 	skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
 	if (unlikely(skb == NULL))
 		return -ENOBUFS;
@@ -551,11 +480,11 @@ int dccp_connect(struct sock *sk)
 
 	DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
 
-	dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
+	dccp_skb_entail(sk, skb);
+	dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
 	DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the REQUEST until an answer. */
-	icsk->icsk_retransmits = 0;
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 				  icsk->icsk_rto, DCCP_RTO_MAX);
 	return 0;
@@ -642,12 +571,6 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
 	DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
 	DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
 
-	/*
-	 * Clear the flag in case the Sync was scheduled for out-of-band data,
-	 * such as carrying a long Ack Vector.
-	 */
-	dccp_sk(sk)->dccps_sync_scheduled = 0;
-
 	dccp_transmit_skb(sk, skb);
 }
 
@@ -676,7 +599,9 @@ void dccp_send_close(struct sock *sk, const int active)
 		DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
 
 	if (active) {
-		skb = dccp_skb_entail(sk, skb);
+		dccp_write_xmit(sk, 1);
+		dccp_skb_entail(sk, skb);
+		dccp_transmit_skb(sk, skb_clone(skb, prio));
 		/*
 		 * Retransmission timer for active-close: RFC 4340, 8.3 requires
 		 * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
@@ -689,6 +614,6 @@ void dccp_send_close(struct sock *sk, const int active)
 		 */
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 					  DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
-	}
-	dccp_transmit_skb(sk, skb);
+	} else
+		dccp_transmit_skb(sk, skb);
 }
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
index eaa59d82ab0..81368a7f537 100644
--- a/net/dccp/probe.c
+++ b/net/dccp/probe.c
@@ -46,54 +46,75 @@ static struct {
 	struct kfifo	  *fifo;
 	spinlock_t	  lock;
 	wait_queue_head_t wait;
-	ktime_t		  start;
+	struct timespec	  tstart;
 } dccpw;
 
-static void jdccp_write_xmit(struct sock *sk)
+static void printl(const char *fmt, ...)
 {
-	const struct inet_sock *inet = inet_sk(sk);
-	struct ccid3_hc_tx_sock *hctx = NULL;
-	struct timespec tv;
-	char buf[256];
-	int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk));
+	va_list args;
+	int len;
+	struct timespec now;
+	char tbuf[256];
 
-	if (ccid == DCCPC_CCID3)
-		hctx = ccid3_hc_tx_sk(sk);
+	va_start(args, fmt);
+	getnstimeofday(&now);
 
-	if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) {
+	now = timespec_sub(now, dccpw.tstart);
 
-		tv  = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start));
-		len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d",
-			       (unsigned long)tv.tv_sec,
-			       (unsigned long)tv.tv_nsec,
-			       NIPQUAD(inet->saddr), ntohs(inet->sport),
-			       NIPQUAD(inet->daddr), ntohs(inet->dport), ccid);
+	len = sprintf(tbuf, "%lu.%06lu ",
+		      (unsigned long) now.tv_sec,
+		      (unsigned long) now.tv_nsec / NSEC_PER_USEC);
+	len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
+	va_end(args);
 
+	kfifo_put(dccpw.fifo, tbuf, len);
+	wake_up(&dccpw.wait);
+}
+
+static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+			 struct msghdr *msg, size_t size)
+{
+	const struct dccp_minisock *dmsk = dccp_msk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct ccid3_hc_tx_sock *hctx;
+
+	if (dmsk->dccpms_tx_ccid == DCCPC_CCID3)
+		hctx = ccid3_hc_tx_sk(sk);
+	else
+		hctx = NULL;
+
+	if (port == 0 || ntohs(inet->dport) == port ||
+	    ntohs(inet->sport) == port) {
 		if (hctx)
-			len += sprintf(buf + len, " %d %d %d %u %u %u %d",
-			       hctx->s, hctx->rtt, hctx->p, hctx->x_calc,
-			       (unsigned)(hctx->x_recv >> 6),
-			       (unsigned)(hctx->x >> 6), hctx->t_ipi);
-
-		len += sprintf(buf + len, "\n");
-		kfifo_put(dccpw.fifo, buf, len);
-		wake_up(&dccpw.wait);
+			printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u "
+			       "%llu %llu %d\n",
+			       NIPQUAD(inet->saddr), ntohs(inet->sport),
+			       NIPQUAD(inet->daddr), ntohs(inet->dport), size,
+			       hctx->ccid3hctx_s, hctx->ccid3hctx_rtt,
+			       hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc,
+			       hctx->ccid3hctx_x_recv >> 6,
+			       hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi);
+		else
+			printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n",
+			       NIPQUAD(inet->saddr), ntohs(inet->sport),
+			       NIPQUAD(inet->daddr), ntohs(inet->dport), size);
 	}
 
 	jprobe_return();
+	return 0;
 }
 
 static struct jprobe dccp_send_probe = {
 	.kp	= {
-		.symbol_name = "dccp_write_xmit",
+		.symbol_name = "dccp_sendmsg",
 	},
-	.entry	= jdccp_write_xmit,
+	.entry	= jdccp_sendmsg,
 };
 
 static int dccpprobe_open(struct inode *inode, struct file *file)
 {
 	kfifo_reset(dccpw.fifo);
-	dccpw.start = ktime_get();
+	getnstimeofday(&dccpw.tstart);
 	return 0;
 }
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ecf3be961e1..d0bd3481976 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -67,9 +67,6 @@ void dccp_set_state(struct sock *sk, const int state)
 	case DCCP_OPEN:
 		if (oldstate != DCCP_OPEN)
 			DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
-		/* Client retransmits all Confirm options until entering OPEN */
-		if (oldstate == DCCP_PARTOPEN)
-			dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
 		break;
 
 	case DCCP_CLOSED:
@@ -178,25 +175,63 @@ EXPORT_SYMBOL_GPL(dccp_state_name);
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
+	dccp_minisock_init(&dp->dccps_minisock);
+
 	icsk->icsk_rto		= DCCP_TIMEOUT_INIT;
 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
 	sk->sk_state		= DCCP_CLOSED;
 	sk->sk_write_space	= dccp_write_space;
 	icsk->icsk_sync_mss	= dccp_sync_mss;
-	dp->dccps_mss_cache	= TCP_MIN_RCVMSS;
+	dp->dccps_mss_cache	= 536;
 	dp->dccps_rate_last	= jiffies;
 	dp->dccps_role		= DCCP_ROLE_UNDEFINED;
 	dp->dccps_service	= DCCP_SERVICE_CODE_IS_ABSENT;
-	dp->dccps_tx_qlen	= sysctl_dccp_tx_qlen;
+	dp->dccps_l_ack_ratio	= dp->dccps_r_ack_ratio = 1;
 
 	dccp_init_xmit_timers(sk);
 
-	INIT_LIST_HEAD(&dp->dccps_featneg);
-	/* control socket doesn't need feat nego */
-	if (likely(ctl_sock_initialized))
-		return dccp_feat_init(sk);
+	/*
+	 * FIXME: We're hardcoding the CCID, and doing this at this point makes
+	 * the listening (master) sock get CCID control blocks, which is not
+	 * necessary, but for now, to not mess with the test userspace apps,
+	 * lets leave it here, later the real solution is to do this in a
+	 * setsockopt(CCIDs-I-want/accept). -acme
+	 */
+	if (likely(ctl_sock_initialized)) {
+		int rc = dccp_feat_init(dmsk);
+
+		if (rc)
+			return rc;
+
+		if (dmsk->dccpms_send_ack_vector) {
+			dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
+			if (dp->dccps_hc_rx_ackvec == NULL)
+				return -ENOMEM;
+		}
+		dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid,
+						      sk, GFP_KERNEL);
+		dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid,
+						      sk, GFP_KERNEL);
+		if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
+			     dp->dccps_hc_tx_ccid == NULL)) {
+			ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+			ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+			if (dmsk->dccpms_send_ack_vector) {
+				dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+				dp->dccps_hc_rx_ackvec = NULL;
+			}
+			dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+			return -ENOMEM;
+		}
+	} else {
+		/* control socket doesn't need feat nego */
+		INIT_LIST_HEAD(&dmsk->dccpms_pending);
+		INIT_LIST_HEAD(&dmsk->dccpms_conf);
+	}
+
 	return 0;
 }
 
@@ -205,6 +240,7 @@ EXPORT_SYMBOL_GPL(dccp_init_sock);
 void dccp_destroy_sock(struct sock *sk)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
+	struct dccp_minisock *dmsk = dccp_msk(sk);
 
 	/*
 	 * DCCP doesn't use sk_write_queue, just sk_send_head
@@ -222,7 +258,7 @@ void dccp_destroy_sock(struct sock *sk)
 	kfree(dp->dccps_service_list);
 	dp->dccps_service_list = NULL;
 
-	if (dp->dccps_hc_rx_ackvec != NULL) {
+	if (dmsk->dccpms_send_ack_vector) {
 		dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
@@ -231,7 +267,7 @@ void dccp_destroy_sock(struct sock *sk)
 	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
 
 	/* clean up feature negotiation state */
-	dccp_feat_list_purge(&dp->dccps_featneg);
+	dccp_feat_clean(dmsk);
 }
 
 EXPORT_SYMBOL_GPL(dccp_destroy_sock);
@@ -241,9 +277,6 @@ static inline int dccp_listen_start(struct sock *sk, int backlog)
 	struct dccp_sock *dp = dccp_sk(sk);
 
 	dp->dccps_role = DCCP_ROLE_LISTEN;
-	/* do not start to listen if feature negotiation setup fails */
-	if (dccp_feat_finalise_settings(dp))
-		return -EPROTO;
 	return inet_csk_listen_start(sk, backlog);
 }
 
@@ -433,70 +466,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
 	return 0;
 }
 
-static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
+/* byte 1 is feature.  the rest is the preference list */
+static int dccp_setsockopt_change(struct sock *sk, int type,
+				  struct dccp_so_feat __user *optval)
 {
-	u8 *list, len;
-	int i, rc;
+	struct dccp_so_feat opt;
+	u8 *val;
+	int rc;
 
-	if (cscov < 0 || cscov > 15)
-		return -EINVAL;
+	if (copy_from_user(&opt, optval, sizeof(opt)))
+		return -EFAULT;
 	/*
-	 * Populate a list of permissible values, in the range cscov...15. This
-	 * is necessary since feature negotiation of single values only works if
-	 * both sides incidentally choose the same value. Since the list starts
-	 * lowest-value first, negotiation will pick the smallest shared value.
+	 * rfc4340: 6.1. Change Options
 	 */
-	if (cscov == 0)
-		return 0;
-	len = 16 - cscov;
-
-	list = kmalloc(len, GFP_KERNEL);
-	if (list == NULL)
-		return -ENOBUFS;
-
-	for (i = 0; i < len; i++)
-		list[i] = cscov++;
-
-	rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
-
-	if (rc == 0) {
-		if (rx)
-			dccp_sk(sk)->dccps_pcrlen = cscov;
-		else
-			dccp_sk(sk)->dccps_pcslen = cscov;
-	}
-	kfree(list);
-	return rc;
-}
-
-static int dccp_setsockopt_ccid(struct sock *sk, int type,
-				char __user *optval, int optlen)
-{
-	u8 *val;
-	int rc = 0;
-
-	if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
+	if (opt.dccpsf_len < 1)
 		return -EINVAL;
 
-	val = kmalloc(optlen, GFP_KERNEL);
-	if (val == NULL)
+	val = kmalloc(opt.dccpsf_len, GFP_KERNEL);
+	if (!val)
 		return -ENOMEM;
 
-	if (copy_from_user(val, optval, optlen)) {
-		kfree(val);
-		return -EFAULT;
+	if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) {
+		rc = -EFAULT;
+		goto out_free_val;
 	}
 
-	lock_sock(sk);
-	if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
-		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
+	rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat,
+			      val, opt.dccpsf_len, GFP_KERNEL);
+	if (rc)
+		goto out_free_val;
 
-	if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
-		rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
-	release_sock(sk);
+out:
+	return rc;
 
+out_free_val:
 	kfree(val);
-	return rc;
+	goto out;
 }
 
 static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
@@ -505,21 +510,7 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 	struct dccp_sock *dp = dccp_sk(sk);
 	int val, err = 0;
 
-	switch (optname) {
-	case DCCP_SOCKOPT_PACKET_SIZE:
-		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
-		return 0;
-	case DCCP_SOCKOPT_CHANGE_L:
-	case DCCP_SOCKOPT_CHANGE_R:
-		DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
-		return 0;
-	case DCCP_SOCKOPT_CCID:
-	case DCCP_SOCKOPT_RX_CCID:
-	case DCCP_SOCKOPT_TX_CCID:
-		return dccp_setsockopt_ccid(sk, optname, optval, optlen);
-	}
-
-	if (optlen < (int)sizeof(int))
+	if (optlen < sizeof(int))
 		return -EINVAL;
 
 	if (get_user(val, (int __user *)optval))
@@ -530,38 +521,53 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
 
 	lock_sock(sk);
 	switch (optname) {
+	case DCCP_SOCKOPT_PACKET_SIZE:
+		DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+		err = 0;
+		break;
+	case DCCP_SOCKOPT_CHANGE_L:
+		if (optlen != sizeof(struct dccp_so_feat))
+			err = -EINVAL;
+		else
+			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L,
+						     (struct dccp_so_feat __user *)
+						     optval);
+		break;
+	case DCCP_SOCKOPT_CHANGE_R:
+		if (optlen != sizeof(struct dccp_so_feat))
+			err = -EINVAL;
+		else
+			err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R,
+						     (struct dccp_so_feat __user *)
+						     optval);
+		break;
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		if (dp->dccps_role != DCCP_ROLE_SERVER)
 			err = -EOPNOTSUPP;
 		else
 			dp->dccps_server_timewait = (val != 0);
 		break;
-	case DCCP_SOCKOPT_SEND_CSCOV:
-		err = dccp_setsockopt_cscov(sk, val, false);
-		break;
-	case DCCP_SOCKOPT_RECV_CSCOV:
-		err = dccp_setsockopt_cscov(sk, val, true);
-		break;
-	case DCCP_SOCKOPT_QPOLICY_ID:
-		if (sk->sk_state != DCCP_CLOSED)
-			err = -EISCONN;
-		else if (val < 0 || val >= DCCPQ_POLICY_MAX)
+	case DCCP_SOCKOPT_SEND_CSCOV:	/* sender side, RFC 4340, sec. 9.2 */
+		if (val < 0 || val > 15)
 			err = -EINVAL;
 		else
-			dp->dccps_qpolicy = val;
+			dp->dccps_pcslen = val;
 		break;
-	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
-		if (val < 0)
+	case DCCP_SOCKOPT_RECV_CSCOV:	/* receiver side, RFC 4340 sec. 9.2.1 */
+		if (val < 0 || val > 15)
 			err = -EINVAL;
-		else
-			dp->dccps_tx_qlen = val;
+		else {
+			dp->dccps_pcrlen = val;
+			/* FIXME: add feature negotiation,
+			 * ChangeL(MinimumChecksumCoverage, val) */
+		}
 		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
 	}
-	release_sock(sk);
 
+	release_sock(sk);
 	return err;
 }
 
@@ -642,18 +648,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_GET_CUR_MPS:
 		val = dp->dccps_mss_cache;
 		break;
-	case DCCP_SOCKOPT_AVAILABLE_CCIDS:
-		return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
-	case DCCP_SOCKOPT_TX_CCID:
-		val = ccid_get_current_tx_ccid(dp);
-		if (val < 0)
-			return -ENOPROTOOPT;
-		break;
-	case DCCP_SOCKOPT_RX_CCID:
-		val = ccid_get_current_rx_ccid(dp);
-		if (val < 0)
-			return -ENOPROTOOPT;
-		break;
 	case DCCP_SOCKOPT_SERVER_TIMEWAIT:
 		val = dp->dccps_server_timewait;
 		break;
@@ -663,12 +657,6 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
 	case DCCP_SOCKOPT_RECV_CSCOV:
 		val = dp->dccps_pcrlen;
 		break;
-	case DCCP_SOCKOPT_QPOLICY_ID:
-		val = dp->dccps_qpolicy;
-		break;
-	case DCCP_SOCKOPT_QPOLICY_TXQLEN:
-		val = dp->dccps_tx_qlen;
-		break;
 	case 128 ... 191:
 		return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
 					     len, (u32 __user *)optval, optlen);
@@ -711,47 +699,6 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
 EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
 #endif
 
-static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
-{
-	struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
-
-	/*
-	 * Assign an (opaque) qpolicy priority value to skb->priority.
-	 *
-	 * We are overloading this skb field for use with the qpolicy subystem.
-	 * The skb->priority is normally used for the SO_PRIORITY option, which
-	 * is initialised from sk_priority. Since the assignment of sk_priority
-	 * to skb->priority happens later (on layer 3), we overload this field
-	 * for use with queueing priorities as long as the skb is on layer 4.
-	 * The default priority value (if nothing is set) is 0.
-	 */
-	skb->priority = 0;
-
-	for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
-
-		if (!CMSG_OK(msg, cmsg))
-			return -EINVAL;
-
-		if (cmsg->cmsg_level != SOL_DCCP)
-			continue;
-
-		if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
-		    !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
-			return -EINVAL;
-
-		switch (cmsg->cmsg_type) {
-		case DCCP_SCM_PRIORITY:
-			if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
-				return -EINVAL;
-			skb->priority = *(__u32 *)CMSG_DATA(cmsg);
-			break;
-		default:
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		 size_t len)
 {
@@ -767,7 +714,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	lock_sock(sk);
 
-	if (dccp_qpolicy_full(sk)) {
+	if (sysctl_dccp_tx_qlen &&
+	    (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
 		rc = -EAGAIN;
 		goto out_release;
 	}
@@ -795,12 +743,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (rc != 0)
 		goto out_discard;
 
-	rc = dccp_msghdr_parse(msg, skb);
-	if (rc != 0)
-		goto out_discard;
-
-	dccp_qpolicy_push(sk, skb);
-	dccp_write_xmit(sk);
+	skb_queue_tail(&sk->sk_write_queue, skb);
+	dccp_write_xmit(sk,0);
 out_release:
 	release_sock(sk);
 	return rc ? : len;
@@ -1023,22 +967,9 @@ void dccp_close(struct sock *sk, long timeout)
 		/* Check zero linger _after_ checking for unread data. */
 		sk->sk_prot->disconnect(sk, 0);
 	} else if (sk->sk_state != DCCP_CLOSED) {
-		/*
-		 * Normal connection termination. May need to wait if there are
-		 * still packets in the TX queue that are delayed by the CCID.
-		 */
-		dccp_flush_write_queue(sk, &timeout);
 		dccp_terminate_connection(sk);
 	}
 
-	/*
-	 * Flush write queue. This may be necessary in several cases:
-	 * - we have been closed by the peer but still have application data;
-	 * - abortive termination (unread data or zero linger time),
-	 * - normal termination but queue could not be flushed within time limit
-	 */
-	__skb_queue_purge(&sk->sk_write_queue);
-
 	sk_stream_wait_close(sk, timeout);
 
 adjudge_to_death:
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
deleted file mode 100644
index 27383f88c75..00000000000
--- a/net/dccp/qpolicy.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- *  net/dccp/qpolicy.c
- *
- *  Policy-based packet dequeueing interface for DCCP.
- *
- *  Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License v2
- *  as published by the Free Software Foundation.
- */
-#include "dccp.h"
-
-/*
- *	Simple Dequeueing Policy:
- *	If tx_qlen is different from 0, enqueue up to tx_qlen elements.
- */
-static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
-{
-	skb_queue_tail(&sk->sk_write_queue, skb);
-}
-
-static bool qpolicy_simple_full(struct sock *sk)
-{
-	return dccp_sk(sk)->dccps_tx_qlen &&
-	       sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
-}
-
-static struct sk_buff *qpolicy_simple_top(struct sock *sk)
-{
-	return skb_peek(&sk->sk_write_queue);
-}
-
-/*
- *	Priority-based Dequeueing Policy:
- *	If tx_qlen is different from 0 and the queue has reached its upper bound
- *	of tx_qlen elements, replace older packets lowest-priority-first.
- */
-static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
-{
-	struct sk_buff *skb, *best = NULL;
-
-	skb_queue_walk(&sk->sk_write_queue, skb)
-		if (best == NULL || skb->priority > best->priority)
-			best = skb;
-	return best;
-}
-
-static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
-{
-	struct sk_buff *skb, *worst = NULL;
-
-	skb_queue_walk(&sk->sk_write_queue, skb)
-		if (worst == NULL || skb->priority < worst->priority)
-			worst = skb;
-	return worst;
-}
-
-static bool qpolicy_prio_full(struct sock *sk)
-{
-	if (qpolicy_simple_full(sk))
-		dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
-	return false;
-}
-
-/**
- * struct dccp_qpolicy_operations  -  TX Packet Dequeueing Interface
- * @push: add a new @skb to the write queue
- * @full: indicates that no more packets will be admitted
- * @top:  peeks at whatever the queueing policy defines as its `top'
- */
-static struct dccp_qpolicy_operations {
-	void		(*push)	(struct sock *sk, struct sk_buff *skb);
-	bool		(*full) (struct sock *sk);
-	struct sk_buff*	(*top)  (struct sock *sk);
-	__be32		params;
-
-} qpol_table[DCCPQ_POLICY_MAX] = {
-	[DCCPQ_POLICY_SIMPLE] = {
-		.push   = qpolicy_simple_push,
-		.full   = qpolicy_simple_full,
-		.top    = qpolicy_simple_top,
-		.params = 0,
-	},
-	[DCCPQ_POLICY_PRIO] = {
-		.push   = qpolicy_simple_push,
-		.full   = qpolicy_prio_full,
-		.top    = qpolicy_prio_best_skb,
-		.params = DCCP_SCM_PRIORITY,
-	},
-};
-
-/*
- *	Externally visible interface
- */
-void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
-{
-	qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
-}
-
-bool dccp_qpolicy_full(struct sock *sk)
-{
-	return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
-}
-
-void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
-{
-	if (skb != NULL) {
-		skb_unlink(skb, &sk->sk_write_queue);
-		kfree_skb(skb);
-	}
-}
-
-struct sk_buff *dccp_qpolicy_top(struct sock *sk)
-{
-	return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
-}
-
-struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
-{
-	struct sk_buff *skb = dccp_qpolicy_top(sk);
-
-	/* Clear any skb fields that we used internally */
-	skb->priority = 0;
-
-	if (skb)
-		skb_unlink(skb, &sk->sk_write_queue);
-	return skb;
-}
-
-bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
-{
-	/* check if exactly one bit is set */
-	if (!param || (param & (param - 1)))
-		return false;
-	return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
-}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index a5a1856234e..21295993fdb 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -18,72 +18,76 @@
 #error This file should not be compiled without CONFIG_SYSCTL defined
 #endif
 
-/* Boundary values */
-static int		zero     = 0,
-			u8_max   = 0xFF;
-static unsigned long	seqw_min = 32;
-
 static struct ctl_table dccp_default_table[] = {
 	{
 		.procname	= "seq_window",
-		.data		= &sysctl_dccp_sequence_window,
-		.maxlen		= sizeof(sysctl_dccp_sequence_window),
+		.data		= &sysctl_dccp_feat_sequence_window,
+		.maxlen		= sizeof(sysctl_dccp_feat_sequence_window),
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &seqw_min,		/* RFC 4340, 7.5.2 */
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "rx_ccid",
-		.data		= &sysctl_dccp_rx_ccid,
-		.maxlen		= sizeof(sysctl_dccp_rx_ccid),
+		.data		= &sysctl_dccp_feat_rx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_feat_rx_ccid),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,		/* RFC 4340, 10. */
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "tx_ccid",
-		.data		= &sysctl_dccp_tx_ccid,
-		.maxlen		= sizeof(sysctl_dccp_tx_ccid),
+		.data		= &sysctl_dccp_feat_tx_ccid,
+		.maxlen		= sizeof(sysctl_dccp_feat_tx_ccid),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "ack_ratio",
+		.data		= &sysctl_dccp_feat_ack_ratio,
+		.maxlen		= sizeof(sysctl_dccp_feat_ack_ratio),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "send_ackvec",
+		.data		= &sysctl_dccp_feat_send_ack_vector,
+		.maxlen		= sizeof(sysctl_dccp_feat_send_ack_vector),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "send_ndp",
+		.data		= &sysctl_dccp_feat_send_ndp_count,
+		.maxlen		= sizeof(sysctl_dccp_feat_send_ndp_count),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,		/* RFC 4340, 10. */
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "request_retries",
 		.data		= &sysctl_dccp_request_retries,
 		.maxlen		= sizeof(sysctl_dccp_request_retries),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "retries1",
 		.data		= &sysctl_dccp_retries1,
 		.maxlen		= sizeof(sysctl_dccp_retries1),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "retries2",
 		.data		= &sysctl_dccp_retries2,
 		.maxlen		= sizeof(sysctl_dccp_retries2),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &u8_max,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "tx_qlen",
 		.data		= &sysctl_dccp_tx_qlen,
 		.maxlen		= sizeof(sysctl_dccp_tx_qlen),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "sync_ratelimit",
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index 16359e29e7f..54b3c7e9e01 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -87,6 +87,17 @@ static void dccp_retransmit_timer(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
+	/* retransmit timer is used for feature negotiation throughout
+	 * connection.  In this case, no packet is re-transmitted, but rather an
+	 * ack is generated and pending changes are placed into its options.
+	 */
+	if (sk->sk_send_head == NULL) {
+		dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk);
+		if (sk->sk_state == DCCP_OPEN)
+			dccp_send_ack(sk);
+		goto backoff;
+	}
+
 	/*
 	 * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
 	 * sent, no need to retransmit, this sock is dead.
@@ -115,6 +126,7 @@ static void dccp_retransmit_timer(struct sock *sk)
 		return;
 	}
 
+backoff:
 	icsk->icsk_backoff++;
 
 	icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
@@ -237,35 +249,32 @@ out:
 	sock_put(sk);
 }
 
-/**
- * dccp_write_xmitlet  -  Workhorse for CCID packet dequeueing interface
- * See the comments above %ccid_dequeueing_decision for supported modes.
- */
-static void dccp_write_xmitlet(unsigned long data)
+/* Transmit-delay timer: used by the CCIDs to delay actual send time */
+static void dccp_write_xmit_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock *)data;
+	struct dccp_sock *dp = dccp_sk(sk);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk))
-		sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
+		sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1);
 	else
-		dccp_write_xmit(sk);
+		dccp_write_xmit(sk, 0);
 	bh_unlock_sock(sk);
+	sock_put(sk);
 }
 
-static void dccp_write_xmit_timer(unsigned long data)
+static void dccp_init_write_xmit_timer(struct sock *sk)
 {
-	dccp_write_xmitlet(data);
-	sock_put((struct sock *)data);
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
+			(unsigned long)sk);
 }
 
 void dccp_init_xmit_timers(struct sock *sk)
 {
-	struct dccp_sock *dp = dccp_sk(sk);
-
-	tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
-	setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
-							     (unsigned long)sk);
+	dccp_init_write_xmit_timer(sk);
 	inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
 				  &dccp_keepalive_timer);
 }
@@ -281,7 +290,8 @@ u32 dccp_timestamp(void)
 {
 	s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
 
-	return div_u64(delta, DCCP_TIME_RESOLUTION);
+	do_div(delta, 10);
+	return delta;
 }
 EXPORT_SYMBOL_GPL(dccp_timestamp);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9da9f19ece8..f79a5160729 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -811,12 +811,25 @@ void tcp_update_metrics(struct sock *sk)
 	}
 }
 
+/* Numbers are taken from RFC3390.
+ *
+ * John Heffner states:
+ *
+ *	The RFC specifies a window of no more than 4380 bytes
+ *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
+ *	is a bit misleading because they use a clamp at 4380 bytes
+ *	rather than use a multiplier in the relevant range.
+ */
 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 {
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
-	if (!cwnd)
-		cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
+	if (!cwnd) {
+		if (tp->mss_cache > 1460)
+			cwnd = 2;
+		else
+			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
+	}
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 92651940ab00dbe64722e908f70d816713d677b7 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 12 Sep 2008 16:29:34 -0700
Subject: pkt_sched: Add multiqueue scheduler support

This patch is intended to add a qdisc to support the new tx multiqueue
architecture by providing a band for each hardware queue.  By doing
this it is possible to support a different qdisc per physical hardware
queue.

This qdisc uses the skb->queue_mapping to select which band to place
the traffic onto.  It then uses a round robin w/ a check to see if the
subqueue is stopped to determine which band to dequeue the packet from.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/multiqueue.txt |  47 +++-
 include/linux/pkt_sched.h               |   7 +
 net/sched/Kconfig                       |   9 +
 net/sched/Makefile                      |   1 +
 net/sched/sch_multiq.c                  | 467 ++++++++++++++++++++++++++++++++
 5 files changed, 530 insertions(+), 1 deletion(-)
 create mode 100644 net/sched/sch_multiq.c

(limited to 'Documentation')

diff --git a/Documentation/networking/multiqueue.txt b/Documentation/networking/multiqueue.txt
index d391ea63114..5787ee6eca4 100644
--- a/Documentation/networking/multiqueue.txt
+++ b/Documentation/networking/multiqueue.txt
@@ -24,4 +24,49 @@ netif_{start|stop|wake}_subqueue() functions to manage each queue while the
 device is still operational.  netdev->queue_lock is still used when the device
 comes online or when it's completely shut down (unregister_netdev(), etc.).
 
-Author: Peter P. Waskiewicz Jr. <peter.p.waskiewicz.jr@intel.com>
+
+Section 2: Qdisc support for multiqueue devices
+
+-----------------------------------------------
+
+Currently two qdiscs support multiqueue devices.  The first is the default
+pfifo_fast qdisc.  This qdisc supports one qdisc per hardware queue.  A new
+round-robin qdisc, sch_multiq also supports multiple hardware queues. The
+qdisc is responsible for classifying the skb's and then directing the skb's to
+bands and queues based on the value in skb->queue_mapping.  Use this field in
+the base driver to determine which queue to send the skb to.
+
+sch_multiq has been added for hardware that wishes to avoid unnecessary
+requeuing.  It will cycle though the bands and verify that the hardware queue
+associated with the band is not stopped prior to dequeuing a packet.
+
+On qdisc load, the number of bands is based on the number of queues on the
+hardware.  Once the association is made, any skb with skb->queue_mapping set,
+will be queued to the band associated with the hardware queue.
+
+
+Section 3: Brief howto using MULTIQ for multiqueue devices
+---------------------------------------------------------------
+
+The userspace command 'tc,' part of the iproute2 package, is used to configure
+qdiscs.  To add the MULTIQ qdisc to your network device, assuming the device
+is called eth0, run the following command:
+
+# tc qdisc add dev eth0 root handle 1: multiq
+
+The qdisc will allocate the number of bands to equal the number of queues that
+the device reports, and bring the qdisc online.  Assuming eth0 has 4 Tx
+queues, the band mapping would look like:
+
+band 0 => queue 0
+band 1 => queue 1
+band 2 => queue 2
+band 3 => queue 3
+
+Traffic will begin flowing through each queue if your base device has either
+the default simple_tx_hash or a custom netdev->select_queue() defined.
+
+The behavior of tc filters remains the same.
+
+Author: Alexander Duyck <alexander.h.duyck@intel.com>
+Original Author: Peter P. Waskiewicz Jr. <peter.p.waskiewicz.jr@intel.com>
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index e5de421ac7b..5d921fa91a5 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -123,6 +123,13 @@ struct tc_prio_qopt
 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */
 };
 
+/* MULTIQ section */
+
+struct tc_multiq_qopt {
+	__u16	bands;			/* Number of bands */
+	__u16	max_bands;		/* Maximum number of queues */
+};
+
 /* TBF section */
 
 struct tc_tbf_qopt
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 9437b27ff84..efaa7a75e7f 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -106,6 +106,15 @@ config NET_SCH_PRIO
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_prio.
 
+config NET_SCH_MULTIQ
+	tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
+	---help---
+	  Say Y here if you want to use an n-band queue packet scheduler
+	  to support devices that have multiple hardware transmit queues.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_multiq.
+
 config NET_SCH_RED
 	tristate "Random Early Detection (RED)"
 	---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 1d2b0f7df84..3d9b953f7f6 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
 obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
+obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
new file mode 100644
index 00000000000..49a8b67ed3b
--- /dev/null
+++ b/net/sched/sch_multiq.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+
+struct multiq_sched_data {
+	u16 bands;
+	u16 max_bands;
+	u16 curband;
+	struct tcf_proto *filter_list;
+	struct Qdisc **queues;
+};
+
+
+static struct Qdisc *
+multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	u32 band;
+	struct tcf_result res;
+	int err;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	err = tc_classify(skb, q->filter_list, &res);
+#ifdef CONFIG_NET_CLS_ACT
+	switch (err) {
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+	case TC_ACT_SHOT:
+		return NULL;
+	}
+#endif
+	band = skb_get_queue_mapping(skb);
+
+	if (band >= q->bands)
+		return q->queues[0];
+
+	return q->queues[band];
+}
+
+static int
+multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc_enqueue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->bstats.bytes += qdisc_pkt_len(skb);
+		sch->bstats.packets++;
+		sch->q.qlen++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+
+static int
+multiq_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct Qdisc *qdisc;
+	int ret;
+
+	qdisc = multiq_classify(skb, sch, &ret);
+#ifdef CONFIG_NET_CLS_ACT
+	if (qdisc == NULL) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+#endif
+
+	ret = qdisc->ops->requeue(skb, qdisc);
+	if (ret == NET_XMIT_SUCCESS) {
+		sch->q.qlen++;
+		sch->qstats.requeues++;
+		return NET_XMIT_SUCCESS;
+	}
+	if (net_xmit_drop_count(ret))
+		sch->qstats.drops++;
+	return ret;
+}
+
+
+static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	struct sk_buff *skb;
+	int band;
+
+	for (band = 0; band < q->bands; band++) {
+		/* cycle through bands to ensure fairness */
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+
+		/* Check that target subqueue is available before
+		 * pulling an skb to avoid excessive requeues
+		 */
+		if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
+			qdisc = q->queues[q->curband];
+			skb = qdisc->dequeue(qdisc);
+			if (skb) {
+				sch->q.qlen--;
+				return skb;
+			}
+		}
+	}
+	return NULL;
+
+}
+
+static unsigned int multiq_drop(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+	unsigned int len;
+	struct Qdisc *qdisc;
+
+	for (band = q->bands-1; band >= 0; band--) {
+		qdisc = q->queues[band];
+		if (qdisc->ops->drop) {
+			len = qdisc->ops->drop(qdisc);
+			if (len != 0) {
+				sch->q.qlen--;
+				return len;
+			}
+		}
+	}
+	return 0;
+}
+
+
+static void
+multiq_reset(struct Qdisc *sch)
+{
+	u16 band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	for (band = 0; band < q->bands; band++)
+		qdisc_reset(q->queues[band]);
+	sch->q.qlen = 0;
+	q->curband = 0;
+}
+
+static void
+multiq_destroy(struct Qdisc *sch)
+{
+	int band;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	for (band = 0; band < q->bands; band++)
+		qdisc_destroy(q->queues[band]);
+
+	kfree(q->queues);
+}
+
+static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct tc_multiq_qopt *qopt;
+	int i;
+
+	if (!netif_is_multiqueue(qdisc_dev(sch)))
+		return -EINVAL;
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+
+	qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
+
+	sch_tree_lock(sch);
+	q->bands = qopt->bands;
+	for (i = q->bands; i < q->max_bands; i++) {
+		struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
+		if (child != &noop_qdisc) {
+			qdisc_tree_decrease_qlen(child, child->q.qlen);
+			qdisc_destroy(child);
+		}
+	}
+
+	sch_tree_unlock(sch);
+
+	for (i = 0; i < q->bands; i++) {
+		if (q->queues[i] == &noop_qdisc) {
+			struct Qdisc *child;
+			child = qdisc_create_dflt(qdisc_dev(sch),
+						  sch->dev_queue,
+						  &pfifo_qdisc_ops,
+						  TC_H_MAKE(sch->handle,
+							    i + 1));
+			if (child) {
+				sch_tree_lock(sch);
+				child = xchg(&q->queues[i], child);
+
+				if (child != &noop_qdisc) {
+					qdisc_tree_decrease_qlen(child,
+								 child->q.qlen);
+					qdisc_destroy(child);
+				}
+				sch_tree_unlock(sch);
+			}
+		}
+	}
+	return 0;
+}
+
+static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int i;
+
+	q->queues = NULL;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	q->max_bands = qdisc_dev(sch)->num_tx_queues;
+
+	q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
+	if (!q->queues)
+		return -ENOBUFS;
+	for (i = 0; i < q->max_bands; i++)
+		q->queues[i] = &noop_qdisc;
+
+	return multiq_tune(sch, opt);
+}
+
+static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_multiq_qopt opt;
+
+	opt.bands = q->bands;
+	opt.max_bands = q->max_bands;
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+		      struct Qdisc **old)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (band >= q->bands)
+		return -EINVAL;
+
+	if (new == NULL)
+		new = &noop_qdisc;
+
+	sch_tree_lock(sch);
+	*old = q->queues[band];
+	q->queues[band] = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
+}
+
+static struct Qdisc *
+multiq_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = arg - 1;
+
+	if (band >= q->bands)
+		return NULL;
+
+	return q->queues[band];
+}
+
+static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	unsigned long band = TC_H_MIN(classid);
+
+	if (band - 1 >= q->bands)
+		return 0;
+	return band;
+}
+
+static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
+				 u32 classid)
+{
+	return multiq_get(sch, classid);
+}
+
+
+static void multiq_put(struct Qdisc *q, unsigned long cl)
+{
+	return;
+}
+
+static int multiq_change(struct Qdisc *sch, u32 handle, u32 parent,
+			 struct nlattr **tca, unsigned long *arg)
+{
+	unsigned long cl = *arg;
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	return 0;
+}
+
+static int multiq_delete(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	return 0;
+}
+
+
+static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
+			     struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl - 1 > q->bands)
+		return -ENOENT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	if (q->queues[cl-1])
+		tcm->tcm_info = q->queues[cl-1]->handle;
+	return 0;
+}
+
+static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				 struct gnet_dump *d)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *cl_q;
+
+	cl_q = q->queues[cl - 1];
+	if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
+		return -1;
+
+	return 0;
+}
+
+static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	int band;
+
+	if (arg->stop)
+		return;
+
+	for (band = 0; band < q->bands; band++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, band+1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static const struct Qdisc_class_ops multiq_class_ops = {
+	.graft		=	multiq_graft,
+	.leaf		=	multiq_leaf,
+	.get		=	multiq_get,
+	.put		=	multiq_put,
+	.change		=	multiq_change,
+	.delete		=	multiq_delete,
+	.walk		=	multiq_walk,
+	.tcf_chain	=	multiq_find_tcf,
+	.bind_tcf	=	multiq_bind,
+	.unbind_tcf	=	multiq_put,
+	.dump		=	multiq_dump_class,
+	.dump_stats	=	multiq_dump_class_stats,
+};
+
+static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
+	.next		=	NULL,
+	.cl_ops		=	&multiq_class_ops,
+	.id		=	"multiq",
+	.priv_size	=	sizeof(struct multiq_sched_data),
+	.enqueue	=	multiq_enqueue,
+	.dequeue	=	multiq_dequeue,
+	.requeue	=	multiq_requeue,
+	.drop		=	multiq_drop,
+	.init		=	multiq_init,
+	.reset		=	multiq_reset,
+	.destroy	=	multiq_destroy,
+	.change		=	multiq_tune,
+	.dump		=	multiq_dump,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init multiq_module_init(void)
+{
+	return register_qdisc(&multiq_qdisc_ops);
+}
+
+static void __exit multiq_module_exit(void)
+{
+	unregister_qdisc(&multiq_qdisc_ops);
+}
+
+module_init(multiq_module_init)
+module_exit(multiq_module_exit)
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From ca9b0e27e072be4cef2f5f0cbc0b0fd94eae3520 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 12 Sep 2008 16:30:20 -0700
Subject: pkt_action: add new action skbedit

This new action will have the ability to change the priority and/or
queue_mapping fields on an sk_buff.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/multiqueue.txt |   9 +-
 include/linux/tc_act/Kbuild             |   1 +
 include/linux/tc_act/tc_skbedit.h       |  44 +++++++
 include/net/tc_act/tc_skbedit.h         |  34 ++++++
 net/sched/Kconfig                       |  11 ++
 net/sched/Makefile                      |   1 +
 net/sched/act_skbedit.c                 | 203 ++++++++++++++++++++++++++++++++
 7 files changed, 302 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/tc_act/tc_skbedit.h
 create mode 100644 include/net/tc_act/tc_skbedit.h
 create mode 100644 net/sched/act_skbedit.c

(limited to 'Documentation')

diff --git a/Documentation/networking/multiqueue.txt b/Documentation/networking/multiqueue.txt
index 5787ee6eca4..10113ffa807 100644
--- a/Documentation/networking/multiqueue.txt
+++ b/Documentation/networking/multiqueue.txt
@@ -66,7 +66,14 @@ band 3 => queue 3
 Traffic will begin flowing through each queue if your base device has either
 the default simple_tx_hash or a custom netdev->select_queue() defined.
 
-The behavior of tc filters remains the same.
+The behavior of tc filters remains the same.  However a new tc action,
+skbedit, has been added.  Assuming you wanted to route all traffic to a
+specific host, for example 192.168.0.3, though a specific queue you could use
+this action and establish a filter such as:
+
+tc filter add dev eth0 parent 1: protocol ip prio 1 u32 \
+	match ip dst 192.168.0.3 \
+	action skbedit queue_mapping 3
 
 Author: Alexander Duyck <alexander.h.duyck@intel.com>
 Original Author: Peter P. Waskiewicz Jr. <peter.p.waskiewicz.jr@intel.com>
diff --git a/include/linux/tc_act/Kbuild b/include/linux/tc_act/Kbuild
index 6dac0d7365c..76990937f4c 100644
--- a/include/linux/tc_act/Kbuild
+++ b/include/linux/tc_act/Kbuild
@@ -3,3 +3,4 @@ header-y += tc_ipt.h
 header-y += tc_mirred.h
 header-y += tc_pedit.h
 header-y += tc_nat.h
+header-y += tc_skbedit.h
diff --git a/include/linux/tc_act/tc_skbedit.h b/include/linux/tc_act/tc_skbedit.h
new file mode 100644
index 00000000000..a14e461a7af
--- /dev/null
+++ b/include/linux/tc_act/tc_skbedit.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#ifndef __LINUX_TC_SKBEDIT_H
+#define __LINUX_TC_SKBEDIT_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_SKBEDIT 11
+
+#define SKBEDIT_F_PRIORITY		0x1
+#define SKBEDIT_F_QUEUE_MAPPING		0x2
+
+struct tc_skbedit {
+	tc_gen;
+};
+
+enum {
+	TCA_SKBEDIT_UNSPEC,
+	TCA_SKBEDIT_TM,
+	TCA_SKBEDIT_PARMS,
+	TCA_SKBEDIT_PRIORITY,
+	TCA_SKBEDIT_QUEUE_MAPPING,
+	__TCA_SKBEDIT_MAX
+};
+#define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
+
+#endif
diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
new file mode 100644
index 00000000000..6abb3ed3ebf
--- /dev/null
+++ b/include/net/tc_act/tc_skbedit.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#ifndef __NET_TC_SKBEDIT_H
+#define __NET_TC_SKBEDIT_H
+
+#include <net/act_api.h>
+
+struct tcf_skbedit {
+	struct tcf_common	common;
+	u32			flags;
+	u32     		priority;
+	u16			queue_mapping;
+};
+#define to_skbedit(pc) \
+	container_of(pc, struct tcf_skbedit, common)
+
+#endif /* __NET_TC_SKBEDIT_H */
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index efaa7a75e7f..6767e54155d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -485,6 +485,17 @@ config NET_ACT_SIMP
 	  To compile this code as a module, choose M here: the
 	  module will be called simple.
 
+config NET_ACT_SKBEDIT
+        tristate "SKB Editing"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to change skb priority or queue_mapping settings.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called skbedit.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 3d9b953f7f6..e60c9925b26 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_NET_ACT_IPT)	+= act_ipt.o
 obj-$(CONFIG_NET_ACT_NAT)	+= act_nat.o
 obj-$(CONFIG_NET_ACT_PEDIT)	+= act_pedit.o
 obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
+obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
new file mode 100644
index 00000000000..fe9777e77f3
--- /dev/null
+++ b/net/sched/act_skbedit.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Alexander Duyck <alexander.h.duyck@intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbedit.h>
+#include <net/tc_act/tc_skbedit.h>
+
+#define SKBEDIT_TAB_MASK     15
+static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1];
+static u32 skbedit_idx_gen;
+static DEFINE_RWLOCK(skbedit_lock);
+
+static struct tcf_hashinfo skbedit_hash_info = {
+	.htab	=	tcf_skbedit_ht,
+	.hmask	=	SKBEDIT_TAB_MASK,
+	.lock	=	&skbedit_lock,
+};
+
+static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a,
+		       struct tcf_result *res)
+{
+	struct tcf_skbedit *d = a->priv;
+
+	spin_lock(&d->tcf_lock);
+	d->tcf_tm.lastuse = jiffies;
+	d->tcf_bstats.bytes += qdisc_pkt_len(skb);
+	d->tcf_bstats.packets++;
+
+	if (d->flags & SKBEDIT_F_PRIORITY)
+		skb->priority = d->priority;
+	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
+	    skb->dev->real_num_tx_queues > d->queue_mapping)
+		skb_set_queue_mapping(skb, d->queue_mapping);
+
+	spin_unlock(&d->tcf_lock);
+	return d->tcf_action;
+}
+
+static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
+	[TCA_SKBEDIT_PARMS]		= { .len = sizeof(struct tc_skbedit) },
+	[TCA_SKBEDIT_PRIORITY]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_QUEUE_MAPPING]	= { .len = sizeof(u16) },
+};
+
+static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
+			 struct tc_action *a, int ovr, int bind)
+{
+	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
+	struct tc_skbedit *parm;
+	struct tcf_skbedit *d;
+	struct tcf_common *pc;
+	u32 flags = 0, *priority = NULL;
+	u16 *queue_mapping = NULL;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_SKBEDIT_PARMS] == NULL)
+		return -EINVAL;
+
+	if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
+		flags |= SKBEDIT_F_PRIORITY;
+		priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
+	}
+
+	if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+		flags |= SKBEDIT_F_QUEUE_MAPPING;
+		queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
+	}
+	if (!flags)
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
+
+	pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info);
+	if (!pc) {
+		pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
+				     &skbedit_idx_gen, &skbedit_hash_info);
+		if (unlikely(!pc))
+			return -ENOMEM;
+
+		d = to_skbedit(pc);
+		ret = ACT_P_CREATED;
+	} else {
+		d = to_skbedit(pc);
+		if (!ovr) {
+			tcf_hash_release(pc, bind, &skbedit_hash_info);
+			return -EEXIST;
+		}
+	}
+
+	spin_lock_bh(&d->tcf_lock);
+
+	d->flags = flags;
+	if (flags & SKBEDIT_F_PRIORITY)
+		d->priority = *priority;
+	if (flags & SKBEDIT_F_QUEUE_MAPPING)
+		d->queue_mapping = *queue_mapping;
+	d->tcf_action = parm->action;
+
+	spin_unlock_bh(&d->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(pc, &skbedit_hash_info);
+	return ret;
+}
+
+static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_skbedit *d = a->priv;
+
+	if (d)
+		return tcf_hash_release(&d->common, bind, &skbedit_hash_info);
+	return 0;
+}
+
+static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
+				int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_skbedit *d = a->priv;
+	struct tc_skbedit opt;
+	struct tcf_t t;
+
+	opt.index = d->tcf_index;
+	opt.refcnt = d->tcf_refcnt - ref;
+	opt.bindcnt = d->tcf_bindcnt - bind;
+	opt.action = d->tcf_action;
+	NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
+	if (d->flags & SKBEDIT_F_PRIORITY)
+		NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
+			&d->priority);
+	if (d->flags & SKBEDIT_F_QUEUE_MAPPING)
+		NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING,
+			sizeof(d->queue_mapping), &d->queue_mapping);
+	t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
+	NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t);
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct tc_action_ops act_skbedit_ops = {
+	.kind		=	"skbedit",
+	.hinfo		=	&skbedit_hash_info,
+	.type		=	TCA_ACT_SKBEDIT,
+	.capab		=	TCA_CAP_NONE,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_skbedit,
+	.dump		=	tcf_skbedit_dump,
+	.cleanup	=	tcf_skbedit_cleanup,
+	.init		=	tcf_skbedit_init,
+	.walk		=	tcf_generic_walker,
+};
+
+MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
+MODULE_DESCRIPTION("SKB Editing");
+MODULE_LICENSE("GPL");
+
+static int __init skbedit_init_module(void)
+{
+	return tcf_register_action(&act_skbedit_ops);
+}
+
+static void __exit skbedit_cleanup_module(void)
+{
+	tcf_unregister_action(&act_skbedit_ops);
+}
+
+module_init(skbedit_init_module);
+module_exit(skbedit_cleanup_module);
-- 
cgit v1.2.3-70-g09d2


From 67333bb5679325db310bb612c1de3e6e47bb0043 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 12 Sep 2008 17:56:50 -0700
Subject: skbedit: Fix a typo in the documentation

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/multiqueue.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/multiqueue.txt b/Documentation/networking/multiqueue.txt
index 10113ffa807..8c2b06b77f6 100644
--- a/Documentation/networking/multiqueue.txt
+++ b/Documentation/networking/multiqueue.txt
@@ -68,7 +68,7 @@ the default simple_tx_hash or a custom netdev->select_queue() defined.
 
 The behavior of tc filters remains the same.  However a new tc action,
 skbedit, has been added.  Assuming you wanted to route all traffic to a
-specific host, for example 192.168.0.3, though a specific queue you could use
+specific host, for example 192.168.0.3, through a specific queue you could use
 this action and establish a filter such as:
 
 tc filter add dev eth0 parent 1: protocol ip prio 1 u32 \
-- 
cgit v1.2.3-70-g09d2


From f07d1501292b3b0d3276ee0e537005526a45e242 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 12 Sep 2008 17:57:23 -0700
Subject: multiq: Further multiqueue cleanup

This patch resolves a few issues found with multiq including wording
suggestions and a problem seen in the allocation of queues.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/multiqueue.txt | 14 +++++++-------
 net/sched/sch_multiq.c                  | 13 +++++++++----
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/multiqueue.txt b/Documentation/networking/multiqueue.txt
index 8c2b06b77f6..4caa0e314cc 100644
--- a/Documentation/networking/multiqueue.txt
+++ b/Documentation/networking/multiqueue.txt
@@ -29,15 +29,15 @@ Section 2: Qdisc support for multiqueue devices
 
 -----------------------------------------------
 
-Currently two qdiscs support multiqueue devices.  The first is the default
-pfifo_fast qdisc.  This qdisc supports one qdisc per hardware queue.  A new
-round-robin qdisc, sch_multiq also supports multiple hardware queues. The
+Currently two qdiscs are optimized for multiqueue devices.  The first is the
+default pfifo_fast qdisc.  This qdisc supports one qdisc per hardware queue.
+A new round-robin qdisc, sch_multiq also supports multiple hardware queues. The
 qdisc is responsible for classifying the skb's and then directing the skb's to
 bands and queues based on the value in skb->queue_mapping.  Use this field in
 the base driver to determine which queue to send the skb to.
 
-sch_multiq has been added for hardware that wishes to avoid unnecessary
-requeuing.  It will cycle though the bands and verify that the hardware queue
+sch_multiq has been added for hardware that wishes to avoid head-of-line
+blocking.  It will cycle though the bands and verify that the hardware queue
 associated with the band is not stopped prior to dequeuing a packet.
 
 On qdisc load, the number of bands is based on the number of queues on the
@@ -63,8 +63,8 @@ band 1 => queue 1
 band 2 => queue 2
 band 3 => queue 3
 
-Traffic will begin flowing through each queue if your base device has either
-the default simple_tx_hash or a custom netdev->select_queue() defined.
+Traffic will begin flowing through each queue based on either the simple_tx_hash
+function or based on netdev->select_queue() if you have it defined.
 
 The behavior of tc filters remains the same.  However a new tc action,
 skbedit, has been added.  Assuming you wanted to route all traffic to a
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 49a8b67ed3b..5d9cd68e91d 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -214,8 +214,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 	sch_tree_lock(sch);
 	q->bands = qopt->bands;
 	for (i = q->bands; i < q->max_bands; i++) {
-		struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
-		if (child != &noop_qdisc) {
+		if (q->queues[i] != &noop_qdisc) {
+			struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc);
 			qdisc_tree_decrease_qlen(child, child->q.qlen);
 			qdisc_destroy(child);
 		}
@@ -250,7 +250,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
 static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
-	int i;
+	int i, err;
 
 	q->queues = NULL;
 
@@ -265,7 +265,12 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
 	for (i = 0; i < q->max_bands; i++)
 		q->queues[i] = &noop_qdisc;
 
-	return multiq_tune(sch, opt);
+	err = multiq_tune(sch,opt);
+
+	if (err)
+		kfree(q->queues);
+
+	return err;
 }
 
 static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
cgit v1.2.3-70-g09d2


From b2e1b30290539b344cbaff0d9da38012e03aa347 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Tue, 9 Sep 2008 23:19:48 -0700
Subject: cfg80211: Add new wireless regulatory infrastructure

This adds the new wireless regulatory infrastructure. The
main motiviation behind this was to centralize regulatory
code as each driver was implementing their own regulatory solution,
and to replace the initial centralized code we have where:

* only 3 regulatory domains are supported: US, JP and EU
* regulatory domains can only be changed through module parameter
* all rules were built statically in the kernel

We now have support for regulatory domains for many countries
and regulatory domains are now queried through a userspace agent
through udev allowing distributions to update regulatory rules
without updating the kernel.

Each driver can regulatory_hint() a regulatory domain
based on either their EEPROM mapped regulatory domain value to a
respective ISO/IEC 3166-1 country code or pass an internally built
regulatory domain. We also add support to let the user set the
regulatory domain through userspace in case of faulty EEPROMs to
further help compliance.

Support for world roaming will be added soon for cards capable of
this.

For more information see:

http://wireless.kernel.org/en/developers/Regulatory/CRDA

For now we leave an option to enable the old module parameter,
ieee80211_regdom, and to build the 3 old regdomains statically
(US, JP and EU). This option is CONFIG_WIRELESS_OLD_REGULATORY.
These old static definitions and the module parameter is being
scheduled for removal for 2.6.29. Note that if you use this
you won't make use of a world regulatory domain as its pointless.
If you leave this option enabled and if CRDA is present and you
use US or JP we will try to ask CRDA to update us a regulatory
domain for us.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 Documentation/feature-removal-schedule.txt |  18 +
 Documentation/networking/regulatory.txt    | 194 +++++++
 include/linux/nl80211.h                    |  96 +++-
 include/net/cfg80211.h                     |  60 +++
 include/net/mac80211.h                     |   2 +
 include/net/wireless.h                     |  58 +++
 net/mac80211/cfg.c                         |   7 +
 net/wireless/Kconfig                       |  32 ++
 net/wireless/core.c                        | 162 +++++-
 net/wireless/core.h                        |   2 +-
 net/wireless/nl80211.c                     | 151 ++++++
 net/wireless/reg.c                         | 805 +++++++++++++++++++++++++----
 net/wireless/reg.h                         |  44 ++
 13 files changed, 1513 insertions(+), 118 deletions(-)
 create mode 100644 Documentation/networking/regulatory.txt
 create mode 100644 net/wireless/reg.h

(limited to 'Documentation')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index eb1a47b9742..c93fcdec246 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,24 @@ be removed from this file.
 
 ---------------------------
 
+What:	old static regulatory information and ieee80211_regdom module parameter
+When:	2.6.29
+Why:	The old regulatory infrastructure has been replaced with a new one
+	which does not require statically defined regulatory domains. We do
+	not want to keep static regulatory domains in the kernel due to the
+	the dynamic nature of regulatory law and localization. We kept around
+	the old static definitions for the regulatory domains of:
+		* US
+		* JP
+		* EU
+	and used by default the US when CONFIG_WIRELESS_OLD_REGULATORY was
+	set. We also kept around the ieee80211_regdom module parameter in case
+	some applications were relying on it. Changing regulatory domains
+	can now be done instead by using nl80211, as is done with iw.
+Who:	Luis R. Rodriguez <lrodriguez@atheros.com>
+
+---------------------------
+
 What:	dev->power.power_state
 When:	July 2007
 Why:	Broken design for runtime control over driver power states, confusing
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt
new file mode 100644
index 00000000000..a96989a8ff3
--- /dev/null
+++ b/Documentation/networking/regulatory.txt
@@ -0,0 +1,194 @@
+Linux wireless regulatory documentation
+---------------------------------------
+
+This document gives a brief review over how the Linux wireless
+regulatory infrastructure works.
+
+More up to date information can be obtained at the project's web page:
+
+http://wireless.kernel.org/en/developers/Regulatory
+
+Keeping regulatory domains in userspace
+---------------------------------------
+
+Due to the dynamic nature of regulatory domains we keep them
+in userspace and provide a framework for userspace to upload
+to the kernel one regulatory domain to be used as the central
+core regulatory domain all wireless devices should adhere to.
+
+How to get regulatory domains to the kernel
+-------------------------------------------
+
+Userspace gets a regulatory domain in the kernel by having
+a userspace agent build it and send it via nl80211. Only
+expected regulatory domains will be respected by the kernel.
+
+A currently available userspace agent which can accomplish this
+is CRDA - central regulatory domain agent. Its documented here:
+
+http://wireless.kernel.org/en/developers/Regulatory/CRDA
+
+Essentially the kernel will send a udev event when it knows
+it needs a new regulatory domain. A udev rule can be put in place
+to trigger crda to send the respective regulatory domain for a
+specific ISO/IEC 3166 alpha2.
+
+Below is an example udev rule which can be used:
+
+# Example file, should be put in /etc/udev/rules.d/regulatory.rules
+KERNEL=="regulatory*", ACTION=="change", SUBSYSTEM=="platform", RUN+="/sbin/crda"
+
+The alpha2 is passed as an environment variable under the variable COUNTRY.
+
+Who asks for regulatory domains?
+--------------------------------
+
+* Users
+
+Users can use iw:
+
+http://wireless.kernel.org/en/users/Documentation/iw
+
+An example:
+
+  # set regulatory domain to "Costa Rica"
+  iw reg set CR
+
+This will request the kernel to set the regulatory domain to
+the specificied alpha2. The kernel in turn will then ask userspace
+to provide a regulatory domain for the alpha2 specified by the user
+by sending a uevent.
+
+* Wireless subsystems for Country Information elements
+
+The kernel will send a uevent to inform userspace a new
+regulatory domain is required. More on this to be added
+as its integration is added.
+
+* Drivers
+
+If drivers determine they need a specific regulatory domain
+set they can inform the wireless core using regulatory_hint().
+They have two options -- they either provide an alpha2 so that
+crda can provide back a regulatory domain for that country or
+they can build their own regulatory domain based on internal
+custom knowledge so the wireless core can respect it.
+
+*Most* drivers will rely on the first mechanism of providing a
+regulatory hint with an alpha2. For these drivers there is an additional
+check that can be used to ensure compliance based on custom EEPROM
+regulatory data. This additional check can be used by drivers by
+registering on its struct wiphy a reg_notifier() callback. This notifier
+is called when the core's regulatory domain has been changed. The driver
+can use this to review the changes made and also review who made them
+(driver, user, country IE) and determine what to allow based on its
+internal EEPROM data. Devices drivers wishing to be capable of world
+roaming should use this callback. More on world roaming will be
+added to this document when its support is enabled.
+
+Device drivers who provide their own built regulatory domain
+do not need a callback as the channels registered by them are
+the only ones that will be allowed and therefore *additional*
+cannels cannot be enabled.
+
+Example code - drivers hinting an alpha2:
+------------------------------------------
+
+This example comes from the zd1211rw device driver. You can start
+by having a mapping of your device's EEPROM country/regulatory
+domain value to to a specific alpha2 as follows:
+
+static struct zd_reg_alpha2_map reg_alpha2_map[] = {
+	{ ZD_REGDOMAIN_FCC, "US" },
+	{ ZD_REGDOMAIN_IC, "CA" },
+	{ ZD_REGDOMAIN_ETSI, "DE" }, /* Generic ETSI, use most restrictive */
+	{ ZD_REGDOMAIN_JAPAN, "JP" },
+	{ ZD_REGDOMAIN_JAPAN_ADD, "JP" },
+	{ ZD_REGDOMAIN_SPAIN, "ES" },
+	{ ZD_REGDOMAIN_FRANCE, "FR" },
+
+Then you can define a routine to map your read EEPROM value to an alpha2,
+as follows:
+
+static int zd_reg2alpha2(u8 regdomain, char *alpha2)
+{
+	unsigned int i;
+	struct zd_reg_alpha2_map *reg_map;
+		for (i = 0; i < ARRAY_SIZE(reg_alpha2_map); i++) {
+			reg_map = &reg_alpha2_map[i];
+			if (regdomain == reg_map->reg) {
+			alpha2[0] = reg_map->alpha2[0];
+			alpha2[1] = reg_map->alpha2[1];
+			return 0;
+		}
+	}
+	return 1;
+}
+
+Lastly, you can then hint to the core of your discovered alpha2, if a match
+was found. You need to do this after you have registered your wiphy. You
+are expected to do this during initialization.
+
+	r = zd_reg2alpha2(mac->regdomain, alpha2);
+	if (!r)
+		regulatory_hint(hw->wiphy, alpha2, NULL);
+
+Example code - drivers providing a built in regulatory domain:
+--------------------------------------------------------------
+
+If you have regulatory information you can obtain from your
+driver and you *need* to use this we let you build a regulatory domain
+structure and pass it to the wireless core. To do this you should
+kmalloc() a structure big enough to hold your regulatory domain
+structure and you should then fill it with your data. Finally you simply
+call regulatory_hint() with the regulatory domain structure in it.
+
+Bellow is a simple example, with a regulatory domain cached using the stack.
+Your implementation may vary (read EEPROM cache instead, for example).
+
+Example cache of some regulatory domain
+
+struct ieee80211_regdomain mydriver_jp_regdom = {
+	.n_reg_rules = 3,
+	.alpha2 =  "JP",
+	//.alpha2 =  "99", /* If I have no alpha2 to map it to */
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..14 */
+		REG_RULE(2412-20, 2484+20, 40, 6, 20, 0),
+		/* IEEE 802.11a, channels 34..48 */
+		REG_RULE(5170-20, 5240+20, 40, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channels 52..64 */
+		REG_RULE(5260-20, 5320+20, 40, 6, 20,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+	}
+};
+
+Then in some part of your code after your wiphy has been registered:
+
+	int r;
+	struct ieee80211_regdomain *rd;
+	int size_of_regd;
+	int num_rules = mydriver_jp_regdom.n_reg_rules;
+	unsigned int i;
+
+	size_of_regd = sizeof(struct ieee80211_regdomain) +
+		(num_rules * sizeof(struct ieee80211_reg_rule));
+
+	rd = kzalloc(size_of_regd, GFP_KERNEL);
+	if (!rd)
+	return -ENOMEM;
+
+	memcpy(rd, &mydriver_jp_regdom, sizeof(struct ieee80211_regdomain));
+
+	for (i=0; i < num_rules; i++) {
+		memcpy(&rd->reg_rules[i], &mydriver_jp_regdom.reg_rules[i],
+			sizeof(struct ieee80211_reg_rule));
+	}
+	r = regulatory_hint(hw->wiphy, NULL, rd);
+	if (r) {
+		kfree(rd);
+		return r;
+	}
+
diff --git a/include/linux/nl80211.h b/include/linux/nl80211.h
index 5e51f4e7600..9bad65400fb 100644
--- a/include/linux/nl80211.h
+++ b/include/linux/nl80211.h
@@ -92,6 +92,20 @@
  * @NL80211_CMD_SET_BSS: Set BSS attributes for BSS identified by
  *	%NL80211_ATTR_IFINDEX.
  *
+ * @NL80211_CMD_SET_REG: Set current regulatory domain. CRDA sends this command
+ *	after being queried by the kernel. CRDA replies by sending a regulatory
+ *	domain structure which consists of %NL80211_ATTR_REG_ALPHA set to our
+ *	current alpha2 if it found a match. It also provides
+ * 	NL80211_ATTR_REG_RULE_FLAGS, and a set of regulatory rules. Each
+ * 	regulatory rule is a nested set of attributes  given by
+ * 	%NL80211_ATTR_REG_RULE_FREQ_[START|END] and
+ * 	%NL80211_ATTR_FREQ_RANGE_MAX_BW with an attached power rule given by
+ * 	%NL80211_ATTR_REG_RULE_POWER_MAX_ANT_GAIN and
+ * 	%NL80211_ATTR_REG_RULE_POWER_MAX_EIRP.
+ * @NL80211_CMD_REQ_SET_REG: ask the wireless core to set the regulatory domain
+ * 	to the the specified ISO/IEC 3166-1 alpha2 country code. The core will
+ * 	store this as a valid request and then query userspace for it.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -131,7 +145,10 @@ enum nl80211_commands {
 
 	NL80211_CMD_SET_BSS,
 
-	/* add commands here */
+	NL80211_CMD_SET_REG,
+	NL80211_CMD_REQ_SET_REG,
+
+	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
 	__NL80211_CMD_AFTER_LAST,
@@ -197,10 +214,21 @@ enum nl80211_commands {
  * 	info given for %NL80211_CMD_GET_MPATH, nested attribute described at
  *	&enum nl80211_mpath_info.
  *
- *
  * @NL80211_ATTR_MNTR_FLAGS: flags, nested element with NLA_FLAG attributes of
  *      &enum nl80211_mntr_flags.
  *
+ * @NL80211_ATTR_REG_ALPHA2: an ISO-3166-alpha2 country code for which the
+ * 	current regulatory domain should be set to or is already set to.
+ * 	For example, 'CR', for Costa Rica. This attribute is used by the kernel
+ * 	to query the CRDA to retrieve one regulatory domain. This attribute can
+ * 	also be used by userspace to query the kernel for the currently set
+ * 	regulatory domain. We chose an alpha2 as that is also used by the
+ * 	IEEE-802.11d country information element to identify a country.
+ * 	Users can also simply ask the wireless core to set regulatory domain
+ * 	to a specific alpha2.
+ * @NL80211_ATTR_REG_RULES: a nested array of regulatory domain regulatory
+ *	rules.
+ *
  * @NL80211_ATTR_BSS_CTS_PROT: whether CTS protection is enabled (u8, 0 or 1)
  * @NL80211_ATTR_BSS_SHORT_PREAMBLE: whether short preamble is enabled
  *	(u8, 0 or 1)
@@ -265,6 +293,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_SUPPORTED_IFTYPES,
 
+	NL80211_ATTR_REG_ALPHA2,
+	NL80211_ATTR_REG_RULES,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -278,6 +309,7 @@ enum nl80211_attrs {
 #define NL80211_ATTR_HT_CAPABILITY NL80211_ATTR_HT_CAPABILITY
 
 #define NL80211_MAX_SUPP_RATES			32
+#define NL80211_MAX_SUPP_REG_RULES		32
 #define NL80211_TKIP_DATA_OFFSET_ENCR_KEY	0
 #define NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY	16
 #define NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY	24
@@ -472,6 +504,66 @@ enum nl80211_bitrate_attr {
 	NL80211_BITRATE_ATTR_MAX = __NL80211_BITRATE_ATTR_AFTER_LAST - 1
 };
 
+/**
+ * enum nl80211_reg_rule_attr - regulatory rule attributes
+ * @NL80211_ATTR_REG_RULE_FLAGS: a set of flags which specify additional
+ * 	considerations for a given frequency range. These are the
+ * 	&enum nl80211_reg_rule_flags.
+ * @NL80211_ATTR_FREQ_RANGE_START: starting frequencry for the regulatory
+ * 	rule in KHz. This is not a center of frequency but an actual regulatory
+ * 	band edge.
+ * @NL80211_ATTR_FREQ_RANGE_END: ending frequency for the regulatory rule
+ * 	in KHz. This is not a center a frequency but an actual regulatory
+ * 	band edge.
+ * @NL80211_ATTR_FREQ_RANGE_MAX_BW: maximum allowed bandwidth for this
+ * 	frequency range, in KHz.
+ * @NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN: the maximum allowed antenna gain
+ * 	for a given frequency range. The value is in mBi (100 * dBi).
+ * 	If you don't have one then don't send this.
+ * @NL80211_ATTR_POWER_RULE_MAX_EIRP: the maximum allowed EIRP for
+ * 	a given frequency range. The value is in mBm (100 * dBm).
+ */
+enum nl80211_reg_rule_attr {
+	__NL80211_REG_RULE_ATTR_INVALID,
+	NL80211_ATTR_REG_RULE_FLAGS,
+
+	NL80211_ATTR_FREQ_RANGE_START,
+	NL80211_ATTR_FREQ_RANGE_END,
+	NL80211_ATTR_FREQ_RANGE_MAX_BW,
+
+	NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN,
+	NL80211_ATTR_POWER_RULE_MAX_EIRP,
+
+	/* keep last */
+	__NL80211_REG_RULE_ATTR_AFTER_LAST,
+	NL80211_REG_RULE_ATTR_MAX = __NL80211_REG_RULE_ATTR_AFTER_LAST - 1
+};
+
+/**
+ * enum nl80211_reg_rule_flags - regulatory rule flags
+ *
+ * @NL80211_RRF_NO_OFDM: OFDM modulation not allowed
+ * @NL80211_RRF_NO_CCK: CCK modulation not allowed
+ * @NL80211_RRF_NO_INDOOR: indoor operation not allowed
+ * @NL80211_RRF_NO_OUTDOOR: outdoor operation not allowed
+ * @NL80211_RRF_DFS: DFS support is required to be used
+ * @NL80211_RRF_PTP_ONLY: this is only for Point To Point links
+ * @NL80211_RRF_PTMP_ONLY: this is only for Point To Multi Point links
+ * @NL80211_RRF_PASSIVE_SCAN: passive scan is required
+ * @NL80211_RRF_NO_IBSS: no IBSS is allowed
+ */
+enum nl80211_reg_rule_flags {
+	NL80211_RRF_NO_OFDM		= 1<<0,
+	NL80211_RRF_NO_CCK		= 1<<1,
+	NL80211_RRF_NO_INDOOR		= 1<<2,
+	NL80211_RRF_NO_OUTDOOR		= 1<<3,
+	NL80211_RRF_DFS			= 1<<4,
+	NL80211_RRF_PTP_ONLY		= 1<<5,
+	NL80211_RRF_PTMP_ONLY		= 1<<6,
+	NL80211_RRF_PASSIVE_SCAN	= 1<<7,
+	NL80211_RRF_NO_IBSS		= 1<<8,
+};
+
 /**
  * enum nl80211_mntr_flags - monitor configuration flags
  *
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0a72d1e3d3a..9f40c4d417d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -287,6 +287,66 @@ struct bss_parameters {
 	int use_short_slot_time;
 };
 
+/**
+ * enum reg_set_by - Indicates who is trying to set the regulatory domain
+ * @REGDOM_SET_BY_INIT: regulatory domain was set by initialization. We will be
+ * 	using a static world regulatory domain by default.
+ * @REGDOM_SET_BY_CORE: Core queried CRDA for a dynamic world regulatory domain.
+ * @REGDOM_SET_BY_USER: User asked the wireless core to set the
+ * 	regulatory domain.
+ * @REGDOM_SET_BY_DRIVER: a wireless drivers has hinted to the wireless core
+ *	it thinks its knows the regulatory domain we should be in.
+ * @REGDOM_SET_BY_COUNTRY_IE: the wireless core has received an 802.11 country
+ *	information element with regulatory information it thinks we
+ *	should consider.
+ */
+enum reg_set_by {
+	REGDOM_SET_BY_INIT,
+	REGDOM_SET_BY_CORE,
+	REGDOM_SET_BY_USER,
+	REGDOM_SET_BY_DRIVER,
+	REGDOM_SET_BY_COUNTRY_IE,
+};
+
+struct ieee80211_freq_range {
+	u32 start_freq_khz;
+	u32 end_freq_khz;
+	u32 max_bandwidth_khz;
+};
+
+struct ieee80211_power_rule {
+	u32 max_antenna_gain;
+	u32 max_eirp;
+};
+
+struct ieee80211_reg_rule {
+	struct ieee80211_freq_range freq_range;
+	struct ieee80211_power_rule power_rule;
+	u32 flags;
+};
+
+struct ieee80211_regdomain {
+	u32 n_reg_rules;
+	char alpha2[2];
+	struct ieee80211_reg_rule reg_rules[];
+};
+
+#define MHZ_TO_KHZ(freq) (freq * 1000)
+#define KHZ_TO_MHZ(freq) (freq / 1000)
+#define DBI_TO_MBI(gain) (gain * 100)
+#define MBI_TO_DBI(gain) (gain / 100)
+#define DBM_TO_MBM(gain) (gain * 100)
+#define MBM_TO_DBM(gain) (gain / 100)
+
+#define REG_RULE(start, end, bw, gain, eirp, reg_flags) { \
+	.freq_range.start_freq_khz = (start) * 1000, \
+	.freq_range.end_freq_khz = (end) * 1000, \
+	.freq_range.max_bandwidth_khz = (bw) * 1000, \
+	.power_rule.max_antenna_gain = (gain) * 100, \
+	.power_rule.max_eirp = (eirp) * 100, \
+	.flags = reg_flags, \
+	}
+
 /* from net/wireless.h */
 struct wiphy;
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fb9e62211c3..f504e3eca7d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -833,6 +833,8 @@ struct ieee80211_hw {
 	s8 max_signal;
 };
 
+struct ieee80211_hw *wiphy_to_hw(struct wiphy *wiphy);
+
 /**
  * SET_IEEE80211_DEV - set device for 802.11 hardware
  *
diff --git a/include/net/wireless.h b/include/net/wireless.h
index 1dc8ec3daa2..e4378cc6bf8 100644
--- a/include/net/wireless.h
+++ b/include/net/wireless.h
@@ -60,6 +60,7 @@ enum ieee80211_channel_flags {
  * with cfg80211.
  *
  * @center_freq: center frequency in MHz
+ * @max_bandwidth: maximum allowed bandwidth for this channel, in MHz
  * @hw_value: hardware-specific value for the channel
  * @flags: channel flags from &enum ieee80211_channel_flags.
  * @orig_flags: channel flags at registration time, used by regulatory
@@ -73,6 +74,7 @@ enum ieee80211_channel_flags {
 struct ieee80211_channel {
 	enum ieee80211_band band;
 	u16 center_freq;
+	u8 max_bandwidth;
 	u16 hw_value;
 	u32 flags;
 	int max_antenna_gain;
@@ -178,6 +180,7 @@ struct ieee80211_supported_band {
  * struct wiphy - wireless hardware description
  * @idx: the wiphy index assigned to this item
  * @class_dev: the class device representing /sys/class/ieee80211/<wiphy-name>
+ * @reg_notifier: the driver's regulatory notification callback
  */
 struct wiphy {
 	/* assign these fields before you register the wiphy */
@@ -197,6 +200,9 @@ struct wiphy {
 
 	struct ieee80211_supported_band *bands[IEEE80211_NUM_BANDS];
 
+	/* Lets us get back the wiphy on the callback */
+	int (*reg_notifier)(struct wiphy *wiphy, enum reg_set_by setby);
+
 	/* fields below are read-only, assigned by cfg80211 */
 
 	/* the item in /sys/class/ieee80211/ points to this,
@@ -322,6 +328,58 @@ extern int ieee80211_frequency_to_channel(int freq);
  */
 extern struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy,
 							 int freq);
+/**
+ * __regulatory_hint - hint to the wireless core a regulatory domain
+ * @wiphy: if a driver is providing the hint this is the driver's very
+ * 	own &struct wiphy
+ * @alpha2: the ISO/IEC 3166 alpha2 being claimed the regulatory domain
+ * 	should be in. If @rd is set this should be NULL
+ * @rd: a complete regulatory domain, if passed the caller need not worry
+ * 	about freeing it
+ *
+ * The Wireless subsystem can use this function to hint to the wireless core
+ * what it believes should be the current regulatory domain by
+ * giving it an ISO/IEC 3166 alpha2 country code it knows its regulatory
+ * domain should be in or by providing a completely build regulatory domain.
+ *
+ * Returns -EALREADY if *a regulatory domain* has already been set. Note that
+ * this could be by another driver. It is safe for drivers to continue if
+ * -EALREADY is returned, if drivers are not capable of world roaming they
+ * should not register more channels than they support. Right now we only
+ * support listening to the first driver hint. If the driver is capable
+ * of world roaming but wants to respect its own EEPROM mappings for
+ * specific regulatory domains it should register the @reg_notifier callback
+ * on the &struct wiphy. Returns 0 if the hint went through fine or through an
+ * intersection operation. Otherwise a standard error code is returned.
+ *
+ */
+extern int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
+		const char *alpha2, struct ieee80211_regdomain *rd);
+/**
+ * regulatory_hint - driver hint to the wireless core a regulatory domain
+ * @wiphy: the driver's very own &struct wiphy
+ * @alpha2: the ISO/IEC 3166 alpha2 the driver claims its regulatory domain
+ * 	should be in. If @rd is set this should be NULL. Note that if you
+ * 	set this to NULL you should still set rd->alpha2 to some accepted
+ * 	alpha2.
+ * @rd: a complete regulatory domain provided by the driver. If passed
+ * 	the driver does not need to worry about freeing it.
+ *
+ * Wireless drivers can use this function to hint to the wireless core
+ * what it believes should be the current regulatory domain by
+ * giving it an ISO/IEC 3166 alpha2 country code it knows its regulatory
+ * domain should be in or by providing a completely build regulatory domain.
+ * If the driver provides an ISO/IEC 3166 alpha2 userspace will be queried
+ * for a regulatory domain structure for the respective country. If
+ * a regulatory domain is build and passed you should set the alpha2
+ * if possible, otherwise set it to the special value of "99" which tells
+ * the wireless core it is unknown. If you pass a built regulatory domain
+ * and we return non zero you are in charge of kfree()'ing the structure.
+ *
+ * See __regulatory_hint() documentation for possible return values.
+ */
+extern int regulatory_hint(struct wiphy *wiphy,
+		const char *alpha2, struct ieee80211_regdomain *rd);
 
 /**
  * ieee80211_get_channel - get channel struct from wiphy for specified frequency
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 928813ce08e..5a3bdaad6c1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -17,6 +17,13 @@
 #include "rate.h"
 #include "mesh.h"
 
+struct ieee80211_hw *wiphy_to_hw(struct wiphy *wiphy)
+{
+	struct ieee80211_local *local = wiphy_priv(wiphy);
+	return &local->hw;
+}
+EXPORT_SYMBOL(wiphy_to_hw);
+
 static enum ieee80211_if_types
 nl80211_type_to_mac80211_type(enum nl80211_iftype type)
 {
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index 833b024f8f6..b97bd9fe6b7 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -14,6 +14,38 @@ config NL80211
 
 	  If unsure, say Y.
 
+config WIRELESS_OLD_REGULATORY
+	bool "Old wireless static regulatory defintions"
+	default n
+	---help---
+	  This option enables the old static regulatory information
+	  and uses it within the new framework. This is available
+	  temporarily as an option to help prevent immediate issues
+	  due to the switch to the new regulatory framework which
+	  does require a new userspace application which has the
+	  database of regulatory information (CRDA) and another for
+	  setting regulatory domains (iw).
+
+	  For more information see:
+
+	  http://wireless.kernel.org/en/developers/Regulatory/CRDA
+	  http://wireless.kernel.org/en/users/Documentation/iw
+
+	  It is important to note though that if you *do* have CRDA present
+	  and if this option is enabled CRDA *will* be called to update the
+	  regulatory domain (for US and JP only). Support for letting the user
+	  set the regulatory domain through iw is also supported. This option
+	  mainly exists to leave around for a kernel release some old static
+	  regulatory domains that were defined and to keep around the old
+	  ieee80211_regdom module parameter. This is being phased out and you
+	  should stop using them ASAP.
+
+	  Say N unless you cannot install a new userspace application
+	  or have one currently depending on the ieee80211_regdom module
+	  parameter and cannot port it to use the new userspace interfaces.
+
+	  This is scheduled for removal for 2.6.29.
+
 config WIRELESS_EXT
 	bool "Wireless extensions"
 	default n
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 7e995ac06a0..a910cd2d0fd 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -13,12 +13,14 @@
 #include <linux/debugfs.h>
 #include <linux/notifier.h>
 #include <linux/device.h>
+#include <linux/list.h>
 #include <net/genetlink.h>
 #include <net/cfg80211.h>
 #include <net/wireless.h>
 #include "nl80211.h"
 #include "core.h"
 #include "sysfs.h"
+#include "reg.h"
 
 /* name for sysfs, %d is appended */
 #define PHY_NAME "phy"
@@ -27,6 +29,107 @@ MODULE_AUTHOR("Johannes Berg");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("wireless configuration support");
 
+struct list_head regulatory_requests;
+
+/* Central wireless core regulatory domains, we only need two,
+ * the current one and a world regulatory domain in case we have no
+ * information to give us an alpha2 */
+struct ieee80211_regdomain *cfg80211_regdomain;
+
+/* We keep a static world regulatory domain in case of the absence of CRDA */
+const struct ieee80211_regdomain world_regdom = {
+	.n_reg_rules = 1,
+	.alpha2 =  "00",
+	.reg_rules = {
+		REG_RULE(2402, 2472, 40, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN |
+			NL80211_RRF_NO_IBSS),
+	}
+};
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+/* All this fucking static junk will be removed soon, so
+ * don't fucking count on it !@#$ */
+
+static char *ieee80211_regdom = "US";
+module_param(ieee80211_regdom, charp, 0444);
+MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
+
+/* We assume 40 MHz bandwidth for the old regulatory work.
+ * We make emphasis we are using the exact same frequencies
+ * as before */
+
+const struct ieee80211_regdomain us_regdom = {
+	.n_reg_rules = 6,
+	.alpha2 =  "US",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..11 */
+		REG_RULE(2412-20, 2462+20, 40, 6, 27, 0),
+		/* IEEE 802.11a, channel 36 */
+		REG_RULE(5180-20, 5180+20, 40, 6, 23, 0),
+		/* IEEE 802.11a, channel 40 */
+		REG_RULE(5200-20, 5200+20, 40, 6, 23, 0),
+		/* IEEE 802.11a, channel 44 */
+		REG_RULE(5220-20, 5220+20, 40, 6, 23, 0),
+		/* IEEE 802.11a, channels 48..64 */
+		REG_RULE(5240-20, 5320+20, 40, 6, 23, 0),
+		/* IEEE 802.11a, channels 149..165, outdoor */
+		REG_RULE(5745-20, 5825+20, 40, 6, 30, 0),
+	}
+};
+
+const struct ieee80211_regdomain jp_regdom = {
+	.n_reg_rules = 3,
+	.alpha2 =  "JP",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..14 */
+		REG_RULE(2412-20, 2484+20, 40, 6, 20, 0),
+		/* IEEE 802.11a, channels 34..48 */
+		REG_RULE(5170-20, 5240+20, 40, 6, 20,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channels 52..64 */
+		REG_RULE(5260-20, 5320+20, 40, 6, 20,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+	}
+};
+
+const struct ieee80211_regdomain eu_regdom = {
+	.n_reg_rules = 6,
+	/* This alpha2 is bogus, we leave it here just for stupid
+	 * backward compatibility */
+	.alpha2 =  "EU",
+	.reg_rules = {
+		/* IEEE 802.11b/g, channels 1..13 */
+		REG_RULE(2412-20, 2472+20, 40, 6, 20, 0),
+		/* IEEE 802.11a, channel 36 */
+		REG_RULE(5180-20, 5180+20, 40, 6, 23,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channel 40 */
+		REG_RULE(5200-20, 5200+20, 40, 6, 23,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channel 44 */
+		REG_RULE(5220-20, 5220+20, 40, 6, 23,
+			NL80211_RRF_PASSIVE_SCAN),
+		/* IEEE 802.11a, channels 48..64 */
+		REG_RULE(5240-20, 5320+20, 40, 6, 20,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+		/* IEEE 802.11a, channels 100..140 */
+		REG_RULE(5500-20, 5700+20, 40, 6, 30,
+			NL80211_RRF_NO_IBSS |
+			NL80211_RRF_DFS),
+	}
+};
+
+#endif
+
+struct ieee80211_regdomain *cfg80211_world_regdom =
+	(struct ieee80211_regdomain *) &world_regdom;
+
+LIST_HEAD(regulatory_requests);
+DEFINE_MUTEX(cfg80211_reg_mutex);
+
 /* RCU might be appropriate here since we usually
  * only read the list, and that can happen quite
  * often because we need to do it for each command */
@@ -302,7 +405,9 @@ int wiphy_register(struct wiphy *wiphy)
 	ieee80211_set_bitrate_flags(wiphy);
 
 	/* set up regulatory info */
-	wiphy_update_regulatory(wiphy);
+	mutex_lock(&cfg80211_reg_mutex);
+	wiphy_update_regulatory(wiphy, REGDOM_SET_BY_CORE);
+	mutex_unlock(&cfg80211_reg_mutex);
 
 	mutex_lock(&cfg80211_drv_mutex);
 
@@ -409,9 +514,35 @@ static struct notifier_block cfg80211_netdev_notifier = {
 	.notifier_call = cfg80211_netdev_notifier_call,
 };
 
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+const struct ieee80211_regdomain *static_regdom(char *alpha2)
+{
+	if (alpha2[0] == 'U' && alpha2[1] == 'S')
+		return &us_regdom;
+	if (alpha2[0] == 'J' && alpha2[1] == 'P')
+		return &jp_regdom;
+	if (alpha2[0] == 'E' && alpha2[1] == 'U')
+		return &eu_regdom;
+	/* Default, as per the old rules */
+	return &us_regdom;
+}
+#endif
+
 static int cfg80211_init(void)
 {
-	int err = wiphy_sysfs_init();
+	int err;
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	cfg80211_regdomain =
+		(struct ieee80211_regdomain *) static_regdom(ieee80211_regdom);
+	/* Used during reset_regdomains_static() */
+	cfg80211_world_regdom = cfg80211_regdomain;
+#else
+	cfg80211_regdomain =
+		(struct ieee80211_regdomain *) cfg80211_world_regdom;
+#endif
+
+	err = wiphy_sysfs_init();
 	if (err)
 		goto out_fail_sysfs;
 
@@ -425,8 +556,33 @@ static int cfg80211_init(void)
 
 	ieee80211_debugfs_dir = debugfs_create_dir("ieee80211", NULL);
 
+	err = regulatory_init();
+	if (err)
+		goto out_fail_reg;
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	printk(KERN_INFO "cfg80211: Using old static regulatory domain:\n");
+	print_regdomain_info(cfg80211_regdomain);
+	/* The old code still requests for a new regdomain and if
+	 * you have CRDA you get it updated, otherwise you get
+	 * stuck with the static values. We ignore "EU" code as
+	 * that is not a valid ISO / IEC 3166 alpha2 */
+	if (ieee80211_regdom[0] != 'E' &&
+			ieee80211_regdom[1] != 'U')
+		err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE,
+			ieee80211_regdom, NULL);
+#else
+	err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", NULL);
+	if (err)
+		printk(KERN_ERR "cfg80211: calling CRDA failed - "
+			"unable to update world regulatory domain, "
+			"using static definition\n");
+#endif
+
 	return 0;
 
+out_fail_reg:
+	debugfs_remove(ieee80211_debugfs_dir);
 out_fail_nl80211:
 	unregister_netdevice_notifier(&cfg80211_netdev_notifier);
 out_fail_notifier:
@@ -434,6 +590,7 @@ out_fail_notifier:
 out_fail_sysfs:
 	return err;
 }
+
 subsys_initcall(cfg80211_init);
 
 static void cfg80211_exit(void)
@@ -442,5 +599,6 @@ static void cfg80211_exit(void)
 	nl80211_exit();
 	unregister_netdevice_notifier(&cfg80211_netdev_notifier);
 	wiphy_sysfs_exit();
+	regulatory_exit();
 }
 module_exit(cfg80211_exit);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 7a02c356d63..771cc5cc765 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -79,6 +79,6 @@ extern int cfg80211_dev_rename(struct cfg80211_registered_device *drv,
 			       char *newname);
 
 void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
-void wiphy_update_regulatory(struct wiphy *wiphy);
+void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby);
 
 #endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 77880ba8b61..1221d726ed5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18,6 +18,7 @@
 #include <net/cfg80211.h>
 #include "core.h"
 #include "nl80211.h"
+#include "reg.h"
 
 /* the netlink family */
 static struct genl_family nl80211_fam = {
@@ -88,6 +89,9 @@ static struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] __read_mostly = {
 				.len = IEEE80211_MAX_MESH_ID_LEN },
 	[NL80211_ATTR_MPATH_NEXT_HOP] = { .type = NLA_U32 },
 
+	[NL80211_ATTR_REG_ALPHA2] = { .type = NLA_STRING, .len = 2 },
+	[NL80211_ATTR_REG_RULES] = { .type = NLA_NESTED },
+
 	[NL80211_ATTR_BSS_CTS_PROT] = { .type = NLA_U8 },
 	[NL80211_ATTR_BSS_SHORT_PREAMBLE] = { .type = NLA_U8 },
 	[NL80211_ATTR_BSS_SHORT_SLOT_TIME] = { .type = NLA_U8 },
@@ -1599,6 +1603,141 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
+static const struct nla_policy
+	reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = {
+	[NL80211_ATTR_REG_RULE_FLAGS]		= { .type = NLA_U32 },
+	[NL80211_ATTR_FREQ_RANGE_START]		= { .type = NLA_U32 },
+	[NL80211_ATTR_FREQ_RANGE_END]		= { .type = NLA_U32 },
+	[NL80211_ATTR_FREQ_RANGE_MAX_BW]	= { .type = NLA_U32 },
+	[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]	= { .type = NLA_U32 },
+	[NL80211_ATTR_POWER_RULE_MAX_EIRP]	= { .type = NLA_U32 },
+};
+
+static int parse_reg_rule(struct nlattr *tb[],
+	struct ieee80211_reg_rule *reg_rule)
+{
+	struct ieee80211_freq_range *freq_range = &reg_rule->freq_range;
+	struct ieee80211_power_rule *power_rule = &reg_rule->power_rule;
+
+	if (!tb[NL80211_ATTR_REG_RULE_FLAGS])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_FREQ_RANGE_START])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_FREQ_RANGE_END])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_FREQ_RANGE_MAX_BW])
+		return -EINVAL;
+	if (!tb[NL80211_ATTR_POWER_RULE_MAX_EIRP])
+		return -EINVAL;
+
+	reg_rule->flags = nla_get_u32(tb[NL80211_ATTR_REG_RULE_FLAGS]);
+
+	freq_range->start_freq_khz =
+		nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_START]);
+	freq_range->end_freq_khz =
+		nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_END]);
+	freq_range->max_bandwidth_khz =
+		nla_get_u32(tb[NL80211_ATTR_FREQ_RANGE_MAX_BW]);
+
+	power_rule->max_eirp =
+		nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_EIRP]);
+
+	if (tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN])
+		power_rule->max_antenna_gain =
+			nla_get_u32(tb[NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN]);
+
+	return 0;
+}
+
+static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
+{
+	int r;
+	char *data = NULL;
+
+	if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
+		return -EINVAL;
+
+	data = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	/* We ignore world regdom requests with the old regdom setup */
+	if (is_world_regdom(data))
+		return -EINVAL;
+#endif
+	mutex_lock(&cfg80211_drv_mutex);
+	r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data, NULL);
+	mutex_unlock(&cfg80211_drv_mutex);
+	return r;
+}
+
+static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *tb[NL80211_REG_RULE_ATTR_MAX + 1];
+	struct nlattr *nl_reg_rule;
+	char *alpha2 = NULL;
+	int rem_reg_rules = 0, r = 0;
+	u32 num_rules = 0, rule_idx = 0, size_of_regd;
+	struct ieee80211_regdomain *rd = NULL;
+
+	if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_REG_RULES])
+		return -EINVAL;
+
+	alpha2 = nla_data(info->attrs[NL80211_ATTR_REG_ALPHA2]);
+
+	nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
+			rem_reg_rules) {
+		num_rules++;
+		if (num_rules > NL80211_MAX_SUPP_REG_RULES)
+			goto bad_reg;
+	}
+
+	if (!reg_is_valid_request(alpha2))
+		return -EINVAL;
+
+	size_of_regd = sizeof(struct ieee80211_regdomain) +
+		(num_rules * sizeof(struct ieee80211_reg_rule));
+
+	rd = kzalloc(size_of_regd, GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	rd->n_reg_rules = num_rules;
+	rd->alpha2[0] = alpha2[0];
+	rd->alpha2[1] = alpha2[1];
+
+	nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES],
+			rem_reg_rules) {
+		nla_parse(tb, NL80211_REG_RULE_ATTR_MAX,
+			nla_data(nl_reg_rule), nla_len(nl_reg_rule),
+			reg_rule_policy);
+		r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]);
+		if (r)
+			goto bad_reg;
+
+		rule_idx++;
+
+		if (rule_idx > NL80211_MAX_SUPP_REG_RULES)
+			goto bad_reg;
+	}
+
+	BUG_ON(rule_idx != num_rules);
+
+	mutex_lock(&cfg80211_drv_mutex);
+	r = set_regdom(rd);
+	mutex_unlock(&cfg80211_drv_mutex);
+	if (r)
+		goto bad_reg;
+
+	return r;
+
+bad_reg:
+	kfree(rd);
+	return -EINVAL;
+}
+
 static struct genl_ops nl80211_ops[] = {
 	{
 		.cmd = NL80211_CMD_GET_WIPHY,
@@ -1736,6 +1875,18 @@ static struct genl_ops nl80211_ops[] = {
 		.policy = nl80211_policy,
 		.flags = GENL_ADMIN_PERM,
 	},
+	{
+		.cmd = NL80211_CMD_SET_REG,
+		.doit = nl80211_set_reg,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NL80211_CMD_REQ_SET_REG,
+		.doit = nl80211_req_set_reg,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
 };
 
 /* multicast groups */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 855bff4b325..592b2e391d4 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2,179 +2,758 @@
  * Copyright 2002-2005, Instant802 Networks, Inc.
  * Copyright 2005-2006, Devicescape Software, Inc.
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
+ * Copyright 2008	Luis R. Rodriguez <lrodriguz@atheros.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
 
-/*
- * This regulatory domain control implementation is highly incomplete, it
- * only exists for the purpose of not regressing mac80211.
- *
- * For now, drivers can restrict the set of allowed channels by either
- * not registering those channels or setting the IEEE80211_CHAN_DISABLED
- * flag; that flag will only be *set* by this code, never *cleared.
+/**
+ * DOC: Wireless regulatory infrastructure
  *
  * The usual implementation is for a driver to read a device EEPROM to
  * determine which regulatory domain it should be operating under, then
  * looking up the allowable channels in a driver-local table and finally
  * registering those channels in the wiphy structure.
  *
- * Alternatively, drivers that trust the regulatory domain control here
- * will register a complete set of capabilities and the control code
- * will restrict the set by setting the IEEE80211_CHAN_* flags.
+ * Another set of compliance enforcement is for drivers to use their
+ * own compliance limits which can be stored on the EEPROM. The host
+ * driver or firmware may ensure these are used.
+ *
+ * In addition to all this we provide an extra layer of regulatory
+ * conformance. For drivers which do not have any regulatory
+ * information CRDA provides the complete regulatory solution.
+ * For others it provides a community effort on further restrictions
+ * to enhance compliance.
+ *
+ * Note: When number of rules --> infinity we will not be able to
+ * index on alpha2 any more, instead we'll probably have to
+ * rely on some SHA1 checksum of the regdomain for example.
+ *
  */
 #include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/random.h>
+#include <linux/nl80211.h>
+#include <linux/platform_device.h>
 #include <net/wireless.h>
+#include <net/cfg80211.h>
 #include "core.h"
+#include "reg.h"
 
-static char *ieee80211_regdom = "US";
-module_param(ieee80211_regdom, charp, 0444);
-MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
+/* To trigger userspace events */
+static struct platform_device *reg_pdev;
 
-struct ieee80211_channel_range {
-	short start_freq;
-	short end_freq;
-	int max_power;
-	int max_antenna_gain;
-	u32 flags;
+/* Keep the ordering from large to small */
+static u32 supported_bandwidths[] = {
+	MHZ_TO_KHZ(40),
+	MHZ_TO_KHZ(20),
 };
 
-struct ieee80211_regdomain {
-	const char *code;
-	const struct ieee80211_channel_range *ranges;
-	int n_ranges;
-};
+bool is_world_regdom(char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	if (alpha2[0] == '0' && alpha2[1] == '0')
+		return true;
+	return false;
+}
 
-#define RANGE_PWR(_start, _end, _pwr, _ag, _flags)	\
-	{ _start, _end, _pwr, _ag, _flags }
+static bool is_alpha2_set(char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	if (alpha2[0] != 0 && alpha2[1] != 0)
+		return true;
+	return false;
+}
 
+static bool is_alpha_upper(char letter)
+{
+	/* ASCII A - Z */
+	if (letter >= 65 && letter <= 90)
+		return true;
+	return false;
+}
 
-/*
- * Ideally, in the future, these definitions will be loaded from a
- * userspace table via some daemon.
- */
-static const struct ieee80211_channel_range ieee80211_US_channels[] = {
-	/* IEEE 802.11b/g, channels 1..11 */
-	RANGE_PWR(2412, 2462, 27, 6, 0),
-	/* IEEE 802.11a, channel 36*/
-	RANGE_PWR(5180, 5180, 23, 6, 0),
-	/* IEEE 802.11a, channel 40*/
-	RANGE_PWR(5200, 5200, 23, 6, 0),
-	/* IEEE 802.11a, channel 44*/
-	RANGE_PWR(5220, 5220, 23, 6, 0),
-	/* IEEE 802.11a, channels 48..64 */
-	RANGE_PWR(5240, 5320, 23, 6, 0),
-	/* IEEE 802.11a, channels 149..165, outdoor */
-	RANGE_PWR(5745, 5825, 30, 6, 0),
-};
+static bool is_unknown_alpha2(char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	/* Special case where regulatory domain was built by driver
+	 * but a specific alpha2 cannot be determined */
+	if (alpha2[0] == '9' && alpha2[1] == '9')
+		return true;
+	return false;
+}
 
-static const struct ieee80211_channel_range ieee80211_JP_channels[] = {
-	/* IEEE 802.11b/g, channels 1..14 */
-	RANGE_PWR(2412, 2484, 20, 6, 0),
-	/* IEEE 802.11a, channels 34..48 */
-	RANGE_PWR(5170, 5240, 20, 6, IEEE80211_CHAN_PASSIVE_SCAN),
-	/* IEEE 802.11a, channels 52..64 */
-	RANGE_PWR(5260, 5320, 20, 6, IEEE80211_CHAN_NO_IBSS |
-				     IEEE80211_CHAN_RADAR),
-};
+static bool is_an_alpha2(char *alpha2)
+{
+	if (!alpha2)
+		return false;
+	if (is_alpha_upper(alpha2[0]) && is_alpha_upper(alpha2[1]))
+		return true;
+	return false;
+}
 
-static const struct ieee80211_channel_range ieee80211_EU_channels[] = {
-	/* IEEE 802.11b/g, channels 1..13 */
-	RANGE_PWR(2412, 2472, 20, 6, 0),
-	/* IEEE 802.11a, channel 36*/
-	RANGE_PWR(5180, 5180, 23, 6, IEEE80211_CHAN_PASSIVE_SCAN),
-	/* IEEE 802.11a, channel 40*/
-	RANGE_PWR(5200, 5200, 23, 6, IEEE80211_CHAN_PASSIVE_SCAN),
-	/* IEEE 802.11a, channel 44*/
-	RANGE_PWR(5220, 5220, 23, 6, IEEE80211_CHAN_PASSIVE_SCAN),
-	/* IEEE 802.11a, channels 48..64 */
-	RANGE_PWR(5240, 5320, 23, 6, IEEE80211_CHAN_NO_IBSS |
-				     IEEE80211_CHAN_RADAR),
-	/* IEEE 802.11a, channels 100..140 */
-	RANGE_PWR(5500, 5700, 30, 6, IEEE80211_CHAN_NO_IBSS |
-				     IEEE80211_CHAN_RADAR),
-};
+static bool alpha2_equal(char *alpha2_x, char *alpha2_y)
+{
+	if (!alpha2_x || !alpha2_y)
+		return false;
+	if (alpha2_x[0] == alpha2_y[0] &&
+		alpha2_x[1] == alpha2_y[1])
+		return true;
+	return false;
+}
+
+static bool regdom_changed(char *alpha2)
+{
+	if (!cfg80211_regdomain)
+		return true;
+	if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+		return false;
+	return true;
+}
+
+/* This lets us keep regulatory code which is updated on a regulatory
+ * basis in userspace. */
+static int call_crda(const char *alpha2)
+{
+	char country_env[9 + 2] = "COUNTRY=";
+	char *envp[] = {
+		country_env,
+		NULL
+	};
+
+	if (!is_world_regdom((char *) alpha2))
+		printk(KERN_INFO "cfg80211: Calling CRDA for country: %c%c\n",
+			alpha2[0], alpha2[1]);
+	else
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+		return -EINVAL;
+#else
+		printk(KERN_INFO "cfg80211: Calling CRDA to update world "
+			"regulatory domain\n");
+#endif
+
+	country_env[8] = alpha2[0];
+	country_env[9] = alpha2[1];
+
+	return kobject_uevent_env(&reg_pdev->dev.kobj, KOBJ_CHANGE, envp);
+}
+
+/* This has the logic which determines when a new request
+ * should be ignored. */
+static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
+	char *alpha2, struct ieee80211_regdomain *rd)
+{
+	struct regulatory_request *last_request = NULL;
 
-#define REGDOM(_code)							\
-	{								\
-		.code = __stringify(_code),				\
-		.ranges = ieee80211_ ##_code## _channels,		\
-		.n_ranges = ARRAY_SIZE(ieee80211_ ##_code## _channels),	\
+	/* All initial requests are respected */
+	if (list_empty(&regulatory_requests))
+		return 0;
+
+	last_request = list_first_entry(&regulatory_requests,
+		struct regulatory_request, list);
+
+	switch (set_by) {
+	case REGDOM_SET_BY_INIT:
+		return -EINVAL;
+	case REGDOM_SET_BY_CORE:
+		/* Always respect new wireless core hints, should only
+		 * come in for updating the world regulatory domain at init
+		 * anyway */
+		return 0;
+	case REGDOM_SET_BY_COUNTRY_IE:
+		if (last_request->initiator == set_by) {
+			if (last_request->wiphy != wiphy) {
+				/* Two cards with two APs claiming different
+				 * different Country IE alpha2s!
+				 * You're special!! */
+				if (!alpha2_equal(last_request->alpha2,
+						cfg80211_regdomain->alpha2)) {
+					/* XXX: Deal with conflict, consider
+					 * building a new one out of the
+					 * intersection */
+					WARN_ON(1);
+					return -EOPNOTSUPP;
+				}
+				return -EALREADY;
+			}
+			/* Two consecutive Country IE hints on the same wiphy */
+			if (!alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
+				return 0;
+			return -EALREADY;
+		}
+		if (WARN_ON(!is_alpha2_set(alpha2) || !is_an_alpha2(alpha2)),
+				"Invalid Country IE regulatory hint passed "
+				"to the wireless core\n")
+			return -EINVAL;
+		/* We ignore Country IE hints for now, as we haven't yet
+		 * added the dot11MultiDomainCapabilityEnabled flag
+		 * for wiphys */
+		return 1;
+	case REGDOM_SET_BY_DRIVER:
+		BUG_ON(!wiphy);
+		if (last_request->initiator == set_by) {
+			/* Two separate drivers hinting different things,
+			 * this is possible if you have two devices present
+			 * on a system with different EEPROM regulatory
+			 * readings. XXX: Do intersection, we support only
+			 * the first regulatory hint for now */
+			if (last_request->wiphy != wiphy)
+				return -EALREADY;
+			if (rd)
+				return -EALREADY;
+			/* Driver should not be trying to hint different
+			 * regulatory domains! */
+			BUG_ON(!alpha2_equal(alpha2,
+					cfg80211_regdomain->alpha2));
+			return -EALREADY;
+		}
+		if (last_request->initiator == REGDOM_SET_BY_CORE)
+			return 0;
+		/* XXX: Handle intersection, and add the
+		 * dot11MultiDomainCapabilityEnabled flag to wiphy. For now
+		 * we assume the driver has this set to false, following the
+		 * 802.11d dot11MultiDomainCapabilityEnabled documentation */
+		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+			return 0;
+		return 0;
+	case REGDOM_SET_BY_USER:
+		if (last_request->initiator == set_by ||
+				last_request->initiator == REGDOM_SET_BY_CORE)
+			return 0;
+		/* Drivers can use their wiphy's reg_notifier()
+		 * to override any information */
+		if (last_request->initiator == REGDOM_SET_BY_DRIVER)
+			return 0;
+		/* XXX: Handle intersection */
+		if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE)
+			return -EOPNOTSUPP;
+		return 0;
+	default:
+		return -EINVAL;
 	}
+}
 
-static const struct ieee80211_regdomain ieee80211_regdoms[] = {
-	REGDOM(US),
-	REGDOM(JP),
-	REGDOM(EU),
-};
+static bool __reg_is_valid_request(char *alpha2,
+	struct regulatory_request **request)
+{
+	struct regulatory_request *req;
+	if (list_empty(&regulatory_requests))
+		return false;
+	list_for_each_entry(req, &regulatory_requests, list) {
+		if (alpha2_equal(req->alpha2, alpha2)) {
+			*request = req;
+			return true;
+		}
+	}
+	return false;
+}
 
+/* Used by nl80211 before kmalloc'ing our regulatory domain */
+bool reg_is_valid_request(char *alpha2)
+{
+	struct regulatory_request *request = NULL;
+	return  __reg_is_valid_request(alpha2, &request);
+}
 
-static const struct ieee80211_regdomain *get_regdom(void)
+/* Sanity check on a regulatory rule */
+static bool is_valid_reg_rule(struct ieee80211_reg_rule *rule)
 {
-	static const struct ieee80211_channel_range
-	ieee80211_world_channels[] = {
-		/* IEEE 802.11b/g, channels 1..11 */
-		RANGE_PWR(2412, 2462, 27, 6, 0),
-	};
-	static const struct ieee80211_regdomain regdom_world = REGDOM(world);
-	int i;
+	struct ieee80211_freq_range *freq_range = &rule->freq_range;
+	u32 freq_diff;
+
+	if (freq_range->start_freq_khz == 0 || freq_range->end_freq_khz == 0)
+		return false;
+
+	if (freq_range->start_freq_khz > freq_range->end_freq_khz)
+		return false;
+
+	freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
+
+	if (freq_range->max_bandwidth_khz > freq_diff)
+		return false;
+
+	return true;
+}
+
+static bool is_valid_rd(struct ieee80211_regdomain *rd)
+{
+	struct ieee80211_reg_rule *reg_rule = NULL;
+	unsigned int i;
 
-	for (i = 0; i < ARRAY_SIZE(ieee80211_regdoms); i++)
-		if (strcmp(ieee80211_regdom, ieee80211_regdoms[i].code) == 0)
-			return &ieee80211_regdoms[i];
+	if (!rd->n_reg_rules)
+		return false;
 
-	return &regdom_world;
+	for (i = 0; i < rd->n_reg_rules; i++) {
+		reg_rule = &rd->reg_rules[i];
+		if (!is_valid_reg_rule(reg_rule))
+			return false;
+	}
+
+	return true;
 }
 
+/* Returns value in KHz */
+static u32 freq_max_bandwidth(const struct ieee80211_freq_range *freq_range,
+	u32 freq)
+{
+	unsigned int i;
+	for (i = 0; i < ARRAY_SIZE(supported_bandwidths); i++) {
+		u32 start_freq_khz = freq - supported_bandwidths[i]/2;
+		u32 end_freq_khz = freq + supported_bandwidths[i]/2;
+		if (start_freq_khz >= freq_range->start_freq_khz &&
+			end_freq_khz <= freq_range->end_freq_khz)
+			return supported_bandwidths[i];
+	}
+	return 0;
+}
 
-static void handle_channel(struct ieee80211_channel *chan,
-			   const struct ieee80211_regdomain *rd)
+/* XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
+ * want to just have the channel structure use these */
+static u32 map_regdom_flags(u32 rd_flags)
+{
+	u32 channel_flags = 0;
+	if (rd_flags & NL80211_RRF_PASSIVE_SCAN)
+		channel_flags |= IEEE80211_CHAN_PASSIVE_SCAN;
+	if (rd_flags & NL80211_RRF_NO_IBSS)
+		channel_flags |= IEEE80211_CHAN_NO_IBSS;
+	if (rd_flags & NL80211_RRF_DFS)
+		channel_flags |= IEEE80211_CHAN_RADAR;
+	return channel_flags;
+}
+
+/**
+ * freq_reg_info - get regulatory information for the given frequency
+ * @center_freq: Frequency in KHz for which we want regulatory information for
+ * @bandwidth: the bandwidth requirement you have in KHz, if you do not have one
+ * 	you can set this to 0. If this frequency is allowed we then set
+ * 	this value to the maximum allowed bandwidth.
+ * @reg_rule: the regulatory rule which we have for this frequency
+ *
+ * Use this function to get the regulatory rule for a specific frequency.
+ */
+static int freq_reg_info(u32 center_freq, u32 *bandwidth,
+			 const struct ieee80211_reg_rule **reg_rule)
 {
 	int i;
-	u32 flags = chan->orig_flags;
-	const struct ieee80211_channel_range *rg = NULL;
+	u32 max_bandwidth = 0;
 
-	for (i = 0; i < rd->n_ranges; i++) {
-		if (rd->ranges[i].start_freq <= chan->center_freq &&
-		    chan->center_freq <= rd->ranges[i].end_freq) {
-			rg = &rd->ranges[i];
+	if (!cfg80211_regdomain)
+		return -EINVAL;
+
+	for (i = 0; i < cfg80211_regdomain->n_reg_rules; i++) {
+		const struct ieee80211_reg_rule *rr;
+		const struct ieee80211_freq_range *fr = NULL;
+		const struct ieee80211_power_rule *pr = NULL;
+
+		rr = &cfg80211_regdomain->reg_rules[i];
+		fr = &rr->freq_range;
+		pr = &rr->power_rule;
+		max_bandwidth = freq_max_bandwidth(fr, center_freq);
+		if (max_bandwidth && *bandwidth <= max_bandwidth) {
+			*reg_rule = rr;
+			*bandwidth = max_bandwidth;
 			break;
 		}
 	}
 
-	if (!rg) {
-		/* not found */
+	return !max_bandwidth;
+}
+
+static void handle_channel(struct ieee80211_channel *chan)
+{
+	int r;
+	u32 flags = chan->orig_flags;
+	u32 max_bandwidth = 0;
+	const struct ieee80211_reg_rule *reg_rule = NULL;
+	const struct ieee80211_power_rule *power_rule = NULL;
+
+	r = freq_reg_info(MHZ_TO_KHZ(chan->center_freq),
+		&max_bandwidth, &reg_rule);
+
+	if (r) {
 		flags |= IEEE80211_CHAN_DISABLED;
 		chan->flags = flags;
 		return;
 	}
 
-	chan->flags = flags;
+	power_rule = &reg_rule->power_rule;
+
+	chan->flags = flags | map_regdom_flags(reg_rule->flags);
 	chan->max_antenna_gain = min(chan->orig_mag,
-					 rg->max_antenna_gain);
+		(int) MBI_TO_DBI(power_rule->max_antenna_gain));
+	chan->max_bandwidth = KHZ_TO_MHZ(max_bandwidth);
 	if (chan->orig_mpwr)
-		chan->max_power = min(chan->orig_mpwr, rg->max_power);
+		chan->max_power = min(chan->orig_mpwr,
+			(int) MBM_TO_DBM(power_rule->max_eirp));
 	else
-		chan->max_power = rg->max_power;
+		chan->max_power = (int) MBM_TO_DBM(power_rule->max_eirp);
 }
 
-static void handle_band(struct ieee80211_supported_band *sband,
-			const struct ieee80211_regdomain *rd)
+static void handle_band(struct ieee80211_supported_band *sband)
 {
 	int i;
 
 	for (i = 0; i < sband->n_channels; i++)
-		handle_channel(&sband->channels[i], rd);
+		handle_channel(&sband->channels[i]);
 }
 
-void wiphy_update_regulatory(struct wiphy *wiphy)
+static void update_all_wiphy_regulatory(enum reg_set_by setby)
 {
-	enum ieee80211_band band;
-	const struct ieee80211_regdomain *rd = get_regdom();
+	struct cfg80211_registered_device *drv;
 
-	for (band = 0; band < IEEE80211_NUM_BANDS; band++)
+	list_for_each_entry(drv, &cfg80211_drv_list, list)
+		wiphy_update_regulatory(&drv->wiphy, setby);
+}
+
+void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby)
+{
+	enum ieee80211_band band;
+	for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
 		if (wiphy->bands[band])
-			handle_band(wiphy->bands[band], rd);
+			handle_band(wiphy->bands[band]);
+		if (wiphy->reg_notifier)
+			wiphy->reg_notifier(wiphy, setby);
+	}
+}
+
+/* Caller must hold &cfg80211_drv_mutex */
+int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by,
+		      const char *alpha2, struct ieee80211_regdomain *rd)
+{
+	struct regulatory_request *request;
+	char *rd_alpha2;
+	int r = 0;
+
+	r = ignore_request(wiphy, set_by, (char *) alpha2, rd);
+	if (r)
+		return r;
+
+	if (rd)
+		rd_alpha2 = rd->alpha2;
+	else
+		rd_alpha2 = (char *) alpha2;
+
+	switch (set_by) {
+	case REGDOM_SET_BY_CORE:
+	case REGDOM_SET_BY_COUNTRY_IE:
+	case REGDOM_SET_BY_DRIVER:
+	case REGDOM_SET_BY_USER:
+		request = kzalloc(sizeof(struct regulatory_request),
+			GFP_KERNEL);
+		if (!request)
+			return -ENOMEM;
+
+		request->alpha2[0] = rd_alpha2[0];
+		request->alpha2[1] = rd_alpha2[1];
+		request->initiator = set_by;
+		request->wiphy = wiphy;
+
+		list_add_tail(&request->list, &regulatory_requests);
+		if (rd)
+			break;
+		r = call_crda(alpha2);
+#ifndef CONFIG_WIRELESS_OLD_REGULATORY
+		if (r)
+			printk(KERN_ERR "cfg80211: Failed calling CRDA\n");
+#endif
+		break;
+	default:
+		r = -ENOTSUPP;
+		break;
+	}
+
+	return r;
+}
+
+/* If rd is not NULL and if this call fails the caller must free it */
+int regulatory_hint(struct wiphy *wiphy, const char *alpha2,
+	struct ieee80211_regdomain *rd)
+{
+	int r;
+	BUG_ON(!rd && !alpha2);
+
+	mutex_lock(&cfg80211_drv_mutex);
+
+	r = __regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER, alpha2, rd);
+	if (r || !rd)
+		goto unlock_and_exit;
+
+	/* If the driver passed a regulatory domain we skipped asking
+	 * userspace for one so we can now go ahead and set it */
+	r = set_regdom(rd);
+
+unlock_and_exit:
+	mutex_unlock(&cfg80211_drv_mutex);
+	return r;
+}
+EXPORT_SYMBOL(regulatory_hint);
+
+
+static void print_rd_rules(struct ieee80211_regdomain *rd)
+{
+	unsigned int i;
+	struct ieee80211_reg_rule *reg_rule = NULL;
+	struct ieee80211_freq_range *freq_range = NULL;
+	struct ieee80211_power_rule *power_rule = NULL;
+
+	printk(KERN_INFO "\t(start_freq - end_freq @ bandwidth), "
+		"(max_antenna_gain, max_eirp)\n");
+
+	for (i = 0; i < rd->n_reg_rules; i++) {
+		reg_rule = &rd->reg_rules[i];
+		freq_range = &reg_rule->freq_range;
+		power_rule = &reg_rule->power_rule;
+
+		/* There may not be documentation for max antenna gain
+		 * in certain regions */
+		if (power_rule->max_antenna_gain)
+			printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), "
+				"(%d mBi, %d mBm)\n",
+				freq_range->start_freq_khz,
+				freq_range->end_freq_khz,
+				freq_range->max_bandwidth_khz,
+				power_rule->max_antenna_gain,
+				power_rule->max_eirp);
+		else
+			printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), "
+				"(N/A, %d mBm)\n",
+				freq_range->start_freq_khz,
+				freq_range->end_freq_khz,
+				freq_range->max_bandwidth_khz,
+				power_rule->max_eirp);
+	}
+}
+
+static void print_regdomain(struct ieee80211_regdomain *rd)
+{
+
+	if (is_world_regdom(rd->alpha2))
+		printk(KERN_INFO "cfg80211: World regulatory "
+			"domain updated:\n");
+	else {
+		if (is_unknown_alpha2(rd->alpha2))
+			printk(KERN_INFO "cfg80211: Regulatory domain "
+				"changed to driver built-in settings "
+				"(unknown country)\n");
+		else
+			printk(KERN_INFO "cfg80211: Regulatory domain "
+				"changed to country: %c%c\n",
+				rd->alpha2[0], rd->alpha2[1]);
+	}
+	print_rd_rules(rd);
+}
+
+void print_regdomain_info(struct ieee80211_regdomain *rd)
+{
+	printk(KERN_INFO "cfg80211: Regulatory domain: %c%c\n",
+		rd->alpha2[0], rd->alpha2[1]);
+	print_rd_rules(rd);
+}
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+
+static bool is_old_static_regdom(struct ieee80211_regdomain *rd)
+{
+	if (rd == &us_regdom || rd == &jp_regdom || rd == &eu_regdom)
+		return true;
+	return false;
+}
+
+/* The old crap never deals with a world regulatory domain, it only
+ * deals with the static regulatory domain passed and if possible
+ * an updated "US" or "JP" regulatory domain. We do however store the
+ * old static regulatory domain in cfg80211_world_regdom for convenience
+ * of use here */
+static void reset_regdomains_static(void)
+{
+	if (!is_old_static_regdom(cfg80211_regdomain))
+		kfree(cfg80211_regdomain);
+	/* This is setting the regdom to the old static regdom */
+	cfg80211_regdomain =
+		(struct ieee80211_regdomain *) cfg80211_world_regdom;
+}
+#else
+static void reset_regdomains(void)
+{
+	if (cfg80211_world_regdom && cfg80211_world_regdom != &world_regdom) {
+		if (cfg80211_world_regdom == cfg80211_regdomain) {
+			kfree(cfg80211_regdomain);
+		} else {
+			kfree(cfg80211_world_regdom);
+			kfree(cfg80211_regdomain);
+		}
+	} else if (cfg80211_regdomain && cfg80211_regdomain != &world_regdom)
+		kfree(cfg80211_regdomain);
+
+	cfg80211_world_regdom = (struct ieee80211_regdomain *) &world_regdom;
+	cfg80211_regdomain = NULL;
+}
+
+/* Dynamic world regulatory domain requested by the wireless
+ * core upon initialization */
+static void update_world_regdomain(struct ieee80211_regdomain *rd)
+{
+	BUG_ON(list_empty(&regulatory_requests));
+
+	reset_regdomains();
+
+	cfg80211_world_regdom = rd;
+	cfg80211_regdomain = rd;
+}
+#endif
+
+static int __set_regdom(struct ieee80211_regdomain *rd)
+{
+	struct regulatory_request *request = NULL;
+
+	/* Some basic sanity checks first */
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	/* We ignore the world regdom with the old static regdomains setup
+	 * as there is no point to it with satic regulatory definitions :(
+	 * Don't worry this shit will be removed soon... */
+	if (is_world_regdom(rd->alpha2))
+		return -EINVAL;
+#else
+	if (is_world_regdom(rd->alpha2)) {
+		if (WARN_ON(!__reg_is_valid_request(rd->alpha2, &request)))
+			return -EINVAL;
+		update_world_regdomain(rd);
+		return 0;
+	}
+#endif
+
+	if (!is_alpha2_set(rd->alpha2) && !is_an_alpha2(rd->alpha2) &&
+			!is_unknown_alpha2(rd->alpha2))
+		return -EINVAL;
+
+	if (list_empty(&regulatory_requests))
+		return -EINVAL;
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	/* Static "US" and "JP" will be overridden, but just once */
+	if (!is_old_static_regdom(cfg80211_regdomain) &&
+			!regdom_changed(rd->alpha2))
+		return -EINVAL;
+#else
+	if (!regdom_changed(rd->alpha2))
+		return -EINVAL;
+#endif
+
+	/* Now lets set the regulatory domain, update all driver channels
+	 * and finally inform them of what we have done, in case they want
+	 * to review or adjust their own settings based on their own
+	 * internal EEPROM data */
+
+	if (WARN_ON(!__reg_is_valid_request(rd->alpha2, &request)))
+		return -EINVAL;
+
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	reset_regdomains_static();
+#else
+	reset_regdomains();
+#endif
+
+	/* Country IE parsing coming soon */
+	switch (request->initiator) {
+	case REGDOM_SET_BY_CORE:
+	case REGDOM_SET_BY_DRIVER:
+	case REGDOM_SET_BY_USER:
+		if (!is_valid_rd(rd)) {
+			printk(KERN_ERR "cfg80211: Invalid "
+				"regulatory domain detected:\n");
+			print_regdomain_info(rd);
+			return -EINVAL;
+		}
+		break;
+	case REGDOM_SET_BY_COUNTRY_IE: /* Not yet */
+		WARN_ON(1);
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	/* Tada! */
+	cfg80211_regdomain = rd;
+	request->granted = 1;
+
+	return 0;
+}
+
+
+/* Use this call to set the current regulatory domain. Conflicts with
+ * multiple drivers can be ironed out later. Caller must've already
+ * kmalloc'd the rd structure. If this calls fails you should kfree()
+ * the passed rd. Caller must hold cfg80211_drv_mutex */
+int set_regdom(struct ieee80211_regdomain *rd)
+{
+	struct regulatory_request *this_request = NULL, *prev_request = NULL;
+	int r;
+
+	if (!list_empty(&regulatory_requests))
+		prev_request = list_first_entry(&regulatory_requests,
+			struct regulatory_request, list);
+
+	/* Note that this doesn't update the wiphys, this is done below */
+	r = __set_regdom(rd);
+	if (r)
+		return r;
+
+	BUG_ON((!__reg_is_valid_request(rd->alpha2, &this_request)));
+
+	/* The initial standard core update of the world regulatory domain, no
+	 * need to keep that request info around if it didn't fail. */
+	if (is_world_regdom(rd->alpha2) &&
+			this_request->initiator == REGDOM_SET_BY_CORE &&
+			this_request->granted) {
+		list_del(&this_request->list);
+		kfree(this_request);
+		this_request = NULL;
+	}
+
+	/* Remove old requests, we only leave behind the last one */
+	if (prev_request) {
+		list_del(&prev_request->list);
+		kfree(prev_request);
+		prev_request = NULL;
+	}
+
+	/* This would make this whole thing pointless */
+	BUG_ON(rd != cfg80211_regdomain);
+
+	/* update all wiphys now with the new established regulatory domain */
+	update_all_wiphy_regulatory(this_request->initiator);
+
+	print_regdomain(rd);
+
+	return r;
+}
+
+int regulatory_init(void)
+{
+	reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0);
+	if (IS_ERR(reg_pdev))
+		return PTR_ERR(reg_pdev);
+	return 0;
+}
+
+void regulatory_exit(void)
+{
+	struct regulatory_request *req, *req_tmp;
+	mutex_lock(&cfg80211_drv_mutex);
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+	reset_regdomains_static();
+#else
+	reset_regdomains();
+#endif
+	list_for_each_entry_safe(req, req_tmp, &regulatory_requests, list) {
+		list_del(&req->list);
+		kfree(req);
+	}
+	platform_device_unregister(reg_pdev);
+	mutex_unlock(&cfg80211_drv_mutex);
 }
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
new file mode 100644
index 00000000000..d75fd023297
--- /dev/null
+++ b/net/wireless/reg.h
@@ -0,0 +1,44 @@
+#ifndef __NET_WIRELESS_REG_H
+#define __NET_WIRELESS_REG_H
+
+extern const struct ieee80211_regdomain world_regdom;
+#ifdef CONFIG_WIRELESS_OLD_REGULATORY
+extern const struct ieee80211_regdomain us_regdom;
+extern const struct ieee80211_regdomain jp_regdom;
+extern const struct ieee80211_regdomain eu_regdom;
+#endif
+
+extern struct ieee80211_regdomain *cfg80211_regdomain;
+extern struct ieee80211_regdomain *cfg80211_world_regdom;
+extern struct list_head regulatory_requests;
+
+struct regdom_last_setby {
+	struct wiphy *wiphy;
+	u8 initiator;
+};
+
+/* wiphy is set if this request's initiator is REGDOM_SET_BY_DRIVER */
+struct regulatory_request {
+	struct list_head list;
+	struct wiphy *wiphy;
+	int granted;
+	enum reg_set_by initiator;
+	char alpha2[2];
+};
+
+bool is_world_regdom(char *alpha2);
+bool reg_is_valid_request(char *alpha2);
+
+int set_regdom(struct ieee80211_regdomain *rd);
+int __regulatory_hint_alpha2(struct wiphy *wiphy, enum reg_set_by set_by,
+		      const char *alpha2);
+
+int regulatory_init(void);
+void regulatory_exit(void);
+
+void print_regdomain_info(struct ieee80211_regdomain *);
+
+/* If a char is A-Z */
+#define IS_ALPHA(letter) (letter >= 65 && letter <= 90)
+
+#endif  /* __NET_WIRELESS_REG_H */
-- 
cgit v1.2.3-70-g09d2


From bed7aac9416f50425d2200df32bcc9bf248ff8cb Mon Sep 17 00:00:00 2001
From: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Date: Tue, 26 Aug 2008 11:58:01 -0300
Subject: rfkill: remove transmitter blocking on suspend

Currently, rfkill would stand in the way of properly supporting wireless
devices that are capable of waking the system up from sleep or hibernation
when they receive a special wireless message.  It would also get in the way
of mesh devices that need to remain operational even during platform
suspend.

To avoid that, stop trying to block the transmitters on the rfkill class
suspend handler.

Drivers that need rfkill's older behaviour will have to implement it by
themselves in their own suspend handling.

Do note that rfkill *will* attempt to restore the transmitter state on
resume in any situation.  This happens after the driver's resume method is
called by the suspend core (class devices resume after the devices they are
attached to have been resumed).

The following drivers need to check if they need to explicitly block
their transmitters in their own suspend handlers (maintainers Cc'd):
	arch/arm/mach-pxa/tosa-bt.c
	drivers/net/usb/hso.c
	drivers/net/wireless/rt2x00/* (USB might need it?)
	drivers/net/wireless/b43/ (SSB over USB might need it?)
	drivers/misc/hp-wmi.c
	eeepc-laptop w/rfkill support (not in mainline yet)
	Compal laptop w/rfkill support (not in mainline yet)
	toshiba-acpi w/rfkill support (not in mainline yet)

Signed-off-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Cc: Ivo van Doorn <IvDoorn@gmail.com>
Cc: Matthew Garrett <mjg@redhat.com>
Cc: Andrew Bird <ajb@spheresystems.co.uk>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Cezary Jackiewicz <cezary.jackiewicz@gmail.com>
Cc: Philip Langdale <philipl@overt.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 Documentation/rfkill.txt | 32 ++++++++++++++++++++++++++++----
 net/rfkill/rfkill.c      | 16 ++--------------
 2 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index 6fcb3060dec..b65f0799df4 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -341,6 +341,8 @@ key that does nothing by itself, as well as any hot key that is type-specific
 3.1 Guidelines for wireless device drivers
 ------------------------------------------
 
+(in this text, rfkill->foo means the foo field of struct rfkill).
+
 1. Each independent transmitter in a wireless device (usually there is only one
 transmitter per device) should have a SINGLE rfkill class attached to it.
 
@@ -363,10 +365,32 @@ This rule exists because users of the rfkill subsystem expect to get (and set,
 when possible) the overall transmitter rfkill state, not of a particular rfkill
 line.
 
-5. During suspend, the rfkill class will attempt to soft-block the radio
-through a call to rfkill->toggle_radio, and will try to restore its previous
-state during resume.  After a rfkill class is suspended, it will *not* call
-rfkill->toggle_radio until it is resumed.
+5. The wireless device driver MUST NOT leave the transmitter enabled during
+suspend and hibernation unless:
+
+	5.1. The transmitter has to be enabled for some sort of functionality
+	like wake-on-wireless-packet or autonomous packed forwarding in a mesh
+	network, and that functionality is enabled for this suspend/hibernation
+	cycle.
+
+AND
+
+	5.2. The device was not on a user-requested BLOCKED state before
+	the suspend (i.e. the driver must NOT unblock a device, not even
+	to support wake-on-wireless-packet or remain in the mesh).
+
+In other words, there is absolutely no allowed scenario where a driver can
+automatically take action to unblock a rfkill controller (obviously, this deals
+with scenarios where soft-blocking or both soft and hard blocking is happening.
+Scenarios where hardware rfkill lines are the only ones blocking the
+transmitter are outside of this rule, since the wireless device driver does not
+control its input hardware rfkill lines in the first place).
+
+6. During resume, rfkill will try to restore its previous state.
+
+7. After a rfkill class is suspended, it will *not* call rfkill->toggle_radio
+until it is resumed.
+
 
 Example of a WLAN wireless driver connected to the rfkill subsystem:
 --------------------------------------------------------------------
diff --git a/net/rfkill/rfkill.c b/net/rfkill/rfkill.c
index d5735799ccd..ea0dc04b3c7 100644
--- a/net/rfkill/rfkill.c
+++ b/net/rfkill/rfkill.c
@@ -512,21 +512,9 @@ static void rfkill_release(struct device *dev)
 #ifdef CONFIG_PM
 static int rfkill_suspend(struct device *dev, pm_message_t state)
 {
-	struct rfkill *rfkill = to_rfkill(dev);
-
-	if (dev->power.power_state.event != state.event) {
-		if (state.event & PM_EVENT_SLEEP) {
-			/* Stop transmitter, keep state, no notifies */
-			update_rfkill_state(rfkill);
-
-			mutex_lock(&rfkill->mutex);
-			rfkill->toggle_radio(rfkill->data,
-						RFKILL_STATE_SOFT_BLOCKED);
-			mutex_unlock(&rfkill->mutex);
-		}
-
+	/* mark class device as suspended */
+	if (dev->power.power_state.event != state.event)
 		dev->power.power_state = state;
-	}
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 62c1f95e3993480ae451c322588f7cbf5a58db28 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 12 Sep 2008 10:18:44 +0200
Subject: mac80211: clean up kdoc

A few errors sneaked in over time, some functions no longer exist,
for some alternatives exist. This changes the docbook template to
include the right things.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 Documentation/DocBook/mac80211.tmpl | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl
index b651e0a4b1c..77c3c202991 100644
--- a/Documentation/DocBook/mac80211.tmpl
+++ b/Documentation/DocBook/mac80211.tmpl
@@ -145,7 +145,6 @@ usage should require reading the full document.
         this though and the recommendation to allow only a single
         interface in STA mode at first!
       </para>
-!Finclude/net/mac80211.h ieee80211_if_types
 !Finclude/net/mac80211.h ieee80211_if_init_conf
 !Finclude/net/mac80211.h ieee80211_if_conf
     </chapter>
@@ -177,8 +176,7 @@ usage should require reading the full document.
         <title>functions/definitions</title>
 !Finclude/net/mac80211.h ieee80211_rx_status
 !Finclude/net/mac80211.h mac80211_rx_flags
-!Finclude/net/mac80211.h ieee80211_tx_control
-!Finclude/net/mac80211.h ieee80211_tx_status_flags
+!Finclude/net/mac80211.h ieee80211_tx_info
 !Finclude/net/mac80211.h ieee80211_rx
 !Finclude/net/mac80211.h ieee80211_rx_irqsafe
 !Finclude/net/mac80211.h ieee80211_tx_status
@@ -189,12 +187,11 @@ usage should require reading the full document.
 !Finclude/net/mac80211.h ieee80211_ctstoself_duration
 !Finclude/net/mac80211.h ieee80211_generic_frame_duration
 !Finclude/net/mac80211.h ieee80211_get_hdrlen_from_skb
-!Finclude/net/mac80211.h ieee80211_get_hdrlen
+!Finclude/net/mac80211.h ieee80211_hdrlen
 !Finclude/net/mac80211.h ieee80211_wake_queue
 !Finclude/net/mac80211.h ieee80211_stop_queue
-!Finclude/net/mac80211.h ieee80211_start_queues
-!Finclude/net/mac80211.h ieee80211_stop_queues
 !Finclude/net/mac80211.h ieee80211_wake_queues
+!Finclude/net/mac80211.h ieee80211_stop_queues
       </sect1>
     </chapter>
 
@@ -230,8 +227,7 @@ usage should require reading the full document.
       <title>Multiple queues and QoS support</title>
       <para>TBD</para>
 !Finclude/net/mac80211.h ieee80211_tx_queue_params
-!Finclude/net/mac80211.h ieee80211_tx_queue_stats_data
-!Finclude/net/mac80211.h ieee80211_tx_queue
+!Finclude/net/mac80211.h ieee80211_tx_queue_stats
     </chapter>
 
     <chapter id="AP">
-- 
cgit v1.2.3-70-g09d2


From c4e84bde1d595d857d3c74b49b9c45cc770df792 Mon Sep 17 00:00:00 2001
From: Ron Mercer <ron.mercer@qlogic.com>
Date: Thu, 18 Sep 2008 11:56:28 -0400
Subject: qlge: New Qlogic 10Gb Ethernet Driver.

Signed-off-by: Ron Mercer <ron.mercer@qlogic.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 Documentation/networking/LICENSE.qlge |   46 +
 MAINTAINERS                           |    7 +
 drivers/net/Kconfig                   |    9 +
 drivers/net/Makefile                  |    1 +
 drivers/net/qlge/Makefile             |    7 +
 drivers/net/qlge/qlge.h               | 1593 +++++++++++++
 drivers/net/qlge/qlge_dbg.c           |  858 +++++++
 drivers/net/qlge/qlge_ethtool.c       |  415 ++++
 drivers/net/qlge/qlge_main.c          | 3954 +++++++++++++++++++++++++++++++++
 drivers/net/qlge/qlge_mpi.c           |  150 ++
 10 files changed, 7040 insertions(+)
 create mode 100644 Documentation/networking/LICENSE.qlge
 create mode 100644 drivers/net/qlge/Makefile
 create mode 100644 drivers/net/qlge/qlge.h
 create mode 100644 drivers/net/qlge/qlge_dbg.c
 create mode 100644 drivers/net/qlge/qlge_ethtool.c
 create mode 100644 drivers/net/qlge/qlge_main.c
 create mode 100644 drivers/net/qlge/qlge_mpi.c

(limited to 'Documentation')

diff --git a/Documentation/networking/LICENSE.qlge b/Documentation/networking/LICENSE.qlge
new file mode 100644
index 00000000000..123b6edd7f1
--- /dev/null
+++ b/Documentation/networking/LICENSE.qlge
@@ -0,0 +1,46 @@
+Copyright (c)  2003-2008 QLogic Corporation
+QLogic Linux Networking HBA Driver
+
+This program includes a device driver for Linux 2.6 that may be
+distributed with QLogic hardware specific firmware binary file.
+You may modify and redistribute the device driver code under the
+GNU General Public License as published by the Free Software
+Foundation (version 2 or a later version).
+
+You may redistribute the hardware specific firmware binary file
+under the following terms:
+
+	1. Redistribution of source code (only if applicable),
+	   must retain the above copyright notice, this list of
+	   conditions and the following disclaimer.
+
+	2. Redistribution in binary form must reproduce the above
+	   copyright notice, this list of conditions and the
+	   following disclaimer in the documentation and/or other
+	   materials provided with the distribution.
+
+	3. The name of QLogic Corporation may not be used to
+	   endorse or promote products derived from this software
+	   without specific prior written permission
+
+REGARDLESS OF WHAT LICENSING MECHANISM IS USED OR APPLICABLE,
+THIS PROGRAM IS PROVIDED BY QLOGIC CORPORATION "AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+USER ACKNOWLEDGES AND AGREES THAT USE OF THIS PROGRAM WILL NOT
+CREATE OR GIVE GROUNDS FOR A LICENSE BY IMPLICATION, ESTOPPEL, OR
+OTHERWISE IN ANY INTELLECTUAL PROPERTY RIGHTS (PATENT, COPYRIGHT,
+TRADE SECRET, MASK WORK, OR OTHER PROPRIETARY RIGHT) EMBODIED IN
+ANY OTHER QLOGIC HARDWARE OR SOFTWARE EITHER SOLELY OR IN
+COMBINATION WITH THIS PROGRAM.
+
diff --git a/MAINTAINERS b/MAINTAINERS
index c8203d780f0..106684e45e1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3396,6 +3396,13 @@ M:	linux-driver@qlogic.com
 L:	netdev@vger.kernel.org
 S:	Supported
 
+QLOGIC QLGE 10Gb ETHERNET DRIVER
+P:	Ron Mercer
+M:	linux-driver@qlogic.com
+M:	ron.mercer@qlogic.com
+L:	netdev@vger.kernel.org
+S:	Supported
+
 QNX4 FILESYSTEM
 P:	Anders Larsen
 M:	al@alarsen.net
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 069755af761..69c81da48eb 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2526,6 +2526,15 @@ config BNX2X
 	  To compile this driver as a module, choose M here: the module
 	  will be called bnx2x.  This is recommended.
 
+config QLGE
+	tristate "QLogic QLGE 10Gb Ethernet Driver Support"
+	depends on PCI
+	help
+	  This driver supports QLogic ISP8XXX 10Gb Ethernet cards.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called qlge.
+
 source "drivers/net/sfc/Kconfig"
 
 endif # NETDEV_10000
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 016e23f000e..fa2510b2e60 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -131,6 +131,7 @@ obj-$(CONFIG_AX88796) += ax88796.o
 obj-$(CONFIG_TSI108_ETH) += tsi108_eth.o
 obj-$(CONFIG_MV643XX_ETH) += mv643xx_eth.o
 obj-$(CONFIG_QLA3XXX) += qla3xxx.o
+obj-$(CONFIG_QLGE) += qlge/
 
 obj-$(CONFIG_PPP) += ppp_generic.o
 obj-$(CONFIG_PPP_ASYNC) += ppp_async.o
diff --git a/drivers/net/qlge/Makefile b/drivers/net/qlge/Makefile
new file mode 100644
index 00000000000..8a197658d76
--- /dev/null
+++ b/drivers/net/qlge/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Qlogic 10GbE PCI Express ethernet driver
+#
+
+obj-$(CONFIG_QLGE) += qlge.o
+
+qlge-objs := qlge_main.o qlge_dbg.o qlge_mpi.o qlge_ethtool.o
diff --git a/drivers/net/qlge/qlge.h b/drivers/net/qlge/qlge.h
new file mode 100644
index 00000000000..c37ea436c91
--- /dev/null
+++ b/drivers/net/qlge/qlge.h
@@ -0,0 +1,1593 @@
+/*
+ * QLogic QLA41xx NIC HBA Driver
+ * Copyright (c)  2003-2006 QLogic Corporation
+ *
+ * See LICENSE.qlge for copyright and licensing details.
+ */
+#ifndef _QLGE_H_
+#define _QLGE_H_
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+
+/*
+ * General definitions...
+ */
+#define DRV_NAME  	"qlge"
+#define DRV_STRING 	"QLogic 10 Gigabit PCI-E Ethernet Driver "
+#define DRV_VERSION	"v1.00.00-b3"
+
+#define PFX "qlge: "
+#define QPRINTK(qdev, nlevel, klevel, fmt, args...)     \
+       do {       \
+	if (!((qdev)->msg_enable & NETIF_MSG_##nlevel))		\
+		;						\
+	else							\
+		dev_printk(KERN_##klevel, &((qdev)->pdev->dev),	\
+			   "%s: " fmt, __func__, ##args);  \
+       } while (0)
+
+#define QLGE_VENDOR_ID    0x1077
+#define QLGE_DEVICE_ID1    0x8012
+#define QLGE_DEVICE_ID   0x8000
+
+#define MAX_RX_RINGS 128
+#define MAX_TX_RINGS 128
+
+#define NUM_TX_RING_ENTRIES	256
+#define NUM_RX_RING_ENTRIES	256
+
+#define NUM_SMALL_BUFFERS   512
+#define NUM_LARGE_BUFFERS   512
+
+#define SMALL_BUFFER_SIZE 256
+#define LARGE_BUFFER_SIZE	PAGE_SIZE
+#define MAX_SPLIT_SIZE 1023
+#define QLGE_SB_PAD 32
+
+#define DFLT_COALESCE_WAIT 100	/* 100 usec wait for coalescing */
+#define MAX_INTER_FRAME_WAIT 10	/* 10 usec max interframe-wait for coalescing */
+#define DFLT_INTER_FRAME_WAIT (MAX_INTER_FRAME_WAIT/2)
+#define UDELAY_COUNT 3
+#define UDELAY_DELAY 10
+
+
+#define TX_DESC_PER_IOCB 8
+/* The maximum number of frags we handle is based
+ * on PAGE_SIZE...
+ */
+#if (PAGE_SHIFT == 12) || (PAGE_SHIFT == 13)	/* 4k & 8k pages */
+#define TX_DESC_PER_OAL ((MAX_SKB_FRAGS - TX_DESC_PER_IOCB) + 2)
+#elif (PAGE_SHIFT == 16)	/* 64k pages */
+#define TX_DESC_PER_OAL 0
+#endif
+
+#define DB_PAGE_SIZE 4096
+
+/*
+ * Processor Address Register (PROC_ADDR) bit definitions.
+ */
+enum {
+
+	/* Misc. stuff */
+	MAILBOX_COUNT = 16,
+
+	PROC_ADDR_RDY = (1 << 31),
+	PROC_ADDR_R = (1 << 30),
+	PROC_ADDR_ERR = (1 << 29),
+	PROC_ADDR_DA = (1 << 28),
+	PROC_ADDR_FUNC0_MBI = 0x00001180,
+	PROC_ADDR_FUNC0_MBO = (PROC_ADDR_FUNC0_MBI + MAILBOX_COUNT),
+	PROC_ADDR_FUNC0_CTL = 0x000011a1,
+	PROC_ADDR_FUNC2_MBI = 0x00001280,
+	PROC_ADDR_FUNC2_MBO = (PROC_ADDR_FUNC2_MBI + MAILBOX_COUNT),
+	PROC_ADDR_FUNC2_CTL = 0x000012a1,
+	PROC_ADDR_MPI_RISC = 0x00000000,
+	PROC_ADDR_MDE = 0x00010000,
+	PROC_ADDR_REGBLOCK = 0x00020000,
+	PROC_ADDR_RISC_REG = 0x00030000,
+};
+
+/*
+ * System Register (SYS) bit definitions.
+ */
+enum {
+	SYS_EFE = (1 << 0),
+	SYS_FAE = (1 << 1),
+	SYS_MDC = (1 << 2),
+	SYS_DST = (1 << 3),
+	SYS_DWC = (1 << 4),
+	SYS_EVW = (1 << 5),
+	SYS_OMP_DLY_MASK = 0x3f000000,
+	/*
+	 * There are no values defined as of edit #15.
+	 */
+	SYS_ODI = (1 << 14),
+};
+
+/*
+ *  Reset/Failover Register (RST_FO) bit definitions.
+ */
+enum {
+	RST_FO_TFO = (1 << 0),
+	RST_FO_RR_MASK = 0x00060000,
+	RST_FO_RR_CQ_CAM = 0x00000000,
+	RST_FO_RR_DROP = 0x00000001,
+	RST_FO_RR_DQ = 0x00000002,
+	RST_FO_RR_RCV_FUNC_CQ = 0x00000003,
+	RST_FO_FRB = (1 << 12),
+	RST_FO_MOP = (1 << 13),
+	RST_FO_REG = (1 << 14),
+	RST_FO_FR = (1 << 15),
+};
+
+/*
+ * Function Specific Control Register (FSC) bit definitions.
+ */
+enum {
+	FSC_DBRST_MASK = 0x00070000,
+	FSC_DBRST_256 = 0x00000000,
+	FSC_DBRST_512 = 0x00000001,
+	FSC_DBRST_768 = 0x00000002,
+	FSC_DBRST_1024 = 0x00000003,
+	FSC_DBL_MASK = 0x00180000,
+	FSC_DBL_DBRST = 0x00000000,
+	FSC_DBL_MAX_PLD = 0x00000008,
+	FSC_DBL_MAX_BRST = 0x00000010,
+	FSC_DBL_128_BYTES = 0x00000018,
+	FSC_EC = (1 << 5),
+	FSC_EPC_MASK = 0x00c00000,
+	FSC_EPC_INBOUND = (1 << 6),
+	FSC_EPC_OUTBOUND = (1 << 7),
+	FSC_VM_PAGESIZE_MASK = 0x07000000,
+	FSC_VM_PAGE_2K = 0x00000100,
+	FSC_VM_PAGE_4K = 0x00000200,
+	FSC_VM_PAGE_8K = 0x00000300,
+	FSC_VM_PAGE_64K = 0x00000600,
+	FSC_SH = (1 << 11),
+	FSC_DSB = (1 << 12),
+	FSC_STE = (1 << 13),
+	FSC_FE = (1 << 15),
+};
+
+/*
+ *  Host Command Status Register (CSR) bit definitions.
+ */
+enum {
+	CSR_ERR_STS_MASK = 0x0000003f,
+	/*
+	 * There are no valued defined as of edit #15.
+	 */
+	CSR_RR = (1 << 8),
+	CSR_HRI = (1 << 9),
+	CSR_RP = (1 << 10),
+	CSR_CMD_PARM_SHIFT = 22,
+	CSR_CMD_NOP = 0x00000000,
+	CSR_CMD_SET_RST = 0x1000000,
+	CSR_CMD_CLR_RST = 0x20000000,
+	CSR_CMD_SET_PAUSE = 0x30000000,
+	CSR_CMD_CLR_PAUSE = 0x40000000,
+	CSR_CMD_SET_H2R_INT = 0x50000000,
+	CSR_CMD_CLR_H2R_INT = 0x60000000,
+	CSR_CMD_PAR_EN = 0x70000000,
+	CSR_CMD_SET_BAD_PAR = 0x80000000,
+	CSR_CMD_CLR_BAD_PAR = 0x90000000,
+	CSR_CMD_CLR_R2PCI_INT = 0xa0000000,
+};
+
+/*
+ *  Configuration Register (CFG) bit definitions.
+ */
+enum {
+	CFG_LRQ = (1 << 0),
+	CFG_DRQ = (1 << 1),
+	CFG_LR = (1 << 2),
+	CFG_DR = (1 << 3),
+	CFG_LE = (1 << 5),
+	CFG_LCQ = (1 << 6),
+	CFG_DCQ = (1 << 7),
+	CFG_Q_SHIFT = 8,
+	CFG_Q_MASK = 0x7f000000,
+};
+
+/*
+ *  Status Register (STS) bit definitions.
+ */
+enum {
+	STS_FE = (1 << 0),
+	STS_PI = (1 << 1),
+	STS_PL0 = (1 << 2),
+	STS_PL1 = (1 << 3),
+	STS_PI0 = (1 << 4),
+	STS_PI1 = (1 << 5),
+	STS_FUNC_ID_MASK = 0x000000c0,
+	STS_FUNC_ID_SHIFT = 6,
+	STS_F0E = (1 << 8),
+	STS_F1E = (1 << 9),
+	STS_F2E = (1 << 10),
+	STS_F3E = (1 << 11),
+	STS_NFE = (1 << 12),
+};
+
+/*
+ * Interrupt Enable Register (INTR_EN) bit definitions.
+ */
+enum {
+	INTR_EN_INTR_MASK = 0x007f0000,
+	INTR_EN_TYPE_MASK = 0x03000000,
+	INTR_EN_TYPE_ENABLE = 0x00000100,
+	INTR_EN_TYPE_DISABLE = 0x00000200,
+	INTR_EN_TYPE_READ = 0x00000300,
+	INTR_EN_IHD = (1 << 13),
+	INTR_EN_IHD_MASK = (INTR_EN_IHD << 16),
+	INTR_EN_EI = (1 << 14),
+	INTR_EN_EN = (1 << 15),
+};
+
+/*
+ * Interrupt Mask Register (INTR_MASK) bit definitions.
+ */
+enum {
+	INTR_MASK_PI = (1 << 0),
+	INTR_MASK_HL0 = (1 << 1),
+	INTR_MASK_LH0 = (1 << 2),
+	INTR_MASK_HL1 = (1 << 3),
+	INTR_MASK_LH1 = (1 << 4),
+	INTR_MASK_SE = (1 << 5),
+	INTR_MASK_LSC = (1 << 6),
+	INTR_MASK_MC = (1 << 7),
+	INTR_MASK_LINK_IRQS = INTR_MASK_LSC | INTR_MASK_SE | INTR_MASK_MC,
+};
+
+/*
+ *  Register (REV_ID) bit definitions.
+ */
+enum {
+	REV_ID_MASK = 0x0000000f,
+	REV_ID_NICROLL_SHIFT = 0,
+	REV_ID_NICREV_SHIFT = 4,
+	REV_ID_XGROLL_SHIFT = 8,
+	REV_ID_XGREV_SHIFT = 12,
+	REV_ID_CHIPREV_SHIFT = 28,
+};
+
+/*
+ *  Force ECC Error Register (FRC_ECC_ERR) bit definitions.
+ */
+enum {
+	FRC_ECC_ERR_VW = (1 << 12),
+	FRC_ECC_ERR_VB = (1 << 13),
+	FRC_ECC_ERR_NI = (1 << 14),
+	FRC_ECC_ERR_NO = (1 << 15),
+	FRC_ECC_PFE_SHIFT = 16,
+	FRC_ECC_ERR_DO = (1 << 18),
+	FRC_ECC_P14 = (1 << 19),
+};
+
+/*
+ *  Error Status Register (ERR_STS) bit definitions.
+ */
+enum {
+	ERR_STS_NOF = (1 << 0),
+	ERR_STS_NIF = (1 << 1),
+	ERR_STS_DRP = (1 << 2),
+	ERR_STS_XGP = (1 << 3),
+	ERR_STS_FOU = (1 << 4),
+	ERR_STS_FOC = (1 << 5),
+	ERR_STS_FOF = (1 << 6),
+	ERR_STS_FIU = (1 << 7),
+	ERR_STS_FIC = (1 << 8),
+	ERR_STS_FIF = (1 << 9),
+	ERR_STS_MOF = (1 << 10),
+	ERR_STS_TA = (1 << 11),
+	ERR_STS_MA = (1 << 12),
+	ERR_STS_MPE = (1 << 13),
+	ERR_STS_SCE = (1 << 14),
+	ERR_STS_STE = (1 << 15),
+	ERR_STS_FOW = (1 << 16),
+	ERR_STS_UE = (1 << 17),
+	ERR_STS_MCH = (1 << 26),
+	ERR_STS_LOC_SHIFT = 27,
+};
+
+/*
+ *  RAM Debug Address Register (RAM_DBG_ADDR) bit definitions.
+ */
+enum {
+	RAM_DBG_ADDR_FW = (1 << 30),
+	RAM_DBG_ADDR_FR = (1 << 31),
+};
+
+/*
+ * Semaphore Register (SEM) bit definitions.
+ */
+enum {
+	/*
+	 * Example:
+	 * reg = SEM_XGMAC0_MASK | (SEM_SET << SEM_XGMAC0_SHIFT)
+	 */
+	SEM_CLEAR = 0,
+	SEM_SET = 1,
+	SEM_FORCE = 3,
+	SEM_XGMAC0_SHIFT = 0,
+	SEM_XGMAC1_SHIFT = 2,
+	SEM_ICB_SHIFT = 4,
+	SEM_MAC_ADDR_SHIFT = 6,
+	SEM_FLASH_SHIFT = 8,
+	SEM_PROBE_SHIFT = 10,
+	SEM_RT_IDX_SHIFT = 12,
+	SEM_PROC_REG_SHIFT = 14,
+	SEM_XGMAC0_MASK = 0x00030000,
+	SEM_XGMAC1_MASK = 0x000c0000,
+	SEM_ICB_MASK = 0x00300000,
+	SEM_MAC_ADDR_MASK = 0x00c00000,
+	SEM_FLASH_MASK = 0x03000000,
+	SEM_PROBE_MASK = 0x0c000000,
+	SEM_RT_IDX_MASK = 0x30000000,
+	SEM_PROC_REG_MASK = 0xc0000000,
+};
+
+/*
+ *  10G MAC Address  Register (XGMAC_ADDR) bit definitions.
+ */
+enum {
+	XGMAC_ADDR_RDY = (1 << 31),
+	XGMAC_ADDR_R = (1 << 30),
+	XGMAC_ADDR_XME = (1 << 29),
+
+	/* XGMAC control registers */
+	PAUSE_SRC_LO = 0x00000100,
+	PAUSE_SRC_HI = 0x00000104,
+	GLOBAL_CFG = 0x00000108,
+	GLOBAL_CFG_RESET = (1 << 0),
+	GLOBAL_CFG_JUMBO = (1 << 6),
+	GLOBAL_CFG_TX_STAT_EN = (1 << 10),
+	GLOBAL_CFG_RX_STAT_EN = (1 << 11),
+	TX_CFG = 0x0000010c,
+	TX_CFG_RESET = (1 << 0),
+	TX_CFG_EN = (1 << 1),
+	TX_CFG_PREAM = (1 << 2),
+	RX_CFG = 0x00000110,
+	RX_CFG_RESET = (1 << 0),
+	RX_CFG_EN = (1 << 1),
+	RX_CFG_PREAM = (1 << 2),
+	FLOW_CTL = 0x0000011c,
+	PAUSE_OPCODE = 0x00000120,
+	PAUSE_TIMER = 0x00000124,
+	PAUSE_FRM_DEST_LO = 0x00000128,
+	PAUSE_FRM_DEST_HI = 0x0000012c,
+	MAC_TX_PARAMS = 0x00000134,
+	MAC_TX_PARAMS_JUMBO = (1 << 31),
+	MAC_TX_PARAMS_SIZE_SHIFT = 16,
+	MAC_RX_PARAMS = 0x00000138,
+	MAC_SYS_INT = 0x00000144,
+	MAC_SYS_INT_MASK = 0x00000148,
+	MAC_MGMT_INT = 0x0000014c,
+	MAC_MGMT_IN_MASK = 0x00000150,
+	EXT_ARB_MODE = 0x000001fc,
+
+	/* XGMAC TX statistics  registers */
+	TX_PKTS = 0x00000200,
+	TX_BYTES = 0x00000208,
+	TX_MCAST_PKTS = 0x00000210,
+	TX_BCAST_PKTS = 0x00000218,
+	TX_UCAST_PKTS = 0x00000220,
+	TX_CTL_PKTS = 0x00000228,
+	TX_PAUSE_PKTS = 0x00000230,
+	TX_64_PKT = 0x00000238,
+	TX_65_TO_127_PKT = 0x00000240,
+	TX_128_TO_255_PKT = 0x00000248,
+	TX_256_511_PKT = 0x00000250,
+	TX_512_TO_1023_PKT = 0x00000258,
+	TX_1024_TO_1518_PKT = 0x00000260,
+	TX_1519_TO_MAX_PKT = 0x00000268,
+	TX_UNDERSIZE_PKT = 0x00000270,
+	TX_OVERSIZE_PKT = 0x00000278,
+
+	/* XGMAC statistics control registers */
+	RX_HALF_FULL_DET = 0x000002a0,
+	TX_HALF_FULL_DET = 0x000002a4,
+	RX_OVERFLOW_DET = 0x000002a8,
+	TX_OVERFLOW_DET = 0x000002ac,
+	RX_HALF_FULL_MASK = 0x000002b0,
+	TX_HALF_FULL_MASK = 0x000002b4,
+	RX_OVERFLOW_MASK = 0x000002b8,
+	TX_OVERFLOW_MASK = 0x000002bc,
+	STAT_CNT_CTL = 0x000002c0,
+	STAT_CNT_CTL_CLEAR_TX = (1 << 0),
+	STAT_CNT_CTL_CLEAR_RX = (1 << 1),
+	AUX_RX_HALF_FULL_DET = 0x000002d0,
+	AUX_TX_HALF_FULL_DET = 0x000002d4,
+	AUX_RX_OVERFLOW_DET = 0x000002d8,
+	AUX_TX_OVERFLOW_DET = 0x000002dc,
+	AUX_RX_HALF_FULL_MASK = 0x000002f0,
+	AUX_TX_HALF_FULL_MASK = 0x000002f4,
+	AUX_RX_OVERFLOW_MASK = 0x000002f8,
+	AUX_TX_OVERFLOW_MASK = 0x000002fc,
+
+	/* XGMAC RX statistics  registers */
+	RX_BYTES = 0x00000300,
+	RX_BYTES_OK = 0x00000308,
+	RX_PKTS = 0x00000310,
+	RX_PKTS_OK = 0x00000318,
+	RX_BCAST_PKTS = 0x00000320,
+	RX_MCAST_PKTS = 0x00000328,
+	RX_UCAST_PKTS = 0x00000330,
+	RX_UNDERSIZE_PKTS = 0x00000338,
+	RX_OVERSIZE_PKTS = 0x00000340,
+	RX_JABBER_PKTS = 0x00000348,
+	RX_UNDERSIZE_FCERR_PKTS = 0x00000350,
+	RX_DROP_EVENTS = 0x00000358,
+	RX_FCERR_PKTS = 0x00000360,
+	RX_ALIGN_ERR = 0x00000368,
+	RX_SYMBOL_ERR = 0x00000370,
+	RX_MAC_ERR = 0x00000378,
+	RX_CTL_PKTS = 0x00000380,
+	RX_PAUSE_PKTS = 0x00000384,
+	RX_64_PKTS = 0x00000390,
+	RX_65_TO_127_PKTS = 0x00000398,
+	RX_128_255_PKTS = 0x000003a0,
+	RX_256_511_PKTS = 0x000003a8,
+	RX_512_TO_1023_PKTS = 0x000003b0,
+	RX_1024_TO_1518_PKTS = 0x000003b8,
+	RX_1519_TO_MAX_PKTS = 0x000003c0,
+	RX_LEN_ERR_PKTS = 0x000003c8,
+
+	/* XGMAC MDIO control registers */
+	MDIO_TX_DATA = 0x00000400,
+	MDIO_RX_DATA = 0x00000410,
+	MDIO_CMD = 0x00000420,
+	MDIO_PHY_ADDR = 0x00000430,
+	MDIO_PORT = 0x00000440,
+	MDIO_STATUS = 0x00000450,
+
+	/* XGMAC AUX statistics  registers */
+};
+
+/*
+ *  Enhanced Transmission Schedule Registers (NIC_ETS,CNA_ETS) bit definitions.
+ */
+enum {
+	ETS_QUEUE_SHIFT = 29,
+	ETS_REF = (1 << 26),
+	ETS_RS = (1 << 27),
+	ETS_P = (1 << 28),
+	ETS_FC_COS_SHIFT = 23,
+};
+
+/*
+ *  Flash Address Register (FLASH_ADDR) bit definitions.
+ */
+enum {
+	FLASH_ADDR_RDY = (1 << 31),
+	FLASH_ADDR_R = (1 << 30),
+	FLASH_ADDR_ERR = (1 << 29),
+};
+
+/*
+ *  Stop CQ Processing Register (CQ_STOP) bit definitions.
+ */
+enum {
+	CQ_STOP_QUEUE_MASK = (0x007f0000),
+	CQ_STOP_TYPE_MASK = (0x03000000),
+	CQ_STOP_TYPE_START = 0x00000100,
+	CQ_STOP_TYPE_STOP = 0x00000200,
+	CQ_STOP_TYPE_READ = 0x00000300,
+	CQ_STOP_EN = (1 << 15),
+};
+
+/*
+ *  MAC Protocol Address Index Register (MAC_ADDR_IDX) bit definitions.
+ */
+enum {
+	MAC_ADDR_IDX_SHIFT = 4,
+	MAC_ADDR_TYPE_SHIFT = 16,
+	MAC_ADDR_TYPE_MASK = 0x000f0000,
+	MAC_ADDR_TYPE_CAM_MAC = 0x00000000,
+	MAC_ADDR_TYPE_MULTI_MAC = 0x00010000,
+	MAC_ADDR_TYPE_VLAN = 0x00020000,
+	MAC_ADDR_TYPE_MULTI_FLTR = 0x00030000,
+	MAC_ADDR_TYPE_FC_MAC = 0x00040000,
+	MAC_ADDR_TYPE_MGMT_MAC = 0x00050000,
+	MAC_ADDR_TYPE_MGMT_VLAN = 0x00060000,
+	MAC_ADDR_TYPE_MGMT_V4 = 0x00070000,
+	MAC_ADDR_TYPE_MGMT_V6 = 0x00080000,
+	MAC_ADDR_TYPE_MGMT_TU_DP = 0x00090000,
+	MAC_ADDR_ADR = (1 << 25),
+	MAC_ADDR_RS = (1 << 26),
+	MAC_ADDR_E = (1 << 27),
+	MAC_ADDR_MR = (1 << 30),
+	MAC_ADDR_MW = (1 << 31),
+	MAX_MULTICAST_ENTRIES = 32,
+};
+
+/*
+ *  MAC Protocol Address Index Register (SPLT_HDR) bit definitions.
+ */
+enum {
+	SPLT_HDR_EP = (1 << 31),
+};
+
+/*
+ *  FCoE Receive Configuration Register (FC_RCV_CFG) bit definitions.
+ */
+enum {
+	FC_RCV_CFG_ECT = (1 << 15),
+	FC_RCV_CFG_DFH = (1 << 20),
+	FC_RCV_CFG_DVF = (1 << 21),
+	FC_RCV_CFG_RCE = (1 << 27),
+	FC_RCV_CFG_RFE = (1 << 28),
+	FC_RCV_CFG_TEE = (1 << 29),
+	FC_RCV_CFG_TCE = (1 << 30),
+	FC_RCV_CFG_TFE = (1 << 31),
+};
+
+/*
+ *  NIC Receive Configuration Register (NIC_RCV_CFG) bit definitions.
+ */
+enum {
+	NIC_RCV_CFG_PPE = (1 << 0),
+	NIC_RCV_CFG_VLAN_MASK = 0x00060000,
+	NIC_RCV_CFG_VLAN_ALL = 0x00000000,
+	NIC_RCV_CFG_VLAN_MATCH_ONLY = 0x00000002,
+	NIC_RCV_CFG_VLAN_MATCH_AND_NON = 0x00000004,
+	NIC_RCV_CFG_VLAN_NONE_AND_NON = 0x00000006,
+	NIC_RCV_CFG_RV = (1 << 3),
+	NIC_RCV_CFG_DFQ_MASK = (0x7f000000),
+	NIC_RCV_CFG_DFQ_SHIFT = 8,
+	NIC_RCV_CFG_DFQ = 0,	/* HARDCODE default queue to 0. */
+};
+
+/*
+ *   Mgmt Receive Configuration Register (MGMT_RCV_CFG) bit definitions.
+ */
+enum {
+	MGMT_RCV_CFG_ARP = (1 << 0),
+	MGMT_RCV_CFG_DHC = (1 << 1),
+	MGMT_RCV_CFG_DHS = (1 << 2),
+	MGMT_RCV_CFG_NP = (1 << 3),
+	MGMT_RCV_CFG_I6N = (1 << 4),
+	MGMT_RCV_CFG_I6R = (1 << 5),
+	MGMT_RCV_CFG_DH6 = (1 << 6),
+	MGMT_RCV_CFG_UD1 = (1 << 7),
+	MGMT_RCV_CFG_UD0 = (1 << 8),
+	MGMT_RCV_CFG_BCT = (1 << 9),
+	MGMT_RCV_CFG_MCT = (1 << 10),
+	MGMT_RCV_CFG_DM = (1 << 11),
+	MGMT_RCV_CFG_RM = (1 << 12),
+	MGMT_RCV_CFG_STL = (1 << 13),
+	MGMT_RCV_CFG_VLAN_MASK = 0xc0000000,
+	MGMT_RCV_CFG_VLAN_ALL = 0x00000000,
+	MGMT_RCV_CFG_VLAN_MATCH_ONLY = 0x00004000,
+	MGMT_RCV_CFG_VLAN_MATCH_AND_NON = 0x00008000,
+	MGMT_RCV_CFG_VLAN_NONE_AND_NON = 0x0000c000,
+};
+
+/*
+ *  Routing Index Register (RT_IDX) bit definitions.
+ */
+enum {
+	RT_IDX_IDX_SHIFT = 8,
+	RT_IDX_TYPE_MASK = 0x000f0000,
+	RT_IDX_TYPE_RT = 0x00000000,
+	RT_IDX_TYPE_RT_INV = 0x00010000,
+	RT_IDX_TYPE_NICQ = 0x00020000,
+	RT_IDX_TYPE_NICQ_INV = 0x00030000,
+	RT_IDX_DST_MASK = 0x00700000,
+	RT_IDX_DST_RSS = 0x00000000,
+	RT_IDX_DST_CAM_Q = 0x00100000,
+	RT_IDX_DST_COS_Q = 0x00200000,
+	RT_IDX_DST_DFLT_Q = 0x00300000,
+	RT_IDX_DST_DEST_Q = 0x00400000,
+	RT_IDX_RS = (1 << 26),
+	RT_IDX_E = (1 << 27),
+	RT_IDX_MR = (1 << 30),
+	RT_IDX_MW = (1 << 31),
+
+	/* Nic Queue format - type 2 bits */
+	RT_IDX_BCAST = (1 << 0),
+	RT_IDX_MCAST = (1 << 1),
+	RT_IDX_MCAST_MATCH = (1 << 2),
+	RT_IDX_MCAST_REG_MATCH = (1 << 3),
+	RT_IDX_MCAST_HASH_MATCH = (1 << 4),
+	RT_IDX_FC_MACH = (1 << 5),
+	RT_IDX_ETH_FCOE = (1 << 6),
+	RT_IDX_CAM_HIT = (1 << 7),
+	RT_IDX_CAM_BIT0 = (1 << 8),
+	RT_IDX_CAM_BIT1 = (1 << 9),
+	RT_IDX_VLAN_TAG = (1 << 10),
+	RT_IDX_VLAN_MATCH = (1 << 11),
+	RT_IDX_VLAN_FILTER = (1 << 12),
+	RT_IDX_ETH_SKIP1 = (1 << 13),
+	RT_IDX_ETH_SKIP2 = (1 << 14),
+	RT_IDX_BCAST_MCAST_MATCH = (1 << 15),
+	RT_IDX_802_3 = (1 << 16),
+	RT_IDX_LLDP = (1 << 17),
+	RT_IDX_UNUSED018 = (1 << 18),
+	RT_IDX_UNUSED019 = (1 << 19),
+	RT_IDX_UNUSED20 = (1 << 20),
+	RT_IDX_UNUSED21 = (1 << 21),
+	RT_IDX_ERR = (1 << 22),
+	RT_IDX_VALID = (1 << 23),
+	RT_IDX_TU_CSUM_ERR = (1 << 24),
+	RT_IDX_IP_CSUM_ERR = (1 << 25),
+	RT_IDX_MAC_ERR = (1 << 26),
+	RT_IDX_RSS_TCP6 = (1 << 27),
+	RT_IDX_RSS_TCP4 = (1 << 28),
+	RT_IDX_RSS_IPV6 = (1 << 29),
+	RT_IDX_RSS_IPV4 = (1 << 30),
+	RT_IDX_RSS_MATCH = (1 << 31),
+
+	/* Hierarchy for the NIC Queue Mask */
+	RT_IDX_ALL_ERR_SLOT = 0,
+	RT_IDX_MAC_ERR_SLOT = 0,
+	RT_IDX_IP_CSUM_ERR_SLOT = 1,
+	RT_IDX_TCP_UDP_CSUM_ERR_SLOT = 2,
+	RT_IDX_BCAST_SLOT = 3,
+	RT_IDX_MCAST_MATCH_SLOT = 4,
+	RT_IDX_ALLMULTI_SLOT = 5,
+	RT_IDX_UNUSED6_SLOT = 6,
+	RT_IDX_UNUSED7_SLOT = 7,
+	RT_IDX_RSS_MATCH_SLOT = 8,
+	RT_IDX_RSS_IPV4_SLOT = 8,
+	RT_IDX_RSS_IPV6_SLOT = 9,
+	RT_IDX_RSS_TCP4_SLOT = 10,
+	RT_IDX_RSS_TCP6_SLOT = 11,
+	RT_IDX_CAM_HIT_SLOT = 12,
+	RT_IDX_UNUSED013 = 13,
+	RT_IDX_UNUSED014 = 14,
+	RT_IDX_PROMISCUOUS_SLOT = 15,
+	RT_IDX_MAX_SLOTS = 16,
+};
+
+/*
+ * Control Register Set Map
+ */
+enum {
+	PROC_ADDR = 0,		/* Use semaphore */
+	PROC_DATA = 0x04,	/* Use semaphore */
+	SYS = 0x08,
+	RST_FO = 0x0c,
+	FSC = 0x10,
+	CSR = 0x14,
+	LED = 0x18,
+	ICB_RID = 0x1c,		/* Use semaphore */
+	ICB_L = 0x20,		/* Use semaphore */
+	ICB_H = 0x24,		/* Use semaphore */
+	CFG = 0x28,
+	BIOS_ADDR = 0x2c,
+	STS = 0x30,
+	INTR_EN = 0x34,
+	INTR_MASK = 0x38,
+	ISR1 = 0x3c,
+	ISR2 = 0x40,
+	ISR3 = 0x44,
+	ISR4 = 0x48,
+	REV_ID = 0x4c,
+	FRC_ECC_ERR = 0x50,
+	ERR_STS = 0x54,
+	RAM_DBG_ADDR = 0x58,
+	RAM_DBG_DATA = 0x5c,
+	ECC_ERR_CNT = 0x60,
+	SEM = 0x64,
+	GPIO_1 = 0x68,		/* Use semaphore */
+	GPIO_2 = 0x6c,		/* Use semaphore */
+	GPIO_3 = 0x70,		/* Use semaphore */
+	RSVD2 = 0x74,
+	XGMAC_ADDR = 0x78,	/* Use semaphore */
+	XGMAC_DATA = 0x7c,	/* Use semaphore */
+	NIC_ETS = 0x80,
+	CNA_ETS = 0x84,
+	FLASH_ADDR = 0x88,	/* Use semaphore */
+	FLASH_DATA = 0x8c,	/* Use semaphore */
+	CQ_STOP = 0x90,
+	PAGE_TBL_RID = 0x94,
+	WQ_PAGE_TBL_LO = 0x98,
+	WQ_PAGE_TBL_HI = 0x9c,
+	CQ_PAGE_TBL_LO = 0xa0,
+	CQ_PAGE_TBL_HI = 0xa4,
+	MAC_ADDR_IDX = 0xa8,	/* Use semaphore */
+	MAC_ADDR_DATA = 0xac,	/* Use semaphore */
+	COS_DFLT_CQ1 = 0xb0,
+	COS_DFLT_CQ2 = 0xb4,
+	ETYPE_SKIP1 = 0xb8,
+	ETYPE_SKIP2 = 0xbc,
+	SPLT_HDR = 0xc0,
+	FC_PAUSE_THRES = 0xc4,
+	NIC_PAUSE_THRES = 0xc8,
+	FC_ETHERTYPE = 0xcc,
+	FC_RCV_CFG = 0xd0,
+	NIC_RCV_CFG = 0xd4,
+	FC_COS_TAGS = 0xd8,
+	NIC_COS_TAGS = 0xdc,
+	MGMT_RCV_CFG = 0xe0,
+	RT_IDX = 0xe4,
+	RT_DATA = 0xe8,
+	RSVD7 = 0xec,
+	XG_SERDES_ADDR = 0xf0,
+	XG_SERDES_DATA = 0xf4,
+	PRB_MX_ADDR = 0xf8,	/* Use semaphore */
+	PRB_MX_DATA = 0xfc,	/* Use semaphore */
+};
+
+/*
+ * CAM output format.
+ */
+enum {
+	CAM_OUT_ROUTE_FC = 0,
+	CAM_OUT_ROUTE_NIC = 1,
+	CAM_OUT_FUNC_SHIFT = 2,
+	CAM_OUT_RV = (1 << 4),
+	CAM_OUT_SH = (1 << 15),
+	CAM_OUT_CQ_ID_SHIFT = 5,
+};
+
+/*
+ * Mailbox  definitions
+ */
+enum {
+	/* Asynchronous Event Notifications */
+	AEN_SYS_ERR = 0x00008002,
+	AEN_LINK_UP = 0x00008011,
+	AEN_LINK_DOWN = 0x00008012,
+	AEN_IDC_CMPLT = 0x00008100,
+	AEN_IDC_REQ = 0x00008101,
+	AEN_FW_INIT_DONE = 0x00008400,
+	AEN_FW_INIT_FAIL = 0x00008401,
+
+	/* Mailbox Command Opcodes. */
+	MB_CMD_NOP = 0x00000000,
+	MB_CMD_EX_FW = 0x00000002,
+	MB_CMD_MB_TEST = 0x00000006,
+	MB_CMD_CSUM_TEST = 0x00000007,	/* Verify Checksum */
+	MB_CMD_ABOUT_FW = 0x00000008,
+	MB_CMD_LOAD_RISC_RAM = 0x0000000b,
+	MB_CMD_DUMP_RISC_RAM = 0x0000000c,
+	MB_CMD_WRITE_RAM = 0x0000000d,
+	MB_CMD_READ_RAM = 0x0000000f,
+	MB_CMD_STOP_FW = 0x00000014,
+	MB_CMD_MAKE_SYS_ERR = 0x0000002a,
+	MB_CMD_INIT_FW = 0x00000060,
+	MB_CMD_GET_INIT_CB = 0x00000061,
+	MB_CMD_GET_FW_STATE = 0x00000069,
+	MB_CMD_IDC_REQ = 0x00000100,	/* Inter-Driver Communication */
+	MB_CMD_IDC_ACK = 0x00000101,	/* Inter-Driver Communication */
+	MB_CMD_SET_WOL_MODE = 0x00000110,	/* Wake On Lan */
+	MB_WOL_DISABLE = 0x00000000,
+	MB_WOL_MAGIC_PKT = 0x00000001,
+	MB_WOL_FLTR = 0x00000002,
+	MB_WOL_UCAST = 0x00000004,
+	MB_WOL_MCAST = 0x00000008,
+	MB_WOL_BCAST = 0x00000010,
+	MB_WOL_LINK_UP = 0x00000020,
+	MB_WOL_LINK_DOWN = 0x00000040,
+	MB_CMD_SET_WOL_FLTR = 0x00000111,	/* Wake On Lan Filter */
+	MB_CMD_CLEAR_WOL_FLTR = 0x00000112,	/* Wake On Lan Filter */
+	MB_CMD_SET_WOL_MAGIC = 0x00000113,	/* Wake On Lan Magic Packet */
+	MB_CMD_CLEAR_WOL_MAGIC = 0x00000114,	/* Wake On Lan Magic Packet */
+	MB_CMD_PORT_RESET = 0x00000120,
+	MB_CMD_SET_PORT_CFG = 0x00000122,
+	MB_CMD_GET_PORT_CFG = 0x00000123,
+	MB_CMD_SET_ASIC_VOLTS = 0x00000130,
+	MB_CMD_GET_SNS_DATA = 0x00000131,	/* Temp and Volt Sense data. */
+
+	/* Mailbox Command Status. */
+	MB_CMD_STS_GOOD = 0x00004000,	/* Success. */
+	MB_CMD_STS_INTRMDT = 0x00001000,	/* Intermediate Complete. */
+	MB_CMD_STS_ERR = 0x00004005,	/* Error. */
+};
+
+struct mbox_params {
+	u32 mbox_in[MAILBOX_COUNT];
+	u32 mbox_out[MAILBOX_COUNT];
+	int in_count;
+	int out_count;
+};
+
+struct flash_params {
+	u8 dev_id_str[4];
+	u16 size;
+	u16 csum;
+	u16 ver;
+	u16 sub_dev_id;
+	u8 mac_addr[6];
+	u16 res;
+};
+
+
+/*
+ * doorbell space for the rx ring context
+ */
+struct rx_doorbell_context {
+	u32 cnsmr_idx;		/* 0x00 */
+	u32 valid;		/* 0x04 */
+	u32 reserved[4];	/* 0x08-0x14 */
+	u32 lbq_prod_idx;	/* 0x18 */
+	u32 sbq_prod_idx;	/* 0x1c */
+};
+
+/*
+ * doorbell space for the tx ring context
+ */
+struct tx_doorbell_context {
+	u32 prod_idx;		/* 0x00 */
+	u32 valid;		/* 0x04 */
+	u32 reserved[4];	/* 0x08-0x14 */
+	u32 lbq_prod_idx;	/* 0x18 */
+	u32 sbq_prod_idx;	/* 0x1c */
+};
+
+/* DATA STRUCTURES SHARED WITH HARDWARE. */
+
+struct bq_element {
+	u32 addr_lo;
+#define BQ_END	0x00000001
+#define BQ_CONT	0x00000002
+#define BQ_MASK	0x00000003
+	u32 addr_hi;
+} __attribute((packed));
+
+struct tx_buf_desc {
+	__le64 addr;
+	__le32 len;
+#define TX_DESC_LEN_MASK	0x000fffff
+#define TX_DESC_C	0x40000000
+#define TX_DESC_E	0x80000000
+} __attribute((packed));
+
+/*
+ * IOCB Definitions...
+ */
+
+#define OPCODE_OB_MAC_IOCB 			0x01
+#define OPCODE_OB_MAC_TSO_IOCB		0x02
+#define OPCODE_IB_MAC_IOCB			0x20
+#define OPCODE_IB_MPI_IOCB			0x21
+#define OPCODE_IB_AE_IOCB			0x3f
+
+struct ob_mac_iocb_req {
+	u8 opcode;
+	u8 flags1;
+#define OB_MAC_IOCB_REQ_OI	0x01
+#define OB_MAC_IOCB_REQ_I	0x02
+#define OB_MAC_IOCB_REQ_D	0x08
+#define OB_MAC_IOCB_REQ_F	0x10
+	u8 flags2;
+	u8 flags3;
+#define OB_MAC_IOCB_DFP	0x02
+#define OB_MAC_IOCB_V	0x04
+	__le32 reserved1[2];
+	__le16 frame_len;
+#define OB_MAC_IOCB_LEN_MASK 0x3ffff
+	__le16 reserved2;
+	__le32 tid;
+	__le32 txq_idx;
+	__le32 reserved3;
+	__le16 vlan_tci;
+	__le16 reserved4;
+	struct tx_buf_desc tbd[TX_DESC_PER_IOCB];
+} __attribute((packed));
+
+struct ob_mac_iocb_rsp {
+	u8 opcode;		/* */
+	u8 flags1;		/* */
+#define OB_MAC_IOCB_RSP_OI	0x01	/* */
+#define OB_MAC_IOCB_RSP_I	0x02	/* */
+#define OB_MAC_IOCB_RSP_E	0x08	/* */
+#define OB_MAC_IOCB_RSP_S	0x10	/* too Short */
+#define OB_MAC_IOCB_RSP_L	0x20	/* too Large */
+#define OB_MAC_IOCB_RSP_P	0x40	/* Padded */
+	u8 flags2;		/* */
+	u8 flags3;		/* */
+#define OB_MAC_IOCB_RSP_B	0x80	/* */
+	__le32 tid;
+	__le32 txq_idx;
+	__le32 reserved[13];
+} __attribute((packed));
+
+struct ob_mac_tso_iocb_req {
+	u8 opcode;
+	u8 flags1;
+#define OB_MAC_TSO_IOCB_OI	0x01
+#define OB_MAC_TSO_IOCB_I	0x02
+#define OB_MAC_TSO_IOCB_D	0x08
+#define OB_MAC_TSO_IOCB_IP4	0x40
+#define OB_MAC_TSO_IOCB_IP6	0x80
+	u8 flags2;
+#define OB_MAC_TSO_IOCB_LSO	0x20
+#define OB_MAC_TSO_IOCB_UC	0x40
+#define OB_MAC_TSO_IOCB_TC	0x80
+	u8 flags3;
+#define OB_MAC_TSO_IOCB_IC	0x01
+#define OB_MAC_TSO_IOCB_DFP	0x02
+#define OB_MAC_TSO_IOCB_V	0x04
+	__le32 reserved1[2];
+	__le32 frame_len;
+	__le32 tid;
+	__le32 txq_idx;
+	__le16 total_hdrs_len;
+	__le16 net_trans_offset;
+#define OB_MAC_TRANSPORT_HDR_SHIFT 6
+	__le16 vlan_tci;
+	__le16 mss;
+	struct tx_buf_desc tbd[TX_DESC_PER_IOCB];
+} __attribute((packed));
+
+struct ob_mac_tso_iocb_rsp {
+	u8 opcode;
+	u8 flags1;
+#define OB_MAC_TSO_IOCB_RSP_OI	0x01
+#define OB_MAC_TSO_IOCB_RSP_I	0x02
+#define OB_MAC_TSO_IOCB_RSP_E	0x08
+#define OB_MAC_TSO_IOCB_RSP_S	0x10
+#define OB_MAC_TSO_IOCB_RSP_L	0x20
+#define OB_MAC_TSO_IOCB_RSP_P	0x40
+	u8 flags2;		/* */
+	u8 flags3;		/* */
+#define OB_MAC_TSO_IOCB_RSP_B	0x8000
+	__le32 tid;
+	__le32 txq_idx;
+	__le32 reserved2[13];
+} __attribute((packed));
+
+struct ib_mac_iocb_rsp {
+	u8 opcode;		/* 0x20 */
+	u8 flags1;
+#define IB_MAC_IOCB_RSP_OI	0x01	/* Overide intr delay */
+#define IB_MAC_IOCB_RSP_I	0x02	/* Disble Intr Generation */
+#define IB_MAC_IOCB_RSP_TE	0x04	/* Checksum error */
+#define IB_MAC_IOCB_RSP_NU	0x08	/* No checksum rcvd */
+#define IB_MAC_IOCB_RSP_IE	0x10	/* IPv4 checksum error */
+#define IB_MAC_IOCB_RSP_M_MASK	0x60	/* Multicast info */
+#define IB_MAC_IOCB_RSP_M_NONE	0x00	/* Not mcast frame */
+#define IB_MAC_IOCB_RSP_M_HASH	0x20	/* HASH mcast frame */
+#define IB_MAC_IOCB_RSP_M_REG 	0x40	/* Registered mcast frame */
+#define IB_MAC_IOCB_RSP_M_PROM 	0x60	/* Promiscuous mcast frame */
+#define IB_MAC_IOCB_RSP_B	0x80	/* Broadcast frame */
+	u8 flags2;
+#define IB_MAC_IOCB_RSP_P	0x01	/* Promiscuous frame */
+#define IB_MAC_IOCB_RSP_V	0x02	/* Vlan tag present */
+#define IB_MAC_IOCB_RSP_ERR_MASK	0x1c	/*  */
+#define IB_MAC_IOCB_RSP_ERR_CODE_ERR	0x04
+#define IB_MAC_IOCB_RSP_ERR_OVERSIZE	0x08
+#define IB_MAC_IOCB_RSP_ERR_UNDERSIZE	0x10
+#define IB_MAC_IOCB_RSP_ERR_PREAMBLE	0x14
+#define IB_MAC_IOCB_RSP_ERR_FRAME_LEN	0x18
+#define IB_MAC_IOCB_RSP_ERR_CRC		0x1c
+#define IB_MAC_IOCB_RSP_U	0x20	/* UDP packet */
+#define IB_MAC_IOCB_RSP_T	0x40	/* TCP packet */
+#define IB_MAC_IOCB_RSP_FO	0x80	/* Failover port */
+	u8 flags3;
+#define IB_MAC_IOCB_RSP_RSS_MASK	0x07	/* RSS mask */
+#define IB_MAC_IOCB_RSP_M_NONE	0x00	/* No RSS match */
+#define IB_MAC_IOCB_RSP_M_IPV4	0x04	/* IPv4 RSS match */
+#define IB_MAC_IOCB_RSP_M_IPV6	0x02	/* IPv6 RSS match */
+#define IB_MAC_IOCB_RSP_M_TCP_V4 	0x05	/* TCP with IPv4 */
+#define IB_MAC_IOCB_RSP_M_TCP_V6 	0x03	/* TCP with IPv6 */
+#define IB_MAC_IOCB_RSP_V4	0x08	/* IPV4 */
+#define IB_MAC_IOCB_RSP_V6	0x10	/* IPV6 */
+#define IB_MAC_IOCB_RSP_IH	0x20	/* Split after IP header */
+#define IB_MAC_IOCB_RSP_DS	0x40	/* data is in small buffer */
+#define IB_MAC_IOCB_RSP_DL	0x80	/* data is in large buffer */
+	__le32 data_len;	/* */
+	__le32 data_addr_lo;	/* */
+	__le32 data_addr_hi;	/* */
+	__le32 rss;		/* */
+	__le16 vlan_id;		/* 12 bits */
+#define IB_MAC_IOCB_RSP_C	0x1000	/* VLAN CFI bit */
+#define IB_MAC_IOCB_RSP_COS_SHIFT	12	/* class of service value */
+
+	__le16 reserved1;
+	__le32 reserved2[6];
+	__le32 flags4;
+#define IB_MAC_IOCB_RSP_HV	0x20000000	/* */
+#define IB_MAC_IOCB_RSP_HS	0x40000000	/* */
+#define IB_MAC_IOCB_RSP_HL	0x80000000	/* */
+	__le32 hdr_len;		/* */
+	__le32 hdr_addr_lo;	/* */
+	__le32 hdr_addr_hi;	/* */
+} __attribute((packed));
+
+struct ib_ae_iocb_rsp {
+	u8 opcode;
+	u8 flags1;
+#define IB_AE_IOCB_RSP_OI		0x01
+#define IB_AE_IOCB_RSP_I		0x02
+	u8 event;
+#define LINK_UP_EVENT              0x00
+#define LINK_DOWN_EVENT            0x01
+#define CAM_LOOKUP_ERR_EVENT       0x06
+#define SOFT_ECC_ERROR_EVENT       0x07
+#define MGMT_ERR_EVENT             0x08
+#define TEN_GIG_MAC_EVENT          0x09
+#define GPI0_H2L_EVENT       	0x10
+#define GPI0_L2H_EVENT       	0x20
+#define GPI1_H2L_EVENT       	0x11
+#define GPI1_L2H_EVENT       	0x21
+#define PCI_ERR_ANON_BUF_RD        0x40
+	u8 q_id;
+	__le32 reserved[15];
+} __attribute((packed));
+
+/*
+ * These three structures are for generic
+ * handling of ib and ob iocbs.
+ */
+struct ql_net_rsp_iocb {
+	u8 opcode;
+	u8 flags0;
+	__le16 length;
+	__le32 tid;
+	__le32 reserved[14];
+} __attribute((packed));
+
+struct net_req_iocb {
+	u8 opcode;
+	u8 flags0;
+	__le16 flags1;
+	__le32 tid;
+	__le32 reserved1[30];
+} __attribute((packed));
+
+/*
+ * tx ring initialization control block for chip.
+ * It is defined as:
+ * "Work Queue Initialization Control Block"
+ */
+struct wqicb {
+	__le16 len;
+#define Q_LEN_V		(1 << 4)
+#define Q_LEN_CPP_CONT	0x0000
+#define Q_LEN_CPP_16	0x0001
+#define Q_LEN_CPP_32	0x0002
+#define Q_LEN_CPP_64	0x0003
+	__le16 flags;
+#define Q_PRI_SHIFT	1
+#define Q_FLAGS_LC	0x1000
+#define Q_FLAGS_LB	0x2000
+#define Q_FLAGS_LI	0x4000
+#define Q_FLAGS_LO	0x8000
+	__le16 cq_id_rss;
+#define Q_CQ_ID_RSS_RV 0x8000
+	__le16 rid;
+	__le32 addr_lo;
+	__le32 addr_hi;
+	__le32 cnsmr_idx_addr_lo;
+	__le32 cnsmr_idx_addr_hi;
+} __attribute((packed));
+
+/*
+ * rx ring initialization control block for chip.
+ * It is defined as:
+ * "Completion Queue Initialization Control Block"
+ */
+struct cqicb {
+	u8 msix_vect;
+	u8 reserved1;
+	u8 reserved2;
+	u8 flags;
+#define FLAGS_LV	0x08
+#define FLAGS_LS	0x10
+#define FLAGS_LL	0x20
+#define FLAGS_LI	0x40
+#define FLAGS_LC	0x80
+	__le16 len;
+#define LEN_V		(1 << 4)
+#define LEN_CPP_CONT	0x0000
+#define LEN_CPP_32	0x0001
+#define LEN_CPP_64	0x0002
+#define LEN_CPP_128	0x0003
+	__le16 rid;
+	__le32 addr_lo;
+	__le32 addr_hi;
+	__le32 prod_idx_addr_lo;
+	__le32 prod_idx_addr_hi;
+	__le16 pkt_delay;
+	__le16 irq_delay;
+	__le32 lbq_addr_lo;
+	__le32 lbq_addr_hi;
+	__le16 lbq_buf_size;
+	__le16 lbq_len;		/* entry count */
+	__le32 sbq_addr_lo;
+	__le32 sbq_addr_hi;
+	__le16 sbq_buf_size;
+	__le16 sbq_len;		/* entry count */
+} __attribute((packed));
+
+struct ricb {
+	u8 base_cq;
+#define RSS_L4K 0x80
+	u8 flags;
+#define RSS_L6K 0x01
+#define RSS_LI  0x02
+#define RSS_LB  0x04
+#define RSS_LM  0x08
+#define RSS_RI4 0x10
+#define RSS_RT4 0x20
+#define RSS_RI6 0x40
+#define RSS_RT6 0x80
+	__le16 mask;
+	__le32 hash_cq_id[256];
+	__le32 ipv6_hash_key[10];
+	__le32 ipv4_hash_key[4];
+} __attribute((packed));
+
+/* SOFTWARE/DRIVER DATA STRUCTURES. */
+
+struct oal {
+	struct tx_buf_desc oal[TX_DESC_PER_OAL];
+};
+
+struct map_list {
+	DECLARE_PCI_UNMAP_ADDR(mapaddr);
+	DECLARE_PCI_UNMAP_LEN(maplen);
+};
+
+struct tx_ring_desc {
+	struct sk_buff *skb;
+	struct ob_mac_iocb_req *queue_entry;
+	int index;
+	struct oal oal;
+	struct map_list map[MAX_SKB_FRAGS + 1];
+	int map_cnt;
+	struct tx_ring_desc *next;
+};
+
+struct bq_desc {
+	union {
+		struct page *lbq_page;
+		struct sk_buff *skb;
+	} p;
+	struct bq_element *bq;
+	int index;
+	 DECLARE_PCI_UNMAP_ADDR(mapaddr);
+	 DECLARE_PCI_UNMAP_LEN(maplen);
+};
+
+#define QL_TXQ_IDX(qdev, skb) (smp_processor_id()%(qdev->tx_ring_count))
+
+struct tx_ring {
+	/*
+	 * queue info.
+	 */
+	struct wqicb wqicb;	/* structure used to inform chip of new queue */
+	void *wq_base;		/* pci_alloc:virtual addr for tx */
+	dma_addr_t wq_base_dma;	/* pci_alloc:dma addr for tx */
+	u32 *cnsmr_idx_sh_reg;	/* shadow copy of consumer idx */
+	dma_addr_t cnsmr_idx_sh_reg_dma;	/* dma-shadow copy of consumer */
+	u32 wq_size;		/* size in bytes of queue area */
+	u32 wq_len;		/* number of entries in queue */
+	void __iomem *prod_idx_db_reg;	/* doorbell area index reg at offset 0x00 */
+	void __iomem *valid_db_reg;	/* doorbell area valid reg at offset 0x04 */
+	u16 prod_idx;		/* current value for prod idx */
+	u16 cq_id;		/* completion (rx) queue for tx completions */
+	u8 wq_id;		/* queue id for this entry */
+	u8 reserved1[3];
+	struct tx_ring_desc *q;	/* descriptor list for the queue */
+	spinlock_t lock;
+	atomic_t tx_count;	/* counts down for every outstanding IO */
+	atomic_t queue_stopped;	/* Turns queue off when full. */
+	struct delayed_work tx_work;
+	struct ql_adapter *qdev;
+};
+
+/*
+ * Type of inbound queue.
+ */
+enum {
+	DEFAULT_Q = 2,		/* Handles slow queue and chip/MPI events. */
+	TX_Q = 3,		/* Handles outbound completions. */
+	RX_Q = 4,		/* Handles inbound completions. */
+};
+
+struct rx_ring {
+	struct cqicb cqicb;	/* The chip's completion queue init control block. */
+
+	/* Completion queue elements. */
+	void *cq_base;
+	dma_addr_t cq_base_dma;
+	u32 cq_size;
+	u32 cq_len;
+	u16 cq_id;
+	u32 *prod_idx_sh_reg;	/* Shadowed producer register. */
+	dma_addr_t prod_idx_sh_reg_dma;
+	void __iomem *cnsmr_idx_db_reg;	/* PCI doorbell mem area + 0 */
+	u32 cnsmr_idx;		/* current sw idx */
+	struct ql_net_rsp_iocb *curr_entry;	/* next entry on queue */
+	void __iomem *valid_db_reg;	/* PCI doorbell mem area + 0x04 */
+
+	/* Large buffer queue elements. */
+	u32 lbq_len;		/* entry count */
+	u32 lbq_size;		/* size in bytes of queue */
+	u32 lbq_buf_size;
+	void *lbq_base;
+	dma_addr_t lbq_base_dma;
+	void *lbq_base_indirect;
+	dma_addr_t lbq_base_indirect_dma;
+	struct bq_desc *lbq;	/* array of control blocks */
+	void __iomem *lbq_prod_idx_db_reg;	/* PCI doorbell mem area + 0x18 */
+	u32 lbq_prod_idx;	/* current sw prod idx */
+	u32 lbq_curr_idx;	/* next entry we expect */
+	u32 lbq_clean_idx;	/* beginning of new descs */
+	u32 lbq_free_cnt;	/* free buffer desc cnt */
+
+	/* Small buffer queue elements. */
+	u32 sbq_len;		/* entry count */
+	u32 sbq_size;		/* size in bytes of queue */
+	u32 sbq_buf_size;
+	void *sbq_base;
+	dma_addr_t sbq_base_dma;
+	void *sbq_base_indirect;
+	dma_addr_t sbq_base_indirect_dma;
+	struct bq_desc *sbq;	/* array of control blocks */
+	void __iomem *sbq_prod_idx_db_reg; /* PCI doorbell mem area + 0x1c */
+	u32 sbq_prod_idx;	/* current sw prod idx */
+	u32 sbq_curr_idx;	/* next entry we expect */
+	u32 sbq_clean_idx;	/* beginning of new descs */
+	u32 sbq_free_cnt;	/* free buffer desc cnt */
+
+	/* Misc. handler elements. */
+	u32 type;		/* Type of queue, tx, rx, or default. */
+	u32 irq;		/* Which vector this ring is assigned. */
+	u32 cpu;		/* Which CPU this should run on. */
+	char name[IFNAMSIZ + 5];
+	struct napi_struct napi;
+	struct delayed_work rx_work;
+	u8 reserved;
+	struct ql_adapter *qdev;
+};
+
+/*
+ * RSS Initialization Control Block
+ */
+struct hash_id {
+	u8 value[4];
+};
+
+struct nic_stats {
+	/*
+	 * These stats come from offset 200h to 278h
+	 * in the XGMAC register.
+	 */
+	u64 tx_pkts;
+	u64 tx_bytes;
+	u64 tx_mcast_pkts;
+	u64 tx_bcast_pkts;
+	u64 tx_ucast_pkts;
+	u64 tx_ctl_pkts;
+	u64 tx_pause_pkts;
+	u64 tx_64_pkt;
+	u64 tx_65_to_127_pkt;
+	u64 tx_128_to_255_pkt;
+	u64 tx_256_511_pkt;
+	u64 tx_512_to_1023_pkt;
+	u64 tx_1024_to_1518_pkt;
+	u64 tx_1519_to_max_pkt;
+	u64 tx_undersize_pkt;
+	u64 tx_oversize_pkt;
+
+	/*
+	 * These stats come from offset 300h to 3C8h
+	 * in the XGMAC register.
+	 */
+	u64 rx_bytes;
+	u64 rx_bytes_ok;
+	u64 rx_pkts;
+	u64 rx_pkts_ok;
+	u64 rx_bcast_pkts;
+	u64 rx_mcast_pkts;
+	u64 rx_ucast_pkts;
+	u64 rx_undersize_pkts;
+	u64 rx_oversize_pkts;
+	u64 rx_jabber_pkts;
+	u64 rx_undersize_fcerr_pkts;
+	u64 rx_drop_events;
+	u64 rx_fcerr_pkts;
+	u64 rx_align_err;
+	u64 rx_symbol_err;
+	u64 rx_mac_err;
+	u64 rx_ctl_pkts;
+	u64 rx_pause_pkts;
+	u64 rx_64_pkts;
+	u64 rx_65_to_127_pkts;
+	u64 rx_128_255_pkts;
+	u64 rx_256_511_pkts;
+	u64 rx_512_to_1023_pkts;
+	u64 rx_1024_to_1518_pkts;
+	u64 rx_1519_to_max_pkts;
+	u64 rx_len_err_pkts;
+};
+
+/*
+ * intr_context structure is used during initialization
+ * to hook the interrupts.  It is also used in a single
+ * irq environment as a context to the ISR.
+ */
+struct intr_context {
+	struct ql_adapter *qdev;
+	u32 intr;
+	u32 hooked;
+	u32 intr_en_mask;	/* value/mask used to enable this intr */
+	u32 intr_dis_mask;	/* value/mask used to disable this intr */
+	u32 intr_read_mask;	/* value/mask used to read this intr */
+	char name[IFNAMSIZ * 2];
+	atomic_t irq_cnt;	/* irq_cnt is used in single vector
+				 * environment.  It's incremented for each
+				 * irq handler that is scheduled.  When each
+				 * handler finishes it decrements irq_cnt and
+				 * enables interrupts if it's zero. */
+	irq_handler_t handler;
+};
+
+/* adapter flags definitions. */
+enum {
+	QL_ADAPTER_UP = (1 << 0),	/* Adapter has been brought up. */
+	QL_LEGACY_ENABLED = (1 << 3),
+	QL_MSI_ENABLED = (1 << 3),
+	QL_MSIX_ENABLED = (1 << 4),
+	QL_DMA64 = (1 << 5),
+	QL_PROMISCUOUS = (1 << 6),
+	QL_ALLMULTI = (1 << 7),
+};
+
+/* link_status bit definitions */
+enum {
+	LOOPBACK_MASK = 0x00000700,
+	LOOPBACK_PCS = 0x00000100,
+	LOOPBACK_HSS = 0x00000200,
+	LOOPBACK_EXT = 0x00000300,
+	PAUSE_MASK = 0x000000c0,
+	PAUSE_STD = 0x00000040,
+	PAUSE_PRI = 0x00000080,
+	SPEED_MASK = 0x00000038,
+	SPEED_100Mb = 0x00000000,
+	SPEED_1Gb = 0x00000008,
+	SPEED_10Gb = 0x00000010,
+	LINK_TYPE_MASK = 0x00000007,
+	LINK_TYPE_XFI = 0x00000001,
+	LINK_TYPE_XAUI = 0x00000002,
+	LINK_TYPE_XFI_BP = 0x00000003,
+	LINK_TYPE_XAUI_BP = 0x00000004,
+	LINK_TYPE_10GBASET = 0x00000005,
+};
+
+/*
+ * The main Adapter structure definition.
+ * This structure has all fields relevant to the hardware.
+ */
+struct ql_adapter {
+	struct ricb ricb;
+	unsigned long flags;
+	u32 wol;
+
+	struct nic_stats nic_stats;
+
+	struct vlan_group *vlgrp;
+
+	/* PCI Configuration information for this device */
+	struct pci_dev *pdev;
+	struct net_device *ndev;	/* Parent NET device */
+
+	/* Hardware information */
+	u32 chip_rev_id;
+	u32 func;		/* PCI function for this adapter */
+
+	spinlock_t adapter_lock;
+	spinlock_t hw_lock;
+	spinlock_t stats_lock;
+	spinlock_t legacy_lock;	/* used for maintaining legacy intr sync */
+
+	/* PCI Bus Relative Register Addresses */
+	void __iomem *reg_base;
+	void __iomem *doorbell_area;
+	u32 doorbell_area_size;
+
+	u32 msg_enable;
+
+	/* Page for Shadow Registers */
+	void *rx_ring_shadow_reg_area;
+	dma_addr_t rx_ring_shadow_reg_dma;
+	void *tx_ring_shadow_reg_area;
+	dma_addr_t tx_ring_shadow_reg_dma;
+
+	u32 mailbox_in;
+	u32 mailbox_out;
+
+	int tx_ring_size;
+	int rx_ring_size;
+	u32 intr_count;
+	struct msix_entry *msi_x_entry;
+	struct intr_context intr_context[MAX_RX_RINGS];
+
+	int (*legacy_check) (struct ql_adapter *);
+
+	int tx_ring_count;	/* One per online CPU. */
+	u32 rss_ring_first_cq_id;/* index of first inbound (rss) rx_ring */
+	u32 rss_ring_count;	/* One per online CPU.  */
+	/*
+	 * rx_ring_count =
+	 *  one default queue +
+	 *  (CPU count * outbound completion rx_ring) +
+	 *  (CPU count * inbound (RSS) completion rx_ring)
+	 */
+	int rx_ring_count;
+	int ring_mem_size;
+	void *ring_mem;
+	struct rx_ring *rx_ring;
+	int rx_csum;
+	struct tx_ring *tx_ring;
+	u32 default_rx_queue;
+
+	u16 rx_coalesce_usecs;	/* cqicb->int_delay */
+	u16 rx_max_coalesced_frames;	/* cqicb->pkt_int_delay */
+	u16 tx_coalesce_usecs;	/* cqicb->int_delay */
+	u16 tx_max_coalesced_frames;	/* cqicb->pkt_int_delay */
+
+	u32 xg_sem_mask;
+	u32 port_link_up;
+	u32 port_init;
+	u32 link_status;
+
+	struct flash_params flash;
+
+	struct net_device_stats stats;
+	struct workqueue_struct *q_workqueue;
+	struct workqueue_struct *workqueue;
+	struct delayed_work asic_reset_work;
+	struct delayed_work mpi_reset_work;
+	struct delayed_work mpi_work;
+};
+
+/*
+ * Typical Register accessor for memory mapped device.
+ */
+static inline u32 ql_read32(const struct ql_adapter *qdev, int reg)
+{
+	return readl(qdev->reg_base + reg);
+}
+
+/*
+ * Typical Register accessor for memory mapped device.
+ */
+static inline void ql_write32(const struct ql_adapter *qdev, int reg, u32 val)
+{
+	writel(val, qdev->reg_base + reg);
+}
+
+/*
+ * Doorbell Registers:
+ * Doorbell registers are virtual registers in the PCI memory space.
+ * The space is allocated by the chip during PCI initialization.  The
+ * device driver finds the doorbell address in BAR 3 in PCI config space.
+ * The registers are used to control outbound and inbound queues. For
+ * example, the producer index for an outbound queue.  Each queue uses
+ * 1 4k chunk of memory.  The lower half of the space is for outbound
+ * queues. The upper half is for inbound queues.
+ */
+static inline void ql_write_db_reg(u32 val, void __iomem *addr)
+{
+	writel(val, addr);
+	mmiowb();
+}
+
+/*
+ * Shadow Registers:
+ * Outbound queues have a consumer index that is maintained by the chip.
+ * Inbound queues have a producer index that is maintained by the chip.
+ * For lower overhead, these registers are "shadowed" to host memory
+ * which allows the device driver to track the queue progress without
+ * PCI reads. When an entry is placed on an inbound queue, the chip will
+ * update the relevant index register and then copy the value to the
+ * shadow register in host memory.
+ */
+static inline unsigned int ql_read_sh_reg(const volatile void  *addr)
+{
+	return *(volatile unsigned int __force *)addr;
+}
+
+extern char qlge_driver_name[];
+extern const char qlge_driver_version[];
+extern const struct ethtool_ops qlge_ethtool_ops;
+
+extern int ql_sem_spinlock(struct ql_adapter *qdev, u32 sem_mask);
+extern void ql_sem_unlock(struct ql_adapter *qdev, u32 sem_mask);
+extern int ql_read_xgmac_reg(struct ql_adapter *qdev, u32 reg, u32 *data);
+extern int ql_get_mac_addr_reg(struct ql_adapter *qdev, u32 type, u16 index,
+			       u32 *value);
+extern int ql_get_routing_reg(struct ql_adapter *qdev, u32 index, u32 *value);
+extern int ql_write_cfg(struct ql_adapter *qdev, void *ptr, int size, u32 bit,
+			u16 q_id);
+void ql_queue_fw_error(struct ql_adapter *qdev);
+void ql_mpi_work(struct work_struct *work);
+void ql_mpi_reset_work(struct work_struct *work);
+int ql_wait_reg_rdy(struct ql_adapter *qdev, u32 reg, u32 bit, u32 ebit);
+void ql_queue_asic_error(struct ql_adapter *qdev);
+void ql_enable_completion_interrupt(struct ql_adapter *qdev, u32 intr);
+void ql_set_ethtool_ops(struct net_device *ndev);
+int ql_read_xgmac_reg64(struct ql_adapter *qdev, u32 reg, u64 *data);
+
+#if 1
+#define QL_ALL_DUMP
+#define QL_REG_DUMP
+#define QL_DEV_DUMP
+#define QL_CB_DUMP
+/* #define QL_IB_DUMP */
+/* #define QL_OB_DUMP */
+#endif
+
+#ifdef QL_REG_DUMP
+extern void ql_dump_xgmac_control_regs(struct ql_adapter *qdev);
+extern void ql_dump_routing_entries(struct ql_adapter *qdev);
+extern void ql_dump_regs(struct ql_adapter *qdev);
+#define QL_DUMP_REGS(qdev) ql_dump_regs(qdev)
+#define QL_DUMP_ROUTE(qdev) ql_dump_routing_entries(qdev)
+#define QL_DUMP_XGMAC_CONTROL_REGS(qdev) ql_dump_xgmac_control_regs(qdev)
+#else
+#define QL_DUMP_REGS(qdev)
+#define QL_DUMP_ROUTE(qdev)
+#define QL_DUMP_XGMAC_CONTROL_REGS(qdev)
+#endif
+
+#ifdef QL_STAT_DUMP
+extern void ql_dump_stat(struct ql_adapter *qdev);
+#define QL_DUMP_STAT(qdev) ql_dump_stat(qdev)
+#else
+#define QL_DUMP_STAT(qdev)
+#endif
+
+#ifdef QL_DEV_DUMP
+extern void ql_dump_qdev(struct ql_adapter *qdev);
+#define QL_DUMP_QDEV(qdev) ql_dump_qdev(qdev)
+#else
+#define QL_DUMP_QDEV(qdev)
+#endif
+
+#ifdef QL_CB_DUMP
+extern void ql_dump_wqicb(struct wqicb *wqicb);
+extern void ql_dump_tx_ring(struct tx_ring *tx_ring);
+extern void ql_dump_ricb(struct ricb *ricb);
+extern void ql_dump_cqicb(struct cqicb *cqicb);
+extern void ql_dump_rx_ring(struct rx_ring *rx_ring);
+extern void ql_dump_hw_cb(struct ql_adapter *qdev, int size, u32 bit, u16 q_id);
+#define QL_DUMP_RICB(ricb) ql_dump_ricb(ricb)
+#define QL_DUMP_WQICB(wqicb) ql_dump_wqicb(wqicb)
+#define QL_DUMP_TX_RING(tx_ring) ql_dump_tx_ring(tx_ring)
+#define QL_DUMP_CQICB(cqicb) ql_dump_cqicb(cqicb)
+#define QL_DUMP_RX_RING(rx_ring) ql_dump_rx_ring(rx_ring)
+#define QL_DUMP_HW_CB(qdev, size, bit, q_id) \
+		ql_dump_hw_cb(qdev, size, bit, q_id)
+#else
+#define QL_DUMP_RICB(ricb)
+#define QL_DUMP_WQICB(wqicb)
+#define QL_DUMP_TX_RING(tx_ring)
+#define QL_DUMP_CQICB(cqicb)
+#define QL_DUMP_RX_RING(rx_ring)
+#define QL_DUMP_HW_CB(qdev, size, bit, q_id)
+#endif
+
+#ifdef QL_OB_DUMP
+extern void ql_dump_tx_desc(struct tx_buf_desc *tbd);
+extern void ql_dump_ob_mac_iocb(struct ob_mac_iocb_req *ob_mac_iocb);
+extern void ql_dump_ob_mac_rsp(struct ob_mac_iocb_rsp *ob_mac_rsp);
+#define QL_DUMP_OB_MAC_IOCB(ob_mac_iocb) ql_dump_ob_mac_iocb(ob_mac_iocb)
+#define QL_DUMP_OB_MAC_RSP(ob_mac_rsp) ql_dump_ob_mac_rsp(ob_mac_rsp)
+#else
+#define QL_DUMP_OB_MAC_IOCB(ob_mac_iocb)
+#define QL_DUMP_OB_MAC_RSP(ob_mac_rsp)
+#endif
+
+#ifdef QL_IB_DUMP
+extern void ql_dump_ib_mac_rsp(struct ib_mac_iocb_rsp *ib_mac_rsp);
+#define QL_DUMP_IB_MAC_RSP(ib_mac_rsp) ql_dump_ib_mac_rsp(ib_mac_rsp)
+#else
+#define QL_DUMP_IB_MAC_RSP(ib_mac_rsp)
+#endif
+
+#ifdef	QL_ALL_DUMP
+extern void ql_dump_all(struct ql_adapter *qdev);
+#define QL_DUMP_ALL(qdev) ql_dump_all(qdev)
+#else
+#define QL_DUMP_ALL(qdev)
+#endif
+
+#endif /* _QLGE_H_ */
diff --git a/drivers/net/qlge/qlge_dbg.c b/drivers/net/qlge/qlge_dbg.c
new file mode 100644
index 00000000000..f0392b16617
--- /dev/null
+++ b/drivers/net/qlge/qlge_dbg.c
@@ -0,0 +1,858 @@
+#include "qlge.h"
+
+#ifdef QL_REG_DUMP
+static void ql_dump_intr_states(struct ql_adapter *qdev)
+{
+	int i;
+	u32 value;
+	for (i = 0; i < qdev->intr_count; i++) {
+		ql_write32(qdev, INTR_EN, qdev->intr_context[i].intr_read_mask);
+		value = ql_read32(qdev, INTR_EN);
+		printk(KERN_ERR PFX
+		       "%s: Interrupt %d is %s.\n",
+		       qdev->ndev->name, i,
+		       (value & INTR_EN_EN ? "enabled" : "disabled"));
+	}
+}
+
+void ql_dump_xgmac_control_regs(struct ql_adapter *qdev)
+{
+	u32 data;
+	if (ql_sem_spinlock(qdev, qdev->xg_sem_mask)) {
+		printk(KERN_ERR "%s: Couldn't get xgmac sem.\n", __func__);
+		return;
+	}
+	ql_read_xgmac_reg(qdev, PAUSE_SRC_LO, &data);
+	printk(KERN_ERR PFX "%s: PAUSE_SRC_LO = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, PAUSE_SRC_HI, &data);
+	printk(KERN_ERR PFX "%s: PAUSE_SRC_HI = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, GLOBAL_CFG, &data);
+	printk(KERN_ERR PFX "%s: GLOBAL_CFG = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, TX_CFG, &data);
+	printk(KERN_ERR PFX "%s: TX_CFG = 0x%.08x.\n", qdev->ndev->name, data);
+	ql_read_xgmac_reg(qdev, RX_CFG, &data);
+	printk(KERN_ERR PFX "%s: RX_CFG = 0x%.08x.\n", qdev->ndev->name, data);
+	ql_read_xgmac_reg(qdev, FLOW_CTL, &data);
+	printk(KERN_ERR PFX "%s: FLOW_CTL = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, PAUSE_OPCODE, &data);
+	printk(KERN_ERR PFX "%s: PAUSE_OPCODE = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, PAUSE_TIMER, &data);
+	printk(KERN_ERR PFX "%s: PAUSE_TIMER = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, PAUSE_FRM_DEST_LO, &data);
+	printk(KERN_ERR PFX "%s: PAUSE_FRM_DEST_LO = 0x%.08x.\n",
+	       qdev->ndev->name, data);
+	ql_read_xgmac_reg(qdev, PAUSE_FRM_DEST_HI, &data);
+	printk(KERN_ERR PFX "%s: PAUSE_FRM_DEST_HI = 0x%.08x.\n",
+	       qdev->ndev->name, data);
+	ql_read_xgmac_reg(qdev, MAC_TX_PARAMS, &data);
+	printk(KERN_ERR PFX "%s: MAC_TX_PARAMS = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, MAC_RX_PARAMS, &data);
+	printk(KERN_ERR PFX "%s: MAC_RX_PARAMS = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, MAC_SYS_INT, &data);
+	printk(KERN_ERR PFX "%s: MAC_SYS_INT = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, MAC_SYS_INT_MASK, &data);
+	printk(KERN_ERR PFX "%s: MAC_SYS_INT_MASK = 0x%.08x.\n",
+	       qdev->ndev->name, data);
+	ql_read_xgmac_reg(qdev, MAC_MGMT_INT, &data);
+	printk(KERN_ERR PFX "%s: MAC_MGMT_INT = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_read_xgmac_reg(qdev, MAC_MGMT_IN_MASK, &data);
+	printk(KERN_ERR PFX "%s: MAC_MGMT_IN_MASK = 0x%.08x.\n",
+	       qdev->ndev->name, data);
+	ql_read_xgmac_reg(qdev, EXT_ARB_MODE, &data);
+	printk(KERN_ERR PFX "%s: EXT_ARB_MODE = 0x%.08x.\n", qdev->ndev->name,
+	       data);
+	ql_sem_unlock(qdev, qdev->xg_sem_mask);
+
+}
+
+static void ql_dump_ets_regs(struct ql_adapter *qdev)
+{
+}
+
+static void ql_dump_cam_entries(struct ql_adapter *qdev)
+{
+	int i;
+	u32 value[3];
+	for (i = 0; i < 4; i++) {
+		if (ql_get_mac_addr_reg(qdev, MAC_ADDR_TYPE_CAM_MAC, i, value)) {
+			printk(KERN_ERR PFX
+			       "%s: Failed read of mac index register.\n",
+			       __func__);
+			return;
+		} else {
+			if (value[0])
+				printk(KERN_ERR PFX
+				       "%s: CAM index %d CAM Lookup Lower = 0x%.08x:%.08x, Output = 0x%.08x.\n",
+				       qdev->ndev->name, i, value[1], value[0],
+				       value[2]);
+		}
+	}
+	for (i = 0; i < 32; i++) {
+		if (ql_get_mac_addr_reg
+		    (qdev, MAC_ADDR_TYPE_MULTI_MAC, i, value)) {
+			printk(KERN_ERR PFX
+			       "%s: Failed read of mac index register.\n",
+			       __func__);
+			return;
+		} else {
+			if (value[0])
+				printk(KERN_ERR PFX
+				       "%s: MCAST index %d CAM Lookup Lower = 0x%.08x:%.08x.\n",
+				       qdev->ndev->name, i, value[1], value[0]);
+		}
+	}
+}
+
+void ql_dump_routing_entries(struct ql_adapter *qdev)
+{
+	int i;
+	u32 value;
+	for (i = 0; i < 16; i++) {
+		value = 0;
+		if (ql_get_routing_reg(qdev, i, &value)) {
+			printk(KERN_ERR PFX
+			       "%s: Failed read of routing index register.\n",
+			       __func__);
+			return;
+		} else {
+			if (value)
+				printk(KERN_ERR PFX
+				       "%s: Routing Mask %d = 0x%.08x.\n",
+				       qdev->ndev->name, i, value);
+		}
+	}
+}
+
+void ql_dump_regs(struct ql_adapter *qdev)
+{
+	printk(KERN_ERR PFX "reg dump for function #%d.\n", qdev->func);
+	printk(KERN_ERR PFX "SYS	 			= 0x%x.\n",
+	       ql_read32(qdev, SYS));
+	printk(KERN_ERR PFX "RST_FO 			= 0x%x.\n",
+	       ql_read32(qdev, RST_FO));
+	printk(KERN_ERR PFX "FSC 				= 0x%x.\n",
+	       ql_read32(qdev, FSC));
+	printk(KERN_ERR PFX "CSR 				= 0x%x.\n",
+	       ql_read32(qdev, CSR));
+	printk(KERN_ERR PFX "ICB_RID 			= 0x%x.\n",
+	       ql_read32(qdev, ICB_RID));
+	printk(KERN_ERR PFX "ICB_L 				= 0x%x.\n",
+	       ql_read32(qdev, ICB_L));
+	printk(KERN_ERR PFX "ICB_H 				= 0x%x.\n",
+	       ql_read32(qdev, ICB_H));
+	printk(KERN_ERR PFX "CFG 				= 0x%x.\n",
+	       ql_read32(qdev, CFG));
+	printk(KERN_ERR PFX "BIOS_ADDR 			= 0x%x.\n",
+	       ql_read32(qdev, BIOS_ADDR));
+	printk(KERN_ERR PFX "STS 				= 0x%x.\n",
+	       ql_read32(qdev, STS));
+	printk(KERN_ERR PFX "INTR_EN			= 0x%x.\n",
+	       ql_read32(qdev, INTR_EN));
+	printk(KERN_ERR PFX "INTR_MASK 			= 0x%x.\n",
+	       ql_read32(qdev, INTR_MASK));
+	printk(KERN_ERR PFX "ISR1 				= 0x%x.\n",
+	       ql_read32(qdev, ISR1));
+	printk(KERN_ERR PFX "ISR2 				= 0x%x.\n",
+	       ql_read32(qdev, ISR2));
+	printk(KERN_ERR PFX "ISR3 				= 0x%x.\n",
+	       ql_read32(qdev, ISR3));
+	printk(KERN_ERR PFX "ISR4 				= 0x%x.\n",
+	       ql_read32(qdev, ISR4));
+	printk(KERN_ERR PFX "REV_ID 			= 0x%x.\n",
+	       ql_read32(qdev, REV_ID));
+	printk(KERN_ERR PFX "FRC_ECC_ERR 			= 0x%x.\n",
+	       ql_read32(qdev, FRC_ECC_ERR));
+	printk(KERN_ERR PFX "ERR_STS 			= 0x%x.\n",
+	       ql_read32(qdev, ERR_STS));
+	printk(KERN_ERR PFX "RAM_DBG_ADDR 			= 0x%x.\n",
+	       ql_read32(qdev, RAM_DBG_ADDR));
+	printk(KERN_ERR PFX "RAM_DBG_DATA 			= 0x%x.\n",
+	       ql_read32(qdev, RAM_DBG_DATA));
+	printk(KERN_ERR PFX "ECC_ERR_CNT 			= 0x%x.\n",
+	       ql_read32(qdev, ECC_ERR_CNT));
+	printk(KERN_ERR PFX "SEM 				= 0x%x.\n",
+	       ql_read32(qdev, SEM));
+	printk(KERN_ERR PFX "GPIO_1 			= 0x%x.\n",
+	       ql_read32(qdev, GPIO_1));
+	printk(KERN_ERR PFX "GPIO_2 			= 0x%x.\n",
+	       ql_read32(qdev, GPIO_2));
+	printk(KERN_ERR PFX "GPIO_3 			= 0x%x.\n",
+	       ql_read32(qdev, GPIO_3));
+	printk(KERN_ERR PFX "XGMAC_ADDR 			= 0x%x.\n",
+	       ql_read32(qdev, XGMAC_ADDR));
+	printk(KERN_ERR PFX "XGMAC_DATA 			= 0x%x.\n",
+	       ql_read32(qdev, XGMAC_DATA));
+	printk(KERN_ERR PFX "NIC_ETS 			= 0x%x.\n",
+	       ql_read32(qdev, NIC_ETS));
+	printk(KERN_ERR PFX "CNA_ETS 			= 0x%x.\n",
+	       ql_read32(qdev, CNA_ETS));
+	printk(KERN_ERR PFX "FLASH_ADDR 			= 0x%x.\n",
+	       ql_read32(qdev, FLASH_ADDR));
+	printk(KERN_ERR PFX "FLASH_DATA 			= 0x%x.\n",
+	       ql_read32(qdev, FLASH_DATA));
+	printk(KERN_ERR PFX "CQ_STOP 			= 0x%x.\n",
+	       ql_read32(qdev, CQ_STOP));
+	printk(KERN_ERR PFX "PAGE_TBL_RID 			= 0x%x.\n",
+	       ql_read32(qdev, PAGE_TBL_RID));
+	printk(KERN_ERR PFX "WQ_PAGE_TBL_LO 		= 0x%x.\n",
+	       ql_read32(qdev, WQ_PAGE_TBL_LO));
+	printk(KERN_ERR PFX "WQ_PAGE_TBL_HI 		= 0x%x.\n",
+	       ql_read32(qdev, WQ_PAGE_TBL_HI));
+	printk(KERN_ERR PFX "CQ_PAGE_TBL_LO 		= 0x%x.\n",
+	       ql_read32(qdev, CQ_PAGE_TBL_LO));
+	printk(KERN_ERR PFX "CQ_PAGE_TBL_HI 		= 0x%x.\n",
+	       ql_read32(qdev, CQ_PAGE_TBL_HI));
+	printk(KERN_ERR PFX "COS_DFLT_CQ1 			= 0x%x.\n",
+	       ql_read32(qdev, COS_DFLT_CQ1));
+	printk(KERN_ERR PFX "COS_DFLT_CQ2 			= 0x%x.\n",
+	       ql_read32(qdev, COS_DFLT_CQ2));
+	printk(KERN_ERR PFX "SPLT_HDR 			= 0x%x.\n",
+	       ql_read32(qdev, SPLT_HDR));
+	printk(KERN_ERR PFX "FC_PAUSE_THRES 		= 0x%x.\n",
+	       ql_read32(qdev, FC_PAUSE_THRES));
+	printk(KERN_ERR PFX "NIC_PAUSE_THRES 		= 0x%x.\n",
+	       ql_read32(qdev, NIC_PAUSE_THRES));
+	printk(KERN_ERR PFX "FC_ETHERTYPE 			= 0x%x.\n",
+	       ql_read32(qdev, FC_ETHERTYPE));
+	printk(KERN_ERR PFX "FC_RCV_CFG 			= 0x%x.\n",
+	       ql_read32(qdev, FC_RCV_CFG));
+	printk(KERN_ERR PFX "NIC_RCV_CFG 			= 0x%x.\n",
+	       ql_read32(qdev, NIC_RCV_CFG));
+	printk(KERN_ERR PFX "FC_COS_TAGS 			= 0x%x.\n",
+	       ql_read32(qdev, FC_COS_TAGS));
+	printk(KERN_ERR PFX "NIC_COS_TAGS 			= 0x%x.\n",
+	       ql_read32(qdev, NIC_COS_TAGS));
+	printk(KERN_ERR PFX "MGMT_RCV_CFG 			= 0x%x.\n",
+	       ql_read32(qdev, MGMT_RCV_CFG));
+	printk(KERN_ERR PFX "XG_SERDES_ADDR 		= 0x%x.\n",
+	       ql_read32(qdev, XG_SERDES_ADDR));
+	printk(KERN_ERR PFX "XG_SERDES_DATA 		= 0x%x.\n",
+	       ql_read32(qdev, XG_SERDES_DATA));
+	printk(KERN_ERR PFX "PRB_MX_ADDR 			= 0x%x.\n",
+	       ql_read32(qdev, PRB_MX_ADDR));
+	printk(KERN_ERR PFX "PRB_MX_DATA 			= 0x%x.\n",
+	       ql_read32(qdev, PRB_MX_DATA));
+	ql_dump_intr_states(qdev);
+	ql_dump_xgmac_control_regs(qdev);
+	ql_dump_ets_regs(qdev);
+	ql_dump_cam_entries(qdev);
+	ql_dump_routing_entries(qdev);
+}
+#endif
+
+#ifdef QL_STAT_DUMP
+void ql_dump_stat(struct ql_adapter *qdev)
+{
+	printk(KERN_ERR "%s: Enter.\n", __func__);
+	printk(KERN_ERR "tx_pkts = %ld\n",
+	       (unsigned long)qdev->nic_stats.tx_pkts);
+	printk(KERN_ERR "tx_bytes = %ld\n",
+	       (unsigned long)qdev->nic_stats.tx_bytes);
+	printk(KERN_ERR "tx_mcast_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_mcast_pkts);
+	printk(KERN_ERR "tx_bcast_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_bcast_pkts);
+	printk(KERN_ERR "tx_ucast_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_ucast_pkts);
+	printk(KERN_ERR "tx_ctl_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_ctl_pkts);
+	printk(KERN_ERR "tx_pause_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_pause_pkts);
+	printk(KERN_ERR "tx_64_pkt = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_64_pkt);
+	printk(KERN_ERR "tx_65_to_127_pkt = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_65_to_127_pkt);
+	printk(KERN_ERR "tx_128_to_255_pkt = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_128_to_255_pkt);
+	printk(KERN_ERR "tx_256_511_pkt = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_256_511_pkt);
+	printk(KERN_ERR "tx_512_to_1023_pkt = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_512_to_1023_pkt);
+	printk(KERN_ERR "tx_1024_to_1518_pkt = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_1024_to_1518_pkt);
+	printk(KERN_ERR "tx_1519_to_max_pkt = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_1519_to_max_pkt);
+	printk(KERN_ERR "tx_undersize_pkt = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_undersize_pkt);
+	printk(KERN_ERR "tx_oversize_pkt = %ld.\n",
+	       (unsigned long)qdev->nic_stats.tx_oversize_pkt);
+	printk(KERN_ERR "rx_bytes = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_bytes);
+	printk(KERN_ERR "rx_bytes_ok = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_bytes_ok);
+	printk(KERN_ERR "rx_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_pkts);
+	printk(KERN_ERR "rx_pkts_ok = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_pkts_ok);
+	printk(KERN_ERR "rx_bcast_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_bcast_pkts);
+	printk(KERN_ERR "rx_mcast_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_mcast_pkts);
+	printk(KERN_ERR "rx_ucast_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_ucast_pkts);
+	printk(KERN_ERR "rx_undersize_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_undersize_pkts);
+	printk(KERN_ERR "rx_oversize_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_oversize_pkts);
+	printk(KERN_ERR "rx_jabber_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_jabber_pkts);
+	printk(KERN_ERR "rx_undersize_fcerr_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_undersize_fcerr_pkts);
+	printk(KERN_ERR "rx_drop_events = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_drop_events);
+	printk(KERN_ERR "rx_fcerr_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_fcerr_pkts);
+	printk(KERN_ERR "rx_align_err = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_align_err);
+	printk(KERN_ERR "rx_symbol_err = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_symbol_err);
+	printk(KERN_ERR "rx_mac_err = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_mac_err);
+	printk(KERN_ERR "rx_ctl_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_ctl_pkts);
+	printk(KERN_ERR "rx_pause_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_pause_pkts);
+	printk(KERN_ERR "rx_64_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_64_pkts);
+	printk(KERN_ERR "rx_65_to_127_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_65_to_127_pkts);
+	printk(KERN_ERR "rx_128_255_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_128_255_pkts);
+	printk(KERN_ERR "rx_256_511_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_256_511_pkts);
+	printk(KERN_ERR "rx_512_to_1023_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_512_to_1023_pkts);
+	printk(KERN_ERR "rx_1024_to_1518_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_1024_to_1518_pkts);
+	printk(KERN_ERR "rx_1519_to_max_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_1519_to_max_pkts);
+	printk(KERN_ERR "rx_len_err_pkts = %ld.\n",
+	       (unsigned long)qdev->nic_stats.rx_len_err_pkts);
+};
+#endif
+
+#ifdef QL_DEV_DUMP
+void ql_dump_qdev(struct ql_adapter *qdev)
+{
+	int i;
+	printk(KERN_ERR PFX "qdev->flags 			= %lx.\n",
+	       qdev->flags);
+	printk(KERN_ERR PFX "qdev->vlgrp 			= %p.\n",
+	       qdev->vlgrp);
+	printk(KERN_ERR PFX "qdev->pdev 			= %p.\n",
+	       qdev->pdev);
+	printk(KERN_ERR PFX "qdev->ndev 			= %p.\n",
+	       qdev->ndev);
+	printk(KERN_ERR PFX "qdev->chip_rev_id 		= %d.\n",
+	       qdev->chip_rev_id);
+	printk(KERN_ERR PFX "qdev->reg_base 		= %p.\n",
+	       qdev->reg_base);
+	printk(KERN_ERR PFX "qdev->doorbell_area 	= %p.\n",
+	       qdev->doorbell_area);
+	printk(KERN_ERR PFX "qdev->doorbell_area_size 	= %d.\n",
+	       qdev->doorbell_area_size);
+	printk(KERN_ERR PFX "msg_enable 		= %x.\n",
+	       qdev->msg_enable);
+	printk(KERN_ERR PFX "qdev->rx_ring_shadow_reg_area	= %p.\n",
+	       qdev->rx_ring_shadow_reg_area);
+	printk(KERN_ERR PFX "qdev->rx_ring_shadow_reg_dma 	= %p.\n",
+	       (void *)qdev->rx_ring_shadow_reg_dma);
+	printk(KERN_ERR PFX "qdev->tx_ring_shadow_reg_area	= %p.\n",
+	       qdev->tx_ring_shadow_reg_area);
+	printk(KERN_ERR PFX "qdev->tx_ring_shadow_reg_dma	= %p.\n",
+	       (void *)qdev->tx_ring_shadow_reg_dma);
+	printk(KERN_ERR PFX "qdev->intr_count 		= %d.\n",
+	       qdev->intr_count);
+	if (qdev->msi_x_entry)
+		for (i = 0; i < qdev->intr_count; i++) {
+			printk(KERN_ERR PFX
+			       "msi_x_entry.[%d]vector	= %d.\n", i,
+			       qdev->msi_x_entry[i].vector);
+			printk(KERN_ERR PFX
+			       "msi_x_entry.[%d]entry	= %d.\n", i,
+			       qdev->msi_x_entry[i].entry);
+		}
+	for (i = 0; i < qdev->intr_count; i++) {
+		printk(KERN_ERR PFX
+		       "intr_context[%d].qdev		= %p.\n", i,
+		       qdev->intr_context[i].qdev);
+		printk(KERN_ERR PFX
+		       "intr_context[%d].intr		= %d.\n", i,
+		       qdev->intr_context[i].intr);
+		printk(KERN_ERR PFX
+		       "intr_context[%d].hooked		= %d.\n", i,
+		       qdev->intr_context[i].hooked);
+		printk(KERN_ERR PFX
+		       "intr_context[%d].intr_en_mask	= 0x%08x.\n", i,
+		       qdev->intr_context[i].intr_en_mask);
+		printk(KERN_ERR PFX
+		       "intr_context[%d].intr_dis_mask	= 0x%08x.\n", i,
+		       qdev->intr_context[i].intr_dis_mask);
+		printk(KERN_ERR PFX
+		       "intr_context[%d].intr_read_mask	= 0x%08x.\n", i,
+		       qdev->intr_context[i].intr_read_mask);
+	}
+	printk(KERN_ERR PFX "qdev->tx_ring_count = %d.\n", qdev->tx_ring_count);
+	printk(KERN_ERR PFX "qdev->rx_ring_count = %d.\n", qdev->rx_ring_count);
+	printk(KERN_ERR PFX "qdev->ring_mem_size = %d.\n", qdev->ring_mem_size);
+	printk(KERN_ERR PFX "qdev->ring_mem 	= %p.\n", qdev->ring_mem);
+	printk(KERN_ERR PFX "qdev->intr_count 	= %d.\n", qdev->intr_count);
+	printk(KERN_ERR PFX "qdev->tx_ring		= %p.\n",
+	       qdev->tx_ring);
+	printk(KERN_ERR PFX "qdev->rss_ring_first_cq_id 	= %d.\n",
+	       qdev->rss_ring_first_cq_id);
+	printk(KERN_ERR PFX "qdev->rss_ring_count 	= %d.\n",
+	       qdev->rss_ring_count);
+	printk(KERN_ERR PFX "qdev->rx_ring	= %p.\n", qdev->rx_ring);
+	printk(KERN_ERR PFX "qdev->default_rx_queue	= %d.\n",
+	       qdev->default_rx_queue);
+	printk(KERN_ERR PFX "qdev->xg_sem_mask		= 0x%08x.\n",
+	       qdev->xg_sem_mask);
+	printk(KERN_ERR PFX "qdev->port_link_up		= 0x%08x.\n",
+	       qdev->port_link_up);
+	printk(KERN_ERR PFX "qdev->port_init		= 0x%08x.\n",
+	       qdev->port_init);
+
+}
+#endif
+
+#ifdef QL_CB_DUMP
+void ql_dump_wqicb(struct wqicb *wqicb)
+{
+	printk(KERN_ERR PFX "Dumping wqicb stuff...\n");
+	printk(KERN_ERR PFX "wqicb->len = 0x%x.\n", le16_to_cpu(wqicb->len));
+	printk(KERN_ERR PFX "wqicb->flags = %x.\n", le16_to_cpu(wqicb->flags));
+	printk(KERN_ERR PFX "wqicb->cq_id_rss = %d.\n",
+	       le16_to_cpu(wqicb->cq_id_rss));
+	printk(KERN_ERR PFX "wqicb->rid = 0x%x.\n", le16_to_cpu(wqicb->rid));
+	printk(KERN_ERR PFX "wqicb->wq_addr_lo = 0x%.08x.\n",
+	       le32_to_cpu(wqicb->addr_lo));
+	printk(KERN_ERR PFX "wqicb->wq_addr_hi = 0x%.08x.\n",
+	       le32_to_cpu(wqicb->addr_hi));
+	printk(KERN_ERR PFX "wqicb->wq_cnsmr_idx_addr_lo = 0x%.08x.\n",
+	       le32_to_cpu(wqicb->cnsmr_idx_addr_lo));
+	printk(KERN_ERR PFX "wqicb->wq_cnsmr_idx_addr_hi = 0x%.08x.\n",
+	       le32_to_cpu(wqicb->cnsmr_idx_addr_hi));
+}
+
+void ql_dump_tx_ring(struct tx_ring *tx_ring)
+{
+	if (tx_ring == NULL)
+		return;
+	printk(KERN_ERR PFX
+	       "===================== Dumping tx_ring %d ===============.\n",
+	       tx_ring->wq_id);
+	printk(KERN_ERR PFX "tx_ring->base = %p.\n", tx_ring->wq_base);
+	printk(KERN_ERR PFX "tx_ring->base_dma = 0x%llx.\n",
+	       (u64) tx_ring->wq_base_dma);
+	printk(KERN_ERR PFX "tx_ring->cnsmr_idx_sh_reg = %p.\n",
+	       tx_ring->cnsmr_idx_sh_reg);
+	printk(KERN_ERR PFX "tx_ring->cnsmr_idx_sh_reg_dma = 0x%llx.\n",
+	       (u64) tx_ring->cnsmr_idx_sh_reg_dma);
+	printk(KERN_ERR PFX "tx_ring->size = %d.\n", tx_ring->wq_size);
+	printk(KERN_ERR PFX "tx_ring->len = %d.\n", tx_ring->wq_len);
+	printk(KERN_ERR PFX "tx_ring->prod_idx_db_reg = %p.\n",
+	       tx_ring->prod_idx_db_reg);
+	printk(KERN_ERR PFX "tx_ring->valid_db_reg = %p.\n",
+	       tx_ring->valid_db_reg);
+	printk(KERN_ERR PFX "tx_ring->prod_idx = %d.\n", tx_ring->prod_idx);
+	printk(KERN_ERR PFX "tx_ring->cq_id = %d.\n", tx_ring->cq_id);
+	printk(KERN_ERR PFX "tx_ring->wq_id = %d.\n", tx_ring->wq_id);
+	printk(KERN_ERR PFX "tx_ring->q = %p.\n", tx_ring->q);
+	printk(KERN_ERR PFX "tx_ring->tx_count = %d.\n",
+	       atomic_read(&tx_ring->tx_count));
+}
+
+void ql_dump_ricb(struct ricb *ricb)
+{
+	int i;
+	printk(KERN_ERR PFX
+	       "===================== Dumping ricb ===============.\n");
+	printk(KERN_ERR PFX "Dumping ricb stuff...\n");
+
+	printk(KERN_ERR PFX "ricb->base_cq = %d.\n", ricb->base_cq & 0x1f);
+	printk(KERN_ERR PFX "ricb->flags = %s%s%s%s%s%s%s%s%s.\n",
+	       ricb->base_cq & RSS_L4K ? "RSS_L4K " : "",
+	       ricb->flags & RSS_L6K ? "RSS_L6K " : "",
+	       ricb->flags & RSS_LI ? "RSS_LI " : "",
+	       ricb->flags & RSS_LB ? "RSS_LB " : "",
+	       ricb->flags & RSS_LM ? "RSS_LM " : "",
+	       ricb->flags & RSS_RI4 ? "RSS_RI4 " : "",
+	       ricb->flags & RSS_RT4 ? "RSS_RT4 " : "",
+	       ricb->flags & RSS_RI6 ? "RSS_RI6 " : "",
+	       ricb->flags & RSS_RT6 ? "RSS_RT6 " : "");
+	printk(KERN_ERR PFX "ricb->mask = 0x%.04x.\n", le16_to_cpu(ricb->mask));
+	for (i = 0; i < 16; i++)
+		printk(KERN_ERR PFX "ricb->hash_cq_id[%d] = 0x%.08x.\n", i,
+		       le32_to_cpu(ricb->hash_cq_id[i]));
+	for (i = 0; i < 10; i++)
+		printk(KERN_ERR PFX "ricb->ipv6_hash_key[%d] = 0x%.08x.\n", i,
+		       le32_to_cpu(ricb->ipv6_hash_key[i]));
+	for (i = 0; i < 4; i++)
+		printk(KERN_ERR PFX "ricb->ipv4_hash_key[%d] = 0x%.08x.\n", i,
+		       le32_to_cpu(ricb->ipv4_hash_key[i]));
+}
+
+void ql_dump_cqicb(struct cqicb *cqicb)
+{
+	printk(KERN_ERR PFX "Dumping cqicb stuff...\n");
+
+	printk(KERN_ERR PFX "cqicb->msix_vect = %d.\n", cqicb->msix_vect);
+	printk(KERN_ERR PFX "cqicb->flags = %x.\n", cqicb->flags);
+	printk(KERN_ERR PFX "cqicb->len = %d.\n", le16_to_cpu(cqicb->len));
+	printk(KERN_ERR PFX "cqicb->addr_lo = %x.\n",
+	       le32_to_cpu(cqicb->addr_lo));
+	printk(KERN_ERR PFX "cqicb->addr_hi = %x.\n",
+	       le32_to_cpu(cqicb->addr_hi));
+	printk(KERN_ERR PFX "cqicb->prod_idx_addr_lo = %x.\n",
+	       le32_to_cpu(cqicb->prod_idx_addr_lo));
+	printk(KERN_ERR PFX "cqicb->prod_idx_addr_hi = %x.\n",
+	       le32_to_cpu(cqicb->prod_idx_addr_hi));
+	printk(KERN_ERR PFX "cqicb->pkt_delay = 0x%.04x.\n",
+	       le16_to_cpu(cqicb->pkt_delay));
+	printk(KERN_ERR PFX "cqicb->irq_delay = 0x%.04x.\n",
+	       le16_to_cpu(cqicb->irq_delay));
+	printk(KERN_ERR PFX "cqicb->lbq_addr_lo = %x.\n",
+	       le32_to_cpu(cqicb->lbq_addr_lo));
+	printk(KERN_ERR PFX "cqicb->lbq_addr_hi = %x.\n",
+	       le32_to_cpu(cqicb->lbq_addr_hi));
+	printk(KERN_ERR PFX "cqicb->lbq_buf_size = 0x%.04x.\n",
+	       le16_to_cpu(cqicb->lbq_buf_size));
+	printk(KERN_ERR PFX "cqicb->lbq_len = 0x%.04x.\n",
+	       le16_to_cpu(cqicb->lbq_len));
+	printk(KERN_ERR PFX "cqicb->sbq_addr_lo = %x.\n",
+	       le32_to_cpu(cqicb->sbq_addr_lo));
+	printk(KERN_ERR PFX "cqicb->sbq_addr_hi = %x.\n",
+	       le32_to_cpu(cqicb->sbq_addr_hi));
+	printk(KERN_ERR PFX "cqicb->sbq_buf_size = 0x%.04x.\n",
+	       le16_to_cpu(cqicb->sbq_buf_size));
+	printk(KERN_ERR PFX "cqicb->sbq_len = 0x%.04x.\n",
+	       le16_to_cpu(cqicb->sbq_len));
+}
+
+void ql_dump_rx_ring(struct rx_ring *rx_ring)
+{
+	if (rx_ring == NULL)
+		return;
+	printk(KERN_ERR PFX
+	       "===================== Dumping rx_ring %d ===============.\n",
+	       rx_ring->cq_id);
+	printk(KERN_ERR PFX "Dumping rx_ring %d, type = %s%s%s.\n",
+	       rx_ring->cq_id, rx_ring->type == DEFAULT_Q ? "DEFAULT" : "",
+	       rx_ring->type == TX_Q ? "OUTBOUND COMPLETIONS" : "",
+	       rx_ring->type == RX_Q ? "INBOUND_COMPLETIONS" : "");
+	printk(KERN_ERR PFX "rx_ring->cqicb = %p.\n", &rx_ring->cqicb);
+	printk(KERN_ERR PFX "rx_ring->cq_base = %p.\n", rx_ring->cq_base);
+	printk(KERN_ERR PFX "rx_ring->cq_base_dma = %llx.\n",
+	       (u64) rx_ring->cq_base_dma);
+	printk(KERN_ERR PFX "rx_ring->cq_size = %d.\n", rx_ring->cq_size);
+	printk(KERN_ERR PFX "rx_ring->cq_len = %d.\n", rx_ring->cq_len);
+	printk(KERN_ERR PFX
+	       "rx_ring->prod_idx_sh_reg, addr = %p, value = %d.\n",
+	       rx_ring->prod_idx_sh_reg,
+	       rx_ring->prod_idx_sh_reg ? *(rx_ring->prod_idx_sh_reg) : 0);
+	printk(KERN_ERR PFX "rx_ring->prod_idx_sh_reg_dma = %llx.\n",
+	       (u64) rx_ring->prod_idx_sh_reg_dma);
+	printk(KERN_ERR PFX "rx_ring->cnsmr_idx_db_reg = %p.\n",
+	       rx_ring->cnsmr_idx_db_reg);
+	printk(KERN_ERR PFX "rx_ring->cnsmr_idx = %d.\n", rx_ring->cnsmr_idx);
+	printk(KERN_ERR PFX "rx_ring->curr_entry = %p.\n", rx_ring->curr_entry);
+	printk(KERN_ERR PFX "rx_ring->valid_db_reg = %p.\n",
+	       rx_ring->valid_db_reg);
+
+	printk(KERN_ERR PFX "rx_ring->lbq_base = %p.\n", rx_ring->lbq_base);
+	printk(KERN_ERR PFX "rx_ring->lbq_base_dma = %llx.\n",
+	       (u64) rx_ring->lbq_base_dma);
+	printk(KERN_ERR PFX "rx_ring->lbq_base_indirect = %p.\n",
+	       rx_ring->lbq_base_indirect);
+	printk(KERN_ERR PFX "rx_ring->lbq_base_indirect_dma = %llx.\n",
+	       (u64) rx_ring->lbq_base_indirect_dma);
+	printk(KERN_ERR PFX "rx_ring->lbq = %p.\n", rx_ring->lbq);
+	printk(KERN_ERR PFX "rx_ring->lbq_len = %d.\n", rx_ring->lbq_len);
+	printk(KERN_ERR PFX "rx_ring->lbq_size = %d.\n", rx_ring->lbq_size);
+	printk(KERN_ERR PFX "rx_ring->lbq_prod_idx_db_reg = %p.\n",
+	       rx_ring->lbq_prod_idx_db_reg);
+	printk(KERN_ERR PFX "rx_ring->lbq_prod_idx = %d.\n",
+	       rx_ring->lbq_prod_idx);
+	printk(KERN_ERR PFX "rx_ring->lbq_curr_idx = %d.\n",
+	       rx_ring->lbq_curr_idx);
+	printk(KERN_ERR PFX "rx_ring->lbq_clean_idx = %d.\n",
+	       rx_ring->lbq_clean_idx);
+	printk(KERN_ERR PFX "rx_ring->lbq_free_cnt = %d.\n",
+	       rx_ring->lbq_free_cnt);
+	printk(KERN_ERR PFX "rx_ring->lbq_buf_size = %d.\n",
+	       rx_ring->lbq_buf_size);
+
+	printk(KERN_ERR PFX "rx_ring->sbq_base = %p.\n", rx_ring->sbq_base);
+	printk(KERN_ERR PFX "rx_ring->sbq_base_dma = %llx.\n",
+	       (u64) rx_ring->sbq_base_dma);
+	printk(KERN_ERR PFX "rx_ring->sbq_base_indirect = %p.\n",
+	       rx_ring->sbq_base_indirect);
+	printk(KERN_ERR PFX "rx_ring->sbq_base_indirect_dma = %llx.\n",
+	       (u64) rx_ring->sbq_base_indirect_dma);
+	printk(KERN_ERR PFX "rx_ring->sbq = %p.\n", rx_ring->sbq);
+	printk(KERN_ERR PFX "rx_ring->sbq_len = %d.\n", rx_ring->sbq_len);
+	printk(KERN_ERR PFX "rx_ring->sbq_size = %d.\n", rx_ring->sbq_size);
+	printk(KERN_ERR PFX "rx_ring->sbq_prod_idx_db_reg addr = %p.\n",
+	       rx_ring->sbq_prod_idx_db_reg);
+	printk(KERN_ERR PFX "rx_ring->sbq_prod_idx = %d.\n",
+	       rx_ring->sbq_prod_idx);
+	printk(KERN_ERR PFX "rx_ring->sbq_curr_idx = %d.\n",
+	       rx_ring->sbq_curr_idx);
+	printk(KERN_ERR PFX "rx_ring->sbq_clean_idx = %d.\n",
+	       rx_ring->sbq_clean_idx);
+	printk(KERN_ERR PFX "rx_ring->sbq_free_cnt = %d.\n",
+	       rx_ring->sbq_free_cnt);
+	printk(KERN_ERR PFX "rx_ring->sbq_buf_size = %d.\n",
+	       rx_ring->sbq_buf_size);
+	printk(KERN_ERR PFX "rx_ring->cq_id = %d.\n", rx_ring->cq_id);
+	printk(KERN_ERR PFX "rx_ring->irq = %d.\n", rx_ring->irq);
+	printk(KERN_ERR PFX "rx_ring->cpu = %d.\n", rx_ring->cpu);
+	printk(KERN_ERR PFX "rx_ring->qdev = %p.\n", rx_ring->qdev);
+}
+
+void ql_dump_hw_cb(struct ql_adapter *qdev, int size, u32 bit, u16 q_id)
+{
+	void *ptr;
+
+	printk(KERN_ERR PFX "%s: Enter.\n", __func__);
+
+	ptr = kmalloc(size, GFP_ATOMIC);
+	if (ptr == NULL) {
+		printk(KERN_ERR PFX "%s: Couldn't allocate a buffer.\n",
+		       __func__);
+		return;
+	}
+
+	if (ql_write_cfg(qdev, ptr, size, bit, q_id)) {
+		printk(KERN_ERR "%s: Failed to upload control block!\n",
+		       __func__);
+		goto fail_it;
+	}
+	switch (bit) {
+	case CFG_DRQ:
+		ql_dump_wqicb((struct wqicb *)ptr);
+		break;
+	case CFG_DCQ:
+		ql_dump_cqicb((struct cqicb *)ptr);
+		break;
+	case CFG_DR:
+		ql_dump_ricb((struct ricb *)ptr);
+		break;
+	default:
+		printk(KERN_ERR PFX "%s: Invalid bit value = %x.\n",
+		       __func__, bit);
+		break;
+	}
+fail_it:
+	kfree(ptr);
+}
+#endif
+
+#ifdef QL_OB_DUMP
+void ql_dump_tx_desc(struct tx_buf_desc *tbd)
+{
+	printk(KERN_ERR PFX "tbd->addr  = 0x%llx\n",
+	       le64_to_cpu((u64) tbd->addr));
+	printk(KERN_ERR PFX "tbd->len   = %d\n",
+	       le32_to_cpu(tbd->len & TX_DESC_LEN_MASK));
+	printk(KERN_ERR PFX "tbd->flags = %s %s\n",
+	       tbd->len & TX_DESC_C ? "C" : ".",
+	       tbd->len & TX_DESC_E ? "E" : ".");
+	tbd++;
+	printk(KERN_ERR PFX "tbd->addr  = 0x%llx\n",
+	       le64_to_cpu((u64) tbd->addr));
+	printk(KERN_ERR PFX "tbd->len   = %d\n",
+	       le32_to_cpu(tbd->len & TX_DESC_LEN_MASK));
+	printk(KERN_ERR PFX "tbd->flags = %s %s\n",
+	       tbd->len & TX_DESC_C ? "C" : ".",
+	       tbd->len & TX_DESC_E ? "E" : ".");
+	tbd++;
+	printk(KERN_ERR PFX "tbd->addr  = 0x%llx\n",
+	       le64_to_cpu((u64) tbd->addr));
+	printk(KERN_ERR PFX "tbd->len   = %d\n",
+	       le32_to_cpu(tbd->len & TX_DESC_LEN_MASK));
+	printk(KERN_ERR PFX "tbd->flags = %s %s\n",
+	       tbd->len & TX_DESC_C ? "C" : ".",
+	       tbd->len & TX_DESC_E ? "E" : ".");
+
+}
+
+void ql_dump_ob_mac_iocb(struct ob_mac_iocb_req *ob_mac_iocb)
+{
+	struct ob_mac_tso_iocb_req *ob_mac_tso_iocb =
+	    (struct ob_mac_tso_iocb_req *)ob_mac_iocb;
+	struct tx_buf_desc *tbd;
+	u16 frame_len;
+
+	printk(KERN_ERR PFX "%s\n", __func__);
+	printk(KERN_ERR PFX "opcode         = %s\n",
+	       (ob_mac_iocb->opcode == OPCODE_OB_MAC_IOCB) ? "MAC" : "TSO");
+	printk(KERN_ERR PFX "flags1          = %s %s %s %s %s\n",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_OI ? "OI" : "",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_I ? "I" : "",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_D ? "D" : "",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_IP4 ? "IP4" : "",
+	       ob_mac_tso_iocb->flags1 & OB_MAC_TSO_IOCB_IP6 ? "IP6" : "");
+	printk(KERN_ERR PFX "flags2          = %s %s %s\n",
+	       ob_mac_tso_iocb->flags2 & OB_MAC_TSO_IOCB_LSO ? "LSO" : "",
+	       ob_mac_tso_iocb->flags2 & OB_MAC_TSO_IOCB_UC ? "UC" : "",
+	       ob_mac_tso_iocb->flags2 & OB_MAC_TSO_IOCB_TC ? "TC" : "");
+	printk(KERN_ERR PFX "flags3          = %s %s %s \n",
+	       ob_mac_tso_iocb->flags3 & OB_MAC_TSO_IOCB_IC ? "IC" : "",
+	       ob_mac_tso_iocb->flags3 & OB_MAC_TSO_IOCB_DFP ? "DFP" : "",
+	       ob_mac_tso_iocb->flags3 & OB_MAC_TSO_IOCB_V ? "V" : "");
+	printk(KERN_ERR PFX "tid = %x\n", ob_mac_iocb->tid);
+	printk(KERN_ERR PFX "txq_idx = %d\n", ob_mac_iocb->txq_idx);
+	printk(KERN_ERR PFX "vlan_tci      = %x\n", ob_mac_tso_iocb->vlan_tci);
+	if (ob_mac_iocb->opcode == OPCODE_OB_MAC_TSO_IOCB) {
+		printk(KERN_ERR PFX "frame_len      = %d\n",
+		       le32_to_cpu(ob_mac_tso_iocb->frame_len));
+		printk(KERN_ERR PFX "mss      = %d\n",
+		       le16_to_cpu(ob_mac_tso_iocb->mss));
+		printk(KERN_ERR PFX "prot_hdr_len   = %d\n",
+		       le16_to_cpu(ob_mac_tso_iocb->total_hdrs_len));
+		printk(KERN_ERR PFX "hdr_offset     = 0x%.04x\n",
+		       le16_to_cpu(ob_mac_tso_iocb->net_trans_offset));
+		frame_len = le32_to_cpu(ob_mac_tso_iocb->frame_len);
+	} else {
+		printk(KERN_ERR PFX "frame_len      = %d\n",
+		       le16_to_cpu(ob_mac_iocb->frame_len));
+		frame_len = le16_to_cpu(ob_mac_iocb->frame_len);
+	}
+	tbd = &ob_mac_iocb->tbd[0];
+	ql_dump_tx_desc(tbd);
+}
+
+void ql_dump_ob_mac_rsp(struct ob_mac_iocb_rsp *ob_mac_rsp)
+{
+	printk(KERN_ERR PFX "%s\n", __func__);
+	printk(KERN_ERR PFX "opcode         = %d\n", ob_mac_rsp->opcode);
+	printk(KERN_ERR PFX "flags          = %s %s %s %s %s %s %s\n",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_OI ? "OI" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_I ? "I" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_E ? "E" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_S ? "S" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_L ? "L" : ".",
+	       ob_mac_rsp->flags1 & OB_MAC_IOCB_RSP_P ? "P" : ".",
+	       ob_mac_rsp->flags2 & OB_MAC_IOCB_RSP_B ? "B" : ".");
+	printk(KERN_ERR PFX "tid = %x\n", ob_mac_rsp->tid);
+}
+#endif
+
+#ifdef QL_IB_DUMP
+void ql_dump_ib_mac_rsp(struct ib_mac_iocb_rsp *ib_mac_rsp)
+{
+	printk(KERN_ERR PFX "%s\n", __func__);
+	printk(KERN_ERR PFX "opcode         = 0x%x\n", ib_mac_rsp->opcode);
+	printk(KERN_ERR PFX "flags1 = %s%s%s%s%s%s\n",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_OI ? "OI " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_I ? "I " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_TE ? "TE " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_NU ? "NU " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_IE ? "IE " : "",
+	       ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_B ? "B " : "");
+
+	if (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK)
+		printk(KERN_ERR PFX "%s%s%s Multicast.\n",
+		       (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+		       IB_MAC_IOCB_RSP_M_HASH ? "Hash" : "",
+		       (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+		       IB_MAC_IOCB_RSP_M_REG ? "Registered" : "",
+		       (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+		       IB_MAC_IOCB_RSP_M_PROM ? "Promiscuous" : "");
+
+	printk(KERN_ERR PFX "flags2 = %s%s%s%s%s\n",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_P) ? "P " : "",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V) ? "V " : "",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_U) ? "U " : "",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_T) ? "T " : "",
+	       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_FO) ? "FO " : "");
+
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK)
+		printk(KERN_ERR PFX "%s%s%s%s%s error.\n",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_OVERSIZE ? "oversize" : "",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_UNDERSIZE ? "undersize" : "",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_PREAMBLE ? "preamble" : "",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_FRAME_LEN ? "frame length" : "",
+		       (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_ERR_MASK) ==
+		       IB_MAC_IOCB_RSP_ERR_CRC ? "CRC" : "");
+
+	printk(KERN_ERR PFX "flags3 = %s%s.\n",
+	       ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DS ? "DS " : "",
+	       ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DL ? "DL " : "");
+
+	if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK)
+		printk(KERN_ERR PFX "RSS flags = %s%s%s%s.\n",
+		       ((ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK) ==
+			IB_MAC_IOCB_RSP_M_IPV4) ? "IPv4 RSS" : "",
+		       ((ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK) ==
+			IB_MAC_IOCB_RSP_M_IPV6) ? "IPv6 RSS " : "",
+		       ((ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK) ==
+			IB_MAC_IOCB_RSP_M_TCP_V4) ? "TCP/IPv4 RSS" : "",
+		       ((ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK) ==
+			IB_MAC_IOCB_RSP_M_TCP_V6) ? "TCP/IPv6 RSS" : "");
+
+	printk(KERN_ERR PFX "data_len	= %d\n",
+	       le32_to_cpu(ib_mac_rsp->data_len));
+	printk(KERN_ERR PFX "data_addr_hi    = 0x%x\n",
+	       le32_to_cpu(ib_mac_rsp->data_addr_hi));
+	printk(KERN_ERR PFX "data_addr_lo    = 0x%x\n",
+	       le32_to_cpu(ib_mac_rsp->data_addr_lo));
+	if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_RSS_MASK)
+		printk(KERN_ERR PFX "rss    = %x\n",
+		       le32_to_cpu(ib_mac_rsp->rss));
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V)
+		printk(KERN_ERR PFX "vlan_id    = %x\n",
+		       le16_to_cpu(ib_mac_rsp->vlan_id));
+
+	printk(KERN_ERR PFX "flags4 = %s%s%s.\n",
+	       le32_to_cpu(ib_mac_rsp->
+			   flags4) & IB_MAC_IOCB_RSP_HV ? "HV " : "",
+	       le32_to_cpu(ib_mac_rsp->
+			   flags4) & IB_MAC_IOCB_RSP_HS ? "HS " : "",
+	       le32_to_cpu(ib_mac_rsp->
+			   flags4) & IB_MAC_IOCB_RSP_HL ? "HL " : "");
+
+	if (le32_to_cpu(ib_mac_rsp->flags4) & IB_MAC_IOCB_RSP_HV) {
+		printk(KERN_ERR PFX "hdr length	= %d.\n",
+		       le32_to_cpu(ib_mac_rsp->hdr_len));
+		printk(KERN_ERR PFX "hdr addr_hi    = 0x%x.\n",
+		       le32_to_cpu(ib_mac_rsp->hdr_addr_hi));
+		printk(KERN_ERR PFX "hdr addr_lo    = 0x%x.\n",
+		       le32_to_cpu(ib_mac_rsp->hdr_addr_lo));
+	}
+}
+#endif
+
+#ifdef QL_ALL_DUMP
+void ql_dump_all(struct ql_adapter *qdev)
+{
+	int i;
+
+	QL_DUMP_REGS(qdev);
+	QL_DUMP_QDEV(qdev);
+	for (i = 0; i < qdev->tx_ring_count; i++) {
+		QL_DUMP_TX_RING(&qdev->tx_ring[i]);
+		QL_DUMP_WQICB((struct wqicb *)&qdev->tx_ring[i]);
+	}
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		QL_DUMP_RX_RING(&qdev->rx_ring[i]);
+		QL_DUMP_CQICB((struct cqicb *)&qdev->rx_ring[i]);
+	}
+}
+#endif
diff --git a/drivers/net/qlge/qlge_ethtool.c b/drivers/net/qlge/qlge_ethtool.c
new file mode 100644
index 00000000000..6457f8c4fda
--- /dev/null
+++ b/drivers/net/qlge/qlge_ethtool.c
@@ -0,0 +1,415 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/dmapool.h>
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include <linux/version.h>
+
+#include "qlge.h"
+
+static int ql_update_ring_coalescing(struct ql_adapter *qdev)
+{
+	int i, status = 0;
+	struct rx_ring *rx_ring;
+	struct cqicb *cqicb;
+
+	if (!netif_running(qdev->ndev))
+		return status;
+
+	spin_lock(&qdev->hw_lock);
+	/* Skip the default queue, and update the outbound handler
+	 * queues if they changed.
+	 */
+	cqicb = (struct cqicb *)&qdev->rx_ring[1];
+	if (le16_to_cpu(cqicb->irq_delay) != qdev->tx_coalesce_usecs ||
+	    le16_to_cpu(cqicb->pkt_delay) != qdev->tx_max_coalesced_frames) {
+		for (i = 1; i < qdev->rss_ring_first_cq_id; i++, rx_ring++) {
+			rx_ring = &qdev->rx_ring[i];
+			cqicb = (struct cqicb *)rx_ring;
+			cqicb->irq_delay = le16_to_cpu(qdev->tx_coalesce_usecs);
+			cqicb->pkt_delay =
+			    le16_to_cpu(qdev->tx_max_coalesced_frames);
+			cqicb->flags = FLAGS_LI;
+			status = ql_write_cfg(qdev, cqicb, sizeof(cqicb),
+						CFG_LCQ, rx_ring->cq_id);
+			if (status) {
+				QPRINTK(qdev, IFUP, ERR,
+					"Failed to load CQICB.\n");
+				goto exit;
+			}
+		}
+	}
+
+	/* Update the inbound (RSS) handler queues if they changed. */
+	cqicb = (struct cqicb *)&qdev->rx_ring[qdev->rss_ring_first_cq_id];
+	if (le16_to_cpu(cqicb->irq_delay) != qdev->rx_coalesce_usecs ||
+	    le16_to_cpu(cqicb->pkt_delay) != qdev->rx_max_coalesced_frames) {
+		for (i = qdev->rss_ring_first_cq_id;
+		     i <= qdev->rss_ring_first_cq_id + qdev->rss_ring_count;
+		     i++) {
+			rx_ring = &qdev->rx_ring[i];
+			cqicb = (struct cqicb *)rx_ring;
+			cqicb->irq_delay = le16_to_cpu(qdev->rx_coalesce_usecs);
+			cqicb->pkt_delay =
+			    le16_to_cpu(qdev->rx_max_coalesced_frames);
+			cqicb->flags = FLAGS_LI;
+			status = ql_write_cfg(qdev, cqicb, sizeof(cqicb),
+						CFG_LCQ, rx_ring->cq_id);
+			if (status) {
+				QPRINTK(qdev, IFUP, ERR,
+					"Failed to load CQICB.\n");
+				goto exit;
+			}
+		}
+	}
+exit:
+	spin_unlock(&qdev->hw_lock);
+	return status;
+}
+
+void ql_update_stats(struct ql_adapter *qdev)
+{
+	u32 i;
+	u64 data;
+	u64 *iter = &qdev->nic_stats.tx_pkts;
+
+	spin_lock(&qdev->stats_lock);
+	if (ql_sem_spinlock(qdev, qdev->xg_sem_mask)) {
+			QPRINTK(qdev, DRV, ERR,
+				"Couldn't get xgmac sem.\n");
+		goto quit;
+	}
+	/*
+	 * Get TX statistics.
+	 */
+	for (i = 0x200; i < 0x280; i += 8) {
+		if (ql_read_xgmac_reg64(qdev, i, &data)) {
+			QPRINTK(qdev, DRV, ERR,
+				"Error reading status register 0x%.04x.\n", i);
+			goto end;
+		} else
+			*iter = data;
+		iter++;
+	}
+
+	/*
+	 * Get RX statistics.
+	 */
+	for (i = 0x300; i < 0x3d0; i += 8) {
+		if (ql_read_xgmac_reg64(qdev, i, &data)) {
+			QPRINTK(qdev, DRV, ERR,
+				"Error reading status register 0x%.04x.\n", i);
+			goto end;
+		} else
+			*iter = data;
+		iter++;
+	}
+
+end:
+	ql_sem_unlock(qdev, qdev->xg_sem_mask);
+quit:
+	spin_unlock(&qdev->stats_lock);
+
+	QL_DUMP_STAT(qdev);
+
+	return;
+}
+
+static char ql_stats_str_arr[][ETH_GSTRING_LEN] = {
+	{"tx_pkts"},
+	{"tx_bytes"},
+	{"tx_mcast_pkts"},
+	{"tx_bcast_pkts"},
+	{"tx_ucast_pkts"},
+	{"tx_ctl_pkts"},
+	{"tx_pause_pkts"},
+	{"tx_64_pkts"},
+	{"tx_65_to_127_pkts"},
+	{"tx_128_to_255_pkts"},
+	{"tx_256_511_pkts"},
+	{"tx_512_to_1023_pkts"},
+	{"tx_1024_to_1518_pkts"},
+	{"tx_1519_to_max_pkts"},
+	{"tx_undersize_pkts"},
+	{"tx_oversize_pkts"},
+	{"rx_bytes"},
+	{"rx_bytes_ok"},
+	{"rx_pkts"},
+	{"rx_pkts_ok"},
+	{"rx_bcast_pkts"},
+	{"rx_mcast_pkts"},
+	{"rx_ucast_pkts"},
+	{"rx_undersize_pkts"},
+	{"rx_oversize_pkts"},
+	{"rx_jabber_pkts"},
+	{"rx_undersize_fcerr_pkts"},
+	{"rx_drop_events"},
+	{"rx_fcerr_pkts"},
+	{"rx_align_err"},
+	{"rx_symbol_err"},
+	{"rx_mac_err"},
+	{"rx_ctl_pkts"},
+	{"rx_pause_pkts"},
+	{"rx_64_pkts"},
+	{"rx_65_to_127_pkts"},
+	{"rx_128_255_pkts"},
+	{"rx_256_511_pkts"},
+	{"rx_512_to_1023_pkts"},
+	{"rx_1024_to_1518_pkts"},
+	{"rx_1519_to_max_pkts"},
+	{"rx_len_err_pkts"},
+};
+
+static void ql_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+	switch (stringset) {
+	case ETH_SS_STATS:
+		memcpy(buf, ql_stats_str_arr, sizeof(ql_stats_str_arr));
+		break;
+	}
+}
+
+static int ql_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(ql_stats_str_arr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void
+ql_get_ethtool_stats(struct net_device *ndev,
+		     struct ethtool_stats *stats, u64 *data)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	struct nic_stats *s = &qdev->nic_stats;
+
+	ql_update_stats(qdev);
+
+	*data++ = s->tx_pkts;
+	*data++ = s->tx_bytes;
+	*data++ = s->tx_mcast_pkts;
+	*data++ = s->tx_bcast_pkts;
+	*data++ = s->tx_ucast_pkts;
+	*data++ = s->tx_ctl_pkts;
+	*data++ = s->tx_pause_pkts;
+	*data++ = s->tx_64_pkt;
+	*data++ = s->tx_65_to_127_pkt;
+	*data++ = s->tx_128_to_255_pkt;
+	*data++ = s->tx_256_511_pkt;
+	*data++ = s->tx_512_to_1023_pkt;
+	*data++ = s->tx_1024_to_1518_pkt;
+	*data++ = s->tx_1519_to_max_pkt;
+	*data++ = s->tx_undersize_pkt;
+	*data++ = s->tx_oversize_pkt;
+	*data++ = s->rx_bytes;
+	*data++ = s->rx_bytes_ok;
+	*data++ = s->rx_pkts;
+	*data++ = s->rx_pkts_ok;
+	*data++ = s->rx_bcast_pkts;
+	*data++ = s->rx_mcast_pkts;
+	*data++ = s->rx_ucast_pkts;
+	*data++ = s->rx_undersize_pkts;
+	*data++ = s->rx_oversize_pkts;
+	*data++ = s->rx_jabber_pkts;
+	*data++ = s->rx_undersize_fcerr_pkts;
+	*data++ = s->rx_drop_events;
+	*data++ = s->rx_fcerr_pkts;
+	*data++ = s->rx_align_err;
+	*data++ = s->rx_symbol_err;
+	*data++ = s->rx_mac_err;
+	*data++ = s->rx_ctl_pkts;
+	*data++ = s->rx_pause_pkts;
+	*data++ = s->rx_64_pkts;
+	*data++ = s->rx_65_to_127_pkts;
+	*data++ = s->rx_128_255_pkts;
+	*data++ = s->rx_256_511_pkts;
+	*data++ = s->rx_512_to_1023_pkts;
+	*data++ = s->rx_1024_to_1518_pkts;
+	*data++ = s->rx_1519_to_max_pkts;
+	*data++ = s->rx_len_err_pkts;
+}
+
+static int ql_get_settings(struct net_device *ndev,
+			      struct ethtool_cmd *ecmd)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	ecmd->supported = SUPPORTED_10000baseT_Full;
+	ecmd->advertising = ADVERTISED_10000baseT_Full;
+	ecmd->autoneg = AUTONEG_ENABLE;
+	ecmd->transceiver = XCVR_EXTERNAL;
+	if ((qdev->link_status & LINK_TYPE_MASK) == LINK_TYPE_10GBASET) {
+		ecmd->supported |= (SUPPORTED_TP | SUPPORTED_Autoneg);
+		ecmd->advertising |= (ADVERTISED_TP | ADVERTISED_Autoneg);
+		ecmd->port = PORT_TP;
+	} else {
+		ecmd->supported |= SUPPORTED_FIBRE;
+		ecmd->advertising |= ADVERTISED_FIBRE;
+		ecmd->port = PORT_FIBRE;
+	}
+
+	ecmd->speed = SPEED_10000;
+	ecmd->duplex = DUPLEX_FULL;
+
+	return 0;
+}
+
+static void ql_get_drvinfo(struct net_device *ndev,
+			   struct ethtool_drvinfo *drvinfo)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	strncpy(drvinfo->driver, qlge_driver_name, 32);
+	strncpy(drvinfo->version, qlge_driver_version, 32);
+	strncpy(drvinfo->fw_version, "N/A", 32);
+	strncpy(drvinfo->bus_info, pci_name(qdev->pdev), 32);
+	drvinfo->n_stats = 0;
+	drvinfo->testinfo_len = 0;
+	drvinfo->regdump_len = 0;
+	drvinfo->eedump_len = 0;
+}
+
+static int ql_get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+{
+	struct ql_adapter *qdev = netdev_priv(dev);
+
+	c->rx_coalesce_usecs = qdev->rx_coalesce_usecs;
+	c->tx_coalesce_usecs = qdev->tx_coalesce_usecs;
+
+	/* This chip coalesces as follows:
+	 * If a packet arrives, hold off interrupts until
+	 * cqicb->int_delay expires, but if no other packets arrive don't
+	 * wait longer than cqicb->pkt_int_delay. But ethtool doesn't use a
+	 * timer to coalesce on a frame basis.  So, we have to take ethtool's
+	 * max_coalesced_frames value and convert it to a delay in microseconds.
+	 * We do this by using a basic thoughput of 1,000,000 frames per
+	 * second @ (1024 bytes).  This means one frame per usec. So it's a
+	 * simple one to one ratio.
+	 */
+	c->rx_max_coalesced_frames = qdev->rx_max_coalesced_frames;
+	c->tx_max_coalesced_frames = qdev->tx_max_coalesced_frames;
+
+	return 0;
+}
+
+static int ql_set_coalesce(struct net_device *ndev, struct ethtool_coalesce *c)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	/* Validate user parameters. */
+	if (c->rx_coalesce_usecs > qdev->rx_ring_size / 2)
+		return -EINVAL;
+       /* Don't wait more than 10 usec. */
+	if (c->rx_max_coalesced_frames > MAX_INTER_FRAME_WAIT)
+		return -EINVAL;
+	if (c->tx_coalesce_usecs > qdev->tx_ring_size / 2)
+		return -EINVAL;
+	if (c->tx_max_coalesced_frames > MAX_INTER_FRAME_WAIT)
+		return -EINVAL;
+
+	/* Verify a change took place before updating the hardware. */
+	if (qdev->rx_coalesce_usecs == c->rx_coalesce_usecs &&
+	    qdev->tx_coalesce_usecs == c->tx_coalesce_usecs &&
+	    qdev->rx_max_coalesced_frames == c->rx_max_coalesced_frames &&
+	    qdev->tx_max_coalesced_frames == c->tx_max_coalesced_frames)
+		return 0;
+
+	qdev->rx_coalesce_usecs = c->rx_coalesce_usecs;
+	qdev->tx_coalesce_usecs = c->tx_coalesce_usecs;
+	qdev->rx_max_coalesced_frames = c->rx_max_coalesced_frames;
+	qdev->tx_max_coalesced_frames = c->tx_max_coalesced_frames;
+
+	return ql_update_ring_coalescing(qdev);
+}
+
+static u32 ql_get_rx_csum(struct net_device *netdev)
+{
+	struct ql_adapter *qdev = netdev_priv(netdev);
+	return qdev->rx_csum;
+}
+
+static int ql_set_rx_csum(struct net_device *netdev, uint32_t data)
+{
+	struct ql_adapter *qdev = netdev_priv(netdev);
+	qdev->rx_csum = data;
+	return 0;
+}
+
+static int ql_set_tso(struct net_device *ndev, uint32_t data)
+{
+
+	if (data) {
+		ndev->features |= NETIF_F_TSO;
+		ndev->features |= NETIF_F_TSO6;
+	} else {
+		ndev->features &= ~NETIF_F_TSO;
+		ndev->features &= ~NETIF_F_TSO6;
+	}
+	return 0;
+}
+
+static u32 ql_get_msglevel(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	return qdev->msg_enable;
+}
+
+static void ql_set_msglevel(struct net_device *ndev, u32 value)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	qdev->msg_enable = value;
+}
+
+const struct ethtool_ops qlge_ethtool_ops = {
+	.get_settings = ql_get_settings,
+	.get_drvinfo = ql_get_drvinfo,
+	.get_msglevel = ql_get_msglevel,
+	.set_msglevel = ql_set_msglevel,
+	.get_link = ethtool_op_get_link,
+	.get_rx_csum = ql_get_rx_csum,
+	.set_rx_csum = ql_set_rx_csum,
+	.get_tx_csum = ethtool_op_get_tx_csum,
+	.get_sg = ethtool_op_get_sg,
+	.set_sg = ethtool_op_set_sg,
+	.get_tso = ethtool_op_get_tso,
+	.set_tso = ql_set_tso,
+	.get_coalesce = ql_get_coalesce,
+	.set_coalesce = ql_set_coalesce,
+	.get_sset_count = ql_get_sset_count,
+	.get_strings = ql_get_strings,
+	.get_ethtool_stats = ql_get_ethtool_stats,
+};
+
diff --git a/drivers/net/qlge/qlge_main.c b/drivers/net/qlge/qlge_main.c
new file mode 100644
index 00000000000..ad878e2b9de
--- /dev/null
+++ b/drivers/net/qlge/qlge_main.c
@@ -0,0 +1,3954 @@
+/*
+ * QLogic qlge NIC HBA Driver
+ * Copyright (c)  2003-2008 QLogic Corporation
+ * See LICENSE.qlge for copyright and licensing details.
+ * Author:     Linux qlge network device driver by
+ *                      Ron Mercer <ron.mercer@qlogic.com>
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/dmapool.h>
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/ioport.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include "qlge.h"
+
+char qlge_driver_name[] = DRV_NAME;
+const char qlge_driver_version[] = DRV_VERSION;
+
+MODULE_AUTHOR("Ron Mercer <ron.mercer@qlogic.com>");
+MODULE_DESCRIPTION(DRV_STRING " ");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const u32 default_msg =
+    NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK |
+/* NETIF_MSG_TIMER |	*/
+    NETIF_MSG_IFDOWN |
+    NETIF_MSG_IFUP |
+    NETIF_MSG_RX_ERR |
+    NETIF_MSG_TX_ERR |
+    NETIF_MSG_TX_QUEUED |
+    NETIF_MSG_INTR | NETIF_MSG_TX_DONE | NETIF_MSG_RX_STATUS |
+/* NETIF_MSG_PKTDATA | */
+    NETIF_MSG_HW | NETIF_MSG_WOL | 0;
+
+static int debug = 0x00007fff;	/* defaults above */
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+#define MSIX_IRQ 0
+#define MSI_IRQ 1
+#define LEG_IRQ 2
+static int irq_type = MSIX_IRQ;
+module_param(irq_type, int, MSIX_IRQ);
+MODULE_PARM_DESC(irq_type, "0 = MSI-X, 1 = MSI, 2 = Legacy.");
+
+static struct pci_device_id qlge_pci_tbl[] __devinitdata = {
+	{PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, QLGE_DEVICE_ID)},
+	{PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, QLGE_DEVICE_ID1)},
+	/* required last entry */
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, qlge_pci_tbl);
+
+/* This hardware semaphore causes exclusive access to
+ * resources shared between the NIC driver, MPI firmware,
+ * FCOE firmware and the FC driver.
+ */
+static int ql_sem_trylock(struct ql_adapter *qdev, u32 sem_mask)
+{
+	u32 sem_bits = 0;
+
+	switch (sem_mask) {
+	case SEM_XGMAC0_MASK:
+		sem_bits = SEM_SET << SEM_XGMAC0_SHIFT;
+		break;
+	case SEM_XGMAC1_MASK:
+		sem_bits = SEM_SET << SEM_XGMAC1_SHIFT;
+		break;
+	case SEM_ICB_MASK:
+		sem_bits = SEM_SET << SEM_ICB_SHIFT;
+		break;
+	case SEM_MAC_ADDR_MASK:
+		sem_bits = SEM_SET << SEM_MAC_ADDR_SHIFT;
+		break;
+	case SEM_FLASH_MASK:
+		sem_bits = SEM_SET << SEM_FLASH_SHIFT;
+		break;
+	case SEM_PROBE_MASK:
+		sem_bits = SEM_SET << SEM_PROBE_SHIFT;
+		break;
+	case SEM_RT_IDX_MASK:
+		sem_bits = SEM_SET << SEM_RT_IDX_SHIFT;
+		break;
+	case SEM_PROC_REG_MASK:
+		sem_bits = SEM_SET << SEM_PROC_REG_SHIFT;
+		break;
+	default:
+		QPRINTK(qdev, PROBE, ALERT, "Bad Semaphore mask!.\n");
+		return -EINVAL;
+	}
+
+	ql_write32(qdev, SEM, sem_bits | sem_mask);
+	return !(ql_read32(qdev, SEM) & sem_bits);
+}
+
+int ql_sem_spinlock(struct ql_adapter *qdev, u32 sem_mask)
+{
+	unsigned int seconds = 3;
+	do {
+		if (!ql_sem_trylock(qdev, sem_mask))
+			return 0;
+		ssleep(1);
+	} while (--seconds);
+	return -ETIMEDOUT;
+}
+
+void ql_sem_unlock(struct ql_adapter *qdev, u32 sem_mask)
+{
+	ql_write32(qdev, SEM, sem_mask);
+	ql_read32(qdev, SEM);	/* flush */
+}
+
+/* This function waits for a specific bit to come ready
+ * in a given register.  It is used mostly by the initialize
+ * process, but is also used in kernel thread API such as
+ * netdev->set_multi, netdev->set_mac_address, netdev->vlan_rx_add_vid.
+ */
+int ql_wait_reg_rdy(struct ql_adapter *qdev, u32 reg, u32 bit, u32 err_bit)
+{
+	u32 temp;
+	int count = UDELAY_COUNT;
+
+	while (count) {
+		temp = ql_read32(qdev, reg);
+
+		/* check for errors */
+		if (temp & err_bit) {
+			QPRINTK(qdev, PROBE, ALERT,
+				"register 0x%.08x access error, value = 0x%.08x!.\n",
+				reg, temp);
+			return -EIO;
+		} else if (temp & bit)
+			return 0;
+		udelay(UDELAY_DELAY);
+		count--;
+	}
+	QPRINTK(qdev, PROBE, ALERT,
+		"Timed out waiting for reg %x to come ready.\n", reg);
+	return -ETIMEDOUT;
+}
+
+/* The CFG register is used to download TX and RX control blocks
+ * to the chip. This function waits for an operation to complete.
+ */
+static int ql_wait_cfg(struct ql_adapter *qdev, u32 bit)
+{
+	int count = UDELAY_COUNT;
+	u32 temp;
+
+	while (count) {
+		temp = ql_read32(qdev, CFG);
+		if (temp & CFG_LE)
+			return -EIO;
+		if (!(temp & bit))
+			return 0;
+		udelay(UDELAY_DELAY);
+		count--;
+	}
+	return -ETIMEDOUT;
+}
+
+
+/* Used to issue init control blocks to hw. Maps control block,
+ * sets address, triggers download, waits for completion.
+ */
+int ql_write_cfg(struct ql_adapter *qdev, void *ptr, int size, u32 bit,
+		 u16 q_id)
+{
+	u64 map;
+	int status = 0;
+	int direction;
+	u32 mask;
+	u32 value;
+
+	direction =
+	    (bit & (CFG_LRQ | CFG_LR | CFG_LCQ)) ? PCI_DMA_TODEVICE :
+	    PCI_DMA_FROMDEVICE;
+
+	map = pci_map_single(qdev->pdev, ptr, size, direction);
+	if (pci_dma_mapping_error(qdev->pdev, map)) {
+		QPRINTK(qdev, IFUP, ERR, "Couldn't map DMA area.\n");
+		return -ENOMEM;
+	}
+
+	status = ql_wait_cfg(qdev, bit);
+	if (status) {
+		QPRINTK(qdev, IFUP, ERR,
+			"Timed out waiting for CFG to come ready.\n");
+		goto exit;
+	}
+
+	status = ql_sem_spinlock(qdev, SEM_ICB_MASK);
+	if (status)
+		goto exit;
+	ql_write32(qdev, ICB_L, (u32) map);
+	ql_write32(qdev, ICB_H, (u32) (map >> 32));
+	ql_sem_unlock(qdev, SEM_ICB_MASK);	/* does flush too */
+
+	mask = CFG_Q_MASK | (bit << 16);
+	value = bit | (q_id << CFG_Q_SHIFT);
+	ql_write32(qdev, CFG, (mask | value));
+
+	/*
+	 * Wait for the bit to clear after signaling hw.
+	 */
+	status = ql_wait_cfg(qdev, bit);
+exit:
+	pci_unmap_single(qdev->pdev, map, size, direction);
+	return status;
+}
+
+/* Get a specific MAC address from the CAM.  Used for debug and reg dump. */
+int ql_get_mac_addr_reg(struct ql_adapter *qdev, u32 type, u16 index,
+			u32 *value)
+{
+	u32 offset = 0;
+	int status;
+
+	status = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+	if (status)
+		return status;
+	switch (type) {
+	case MAC_ADDR_TYPE_MULTI_MAC:
+	case MAC_ADDR_TYPE_CAM_MAC:
+		{
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, MAC_ADDR_E);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+				   MAC_ADDR_ADR | MAC_ADDR_RS | type); /* type */
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MR, MAC_ADDR_E);
+			if (status)
+				goto exit;
+			*value++ = ql_read32(qdev, MAC_ADDR_DATA);
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, MAC_ADDR_E);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+				   MAC_ADDR_ADR | MAC_ADDR_RS | type); /* type */
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MR, MAC_ADDR_E);
+			if (status)
+				goto exit;
+			*value++ = ql_read32(qdev, MAC_ADDR_DATA);
+			if (type == MAC_ADDR_TYPE_CAM_MAC) {
+				status =
+				    ql_wait_reg_rdy(qdev,
+					MAC_ADDR_IDX, MAC_ADDR_MW, MAC_ADDR_E);
+				if (status)
+					goto exit;
+				ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+					   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+					   MAC_ADDR_ADR | MAC_ADDR_RS | type); /* type */
+				status =
+				    ql_wait_reg_rdy(qdev, MAC_ADDR_IDX,
+						    MAC_ADDR_MR, MAC_ADDR_E);
+				if (status)
+					goto exit;
+				*value++ = ql_read32(qdev, MAC_ADDR_DATA);
+			}
+			break;
+		}
+	case MAC_ADDR_TYPE_VLAN:
+	case MAC_ADDR_TYPE_MULTI_FLTR:
+	default:
+		QPRINTK(qdev, IFUP, CRIT,
+			"Address type %d not yet supported.\n", type);
+		status = -EPERM;
+	}
+exit:
+	ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+	return status;
+}
+
+/* Set up a MAC, multicast or VLAN address for the
+ * inbound frame matching.
+ */
+static int ql_set_mac_addr_reg(struct ql_adapter *qdev, u8 *addr, u32 type,
+			       u16 index)
+{
+	u32 offset = 0;
+	int status = 0;
+
+	status = ql_sem_spinlock(qdev, SEM_MAC_ADDR_MASK);
+	if (status)
+		return status;
+	switch (type) {
+	case MAC_ADDR_TYPE_MULTI_MAC:
+	case MAC_ADDR_TYPE_CAM_MAC:
+		{
+			u32 cam_output;
+			u32 upper = (addr[0] << 8) | addr[1];
+			u32 lower =
+			    (addr[2] << 24) | (addr[3] << 16) | (addr[4] << 8) |
+			    (addr[5]);
+
+			QPRINTK(qdev, IFUP, INFO,
+				"Adding %s address %02x:%02x:%02x:%02x:%02x:%02x"
+				" at index %d in the CAM.\n",
+				((type ==
+				  MAC_ADDR_TYPE_MULTI_MAC) ? "MULTICAST" :
+				 "UNICAST"), addr[0], addr[1], addr[2], addr[3],
+				addr[4], addr[5], index);
+
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, MAC_ADDR_E);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+				   type);	/* type */
+			ql_write32(qdev, MAC_ADDR_DATA, lower);
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, MAC_ADDR_E);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset++) | /* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) | /* index */
+				   type);	/* type */
+			ql_write32(qdev, MAC_ADDR_DATA, upper);
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, MAC_ADDR_E);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, (offset) |	/* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) |	/* index */
+				   type);	/* type */
+			/* This field should also include the queue id
+			   and possibly the function id.  Right now we hardcode
+			   the route field to NIC core.
+			 */
+			if (type == MAC_ADDR_TYPE_CAM_MAC) {
+				cam_output = (CAM_OUT_ROUTE_NIC |
+					      (qdev->
+					       func << CAM_OUT_FUNC_SHIFT) |
+					      (qdev->
+					       rss_ring_first_cq_id <<
+					       CAM_OUT_CQ_ID_SHIFT));
+				if (qdev->vlgrp)
+					cam_output |= CAM_OUT_RV;
+				/* route to NIC core */
+				ql_write32(qdev, MAC_ADDR_DATA, cam_output);
+			}
+			break;
+		}
+	case MAC_ADDR_TYPE_VLAN:
+		{
+			u32 enable_bit = *((u32 *) &addr[0]);
+			/* For VLAN, the addr actually holds a bit that
+			 * either enables or disables the vlan id we are
+			 * addressing. It's either MAC_ADDR_E on or off.
+			 * That's bit-27 we're talking about.
+			 */
+			QPRINTK(qdev, IFUP, INFO, "%s VLAN ID %d %s the CAM.\n",
+				(enable_bit ? "Adding" : "Removing"),
+				index, (enable_bit ? "to" : "from"));
+
+			status =
+			    ql_wait_reg_rdy(qdev,
+				MAC_ADDR_IDX, MAC_ADDR_MW, MAC_ADDR_E);
+			if (status)
+				goto exit;
+			ql_write32(qdev, MAC_ADDR_IDX, offset |	/* offset */
+				   (index << MAC_ADDR_IDX_SHIFT) |	/* index */
+				   type |	/* type */
+				   enable_bit);	/* enable/disable */
+			break;
+		}
+	case MAC_ADDR_TYPE_MULTI_FLTR:
+	default:
+		QPRINTK(qdev, IFUP, CRIT,
+			"Address type %d not yet supported.\n", type);
+		status = -EPERM;
+	}
+exit:
+	ql_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
+	return status;
+}
+
+/* Get a specific frame routing value from the CAM.
+ * Used for debug and reg dump.
+ */
+int ql_get_routing_reg(struct ql_adapter *qdev, u32 index, u32 *value)
+{
+	int status = 0;
+
+	status = ql_sem_spinlock(qdev, SEM_RT_IDX_MASK);
+	if (status)
+		goto exit;
+
+	status = ql_wait_reg_rdy(qdev, RT_IDX, RT_IDX_MW, RT_IDX_E);
+	if (status)
+		goto exit;
+
+	ql_write32(qdev, RT_IDX,
+		   RT_IDX_TYPE_NICQ | RT_IDX_RS | (index << RT_IDX_IDX_SHIFT));
+	status = ql_wait_reg_rdy(qdev, RT_IDX, RT_IDX_MR, RT_IDX_E);
+	if (status)
+		goto exit;
+	*value = ql_read32(qdev, RT_DATA);
+exit:
+	ql_sem_unlock(qdev, SEM_RT_IDX_MASK);
+	return status;
+}
+
+/* The NIC function for this chip has 16 routing indexes.  Each one can be used
+ * to route different frame types to various inbound queues.  We send broadcast/
+ * multicast/error frames to the default queue for slow handling,
+ * and CAM hit/RSS frames to the fast handling queues.
+ */
+static int ql_set_routing_reg(struct ql_adapter *qdev, u32 index, u32 mask,
+			      int enable)
+{
+	int status;
+	u32 value = 0;
+
+	status = ql_sem_spinlock(qdev, SEM_RT_IDX_MASK);
+	if (status)
+		return status;
+
+	QPRINTK(qdev, IFUP, DEBUG,
+		"%s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s mask %s the routing reg.\n",
+		(enable ? "Adding" : "Removing"),
+		((index == RT_IDX_ALL_ERR_SLOT) ? "MAC ERROR/ALL ERROR" : ""),
+		((index == RT_IDX_IP_CSUM_ERR_SLOT) ? "IP CSUM ERROR" : ""),
+		((index ==
+		  RT_IDX_TCP_UDP_CSUM_ERR_SLOT) ? "TCP/UDP CSUM ERROR" : ""),
+		((index == RT_IDX_BCAST_SLOT) ? "BROADCAST" : ""),
+		((index == RT_IDX_MCAST_MATCH_SLOT) ? "MULTICAST MATCH" : ""),
+		((index == RT_IDX_ALLMULTI_SLOT) ? "ALL MULTICAST MATCH" : ""),
+		((index == RT_IDX_UNUSED6_SLOT) ? "UNUSED6" : ""),
+		((index == RT_IDX_UNUSED7_SLOT) ? "UNUSED7" : ""),
+		((index == RT_IDX_RSS_MATCH_SLOT) ? "RSS ALL/IPV4 MATCH" : ""),
+		((index == RT_IDX_RSS_IPV6_SLOT) ? "RSS IPV6" : ""),
+		((index == RT_IDX_RSS_TCP4_SLOT) ? "RSS TCP4" : ""),
+		((index == RT_IDX_RSS_TCP6_SLOT) ? "RSS TCP6" : ""),
+		((index == RT_IDX_CAM_HIT_SLOT) ? "CAM HIT" : ""),
+		((index == RT_IDX_UNUSED013) ? "UNUSED13" : ""),
+		((index == RT_IDX_UNUSED014) ? "UNUSED14" : ""),
+		((index == RT_IDX_PROMISCUOUS_SLOT) ? "PROMISCUOUS" : ""),
+		(enable ? "to" : "from"));
+
+	switch (mask) {
+	case RT_IDX_CAM_HIT:
+		{
+			value = RT_IDX_DST_CAM_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_CAM_HIT_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_VALID:	/* Promiscuous Mode frames. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_PROMISCUOUS_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_ERR:	/* Pass up MAC,IP,TCP/UDP error frames. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_ALL_ERR_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_BCAST:	/* Pass up Broadcast frames to default Q. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_BCAST_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_MCAST:	/* Pass up All Multicast frames. */
+		{
+			value = RT_IDX_DST_CAM_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_ALLMULTI_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_MCAST_MATCH:	/* Pass up matched Multicast frames. */
+		{
+			value = RT_IDX_DST_CAM_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_MCAST_MATCH_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case RT_IDX_RSS_MATCH:	/* Pass up matched RSS frames. */
+		{
+			value = RT_IDX_DST_RSS |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (RT_IDX_RSS_MATCH_SLOT << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	case 0:		/* Clear the E-bit on an entry. */
+		{
+			value = RT_IDX_DST_DFLT_Q |	/* dest */
+			    RT_IDX_TYPE_NICQ |	/* type */
+			    (index << RT_IDX_IDX_SHIFT);/* index */
+			break;
+		}
+	default:
+		QPRINTK(qdev, IFUP, ERR, "Mask type %d not yet supported.\n",
+			mask);
+		status = -EPERM;
+		goto exit;
+	}
+
+	if (value) {
+		status = ql_wait_reg_rdy(qdev, RT_IDX, RT_IDX_MW, 0);
+		if (status)
+			goto exit;
+		value |= (enable ? RT_IDX_E : 0);
+		ql_write32(qdev, RT_IDX, value);
+		ql_write32(qdev, RT_DATA, enable ? mask : 0);
+	}
+exit:
+	ql_sem_unlock(qdev, SEM_RT_IDX_MASK);
+	return status;
+}
+
+static void ql_enable_interrupts(struct ql_adapter *qdev)
+{
+	ql_write32(qdev, INTR_EN, (INTR_EN_EI << 16) | INTR_EN_EI);
+}
+
+static void ql_disable_interrupts(struct ql_adapter *qdev)
+{
+	ql_write32(qdev, INTR_EN, (INTR_EN_EI << 16));
+}
+
+/* If we're running with multiple MSI-X vectors then we enable on the fly.
+ * Otherwise, we may have multiple outstanding workers and don't want to
+ * enable until the last one finishes. In this case, the irq_cnt gets
+ * incremented everytime we queue a worker and decremented everytime
+ * a worker finishes.  Once it hits zero we enable the interrupt.
+ */
+void ql_enable_completion_interrupt(struct ql_adapter *qdev, u32 intr)
+{
+	if (likely(test_bit(QL_MSIX_ENABLED, &qdev->flags)))
+		ql_write32(qdev, INTR_EN,
+			   qdev->intr_context[intr].intr_en_mask);
+	else {
+		if (qdev->legacy_check)
+			spin_lock(&qdev->legacy_lock);
+		if (atomic_dec_and_test(&qdev->intr_context[intr].irq_cnt)) {
+			QPRINTK(qdev, INTR, ERR, "Enabling interrupt %d.\n",
+				intr);
+			ql_write32(qdev, INTR_EN,
+				   qdev->intr_context[intr].intr_en_mask);
+		} else {
+			QPRINTK(qdev, INTR, ERR,
+				"Skip enable, other queue(s) are active.\n");
+		}
+		if (qdev->legacy_check)
+			spin_unlock(&qdev->legacy_lock);
+	}
+}
+
+static u32 ql_disable_completion_interrupt(struct ql_adapter *qdev, u32 intr)
+{
+	u32 var = 0;
+
+	if (likely(test_bit(QL_MSIX_ENABLED, &qdev->flags)))
+		goto exit;
+	else if (!atomic_read(&qdev->intr_context[intr].irq_cnt)) {
+		ql_write32(qdev, INTR_EN,
+			   qdev->intr_context[intr].intr_dis_mask);
+		var = ql_read32(qdev, STS);
+	}
+	atomic_inc(&qdev->intr_context[intr].irq_cnt);
+exit:
+	return var;
+}
+
+static void ql_enable_all_completion_interrupts(struct ql_adapter *qdev)
+{
+	int i;
+	for (i = 0; i < qdev->intr_count; i++) {
+		/* The enable call does a atomic_dec_and_test
+		 * and enables only if the result is zero.
+		 * So we precharge it here.
+		 */
+		atomic_set(&qdev->intr_context[i].irq_cnt, 1);
+		ql_enable_completion_interrupt(qdev, i);
+	}
+
+}
+
+int ql_read_flash_word(struct ql_adapter *qdev, int offset, u32 *data)
+{
+	int status = 0;
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			FLASH_ADDR, FLASH_ADDR_RDY, FLASH_ADDR_ERR);
+	if (status)
+		goto exit;
+	/* set up for reg read */
+	ql_write32(qdev, FLASH_ADDR, FLASH_ADDR_R | offset);
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			FLASH_ADDR, FLASH_ADDR_RDY, FLASH_ADDR_ERR);
+	if (status)
+		goto exit;
+	/* get the data */
+	*data = ql_read32(qdev, FLASH_DATA);
+exit:
+	return status;
+}
+
+static int ql_get_flash_params(struct ql_adapter *qdev)
+{
+	int i;
+	int status;
+	u32 *p = (u32 *)&qdev->flash;
+
+	if (ql_sem_spinlock(qdev, SEM_FLASH_MASK))
+		return -ETIMEDOUT;
+
+	for (i = 0; i < sizeof(qdev->flash) / sizeof(u32); i++, p++) {
+		status = ql_read_flash_word(qdev, i, p);
+		if (status) {
+			QPRINTK(qdev, IFUP, ERR, "Error reading flash.\n");
+			goto exit;
+		}
+
+	}
+exit:
+	ql_sem_unlock(qdev, SEM_FLASH_MASK);
+	return status;
+}
+
+/* xgmac register are located behind the xgmac_addr and xgmac_data
+ * register pair.  Each read/write requires us to wait for the ready
+ * bit before reading/writing the data.
+ */
+static int ql_write_xgmac_reg(struct ql_adapter *qdev, u32 reg, u32 data)
+{
+	int status;
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			XGMAC_ADDR, XGMAC_ADDR_RDY, XGMAC_ADDR_XME);
+	if (status)
+		return status;
+	/* write the data to the data reg */
+	ql_write32(qdev, XGMAC_DATA, data);
+	/* trigger the write */
+	ql_write32(qdev, XGMAC_ADDR, reg);
+	return status;
+}
+
+/* xgmac register are located behind the xgmac_addr and xgmac_data
+ * register pair.  Each read/write requires us to wait for the ready
+ * bit before reading/writing the data.
+ */
+int ql_read_xgmac_reg(struct ql_adapter *qdev, u32 reg, u32 *data)
+{
+	int status = 0;
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			XGMAC_ADDR, XGMAC_ADDR_RDY, XGMAC_ADDR_XME);
+	if (status)
+		goto exit;
+	/* set up for reg read */
+	ql_write32(qdev, XGMAC_ADDR, reg | XGMAC_ADDR_R);
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev,
+			XGMAC_ADDR, XGMAC_ADDR_RDY, XGMAC_ADDR_XME);
+	if (status)
+		goto exit;
+	/* get the data */
+	*data = ql_read32(qdev, XGMAC_DATA);
+exit:
+	return status;
+}
+
+/* This is used for reading the 64-bit statistics regs. */
+int ql_read_xgmac_reg64(struct ql_adapter *qdev, u32 reg, u64 *data)
+{
+	int status = 0;
+	u32 hi = 0;
+	u32 lo = 0;
+
+	status = ql_read_xgmac_reg(qdev, reg, &lo);
+	if (status)
+		goto exit;
+
+	status = ql_read_xgmac_reg(qdev, reg + 4, &hi);
+	if (status)
+		goto exit;
+
+	*data = (u64) lo | ((u64) hi << 32);
+
+exit:
+	return status;
+}
+
+/* Take the MAC Core out of reset.
+ * Enable statistics counting.
+ * Take the transmitter/receiver out of reset.
+ * This functionality may be done in the MPI firmware at a
+ * later date.
+ */
+static int ql_port_initialize(struct ql_adapter *qdev)
+{
+	int status = 0;
+	u32 data;
+
+	if (ql_sem_trylock(qdev, qdev->xg_sem_mask)) {
+		/* Another function has the semaphore, so
+		 * wait for the port init bit to come ready.
+		 */
+		QPRINTK(qdev, LINK, INFO,
+			"Another function has the semaphore, so wait for the port init bit to come ready.\n");
+		status = ql_wait_reg_rdy(qdev, STS, qdev->port_init, 0);
+		if (status) {
+			QPRINTK(qdev, LINK, CRIT,
+				"Port initialize timed out.\n");
+		}
+		return status;
+	}
+
+	QPRINTK(qdev, LINK, INFO, "Got xgmac semaphore!.\n");
+	/* Set the core reset. */
+	status = ql_read_xgmac_reg(qdev, GLOBAL_CFG, &data);
+	if (status)
+		goto end;
+	data |= GLOBAL_CFG_RESET;
+	status = ql_write_xgmac_reg(qdev, GLOBAL_CFG, data);
+	if (status)
+		goto end;
+
+	/* Clear the core reset and turn on jumbo for receiver. */
+	data &= ~GLOBAL_CFG_RESET;	/* Clear core reset. */
+	data |= GLOBAL_CFG_JUMBO;	/* Turn on jumbo. */
+	data |= GLOBAL_CFG_TX_STAT_EN;
+	data |= GLOBAL_CFG_RX_STAT_EN;
+	status = ql_write_xgmac_reg(qdev, GLOBAL_CFG, data);
+	if (status)
+		goto end;
+
+	/* Enable transmitter, and clear it's reset. */
+	status = ql_read_xgmac_reg(qdev, TX_CFG, &data);
+	if (status)
+		goto end;
+	data &= ~TX_CFG_RESET;	/* Clear the TX MAC reset. */
+	data |= TX_CFG_EN;	/* Enable the transmitter. */
+	status = ql_write_xgmac_reg(qdev, TX_CFG, data);
+	if (status)
+		goto end;
+
+	/* Enable receiver and clear it's reset. */
+	status = ql_read_xgmac_reg(qdev, RX_CFG, &data);
+	if (status)
+		goto end;
+	data &= ~RX_CFG_RESET;	/* Clear the RX MAC reset. */
+	data |= RX_CFG_EN;	/* Enable the receiver. */
+	status = ql_write_xgmac_reg(qdev, RX_CFG, data);
+	if (status)
+		goto end;
+
+	/* Turn on jumbo. */
+	status =
+	    ql_write_xgmac_reg(qdev, MAC_TX_PARAMS, MAC_TX_PARAMS_JUMBO | (0x2580 << 16));
+	if (status)
+		goto end;
+	status =
+	    ql_write_xgmac_reg(qdev, MAC_RX_PARAMS, 0x2580);
+	if (status)
+		goto end;
+
+	/* Signal to the world that the port is enabled.        */
+	ql_write32(qdev, STS, ((qdev->port_init << 16) | qdev->port_init));
+end:
+	ql_sem_unlock(qdev, qdev->xg_sem_mask);
+	return status;
+}
+
+/* Get the next large buffer. */
+struct bq_desc *ql_get_curr_lbuf(struct rx_ring *rx_ring)
+{
+	struct bq_desc *lbq_desc = &rx_ring->lbq[rx_ring->lbq_curr_idx];
+	rx_ring->lbq_curr_idx++;
+	if (rx_ring->lbq_curr_idx == rx_ring->lbq_len)
+		rx_ring->lbq_curr_idx = 0;
+	rx_ring->lbq_free_cnt++;
+	return lbq_desc;
+}
+
+/* Get the next small buffer. */
+struct bq_desc *ql_get_curr_sbuf(struct rx_ring *rx_ring)
+{
+	struct bq_desc *sbq_desc = &rx_ring->sbq[rx_ring->sbq_curr_idx];
+	rx_ring->sbq_curr_idx++;
+	if (rx_ring->sbq_curr_idx == rx_ring->sbq_len)
+		rx_ring->sbq_curr_idx = 0;
+	rx_ring->sbq_free_cnt++;
+	return sbq_desc;
+}
+
+/* Update an rx ring index. */
+static void ql_update_cq(struct rx_ring *rx_ring)
+{
+	rx_ring->cnsmr_idx++;
+	rx_ring->curr_entry++;
+	if (unlikely(rx_ring->cnsmr_idx == rx_ring->cq_len)) {
+		rx_ring->cnsmr_idx = 0;
+		rx_ring->curr_entry = rx_ring->cq_base;
+	}
+}
+
+static void ql_write_cq_idx(struct rx_ring *rx_ring)
+{
+	ql_write_db_reg(rx_ring->cnsmr_idx, rx_ring->cnsmr_idx_db_reg);
+}
+
+/* Process (refill) a large buffer queue. */
+static void ql_update_lbq(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	int clean_idx = rx_ring->lbq_clean_idx;
+	struct bq_desc *lbq_desc;
+	struct bq_element *bq;
+	u64 map;
+	int i;
+
+	while (rx_ring->lbq_free_cnt > 16) {
+		for (i = 0; i < 16; i++) {
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"lbq: try cleaning clean_idx = %d.\n",
+				clean_idx);
+			lbq_desc = &rx_ring->lbq[clean_idx];
+			bq = lbq_desc->bq;
+			if (lbq_desc->p.lbq_page == NULL) {
+				QPRINTK(qdev, RX_STATUS, DEBUG,
+					"lbq: getting new page for index %d.\n",
+					lbq_desc->index);
+				lbq_desc->p.lbq_page = alloc_page(GFP_ATOMIC);
+				if (lbq_desc->p.lbq_page == NULL) {
+					QPRINTK(qdev, RX_STATUS, ERR,
+						"Couldn't get a page.\n");
+					return;
+				}
+				map = pci_map_page(qdev->pdev,
+						   lbq_desc->p.lbq_page,
+						   0, PAGE_SIZE,
+						   PCI_DMA_FROMDEVICE);
+				if (pci_dma_mapping_error(qdev->pdev, map)) {
+					QPRINTK(qdev, RX_STATUS, ERR,
+						"PCI mapping failed.\n");
+					return;
+				}
+				pci_unmap_addr_set(lbq_desc, mapaddr, map);
+				pci_unmap_len_set(lbq_desc, maplen, PAGE_SIZE);
+				bq->addr_lo =	/*lbq_desc->addr_lo = */
+				    cpu_to_le32(map);
+				bq->addr_hi =	/*lbq_desc->addr_hi = */
+				    cpu_to_le32(map >> 32);
+			}
+			clean_idx++;
+			if (clean_idx == rx_ring->lbq_len)
+				clean_idx = 0;
+		}
+
+		rx_ring->lbq_clean_idx = clean_idx;
+		rx_ring->lbq_prod_idx += 16;
+		if (rx_ring->lbq_prod_idx == rx_ring->lbq_len)
+			rx_ring->lbq_prod_idx = 0;
+		QPRINTK(qdev, RX_STATUS, DEBUG,
+			"lbq: updating prod idx = %d.\n",
+			rx_ring->lbq_prod_idx);
+		ql_write_db_reg(rx_ring->lbq_prod_idx,
+				rx_ring->lbq_prod_idx_db_reg);
+		rx_ring->lbq_free_cnt -= 16;
+	}
+}
+
+/* Process (refill) a small buffer queue. */
+static void ql_update_sbq(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	int clean_idx = rx_ring->sbq_clean_idx;
+	struct bq_desc *sbq_desc;
+	struct bq_element *bq;
+	u64 map;
+	int i;
+
+	while (rx_ring->sbq_free_cnt > 16) {
+		for (i = 0; i < 16; i++) {
+			sbq_desc = &rx_ring->sbq[clean_idx];
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"sbq: try cleaning clean_idx = %d.\n",
+				clean_idx);
+			bq = sbq_desc->bq;
+			if (sbq_desc->p.skb == NULL) {
+				QPRINTK(qdev, RX_STATUS, DEBUG,
+					"sbq: getting new skb for index %d.\n",
+					sbq_desc->index);
+				sbq_desc->p.skb =
+				    netdev_alloc_skb(qdev->ndev,
+						     rx_ring->sbq_buf_size);
+				if (sbq_desc->p.skb == NULL) {
+					QPRINTK(qdev, PROBE, ERR,
+						"Couldn't get an skb.\n");
+					rx_ring->sbq_clean_idx = clean_idx;
+					return;
+				}
+				skb_reserve(sbq_desc->p.skb, QLGE_SB_PAD);
+				map = pci_map_single(qdev->pdev,
+						     sbq_desc->p.skb->data,
+						     rx_ring->sbq_buf_size /
+						     2, PCI_DMA_FROMDEVICE);
+				pci_unmap_addr_set(sbq_desc, mapaddr, map);
+				pci_unmap_len_set(sbq_desc, maplen,
+						  rx_ring->sbq_buf_size / 2);
+				bq->addr_lo = cpu_to_le32(map);
+				bq->addr_hi = cpu_to_le32(map >> 32);
+			}
+
+			clean_idx++;
+			if (clean_idx == rx_ring->sbq_len)
+				clean_idx = 0;
+		}
+		rx_ring->sbq_clean_idx = clean_idx;
+		rx_ring->sbq_prod_idx += 16;
+		if (rx_ring->sbq_prod_idx == rx_ring->sbq_len)
+			rx_ring->sbq_prod_idx = 0;
+		QPRINTK(qdev, RX_STATUS, DEBUG,
+			"sbq: updating prod idx = %d.\n",
+			rx_ring->sbq_prod_idx);
+		ql_write_db_reg(rx_ring->sbq_prod_idx,
+				rx_ring->sbq_prod_idx_db_reg);
+
+		rx_ring->sbq_free_cnt -= 16;
+	}
+}
+
+static void ql_update_buffer_queues(struct ql_adapter *qdev,
+				    struct rx_ring *rx_ring)
+{
+	ql_update_sbq(qdev, rx_ring);
+	ql_update_lbq(qdev, rx_ring);
+}
+
+/* Unmaps tx buffers.  Can be called from send() if a pci mapping
+ * fails at some stage, or from the interrupt when a tx completes.
+ */
+static void ql_unmap_send(struct ql_adapter *qdev,
+			  struct tx_ring_desc *tx_ring_desc, int mapped)
+{
+	int i;
+	for (i = 0; i < mapped; i++) {
+		if (i == 0 || (i == 7 && mapped > 7)) {
+			/*
+			 * Unmap the skb->data area, or the
+			 * external sglist (AKA the Outbound
+			 * Address List (OAL)).
+			 * If its the zeroeth element, then it's
+			 * the skb->data area.  If it's the 7th
+			 * element and there is more than 6 frags,
+			 * then its an OAL.
+			 */
+			if (i == 7) {
+				QPRINTK(qdev, TX_DONE, DEBUG,
+					"unmapping OAL area.\n");
+			}
+			pci_unmap_single(qdev->pdev,
+					 pci_unmap_addr(&tx_ring_desc->map[i],
+							mapaddr),
+					 pci_unmap_len(&tx_ring_desc->map[i],
+						       maplen),
+					 PCI_DMA_TODEVICE);
+		} else {
+			QPRINTK(qdev, TX_DONE, DEBUG, "unmapping frag %d.\n",
+				i);
+			pci_unmap_page(qdev->pdev,
+				       pci_unmap_addr(&tx_ring_desc->map[i],
+						      mapaddr),
+				       pci_unmap_len(&tx_ring_desc->map[i],
+						     maplen), PCI_DMA_TODEVICE);
+		}
+	}
+
+}
+
+/* Map the buffers for this transmit.  This will return
+ * NETDEV_TX_BUSY or NETDEV_TX_OK based on success.
+ */
+static int ql_map_send(struct ql_adapter *qdev,
+		       struct ob_mac_iocb_req *mac_iocb_ptr,
+		       struct sk_buff *skb, struct tx_ring_desc *tx_ring_desc)
+{
+	int len = skb_headlen(skb);
+	dma_addr_t map;
+	int frag_idx, err, map_idx = 0;
+	struct tx_buf_desc *tbd = mac_iocb_ptr->tbd;
+	int frag_cnt = skb_shinfo(skb)->nr_frags;
+
+	if (frag_cnt) {
+		QPRINTK(qdev, TX_QUEUED, DEBUG, "frag_cnt = %d.\n", frag_cnt);
+	}
+	/*
+	 * Map the skb buffer first.
+	 */
+	map = pci_map_single(qdev->pdev, skb->data, len, PCI_DMA_TODEVICE);
+
+	err = pci_dma_mapping_error(qdev->pdev, map);
+	if (err) {
+		QPRINTK(qdev, TX_QUEUED, ERR,
+			"PCI mapping failed with error: %d\n", err);
+
+		return NETDEV_TX_BUSY;
+	}
+
+	tbd->len = cpu_to_le32(len);
+	tbd->addr = cpu_to_le64(map);
+	pci_unmap_addr_set(&tx_ring_desc->map[map_idx], mapaddr, map);
+	pci_unmap_len_set(&tx_ring_desc->map[map_idx], maplen, len);
+	map_idx++;
+
+	/*
+	 * This loop fills the remainder of the 8 address descriptors
+	 * in the IOCB.  If there are more than 7 fragments, then the
+	 * eighth address desc will point to an external list (OAL).
+	 * When this happens, the remainder of the frags will be stored
+	 * in this list.
+	 */
+	for (frag_idx = 0; frag_idx < frag_cnt; frag_idx++, map_idx++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[frag_idx];
+		tbd++;
+		if (frag_idx == 6 && frag_cnt > 7) {
+			/* Let's tack on an sglist.
+			 * Our control block will now
+			 * look like this:
+			 * iocb->seg[0] = skb->data
+			 * iocb->seg[1] = frag[0]
+			 * iocb->seg[2] = frag[1]
+			 * iocb->seg[3] = frag[2]
+			 * iocb->seg[4] = frag[3]
+			 * iocb->seg[5] = frag[4]
+			 * iocb->seg[6] = frag[5]
+			 * iocb->seg[7] = ptr to OAL (external sglist)
+			 * oal->seg[0] = frag[6]
+			 * oal->seg[1] = frag[7]
+			 * oal->seg[2] = frag[8]
+			 * oal->seg[3] = frag[9]
+			 * oal->seg[4] = frag[10]
+			 *      etc...
+			 */
+			/* Tack on the OAL in the eighth segment of IOCB. */
+			map = pci_map_single(qdev->pdev, &tx_ring_desc->oal,
+					     sizeof(struct oal),
+					     PCI_DMA_TODEVICE);
+			err = pci_dma_mapping_error(qdev->pdev, map);
+			if (err) {
+				QPRINTK(qdev, TX_QUEUED, ERR,
+					"PCI mapping outbound address list with error: %d\n",
+					err);
+				goto map_error;
+			}
+
+			tbd->addr = cpu_to_le64(map);
+			/*
+			 * The length is the number of fragments
+			 * that remain to be mapped times the length
+			 * of our sglist (OAL).
+			 */
+			tbd->len =
+			    cpu_to_le32((sizeof(struct tx_buf_desc) *
+					 (frag_cnt - frag_idx)) | TX_DESC_C);
+			pci_unmap_addr_set(&tx_ring_desc->map[map_idx], mapaddr,
+					   map);
+			pci_unmap_len_set(&tx_ring_desc->map[map_idx], maplen,
+					  sizeof(struct oal));
+			tbd = (struct tx_buf_desc *)&tx_ring_desc->oal;
+			map_idx++;
+		}
+
+		map =
+		    pci_map_page(qdev->pdev, frag->page,
+				 frag->page_offset, frag->size,
+				 PCI_DMA_TODEVICE);
+
+		err = pci_dma_mapping_error(qdev->pdev, map);
+		if (err) {
+			QPRINTK(qdev, TX_QUEUED, ERR,
+				"PCI mapping frags failed with error: %d.\n",
+				err);
+			goto map_error;
+		}
+
+		tbd->addr = cpu_to_le64(map);
+		tbd->len = cpu_to_le32(frag->size);
+		pci_unmap_addr_set(&tx_ring_desc->map[map_idx], mapaddr, map);
+		pci_unmap_len_set(&tx_ring_desc->map[map_idx], maplen,
+				  frag->size);
+
+	}
+	/* Save the number of segments we've mapped. */
+	tx_ring_desc->map_cnt = map_idx;
+	/* Terminate the last segment. */
+	tbd->len = cpu_to_le32(le32_to_cpu(tbd->len) | TX_DESC_E);
+	return NETDEV_TX_OK;
+
+map_error:
+	/*
+	 * If the first frag mapping failed, then i will be zero.
+	 * This causes the unmap of the skb->data area.  Otherwise
+	 * we pass in the number of frags that mapped successfully
+	 * so they can be umapped.
+	 */
+	ql_unmap_send(qdev, tx_ring_desc, map_idx);
+	return NETDEV_TX_BUSY;
+}
+
+void ql_realign_skb(struct sk_buff *skb, int len)
+{
+	void *temp_addr = skb->data;
+
+	/* Undo the skb_reserve(skb,32) we did before
+	 * giving to hardware, and realign data on
+	 * a 2-byte boundary.
+	 */
+	skb->data -= QLGE_SB_PAD - NET_IP_ALIGN;
+	skb->tail -= QLGE_SB_PAD - NET_IP_ALIGN;
+	skb_copy_to_linear_data(skb, temp_addr,
+		(unsigned int)len);
+}
+
+/*
+ * This function builds an skb for the given inbound
+ * completion.  It will be rewritten for readability in the near
+ * future, but for not it works well.
+ */
+static struct sk_buff *ql_build_rx_skb(struct ql_adapter *qdev,
+				       struct rx_ring *rx_ring,
+				       struct ib_mac_iocb_rsp *ib_mac_rsp)
+{
+	struct bq_desc *lbq_desc;
+	struct bq_desc *sbq_desc;
+	struct sk_buff *skb = NULL;
+	u32 length = le32_to_cpu(ib_mac_rsp->data_len);
+       u32 hdr_len = le32_to_cpu(ib_mac_rsp->hdr_len);
+
+	/*
+	 * Handle the header buffer if present.
+	 */
+	if (ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HV &&
+	    ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HS) {
+		QPRINTK(qdev, RX_STATUS, DEBUG, "Header of %d bytes in small buffer.\n", hdr_len);
+		/*
+		 * Headers fit nicely into a small buffer.
+		 */
+		sbq_desc = ql_get_curr_sbuf(rx_ring);
+		pci_unmap_single(qdev->pdev,
+				pci_unmap_addr(sbq_desc, mapaddr),
+				pci_unmap_len(sbq_desc, maplen),
+				PCI_DMA_FROMDEVICE);
+		skb = sbq_desc->p.skb;
+		ql_realign_skb(skb, hdr_len);
+		skb_put(skb, hdr_len);
+		sbq_desc->p.skb = NULL;
+	}
+
+	/*
+	 * Handle the data buffer(s).
+	 */
+	if (unlikely(!length)) {	/* Is there data too? */
+		QPRINTK(qdev, RX_STATUS, DEBUG,
+			"No Data buffer in this packet.\n");
+		return skb;
+	}
+
+	if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DS) {
+		if (ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HS) {
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"Headers in small, data of %d bytes in small, combine them.\n", length);
+			/*
+			 * Data is less than small buffer size so it's
+			 * stuffed in a small buffer.
+			 * For this case we append the data
+			 * from the "data" small buffer to the "header" small
+			 * buffer.
+			 */
+			sbq_desc = ql_get_curr_sbuf(rx_ring);
+			pci_dma_sync_single_for_cpu(qdev->pdev,
+						    pci_unmap_addr
+						    (sbq_desc, mapaddr),
+						    pci_unmap_len
+						    (sbq_desc, maplen),
+						    PCI_DMA_FROMDEVICE);
+			memcpy(skb_put(skb, length),
+			       sbq_desc->p.skb->data, length);
+			pci_dma_sync_single_for_device(qdev->pdev,
+						       pci_unmap_addr
+						       (sbq_desc,
+							mapaddr),
+						       pci_unmap_len
+						       (sbq_desc,
+							maplen),
+						       PCI_DMA_FROMDEVICE);
+		} else {
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"%d bytes in a single small buffer.\n", length);
+			sbq_desc = ql_get_curr_sbuf(rx_ring);
+			skb = sbq_desc->p.skb;
+			ql_realign_skb(skb, length);
+			skb_put(skb, length);
+			pci_unmap_single(qdev->pdev,
+					 pci_unmap_addr(sbq_desc,
+							mapaddr),
+					 pci_unmap_len(sbq_desc,
+						       maplen),
+					 PCI_DMA_FROMDEVICE);
+			sbq_desc->p.skb = NULL;
+		}
+	} else if (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_DL) {
+		if (ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HS) {
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"Header in small, %d bytes in large. Chain large to small!\n", length);
+			/*
+			 * The data is in a single large buffer.  We
+			 * chain it to the header buffer's skb and let
+			 * it rip.
+			 */
+			lbq_desc = ql_get_curr_lbuf(rx_ring);
+			pci_unmap_page(qdev->pdev,
+				       pci_unmap_addr(lbq_desc,
+						      mapaddr),
+				       pci_unmap_len(lbq_desc, maplen),
+				       PCI_DMA_FROMDEVICE);
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"Chaining page to skb.\n");
+			skb_fill_page_desc(skb, 0, lbq_desc->p.lbq_page,
+					   0, length);
+			skb->len += length;
+			skb->data_len += length;
+			skb->truesize += length;
+			lbq_desc->p.lbq_page = NULL;
+		} else {
+			/*
+			 * The headers and data are in a single large buffer. We
+			 * copy it to a new skb and let it go. This can happen with
+			 * jumbo mtu on a non-TCP/UDP frame.
+			 */
+			lbq_desc = ql_get_curr_lbuf(rx_ring);
+			skb = netdev_alloc_skb(qdev->ndev, length);
+			if (skb == NULL) {
+				QPRINTK(qdev, PROBE, DEBUG,
+					"No skb available, drop the packet.\n");
+				return NULL;
+			}
+			skb_reserve(skb, NET_IP_ALIGN);
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"%d bytes of headers and data in large. Chain page to new skb and pull tail.\n", length);
+			skb_fill_page_desc(skb, 0, lbq_desc->p.lbq_page,
+					   0, length);
+			skb->len += length;
+			skb->data_len += length;
+			skb->truesize += length;
+			length -= length;
+			lbq_desc->p.lbq_page = NULL;
+			__pskb_pull_tail(skb,
+				(ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V) ?
+				VLAN_ETH_HLEN : ETH_HLEN);
+		}
+	} else {
+		/*
+		 * The data is in a chain of large buffers
+		 * pointed to by a small buffer.  We loop
+		 * thru and chain them to the our small header
+		 * buffer's skb.
+		 * frags:  There are 18 max frags and our small
+		 *         buffer will hold 32 of them. The thing is,
+		 *         we'll use 3 max for our 9000 byte jumbo
+		 *         frames.  If the MTU goes up we could
+		 *          eventually be in trouble.
+		 */
+		int size, offset, i = 0;
+		struct bq_element *bq, bq_array[8];
+		sbq_desc = ql_get_curr_sbuf(rx_ring);
+		pci_unmap_single(qdev->pdev,
+				 pci_unmap_addr(sbq_desc, mapaddr),
+				 pci_unmap_len(sbq_desc, maplen),
+				 PCI_DMA_FROMDEVICE);
+		if (!(ib_mac_rsp->flags4 & IB_MAC_IOCB_RSP_HS)) {
+			/*
+			 * This is an non TCP/UDP IP frame, so
+			 * the headers aren't split into a small
+			 * buffer.  We have to use the small buffer
+			 * that contains our sg list as our skb to
+			 * send upstairs. Copy the sg list here to
+			 * a local buffer and use it to find the
+			 * pages to chain.
+			 */
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"%d bytes of headers & data in chain of large.\n", length);
+			skb = sbq_desc->p.skb;
+			bq = &bq_array[0];
+			memcpy(bq, skb->data, sizeof(bq_array));
+			sbq_desc->p.skb = NULL;
+			skb_reserve(skb, NET_IP_ALIGN);
+		} else {
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"Headers in small, %d bytes of data in chain of large.\n", length);
+			bq = (struct bq_element *)sbq_desc->p.skb->data;
+		}
+		while (length > 0) {
+			lbq_desc = ql_get_curr_lbuf(rx_ring);
+			if ((bq->addr_lo & ~BQ_MASK) != lbq_desc->bq->addr_lo) {
+				QPRINTK(qdev, RX_STATUS, ERR,
+					"Panic!!! bad large buffer address, expected 0x%.08x, got 0x%.08x.\n",
+					lbq_desc->bq->addr_lo, bq->addr_lo);
+				return NULL;
+			}
+			pci_unmap_page(qdev->pdev,
+				       pci_unmap_addr(lbq_desc,
+						      mapaddr),
+				       pci_unmap_len(lbq_desc,
+						     maplen),
+				       PCI_DMA_FROMDEVICE);
+			size = (length < PAGE_SIZE) ? length : PAGE_SIZE;
+			offset = 0;
+
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"Adding page %d to skb for %d bytes.\n",
+				i, size);
+			skb_fill_page_desc(skb, i, lbq_desc->p.lbq_page,
+					   offset, size);
+			skb->len += size;
+			skb->data_len += size;
+			skb->truesize += size;
+			length -= size;
+			lbq_desc->p.lbq_page = NULL;
+			bq++;
+			i++;
+		}
+		__pskb_pull_tail(skb, (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V) ?
+				VLAN_ETH_HLEN : ETH_HLEN);
+	}
+	return skb;
+}
+
+/* Process an inbound completion from an rx ring. */
+static void ql_process_mac_rx_intr(struct ql_adapter *qdev,
+				   struct rx_ring *rx_ring,
+				   struct ib_mac_iocb_rsp *ib_mac_rsp)
+{
+	struct net_device *ndev = qdev->ndev;
+	struct sk_buff *skb = NULL;
+
+	QL_DUMP_IB_MAC_RSP(ib_mac_rsp);
+
+	skb = ql_build_rx_skb(qdev, rx_ring, ib_mac_rsp);
+	if (unlikely(!skb)) {
+		QPRINTK(qdev, RX_STATUS, DEBUG,
+			"No skb available, drop packet.\n");
+		return;
+	}
+
+	prefetch(skb->data);
+	skb->dev = ndev;
+	if (ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) {
+		QPRINTK(qdev, RX_STATUS, DEBUG, "%s%s%s Multicast.\n",
+			(ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+			IB_MAC_IOCB_RSP_M_HASH ? "Hash" : "",
+			(ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+			IB_MAC_IOCB_RSP_M_REG ? "Registered" : "",
+			(ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_M_MASK) ==
+			IB_MAC_IOCB_RSP_M_PROM ? "Promiscuous" : "");
+	}
+	if (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_P) {
+		QPRINTK(qdev, RX_STATUS, DEBUG, "Promiscuous Packet.\n");
+	}
+	if (ib_mac_rsp->flags1 & (IB_MAC_IOCB_RSP_IE | IB_MAC_IOCB_RSP_TE)) {
+		QPRINTK(qdev, RX_STATUS, ERR,
+			"Bad checksum for this %s packet.\n",
+			((ib_mac_rsp->
+			  flags2 & IB_MAC_IOCB_RSP_T) ? "TCP" : "UDP"));
+		skb->ip_summed = CHECKSUM_NONE;
+	} else if (qdev->rx_csum &&
+		   ((ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_T) ||
+		    ((ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_U) &&
+		     !(ib_mac_rsp->flags1 & IB_MAC_IOCB_RSP_NU)))) {
+		QPRINTK(qdev, RX_STATUS, DEBUG, "RX checksum done!\n");
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	qdev->stats.rx_packets++;
+	qdev->stats.rx_bytes += skb->len;
+	skb->protocol = eth_type_trans(skb, ndev);
+	if (qdev->vlgrp && (ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V)) {
+		QPRINTK(qdev, RX_STATUS, DEBUG,
+			"Passing a VLAN packet upstream.\n");
+		vlan_hwaccel_rx(skb, qdev->vlgrp,
+				le16_to_cpu(ib_mac_rsp->vlan_id));
+	} else {
+		QPRINTK(qdev, RX_STATUS, DEBUG,
+			"Passing a normal packet upstream.\n");
+		netif_rx(skb);
+	}
+	ndev->last_rx = jiffies;
+}
+
+/* Process an outbound completion from an rx ring. */
+static void ql_process_mac_tx_intr(struct ql_adapter *qdev,
+				   struct ob_mac_iocb_rsp *mac_rsp)
+{
+	struct tx_ring *tx_ring;
+	struct tx_ring_desc *tx_ring_desc;
+
+	QL_DUMP_OB_MAC_RSP(mac_rsp);
+	tx_ring = &qdev->tx_ring[mac_rsp->txq_idx];
+	tx_ring_desc = &tx_ring->q[mac_rsp->tid];
+	ql_unmap_send(qdev, tx_ring_desc, tx_ring_desc->map_cnt);
+	qdev->stats.tx_bytes += tx_ring_desc->map_cnt;
+	qdev->stats.tx_packets++;
+	dev_kfree_skb(tx_ring_desc->skb);
+	tx_ring_desc->skb = NULL;
+
+	if (unlikely(mac_rsp->flags1 & (OB_MAC_IOCB_RSP_E |
+					OB_MAC_IOCB_RSP_S |
+					OB_MAC_IOCB_RSP_L |
+					OB_MAC_IOCB_RSP_P | OB_MAC_IOCB_RSP_B))) {
+		if (mac_rsp->flags1 & OB_MAC_IOCB_RSP_E) {
+			QPRINTK(qdev, TX_DONE, WARNING,
+				"Total descriptor length did not match transfer length.\n");
+		}
+		if (mac_rsp->flags1 & OB_MAC_IOCB_RSP_S) {
+			QPRINTK(qdev, TX_DONE, WARNING,
+				"Frame too short to be legal, not sent.\n");
+		}
+		if (mac_rsp->flags1 & OB_MAC_IOCB_RSP_L) {
+			QPRINTK(qdev, TX_DONE, WARNING,
+				"Frame too long, but sent anyway.\n");
+		}
+		if (mac_rsp->flags1 & OB_MAC_IOCB_RSP_B) {
+			QPRINTK(qdev, TX_DONE, WARNING,
+				"PCI backplane error. Frame not sent.\n");
+		}
+	}
+	atomic_inc(&tx_ring->tx_count);
+}
+
+/* Fire up a handler to reset the MPI processor. */
+void ql_queue_fw_error(struct ql_adapter *qdev)
+{
+	netif_stop_queue(qdev->ndev);
+	netif_carrier_off(qdev->ndev);
+	queue_delayed_work(qdev->workqueue, &qdev->mpi_reset_work, 0);
+}
+
+void ql_queue_asic_error(struct ql_adapter *qdev)
+{
+	netif_stop_queue(qdev->ndev);
+	netif_carrier_off(qdev->ndev);
+	ql_disable_interrupts(qdev);
+	queue_delayed_work(qdev->workqueue, &qdev->asic_reset_work, 0);
+}
+
+static void ql_process_chip_ae_intr(struct ql_adapter *qdev,
+				    struct ib_ae_iocb_rsp *ib_ae_rsp)
+{
+	switch (ib_ae_rsp->event) {
+	case MGMT_ERR_EVENT:
+		QPRINTK(qdev, RX_ERR, ERR,
+			"Management Processor Fatal Error.\n");
+		ql_queue_fw_error(qdev);
+		return;
+
+	case CAM_LOOKUP_ERR_EVENT:
+		QPRINTK(qdev, LINK, ERR,
+			"Multiple CAM hits lookup occurred.\n");
+		QPRINTK(qdev, DRV, ERR, "This event shouldn't occur.\n");
+		ql_queue_asic_error(qdev);
+		return;
+
+	case SOFT_ECC_ERROR_EVENT:
+		QPRINTK(qdev, RX_ERR, ERR, "Soft ECC error detected.\n");
+		ql_queue_asic_error(qdev);
+		break;
+
+	case PCI_ERR_ANON_BUF_RD:
+		QPRINTK(qdev, RX_ERR, ERR,
+			"PCI error occurred when reading anonymous buffers from rx_ring %d.\n",
+			ib_ae_rsp->q_id);
+		ql_queue_asic_error(qdev);
+		break;
+
+	default:
+		QPRINTK(qdev, DRV, ERR, "Unexpected event %d.\n",
+			ib_ae_rsp->event);
+		ql_queue_asic_error(qdev);
+		break;
+	}
+}
+
+static int ql_clean_outbound_rx_ring(struct rx_ring *rx_ring)
+{
+	struct ql_adapter *qdev = rx_ring->qdev;
+	u32 prod = ql_read_sh_reg(rx_ring->prod_idx_sh_reg);
+	struct ob_mac_iocb_rsp *net_rsp = NULL;
+	int count = 0;
+
+	/* While there are entries in the completion queue. */
+	while (prod != rx_ring->cnsmr_idx) {
+
+		QPRINTK(qdev, RX_STATUS, DEBUG,
+			"cq_id = %d, prod = %d, cnsmr = %d.\n.", rx_ring->cq_id,
+			prod, rx_ring->cnsmr_idx);
+
+		net_rsp = (struct ob_mac_iocb_rsp *)rx_ring->curr_entry;
+		rmb();
+		switch (net_rsp->opcode) {
+
+		case OPCODE_OB_MAC_TSO_IOCB:
+		case OPCODE_OB_MAC_IOCB:
+			ql_process_mac_tx_intr(qdev, net_rsp);
+			break;
+		default:
+			QPRINTK(qdev, RX_STATUS, DEBUG,
+				"Hit default case, not handled! dropping the packet, opcode = %x.\n",
+				net_rsp->opcode);
+		}
+		count++;
+		ql_update_cq(rx_ring);
+		prod = ql_read_sh_reg(rx_ring->prod_idx_sh_reg);
+	}
+	ql_write_cq_idx(rx_ring);
+	if (netif_queue_stopped(qdev->ndev) && net_rsp != NULL) {
+		struct tx_ring *tx_ring = &qdev->tx_ring[net_rsp->txq_idx];
+		if (atomic_read(&tx_ring->queue_stopped) &&
+		    (atomic_read(&tx_ring->tx_count) > (tx_ring->wq_len / 4)))
+			/*
+			 * The queue got stopped because the tx_ring was full.
+			 * Wake it up, because it's now at least 25% empty.
+			 */
+			netif_wake_queue(qdev->ndev);
+	}
+
+	return count;
+}
+
+static int ql_clean_inbound_rx_ring(struct rx_ring *rx_ring, int budget)
+{
+	struct ql_adapter *qdev = rx_ring->qdev;
+	u32 prod = ql_read_sh_reg(rx_ring->prod_idx_sh_reg);
+	struct ql_net_rsp_iocb *net_rsp;
+	int count = 0;
+
+	/* While there are entries in the completion queue. */
+	while (prod != rx_ring->cnsmr_idx) {
+
+		QPRINTK(qdev, RX_STATUS, DEBUG,
+			"cq_id = %d, prod = %d, cnsmr = %d.\n.", rx_ring->cq_id,
+			prod, rx_ring->cnsmr_idx);
+
+		net_rsp = rx_ring->curr_entry;
+		rmb();
+		switch (net_rsp->opcode) {
+		case OPCODE_IB_MAC_IOCB:
+			ql_process_mac_rx_intr(qdev, rx_ring,
+					       (struct ib_mac_iocb_rsp *)
+					       net_rsp);
+			break;
+
+		case OPCODE_IB_AE_IOCB:
+			ql_process_chip_ae_intr(qdev, (struct ib_ae_iocb_rsp *)
+						net_rsp);
+			break;
+		default:
+			{
+				QPRINTK(qdev, RX_STATUS, DEBUG,
+					"Hit default case, not handled! dropping the packet, opcode = %x.\n",
+					net_rsp->opcode);
+			}
+		}
+		count++;
+		ql_update_cq(rx_ring);
+		prod = ql_read_sh_reg(rx_ring->prod_idx_sh_reg);
+		if (count == budget)
+			break;
+	}
+	ql_update_buffer_queues(qdev, rx_ring);
+	ql_write_cq_idx(rx_ring);
+	return count;
+}
+
+static int ql_napi_poll_msix(struct napi_struct *napi, int budget)
+{
+	struct rx_ring *rx_ring = container_of(napi, struct rx_ring, napi);
+	struct ql_adapter *qdev = rx_ring->qdev;
+	int work_done = ql_clean_inbound_rx_ring(rx_ring, budget);
+
+	QPRINTK(qdev, RX_STATUS, DEBUG, "Enter, NAPI POLL cq_id = %d.\n",
+		rx_ring->cq_id);
+
+	if (work_done < budget) {
+		__netif_rx_complete(qdev->ndev, napi);
+		ql_enable_completion_interrupt(qdev, rx_ring->irq);
+	}
+	return work_done;
+}
+
+static void ql_vlan_rx_register(struct net_device *ndev, struct vlan_group *grp)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	qdev->vlgrp = grp;
+	if (grp) {
+		QPRINTK(qdev, IFUP, DEBUG, "Turning on VLAN in NIC_RCV_CFG.\n");
+		ql_write32(qdev, NIC_RCV_CFG, NIC_RCV_CFG_VLAN_MASK |
+			   NIC_RCV_CFG_VLAN_MATCH_AND_NON);
+	} else {
+		QPRINTK(qdev, IFUP, DEBUG,
+			"Turning off VLAN in NIC_RCV_CFG.\n");
+		ql_write32(qdev, NIC_RCV_CFG, NIC_RCV_CFG_VLAN_MASK);
+	}
+}
+
+static void ql_vlan_rx_add_vid(struct net_device *ndev, u16 vid)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	u32 enable_bit = MAC_ADDR_E;
+
+	spin_lock(&qdev->hw_lock);
+	if (ql_set_mac_addr_reg
+	    (qdev, (u8 *) &enable_bit, MAC_ADDR_TYPE_VLAN, vid)) {
+		QPRINTK(qdev, IFUP, ERR, "Failed to init vlan address.\n");
+	}
+	spin_unlock(&qdev->hw_lock);
+}
+
+static void ql_vlan_rx_kill_vid(struct net_device *ndev, u16 vid)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	u32 enable_bit = 0;
+
+	spin_lock(&qdev->hw_lock);
+	if (ql_set_mac_addr_reg
+	    (qdev, (u8 *) &enable_bit, MAC_ADDR_TYPE_VLAN, vid)) {
+		QPRINTK(qdev, IFUP, ERR, "Failed to clear vlan address.\n");
+	}
+	spin_unlock(&qdev->hw_lock);
+
+}
+
+/* Worker thread to process a given rx_ring that is dedicated
+ * to outbound completions.
+ */
+static void ql_tx_clean(struct work_struct *work)
+{
+	struct rx_ring *rx_ring =
+	    container_of(work, struct rx_ring, rx_work.work);
+	ql_clean_outbound_rx_ring(rx_ring);
+	ql_enable_completion_interrupt(rx_ring->qdev, rx_ring->irq);
+
+}
+
+/* Worker thread to process a given rx_ring that is dedicated
+ * to inbound completions.
+ */
+static void ql_rx_clean(struct work_struct *work)
+{
+	struct rx_ring *rx_ring =
+	    container_of(work, struct rx_ring, rx_work.work);
+	ql_clean_inbound_rx_ring(rx_ring, 64);
+	ql_enable_completion_interrupt(rx_ring->qdev, rx_ring->irq);
+}
+
+/* MSI-X Multiple Vector Interrupt Handler for outbound completions. */
+static irqreturn_t qlge_msix_tx_isr(int irq, void *dev_id)
+{
+	struct rx_ring *rx_ring = dev_id;
+	queue_delayed_work_on(rx_ring->cpu, rx_ring->qdev->q_workqueue,
+			      &rx_ring->rx_work, 0);
+	return IRQ_HANDLED;
+}
+
+/* MSI-X Multiple Vector Interrupt Handler for inbound completions. */
+static irqreturn_t qlge_msix_rx_isr(int irq, void *dev_id)
+{
+	struct rx_ring *rx_ring = dev_id;
+	struct ql_adapter *qdev = rx_ring->qdev;
+	netif_rx_schedule(qdev->ndev, &rx_ring->napi);
+	return IRQ_HANDLED;
+}
+
+/* We check here to see if we're already handling a legacy
+ * interrupt.  If we are, then it must belong to another
+ * chip with which we're sharing the interrupt line.
+ */
+int ql_legacy_check(struct ql_adapter *qdev)
+{
+	int err;
+	spin_lock(&qdev->legacy_lock);
+	err = atomic_read(&qdev->intr_context[0].irq_cnt);
+	spin_unlock(&qdev->legacy_lock);
+	return err;
+}
+
+/* This handles a fatal error, MPI activity, and the default
+ * rx_ring in an MSI-X multiple vector environment.
+ * In MSI/Legacy environment it also process the rest of
+ * the rx_rings.
+ */
+static irqreturn_t qlge_isr(int irq, void *dev_id)
+{
+	struct rx_ring *rx_ring = dev_id;
+	struct ql_adapter *qdev = rx_ring->qdev;
+	struct intr_context *intr_context = &qdev->intr_context[0];
+	u32 var;
+	int i;
+	int work_done = 0;
+
+	if (qdev->legacy_check && qdev->legacy_check(qdev)) {
+		QPRINTK(qdev, INTR, INFO, "Already busy, not our interrupt.\n");
+		return IRQ_NONE;	/* Not our interrupt */
+	}
+
+	var = ql_read32(qdev, STS);
+
+	/*
+	 * Check for fatal error.
+	 */
+	if (var & STS_FE) {
+		ql_queue_asic_error(qdev);
+		QPRINTK(qdev, INTR, ERR, "Got fatal error, STS = %x.\n", var);
+		var = ql_read32(qdev, ERR_STS);
+		QPRINTK(qdev, INTR, ERR,
+			"Resetting chip. Error Status Register = 0x%x\n", var);
+		return IRQ_HANDLED;
+	}
+
+	/*
+	 * Check MPI processor activity.
+	 */
+	if (var & STS_PI) {
+		/*
+		 * We've got an async event or mailbox completion.
+		 * Handle it and clear the source of the interrupt.
+		 */
+		QPRINTK(qdev, INTR, ERR, "Got MPI processor interrupt.\n");
+		ql_disable_completion_interrupt(qdev, intr_context->intr);
+		queue_delayed_work_on(smp_processor_id(), qdev->workqueue,
+				      &qdev->mpi_work, 0);
+		work_done++;
+	}
+
+	/*
+	 * Check the default queue and wake handler if active.
+	 */
+	rx_ring = &qdev->rx_ring[0];
+	if (ql_read_sh_reg(rx_ring->prod_idx_sh_reg) != rx_ring->cnsmr_idx) {
+		QPRINTK(qdev, INTR, INFO, "Waking handler for rx_ring[0].\n");
+		ql_disable_completion_interrupt(qdev, intr_context->intr);
+		queue_delayed_work_on(smp_processor_id(), qdev->q_workqueue,
+				      &rx_ring->rx_work, 0);
+		work_done++;
+	}
+
+	if (!test_bit(QL_MSIX_ENABLED, &qdev->flags)) {
+		/*
+		 * Start the DPC for each active queue.
+		 */
+		for (i = 1; i < qdev->rx_ring_count; i++) {
+			rx_ring = &qdev->rx_ring[i];
+			if (ql_read_sh_reg(rx_ring->prod_idx_sh_reg) !=
+			    rx_ring->cnsmr_idx) {
+				QPRINTK(qdev, INTR, INFO,
+					"Waking handler for rx_ring[%d].\n", i);
+				ql_disable_completion_interrupt(qdev,
+								intr_context->
+								intr);
+				if (i < qdev->rss_ring_first_cq_id)
+					queue_delayed_work_on(rx_ring->cpu,
+							      qdev->q_workqueue,
+							      &rx_ring->rx_work,
+							      0);
+				else
+					netif_rx_schedule(qdev->ndev,
+							  &rx_ring->napi);
+				work_done++;
+			}
+		}
+	}
+	return work_done ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static int ql_tso(struct sk_buff *skb, struct ob_mac_tso_iocb_req *mac_iocb_ptr)
+{
+
+	if (skb_is_gso(skb)) {
+		int err;
+		if (skb_header_cloned(skb)) {
+			err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+			if (err)
+				return err;
+		}
+
+		mac_iocb_ptr->opcode = OPCODE_OB_MAC_TSO_IOCB;
+		mac_iocb_ptr->flags3 |= OB_MAC_TSO_IOCB_IC;
+		mac_iocb_ptr->frame_len = cpu_to_le32((u32) skb->len);
+		mac_iocb_ptr->total_hdrs_len =
+		    cpu_to_le16(skb_transport_offset(skb) + tcp_hdrlen(skb));
+		mac_iocb_ptr->net_trans_offset =
+		    cpu_to_le16(skb_network_offset(skb) |
+				skb_transport_offset(skb)
+				<< OB_MAC_TRANSPORT_HDR_SHIFT);
+		mac_iocb_ptr->mss = cpu_to_le16(skb_shinfo(skb)->gso_size);
+		mac_iocb_ptr->flags2 |= OB_MAC_TSO_IOCB_LSO;
+		if (likely(skb->protocol == htons(ETH_P_IP))) {
+			struct iphdr *iph = ip_hdr(skb);
+			iph->check = 0;
+			mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP4;
+			tcp_hdr(skb)->check = ~csum_tcpudp_magic(iph->saddr,
+								 iph->daddr, 0,
+								 IPPROTO_TCP,
+								 0);
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP6;
+			tcp_hdr(skb)->check =
+			    ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+					     &ipv6_hdr(skb)->daddr,
+					     0, IPPROTO_TCP, 0);
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static void ql_hw_csum_setup(struct sk_buff *skb,
+			     struct ob_mac_tso_iocb_req *mac_iocb_ptr)
+{
+	int len;
+	struct iphdr *iph = ip_hdr(skb);
+	u16 *check;
+	mac_iocb_ptr->opcode = OPCODE_OB_MAC_TSO_IOCB;
+	mac_iocb_ptr->frame_len = cpu_to_le32((u32) skb->len);
+	mac_iocb_ptr->net_trans_offset =
+		cpu_to_le16(skb_network_offset(skb) |
+		skb_transport_offset(skb) << OB_MAC_TRANSPORT_HDR_SHIFT);
+
+	mac_iocb_ptr->flags1 |= OB_MAC_TSO_IOCB_IP4;
+	len = (ntohs(iph->tot_len) - (iph->ihl << 2));
+	if (likely(iph->protocol == IPPROTO_TCP)) {
+		check = &(tcp_hdr(skb)->check);
+		mac_iocb_ptr->flags2 |= OB_MAC_TSO_IOCB_TC;
+		mac_iocb_ptr->total_hdrs_len =
+		    cpu_to_le16(skb_transport_offset(skb) +
+				(tcp_hdr(skb)->doff << 2));
+	} else {
+		check = &(udp_hdr(skb)->check);
+		mac_iocb_ptr->flags2 |= OB_MAC_TSO_IOCB_UC;
+		mac_iocb_ptr->total_hdrs_len =
+		    cpu_to_le16(skb_transport_offset(skb) +
+				sizeof(struct udphdr));
+	}
+	*check = ~csum_tcpudp_magic(iph->saddr,
+				    iph->daddr, len, iph->protocol, 0);
+}
+
+static int qlge_send(struct sk_buff *skb, struct net_device *ndev)
+{
+	struct tx_ring_desc *tx_ring_desc;
+	struct ob_mac_iocb_req *mac_iocb_ptr;
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int tso;
+	struct tx_ring *tx_ring;
+	u32 tx_ring_idx = (u32) QL_TXQ_IDX(qdev, skb);
+
+	tx_ring = &qdev->tx_ring[tx_ring_idx];
+
+	if (unlikely(atomic_read(&tx_ring->tx_count) < 2)) {
+		QPRINTK(qdev, TX_QUEUED, INFO,
+			"%s: shutting down tx queue %d du to lack of resources.\n",
+			__func__, tx_ring_idx);
+		netif_stop_queue(ndev);
+		atomic_inc(&tx_ring->queue_stopped);
+		return NETDEV_TX_BUSY;
+	}
+	tx_ring_desc = &tx_ring->q[tx_ring->prod_idx];
+	mac_iocb_ptr = tx_ring_desc->queue_entry;
+	memset((void *)mac_iocb_ptr, 0, sizeof(mac_iocb_ptr));
+	if (ql_map_send(qdev, mac_iocb_ptr, skb, tx_ring_desc) != NETDEV_TX_OK) {
+		QPRINTK(qdev, TX_QUEUED, ERR, "Could not map the segments.\n");
+		return NETDEV_TX_BUSY;
+	}
+
+	mac_iocb_ptr->opcode = OPCODE_OB_MAC_IOCB;
+	mac_iocb_ptr->tid = tx_ring_desc->index;
+	/* We use the upper 32-bits to store the tx queue for this IO.
+	 * When we get the completion we can use it to establish the context.
+	 */
+	mac_iocb_ptr->txq_idx = tx_ring_idx;
+	tx_ring_desc->skb = skb;
+
+	mac_iocb_ptr->frame_len = cpu_to_le16((u16) skb->len);
+
+	if (qdev->vlgrp && vlan_tx_tag_present(skb)) {
+		QPRINTK(qdev, TX_QUEUED, DEBUG, "Adding a vlan tag %d.\n",
+			vlan_tx_tag_get(skb));
+		mac_iocb_ptr->flags3 |= OB_MAC_IOCB_V;
+		mac_iocb_ptr->vlan_tci = cpu_to_le16(vlan_tx_tag_get(skb));
+	}
+	tso = ql_tso(skb, (struct ob_mac_tso_iocb_req *)mac_iocb_ptr);
+	if (tso < 0) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	} else if (unlikely(!tso) && (skb->ip_summed == CHECKSUM_PARTIAL)) {
+		ql_hw_csum_setup(skb,
+				 (struct ob_mac_tso_iocb_req *)mac_iocb_ptr);
+	}
+	QL_DUMP_OB_MAC_IOCB(mac_iocb_ptr);
+	tx_ring->prod_idx++;
+	if (tx_ring->prod_idx == tx_ring->wq_len)
+		tx_ring->prod_idx = 0;
+	wmb();
+
+	ql_write_db_reg(tx_ring->prod_idx, tx_ring->prod_idx_db_reg);
+	ndev->trans_start = jiffies;
+	QPRINTK(qdev, TX_QUEUED, DEBUG, "tx queued, slot %d, len %d\n",
+		tx_ring->prod_idx, skb->len);
+
+	atomic_dec(&tx_ring->tx_count);
+	return NETDEV_TX_OK;
+}
+
+static void ql_free_shadow_space(struct ql_adapter *qdev)
+{
+	if (qdev->rx_ring_shadow_reg_area) {
+		pci_free_consistent(qdev->pdev,
+				    PAGE_SIZE,
+				    qdev->rx_ring_shadow_reg_area,
+				    qdev->rx_ring_shadow_reg_dma);
+		qdev->rx_ring_shadow_reg_area = NULL;
+	}
+	if (qdev->tx_ring_shadow_reg_area) {
+		pci_free_consistent(qdev->pdev,
+				    PAGE_SIZE,
+				    qdev->tx_ring_shadow_reg_area,
+				    qdev->tx_ring_shadow_reg_dma);
+		qdev->tx_ring_shadow_reg_area = NULL;
+	}
+}
+
+static int ql_alloc_shadow_space(struct ql_adapter *qdev)
+{
+	qdev->rx_ring_shadow_reg_area =
+	    pci_alloc_consistent(qdev->pdev,
+				 PAGE_SIZE, &qdev->rx_ring_shadow_reg_dma);
+	if (qdev->rx_ring_shadow_reg_area == NULL) {
+		QPRINTK(qdev, IFUP, ERR,
+			"Allocation of RX shadow space failed.\n");
+		return -ENOMEM;
+	}
+	qdev->tx_ring_shadow_reg_area =
+	    pci_alloc_consistent(qdev->pdev, PAGE_SIZE,
+				 &qdev->tx_ring_shadow_reg_dma);
+	if (qdev->tx_ring_shadow_reg_area == NULL) {
+		QPRINTK(qdev, IFUP, ERR,
+			"Allocation of TX shadow space failed.\n");
+		goto err_wqp_sh_area;
+	}
+	return 0;
+
+err_wqp_sh_area:
+	pci_free_consistent(qdev->pdev,
+			    PAGE_SIZE,
+			    qdev->rx_ring_shadow_reg_area,
+			    qdev->rx_ring_shadow_reg_dma);
+	return -ENOMEM;
+}
+
+static void ql_init_tx_ring(struct ql_adapter *qdev, struct tx_ring *tx_ring)
+{
+	struct tx_ring_desc *tx_ring_desc;
+	int i;
+	struct ob_mac_iocb_req *mac_iocb_ptr;
+
+	mac_iocb_ptr = tx_ring->wq_base;
+	tx_ring_desc = tx_ring->q;
+	for (i = 0; i < tx_ring->wq_len; i++) {
+		tx_ring_desc->index = i;
+		tx_ring_desc->skb = NULL;
+		tx_ring_desc->queue_entry = mac_iocb_ptr;
+		mac_iocb_ptr++;
+		tx_ring_desc++;
+	}
+	atomic_set(&tx_ring->tx_count, tx_ring->wq_len);
+	atomic_set(&tx_ring->queue_stopped, 0);
+}
+
+static void ql_free_tx_resources(struct ql_adapter *qdev,
+				 struct tx_ring *tx_ring)
+{
+	if (tx_ring->wq_base) {
+		pci_free_consistent(qdev->pdev, tx_ring->wq_size,
+				    tx_ring->wq_base, tx_ring->wq_base_dma);
+		tx_ring->wq_base = NULL;
+	}
+	kfree(tx_ring->q);
+	tx_ring->q = NULL;
+}
+
+static int ql_alloc_tx_resources(struct ql_adapter *qdev,
+				 struct tx_ring *tx_ring)
+{
+	tx_ring->wq_base =
+	    pci_alloc_consistent(qdev->pdev, tx_ring->wq_size,
+				 &tx_ring->wq_base_dma);
+
+	if ((tx_ring->wq_base == NULL)
+	    || tx_ring->wq_base_dma & (tx_ring->wq_size - 1)) {
+		QPRINTK(qdev, IFUP, ERR, "tx_ring alloc failed.\n");
+		return -ENOMEM;
+	}
+	tx_ring->q =
+	    kmalloc(tx_ring->wq_len * sizeof(struct tx_ring_desc), GFP_KERNEL);
+	if (tx_ring->q == NULL)
+		goto err;
+
+	return 0;
+err:
+	pci_free_consistent(qdev->pdev, tx_ring->wq_size,
+			    tx_ring->wq_base, tx_ring->wq_base_dma);
+	return -ENOMEM;
+}
+
+void ql_free_lbq_buffers(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	int i;
+	struct bq_desc *lbq_desc;
+
+	for (i = 0; i < rx_ring->lbq_len; i++) {
+		lbq_desc = &rx_ring->lbq[i];
+		if (lbq_desc->p.lbq_page) {
+			pci_unmap_page(qdev->pdev,
+				       pci_unmap_addr(lbq_desc, mapaddr),
+				       pci_unmap_len(lbq_desc, maplen),
+				       PCI_DMA_FROMDEVICE);
+
+			put_page(lbq_desc->p.lbq_page);
+			lbq_desc->p.lbq_page = NULL;
+		}
+		lbq_desc->bq->addr_lo = 0;
+		lbq_desc->bq->addr_hi = 0;
+	}
+}
+
+/*
+ * Allocate and map a page for each element of the lbq.
+ */
+static int ql_alloc_lbq_buffers(struct ql_adapter *qdev,
+				struct rx_ring *rx_ring)
+{
+	int i;
+	struct bq_desc *lbq_desc;
+	u64 map;
+	struct bq_element *bq = rx_ring->lbq_base;
+
+	for (i = 0; i < rx_ring->lbq_len; i++) {
+		lbq_desc = &rx_ring->lbq[i];
+		memset(lbq_desc, 0, sizeof(lbq_desc));
+		lbq_desc->bq = bq;
+		lbq_desc->index = i;
+		lbq_desc->p.lbq_page = alloc_page(GFP_ATOMIC);
+		if (unlikely(!lbq_desc->p.lbq_page)) {
+			QPRINTK(qdev, IFUP, ERR, "failed alloc_page().\n");
+			goto mem_error;
+		} else {
+			map = pci_map_page(qdev->pdev,
+					   lbq_desc->p.lbq_page,
+					   0, PAGE_SIZE, PCI_DMA_FROMDEVICE);
+			if (pci_dma_mapping_error(qdev->pdev, map)) {
+				QPRINTK(qdev, IFUP, ERR,
+					"PCI mapping failed.\n");
+				goto mem_error;
+			}
+			pci_unmap_addr_set(lbq_desc, mapaddr, map);
+			pci_unmap_len_set(lbq_desc, maplen, PAGE_SIZE);
+			bq->addr_lo = cpu_to_le32(map);
+			bq->addr_hi = cpu_to_le32(map >> 32);
+		}
+		bq++;
+	}
+	return 0;
+mem_error:
+	ql_free_lbq_buffers(qdev, rx_ring);
+	return -ENOMEM;
+}
+
+void ql_free_sbq_buffers(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	int i;
+	struct bq_desc *sbq_desc;
+
+	for (i = 0; i < rx_ring->sbq_len; i++) {
+		sbq_desc = &rx_ring->sbq[i];
+		if (sbq_desc == NULL) {
+			QPRINTK(qdev, IFUP, ERR, "sbq_desc %d is NULL.\n", i);
+			return;
+		}
+		if (sbq_desc->p.skb) {
+			pci_unmap_single(qdev->pdev,
+					 pci_unmap_addr(sbq_desc, mapaddr),
+					 pci_unmap_len(sbq_desc, maplen),
+					 PCI_DMA_FROMDEVICE);
+			dev_kfree_skb(sbq_desc->p.skb);
+			sbq_desc->p.skb = NULL;
+		}
+		if (sbq_desc->bq == NULL) {
+			QPRINTK(qdev, IFUP, ERR, "sbq_desc->bq %d is NULL.\n",
+				i);
+			return;
+		}
+		sbq_desc->bq->addr_lo = 0;
+		sbq_desc->bq->addr_hi = 0;
+	}
+}
+
+/* Allocate and map an skb for each element of the sbq. */
+static int ql_alloc_sbq_buffers(struct ql_adapter *qdev,
+				struct rx_ring *rx_ring)
+{
+	int i;
+	struct bq_desc *sbq_desc;
+	struct sk_buff *skb;
+	u64 map;
+	struct bq_element *bq = rx_ring->sbq_base;
+
+	for (i = 0; i < rx_ring->sbq_len; i++) {
+		sbq_desc = &rx_ring->sbq[i];
+		memset(sbq_desc, 0, sizeof(sbq_desc));
+		sbq_desc->index = i;
+		sbq_desc->bq = bq;
+		skb = netdev_alloc_skb(qdev->ndev, rx_ring->sbq_buf_size);
+		if (unlikely(!skb)) {
+			/* Better luck next round */
+			QPRINTK(qdev, IFUP, ERR,
+				"small buff alloc failed for %d bytes at index %d.\n",
+				rx_ring->sbq_buf_size, i);
+			goto mem_err;
+		}
+		skb_reserve(skb, QLGE_SB_PAD);
+		sbq_desc->p.skb = skb;
+		/*
+		 * Map only half the buffer. Because the
+		 * other half may get some data copied to it
+		 * when the completion arrives.
+		 */
+		map = pci_map_single(qdev->pdev,
+				     skb->data,
+				     rx_ring->sbq_buf_size / 2,
+				     PCI_DMA_FROMDEVICE);
+		if (pci_dma_mapping_error(qdev->pdev, map)) {
+			QPRINTK(qdev, IFUP, ERR, "PCI mapping failed.\n");
+			goto mem_err;
+		}
+		pci_unmap_addr_set(sbq_desc, mapaddr, map);
+		pci_unmap_len_set(sbq_desc, maplen, rx_ring->sbq_buf_size / 2);
+		bq->addr_lo =	/*sbq_desc->addr_lo = */
+		    cpu_to_le32(map);
+		bq->addr_hi =	/*sbq_desc->addr_hi = */
+		    cpu_to_le32(map >> 32);
+		bq++;
+	}
+	return 0;
+mem_err:
+	ql_free_sbq_buffers(qdev, rx_ring);
+	return -ENOMEM;
+}
+
+static void ql_free_rx_resources(struct ql_adapter *qdev,
+				 struct rx_ring *rx_ring)
+{
+	if (rx_ring->sbq_len)
+		ql_free_sbq_buffers(qdev, rx_ring);
+	if (rx_ring->lbq_len)
+		ql_free_lbq_buffers(qdev, rx_ring);
+
+	/* Free the small buffer queue. */
+	if (rx_ring->sbq_base) {
+		pci_free_consistent(qdev->pdev,
+				    rx_ring->sbq_size,
+				    rx_ring->sbq_base, rx_ring->sbq_base_dma);
+		rx_ring->sbq_base = NULL;
+	}
+
+	/* Free the small buffer queue control blocks. */
+	kfree(rx_ring->sbq);
+	rx_ring->sbq = NULL;
+
+	/* Free the large buffer queue. */
+	if (rx_ring->lbq_base) {
+		pci_free_consistent(qdev->pdev,
+				    rx_ring->lbq_size,
+				    rx_ring->lbq_base, rx_ring->lbq_base_dma);
+		rx_ring->lbq_base = NULL;
+	}
+
+	/* Free the large buffer queue control blocks. */
+	kfree(rx_ring->lbq);
+	rx_ring->lbq = NULL;
+
+	/* Free the rx queue. */
+	if (rx_ring->cq_base) {
+		pci_free_consistent(qdev->pdev,
+				    rx_ring->cq_size,
+				    rx_ring->cq_base, rx_ring->cq_base_dma);
+		rx_ring->cq_base = NULL;
+	}
+}
+
+/* Allocate queues and buffers for this completions queue based
+ * on the values in the parameter structure. */
+static int ql_alloc_rx_resources(struct ql_adapter *qdev,
+				 struct rx_ring *rx_ring)
+{
+
+	/*
+	 * Allocate the completion queue for this rx_ring.
+	 */
+	rx_ring->cq_base =
+	    pci_alloc_consistent(qdev->pdev, rx_ring->cq_size,
+				 &rx_ring->cq_base_dma);
+
+	if (rx_ring->cq_base == NULL) {
+		QPRINTK(qdev, IFUP, ERR, "rx_ring alloc failed.\n");
+		return -ENOMEM;
+	}
+
+	if (rx_ring->sbq_len) {
+		/*
+		 * Allocate small buffer queue.
+		 */
+		rx_ring->sbq_base =
+		    pci_alloc_consistent(qdev->pdev, rx_ring->sbq_size,
+					 &rx_ring->sbq_base_dma);
+
+		if (rx_ring->sbq_base == NULL) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Small buffer queue allocation failed.\n");
+			goto err_mem;
+		}
+
+		/*
+		 * Allocate small buffer queue control blocks.
+		 */
+		rx_ring->sbq =
+		    kmalloc(rx_ring->sbq_len * sizeof(struct bq_desc),
+			    GFP_KERNEL);
+		if (rx_ring->sbq == NULL) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Small buffer queue control block allocation failed.\n");
+			goto err_mem;
+		}
+
+		if (ql_alloc_sbq_buffers(qdev, rx_ring)) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Small buffer allocation failed.\n");
+			goto err_mem;
+		}
+	}
+
+	if (rx_ring->lbq_len) {
+		/*
+		 * Allocate large buffer queue.
+		 */
+		rx_ring->lbq_base =
+		    pci_alloc_consistent(qdev->pdev, rx_ring->lbq_size,
+					 &rx_ring->lbq_base_dma);
+
+		if (rx_ring->lbq_base == NULL) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Large buffer queue allocation failed.\n");
+			goto err_mem;
+		}
+		/*
+		 * Allocate large buffer queue control blocks.
+		 */
+		rx_ring->lbq =
+		    kmalloc(rx_ring->lbq_len * sizeof(struct bq_desc),
+			    GFP_KERNEL);
+		if (rx_ring->lbq == NULL) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Large buffer queue control block allocation failed.\n");
+			goto err_mem;
+		}
+
+		/*
+		 * Allocate the buffers.
+		 */
+		if (ql_alloc_lbq_buffers(qdev, rx_ring)) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Large buffer allocation failed.\n");
+			goto err_mem;
+		}
+	}
+
+	return 0;
+
+err_mem:
+	ql_free_rx_resources(qdev, rx_ring);
+	return -ENOMEM;
+}
+
+static void ql_tx_ring_clean(struct ql_adapter *qdev)
+{
+	struct tx_ring *tx_ring;
+	struct tx_ring_desc *tx_ring_desc;
+	int i, j;
+
+	/*
+	 * Loop through all queues and free
+	 * any resources.
+	 */
+	for (j = 0; j < qdev->tx_ring_count; j++) {
+		tx_ring = &qdev->tx_ring[j];
+		for (i = 0; i < tx_ring->wq_len; i++) {
+			tx_ring_desc = &tx_ring->q[i];
+			if (tx_ring_desc && tx_ring_desc->skb) {
+				QPRINTK(qdev, IFDOWN, ERR,
+				"Freeing lost SKB %p, from queue %d, index %d.\n",
+					tx_ring_desc->skb, j,
+					tx_ring_desc->index);
+				ql_unmap_send(qdev, tx_ring_desc,
+					      tx_ring_desc->map_cnt);
+				dev_kfree_skb(tx_ring_desc->skb);
+				tx_ring_desc->skb = NULL;
+			}
+		}
+	}
+}
+
+static void ql_free_ring_cb(struct ql_adapter *qdev)
+{
+	kfree(qdev->ring_mem);
+}
+
+static int ql_alloc_ring_cb(struct ql_adapter *qdev)
+{
+	/* Allocate space for tx/rx ring control blocks. */
+	qdev->ring_mem_size =
+	    (qdev->tx_ring_count * sizeof(struct tx_ring)) +
+	    (qdev->rx_ring_count * sizeof(struct rx_ring));
+	qdev->ring_mem = kmalloc(qdev->ring_mem_size, GFP_KERNEL);
+	if (qdev->ring_mem == NULL) {
+		return -ENOMEM;
+	} else {
+		qdev->rx_ring = qdev->ring_mem;
+		qdev->tx_ring = qdev->ring_mem +
+		    (qdev->rx_ring_count * sizeof(struct rx_ring));
+	}
+	return 0;
+}
+
+static void ql_free_mem_resources(struct ql_adapter *qdev)
+{
+	int i;
+
+	for (i = 0; i < qdev->tx_ring_count; i++)
+		ql_free_tx_resources(qdev, &qdev->tx_ring[i]);
+	for (i = 0; i < qdev->rx_ring_count; i++)
+		ql_free_rx_resources(qdev, &qdev->rx_ring[i]);
+	ql_free_shadow_space(qdev);
+}
+
+static int ql_alloc_mem_resources(struct ql_adapter *qdev)
+{
+	int i;
+
+	/* Allocate space for our shadow registers and such. */
+	if (ql_alloc_shadow_space(qdev))
+		return -ENOMEM;
+
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		if (ql_alloc_rx_resources(qdev, &qdev->rx_ring[i]) != 0) {
+			QPRINTK(qdev, IFUP, ERR,
+				"RX resource allocation failed.\n");
+			goto err_mem;
+		}
+	}
+	/* Allocate tx queue resources */
+	for (i = 0; i < qdev->tx_ring_count; i++) {
+		if (ql_alloc_tx_resources(qdev, &qdev->tx_ring[i]) != 0) {
+			QPRINTK(qdev, IFUP, ERR,
+				"TX resource allocation failed.\n");
+			goto err_mem;
+		}
+	}
+	return 0;
+
+err_mem:
+	ql_free_mem_resources(qdev);
+	return -ENOMEM;
+}
+
+/* Set up the rx ring control block and pass it to the chip.
+ * The control block is defined as
+ * "Completion Queue Initialization Control Block", or cqicb.
+ */
+static int ql_start_rx_ring(struct ql_adapter *qdev, struct rx_ring *rx_ring)
+{
+	struct cqicb *cqicb = &rx_ring->cqicb;
+	void *shadow_reg = qdev->rx_ring_shadow_reg_area +
+	    (rx_ring->cq_id * sizeof(u64) * 4);
+	u64 shadow_reg_dma = qdev->rx_ring_shadow_reg_dma +
+	    (rx_ring->cq_id * sizeof(u64) * 4);
+	void __iomem *doorbell_area =
+	    qdev->doorbell_area + (DB_PAGE_SIZE * (128 + rx_ring->cq_id));
+	int err = 0;
+	u16 bq_len;
+
+	/* Set up the shadow registers for this ring. */
+	rx_ring->prod_idx_sh_reg = shadow_reg;
+	rx_ring->prod_idx_sh_reg_dma = shadow_reg_dma;
+	shadow_reg += sizeof(u64);
+	shadow_reg_dma += sizeof(u64);
+	rx_ring->lbq_base_indirect = shadow_reg;
+	rx_ring->lbq_base_indirect_dma = shadow_reg_dma;
+	shadow_reg += sizeof(u64);
+	shadow_reg_dma += sizeof(u64);
+	rx_ring->sbq_base_indirect = shadow_reg;
+	rx_ring->sbq_base_indirect_dma = shadow_reg_dma;
+
+	/* PCI doorbell mem area + 0x00 for consumer index register */
+	rx_ring->cnsmr_idx_db_reg = (u32 *) doorbell_area;
+	rx_ring->cnsmr_idx = 0;
+	rx_ring->curr_entry = rx_ring->cq_base;
+
+	/* PCI doorbell mem area + 0x04 for valid register */
+	rx_ring->valid_db_reg = doorbell_area + 0x04;
+
+	/* PCI doorbell mem area + 0x18 for large buffer consumer */
+	rx_ring->lbq_prod_idx_db_reg = (u32 *) (doorbell_area + 0x18);
+
+	/* PCI doorbell mem area + 0x1c */
+	rx_ring->sbq_prod_idx_db_reg = (u32 *) (doorbell_area + 0x1c);
+
+	memset((void *)cqicb, 0, sizeof(struct cqicb));
+	cqicb->msix_vect = rx_ring->irq;
+
+	cqicb->len = cpu_to_le16(rx_ring->cq_len | LEN_V | LEN_CPP_CONT);
+
+	cqicb->addr_lo = cpu_to_le32(rx_ring->cq_base_dma);
+	cqicb->addr_hi = cpu_to_le32((u64) rx_ring->cq_base_dma >> 32);
+
+	cqicb->prod_idx_addr_lo = cpu_to_le32(rx_ring->prod_idx_sh_reg_dma);
+	cqicb->prod_idx_addr_hi =
+	    cpu_to_le32((u64) rx_ring->prod_idx_sh_reg_dma >> 32);
+
+	/*
+	 * Set up the control block load flags.
+	 */
+	cqicb->flags = FLAGS_LC |	/* Load queue base address */
+	    FLAGS_LV |		/* Load MSI-X vector */
+	    FLAGS_LI;		/* Load irq delay values */
+	if (rx_ring->lbq_len) {
+		cqicb->flags |= FLAGS_LL;	/* Load lbq values */
+		*((u64 *) rx_ring->lbq_base_indirect) = rx_ring->lbq_base_dma;
+		cqicb->lbq_addr_lo =
+		    cpu_to_le32(rx_ring->lbq_base_indirect_dma);
+		cqicb->lbq_addr_hi =
+		    cpu_to_le32((u64) rx_ring->lbq_base_indirect_dma >> 32);
+		cqicb->lbq_buf_size = cpu_to_le32(rx_ring->lbq_buf_size);
+		bq_len = (u16) rx_ring->lbq_len;
+		cqicb->lbq_len = cpu_to_le16(bq_len);
+		rx_ring->lbq_prod_idx = rx_ring->lbq_len - 16;
+		rx_ring->lbq_curr_idx = 0;
+		rx_ring->lbq_clean_idx = rx_ring->lbq_prod_idx;
+		rx_ring->lbq_free_cnt = 16;
+	}
+	if (rx_ring->sbq_len) {
+		cqicb->flags |= FLAGS_LS;	/* Load sbq values */
+		*((u64 *) rx_ring->sbq_base_indirect) = rx_ring->sbq_base_dma;
+		cqicb->sbq_addr_lo =
+		    cpu_to_le32(rx_ring->sbq_base_indirect_dma);
+		cqicb->sbq_addr_hi =
+		    cpu_to_le32((u64) rx_ring->sbq_base_indirect_dma >> 32);
+		cqicb->sbq_buf_size =
+		    cpu_to_le16(((rx_ring->sbq_buf_size / 2) + 8) & 0xfffffff8);
+		bq_len = (u16) rx_ring->sbq_len;
+		cqicb->sbq_len = cpu_to_le16(bq_len);
+		rx_ring->sbq_prod_idx = rx_ring->sbq_len - 16;
+		rx_ring->sbq_curr_idx = 0;
+		rx_ring->sbq_clean_idx = rx_ring->sbq_prod_idx;
+		rx_ring->sbq_free_cnt = 16;
+	}
+	switch (rx_ring->type) {
+	case TX_Q:
+		/* If there's only one interrupt, then we use
+		 * worker threads to process the outbound
+		 * completion handling rx_rings. We do this so
+		 * they can be run on multiple CPUs. There is
+		 * room to play with this more where we would only
+		 * run in a worker if there are more than x number
+		 * of outbound completions on the queue and more
+		 * than one queue active.  Some threshold that
+		 * would indicate a benefit in spite of the cost
+		 * of a context switch.
+		 * If there's more than one interrupt, then the
+		 * outbound completions are processed in the ISR.
+		 */
+		if (!test_bit(QL_MSIX_ENABLED, &qdev->flags))
+			INIT_DELAYED_WORK(&rx_ring->rx_work, ql_tx_clean);
+		else {
+			/* With all debug warnings on we see a WARN_ON message
+			 * when we free the skb in the interrupt context.
+			 */
+			INIT_DELAYED_WORK(&rx_ring->rx_work, ql_tx_clean);
+		}
+		cqicb->irq_delay = cpu_to_le16(qdev->tx_coalesce_usecs);
+		cqicb->pkt_delay = cpu_to_le16(qdev->tx_max_coalesced_frames);
+		break;
+	case DEFAULT_Q:
+		INIT_DELAYED_WORK(&rx_ring->rx_work, ql_rx_clean);
+		cqicb->irq_delay = 0;
+		cqicb->pkt_delay = 0;
+		break;
+	case RX_Q:
+		/* Inbound completion handling rx_rings run in
+		 * separate NAPI contexts.
+		 */
+		netif_napi_add(qdev->ndev, &rx_ring->napi, ql_napi_poll_msix,
+			       64);
+		cqicb->irq_delay = cpu_to_le16(qdev->rx_coalesce_usecs);
+		cqicb->pkt_delay = cpu_to_le16(qdev->rx_max_coalesced_frames);
+		break;
+	default:
+		QPRINTK(qdev, IFUP, DEBUG, "Invalid rx_ring->type = %d.\n",
+			rx_ring->type);
+	}
+	QPRINTK(qdev, IFUP, INFO, "Initializing rx work queue.\n");
+	err = ql_write_cfg(qdev, cqicb, sizeof(struct cqicb),
+			   CFG_LCQ, rx_ring->cq_id);
+	if (err) {
+		QPRINTK(qdev, IFUP, ERR, "Failed to load CQICB.\n");
+		return err;
+	}
+	QPRINTK(qdev, IFUP, INFO, "Successfully loaded CQICB.\n");
+	/*
+	 * Advance the producer index for the buffer queues.
+	 */
+	wmb();
+	if (rx_ring->lbq_len)
+		ql_write_db_reg(rx_ring->lbq_prod_idx,
+				rx_ring->lbq_prod_idx_db_reg);
+	if (rx_ring->sbq_len)
+		ql_write_db_reg(rx_ring->sbq_prod_idx,
+				rx_ring->sbq_prod_idx_db_reg);
+	return err;
+}
+
+static int ql_start_tx_ring(struct ql_adapter *qdev, struct tx_ring *tx_ring)
+{
+	struct wqicb *wqicb = (struct wqicb *)tx_ring;
+	void __iomem *doorbell_area =
+	    qdev->doorbell_area + (DB_PAGE_SIZE * tx_ring->wq_id);
+	void *shadow_reg = qdev->tx_ring_shadow_reg_area +
+	    (tx_ring->wq_id * sizeof(u64));
+	u64 shadow_reg_dma = qdev->tx_ring_shadow_reg_dma +
+	    (tx_ring->wq_id * sizeof(u64));
+	int err = 0;
+
+	/*
+	 * Assign doorbell registers for this tx_ring.
+	 */
+	/* TX PCI doorbell mem area for tx producer index */
+	tx_ring->prod_idx_db_reg = (u32 *) doorbell_area;
+	tx_ring->prod_idx = 0;
+	/* TX PCI doorbell mem area + 0x04 */
+	tx_ring->valid_db_reg = doorbell_area + 0x04;
+
+	/*
+	 * Assign shadow registers for this tx_ring.
+	 */
+	tx_ring->cnsmr_idx_sh_reg = shadow_reg;
+	tx_ring->cnsmr_idx_sh_reg_dma = shadow_reg_dma;
+
+	wqicb->len = cpu_to_le16(tx_ring->wq_len | Q_LEN_V | Q_LEN_CPP_CONT);
+	wqicb->flags = cpu_to_le16(Q_FLAGS_LC |
+				   Q_FLAGS_LB | Q_FLAGS_LI | Q_FLAGS_LO);
+	wqicb->cq_id_rss = cpu_to_le16(tx_ring->cq_id);
+	wqicb->rid = 0;
+	wqicb->addr_lo = cpu_to_le32(tx_ring->wq_base_dma);
+	wqicb->addr_hi = cpu_to_le32((u64) tx_ring->wq_base_dma >> 32);
+
+	wqicb->cnsmr_idx_addr_lo = cpu_to_le32(tx_ring->cnsmr_idx_sh_reg_dma);
+	wqicb->cnsmr_idx_addr_hi =
+	    cpu_to_le32((u64) tx_ring->cnsmr_idx_sh_reg_dma >> 32);
+
+	ql_init_tx_ring(qdev, tx_ring);
+
+	err = ql_write_cfg(qdev, wqicb, sizeof(wqicb), CFG_LRQ,
+			   (u16) tx_ring->wq_id);
+	if (err) {
+		QPRINTK(qdev, IFUP, ERR, "Failed to load tx_ring.\n");
+		return err;
+	}
+	QPRINTK(qdev, IFUP, INFO, "Successfully loaded WQICB.\n");
+	return err;
+}
+
+static void ql_disable_msix(struct ql_adapter *qdev)
+{
+	if (test_bit(QL_MSIX_ENABLED, &qdev->flags)) {
+		pci_disable_msix(qdev->pdev);
+		clear_bit(QL_MSIX_ENABLED, &qdev->flags);
+		kfree(qdev->msi_x_entry);
+		qdev->msi_x_entry = NULL;
+	} else if (test_bit(QL_MSI_ENABLED, &qdev->flags)) {
+		pci_disable_msi(qdev->pdev);
+		clear_bit(QL_MSI_ENABLED, &qdev->flags);
+	}
+}
+
+static void ql_enable_msix(struct ql_adapter *qdev)
+{
+	int i;
+
+	qdev->intr_count = 1;
+	/* Get the MSIX vectors. */
+	if (irq_type == MSIX_IRQ) {
+		/* Try to alloc space for the msix struct,
+		 * if it fails then go to MSI/legacy.
+		 */
+		qdev->msi_x_entry = kcalloc(qdev->rx_ring_count,
+					    sizeof(struct msix_entry),
+					    GFP_KERNEL);
+		if (!qdev->msi_x_entry) {
+			irq_type = MSI_IRQ;
+			goto msi;
+		}
+
+		for (i = 0; i < qdev->rx_ring_count; i++)
+			qdev->msi_x_entry[i].entry = i;
+
+		if (!pci_enable_msix
+		    (qdev->pdev, qdev->msi_x_entry, qdev->rx_ring_count)) {
+			set_bit(QL_MSIX_ENABLED, &qdev->flags);
+			qdev->intr_count = qdev->rx_ring_count;
+			QPRINTK(qdev, IFUP, INFO,
+				"MSI-X Enabled, got %d vectors.\n",
+				qdev->intr_count);
+			return;
+		} else {
+			kfree(qdev->msi_x_entry);
+			qdev->msi_x_entry = NULL;
+			QPRINTK(qdev, IFUP, WARNING,
+				"MSI-X Enable failed, trying MSI.\n");
+			irq_type = MSI_IRQ;
+		}
+	}
+msi:
+	if (irq_type == MSI_IRQ) {
+		if (!pci_enable_msi(qdev->pdev)) {
+			set_bit(QL_MSI_ENABLED, &qdev->flags);
+			QPRINTK(qdev, IFUP, INFO,
+				"Running with MSI interrupts.\n");
+			return;
+		}
+	}
+	irq_type = LEG_IRQ;
+	spin_lock_init(&qdev->legacy_lock);
+	qdev->legacy_check = ql_legacy_check;
+	QPRINTK(qdev, IFUP, DEBUG, "Running with legacy interrupts.\n");
+}
+
+/*
+ * Here we build the intr_context structures based on
+ * our rx_ring count and intr vector count.
+ * The intr_context structure is used to hook each vector
+ * to possibly different handlers.
+ */
+static void ql_resolve_queues_to_irqs(struct ql_adapter *qdev)
+{
+	int i = 0;
+	struct intr_context *intr_context = &qdev->intr_context[0];
+
+	ql_enable_msix(qdev);
+
+	if (likely(test_bit(QL_MSIX_ENABLED, &qdev->flags))) {
+		/* Each rx_ring has it's
+		 * own intr_context since we have separate
+		 * vectors for each queue.
+		 * This only true when MSI-X is enabled.
+		 */
+		for (i = 0; i < qdev->intr_count; i++, intr_context++) {
+			qdev->rx_ring[i].irq = i;
+			intr_context->intr = i;
+			intr_context->qdev = qdev;
+			/*
+			 * We set up each vectors enable/disable/read bits so
+			 * there's no bit/mask calculations in the critical path.
+			 */
+			intr_context->intr_en_mask =
+			    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+			    INTR_EN_TYPE_ENABLE | INTR_EN_IHD_MASK | INTR_EN_IHD
+			    | i;
+			intr_context->intr_dis_mask =
+			    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+			    INTR_EN_TYPE_DISABLE | INTR_EN_IHD_MASK |
+			    INTR_EN_IHD | i;
+			intr_context->intr_read_mask =
+			    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+			    INTR_EN_TYPE_READ | INTR_EN_IHD_MASK | INTR_EN_IHD |
+			    i;
+
+			if (i == 0) {
+				/*
+				 * Default queue handles bcast/mcast plus
+				 * async events.  Needs buffers.
+				 */
+				intr_context->handler = qlge_isr;
+				sprintf(intr_context->name, "%s-default-queue",
+					qdev->ndev->name);
+			} else if (i < qdev->rss_ring_first_cq_id) {
+				/*
+				 * Outbound queue is for outbound completions only.
+				 */
+				intr_context->handler = qlge_msix_tx_isr;
+				sprintf(intr_context->name, "%s-txq-%d",
+					qdev->ndev->name, i);
+			} else {
+				/*
+				 * Inbound queues handle unicast frames only.
+				 */
+				intr_context->handler = qlge_msix_rx_isr;
+				sprintf(intr_context->name, "%s-rxq-%d",
+					qdev->ndev->name, i);
+			}
+		}
+	} else {
+		/*
+		 * All rx_rings use the same intr_context since
+		 * there is only one vector.
+		 */
+		intr_context->intr = 0;
+		intr_context->qdev = qdev;
+		/*
+		 * We set up each vectors enable/disable/read bits so
+		 * there's no bit/mask calculations in the critical path.
+		 */
+		intr_context->intr_en_mask =
+		    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK | INTR_EN_TYPE_ENABLE;
+		intr_context->intr_dis_mask =
+		    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+		    INTR_EN_TYPE_DISABLE;
+		intr_context->intr_read_mask =
+		    INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK | INTR_EN_TYPE_READ;
+		/*
+		 * Single interrupt means one handler for all rings.
+		 */
+		intr_context->handler = qlge_isr;
+		sprintf(intr_context->name, "%s-single_irq", qdev->ndev->name);
+		for (i = 0; i < qdev->rx_ring_count; i++)
+			qdev->rx_ring[i].irq = 0;
+	}
+}
+
+static void ql_free_irq(struct ql_adapter *qdev)
+{
+	int i;
+	struct intr_context *intr_context = &qdev->intr_context[0];
+
+	for (i = 0; i < qdev->intr_count; i++, intr_context++) {
+		if (intr_context->hooked) {
+			if (test_bit(QL_MSIX_ENABLED, &qdev->flags)) {
+				free_irq(qdev->msi_x_entry[i].vector,
+					 &qdev->rx_ring[i]);
+				QPRINTK(qdev, IFDOWN, ERR,
+					"freeing msix interrupt %d.\n", i);
+			} else {
+				free_irq(qdev->pdev->irq, &qdev->rx_ring[0]);
+				QPRINTK(qdev, IFDOWN, ERR,
+					"freeing msi interrupt %d.\n", i);
+			}
+		}
+	}
+	ql_disable_msix(qdev);
+}
+
+static int ql_request_irq(struct ql_adapter *qdev)
+{
+	int i;
+	int status = 0;
+	struct pci_dev *pdev = qdev->pdev;
+	struct intr_context *intr_context = &qdev->intr_context[0];
+
+	ql_resolve_queues_to_irqs(qdev);
+
+	for (i = 0; i < qdev->intr_count; i++, intr_context++) {
+		atomic_set(&intr_context->irq_cnt, 0);
+		if (test_bit(QL_MSIX_ENABLED, &qdev->flags)) {
+			status = request_irq(qdev->msi_x_entry[i].vector,
+					     intr_context->handler,
+					     0,
+					     intr_context->name,
+					     &qdev->rx_ring[i]);
+			if (status) {
+				QPRINTK(qdev, IFUP, ERR,
+					"Failed request for MSIX interrupt %d.\n",
+					i);
+				goto err_irq;
+			} else {
+				QPRINTK(qdev, IFUP, INFO,
+					"Hooked intr %d, queue type %s%s%s, with name %s.\n",
+					i,
+					qdev->rx_ring[i].type ==
+					DEFAULT_Q ? "DEFAULT_Q" : "",
+					qdev->rx_ring[i].type ==
+					TX_Q ? "TX_Q" : "",
+					qdev->rx_ring[i].type ==
+					RX_Q ? "RX_Q" : "", intr_context->name);
+			}
+		} else {
+			QPRINTK(qdev, IFUP, DEBUG,
+				"trying msi or legacy interrupts.\n");
+			QPRINTK(qdev, IFUP, DEBUG,
+				"%s: irq = %d.\n", __func__, pdev->irq);
+			QPRINTK(qdev, IFUP, DEBUG,
+				"%s: context->name = %s.\n", __func__,
+			       intr_context->name);
+			QPRINTK(qdev, IFUP, DEBUG,
+				"%s: dev_id = 0x%p.\n", __func__,
+			       &qdev->rx_ring[0]);
+			status =
+			    request_irq(pdev->irq, qlge_isr,
+					test_bit(QL_MSI_ENABLED,
+						 &qdev->
+						 flags) ? 0 : IRQF_SHARED,
+					intr_context->name, &qdev->rx_ring[0]);
+			if (status)
+				goto err_irq;
+
+			QPRINTK(qdev, IFUP, ERR,
+				"Hooked intr %d, queue type %s%s%s, with name %s.\n",
+				i,
+				qdev->rx_ring[0].type ==
+				DEFAULT_Q ? "DEFAULT_Q" : "",
+				qdev->rx_ring[0].type == TX_Q ? "TX_Q" : "",
+				qdev->rx_ring[0].type == RX_Q ? "RX_Q" : "",
+				intr_context->name);
+		}
+		intr_context->hooked = 1;
+	}
+	return status;
+err_irq:
+	QPRINTK(qdev, IFUP, ERR, "Failed to get the interrupts!!!/n");
+	ql_free_irq(qdev);
+	return status;
+}
+
+static int ql_start_rss(struct ql_adapter *qdev)
+{
+	struct ricb *ricb = &qdev->ricb;
+	int status = 0;
+	int i;
+	u8 *hash_id = (u8 *) ricb->hash_cq_id;
+
+	memset((void *)ricb, 0, sizeof(ricb));
+
+	ricb->base_cq = qdev->rss_ring_first_cq_id | RSS_L4K;
+	ricb->flags =
+	    (RSS_L6K | RSS_LI | RSS_LB | RSS_LM | RSS_RI4 | RSS_RI6 | RSS_RT4 |
+	     RSS_RT6);
+	ricb->mask = cpu_to_le16(qdev->rss_ring_count - 1);
+
+	/*
+	 * Fill out the Indirection Table.
+	 */
+	for (i = 0; i < 32; i++)
+		hash_id[i] = i & 1;
+
+	/*
+	 * Random values for the IPv6 and IPv4 Hash Keys.
+	 */
+	get_random_bytes((void *)&ricb->ipv6_hash_key[0], 40);
+	get_random_bytes((void *)&ricb->ipv4_hash_key[0], 16);
+
+	QPRINTK(qdev, IFUP, INFO, "Initializing RSS.\n");
+
+	status = ql_write_cfg(qdev, ricb, sizeof(ricb), CFG_LR, 0);
+	if (status) {
+		QPRINTK(qdev, IFUP, ERR, "Failed to load RICB.\n");
+		return status;
+	}
+	QPRINTK(qdev, IFUP, INFO, "Successfully loaded RICB.\n");
+	return status;
+}
+
+/* Initialize the frame-to-queue routing. */
+static int ql_route_initialize(struct ql_adapter *qdev)
+{
+	int status = 0;
+	int i;
+
+	/* Clear all the entries in the routing table. */
+	for (i = 0; i < 16; i++) {
+		status = ql_set_routing_reg(qdev, i, 0, 0);
+		if (status) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Failed to init routing register for CAM packets.\n");
+			return status;
+		}
+	}
+
+	status = ql_set_routing_reg(qdev, RT_IDX_ALL_ERR_SLOT, RT_IDX_ERR, 1);
+	if (status) {
+		QPRINTK(qdev, IFUP, ERR,
+			"Failed to init routing register for error packets.\n");
+		return status;
+	}
+	status = ql_set_routing_reg(qdev, RT_IDX_BCAST_SLOT, RT_IDX_BCAST, 1);
+	if (status) {
+		QPRINTK(qdev, IFUP, ERR,
+			"Failed to init routing register for broadcast packets.\n");
+		return status;
+	}
+	/* If we have more than one inbound queue, then turn on RSS in the
+	 * routing block.
+	 */
+	if (qdev->rss_ring_count > 1) {
+		status = ql_set_routing_reg(qdev, RT_IDX_RSS_MATCH_SLOT,
+					RT_IDX_RSS_MATCH, 1);
+		if (status) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Failed to init routing register for MATCH RSS packets.\n");
+			return status;
+		}
+	}
+
+	status = ql_set_routing_reg(qdev, RT_IDX_CAM_HIT_SLOT,
+				    RT_IDX_CAM_HIT, 1);
+	if (status) {
+		QPRINTK(qdev, IFUP, ERR,
+			"Failed to init routing register for CAM packets.\n");
+		return status;
+	}
+	return status;
+}
+
+static int ql_adapter_initialize(struct ql_adapter *qdev)
+{
+	u32 value, mask;
+	int i;
+	int status = 0;
+
+	/*
+	 * Set up the System register to halt on errors.
+	 */
+	value = SYS_EFE | SYS_FAE;
+	mask = value << 16;
+	ql_write32(qdev, SYS, mask | value);
+
+	/* Set the default queue. */
+	value = NIC_RCV_CFG_DFQ;
+	mask = NIC_RCV_CFG_DFQ_MASK;
+	ql_write32(qdev, NIC_RCV_CFG, (mask | value));
+
+	/* Set the MPI interrupt to enabled. */
+	ql_write32(qdev, INTR_MASK, (INTR_MASK_PI << 16) | INTR_MASK_PI);
+
+	/* Enable the function, set pagesize, enable error checking. */
+	value = FSC_FE | FSC_EPC_INBOUND | FSC_EPC_OUTBOUND |
+	    FSC_EC | FSC_VM_PAGE_4K | FSC_SH;
+
+	/* Set/clear header splitting. */
+	mask = FSC_VM_PAGESIZE_MASK |
+	    FSC_DBL_MASK | FSC_DBRST_MASK | (value << 16);
+	ql_write32(qdev, FSC, mask | value);
+
+	ql_write32(qdev, SPLT_HDR, SPLT_HDR_EP |
+		min(SMALL_BUFFER_SIZE, MAX_SPLIT_SIZE));
+
+	/* Start up the rx queues. */
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		status = ql_start_rx_ring(qdev, &qdev->rx_ring[i]);
+		if (status) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Failed to start rx ring[%d].\n", i);
+			return status;
+		}
+	}
+
+	/* If there is more than one inbound completion queue
+	 * then download a RICB to configure RSS.
+	 */
+	if (qdev->rss_ring_count > 1) {
+		status = ql_start_rss(qdev);
+		if (status) {
+			QPRINTK(qdev, IFUP, ERR, "Failed to start RSS.\n");
+			return status;
+		}
+	}
+
+	/* Start up the tx queues. */
+	for (i = 0; i < qdev->tx_ring_count; i++) {
+		status = ql_start_tx_ring(qdev, &qdev->tx_ring[i]);
+		if (status) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Failed to start tx ring[%d].\n", i);
+			return status;
+		}
+	}
+
+	status = ql_port_initialize(qdev);
+	if (status) {
+		QPRINTK(qdev, IFUP, ERR, "Failed to start port.\n");
+		return status;
+	}
+
+	status = ql_set_mac_addr_reg(qdev, (u8 *) qdev->ndev->perm_addr,
+				     MAC_ADDR_TYPE_CAM_MAC, qdev->func);
+	if (status) {
+		QPRINTK(qdev, IFUP, ERR, "Failed to init mac address.\n");
+		return status;
+	}
+
+	status = ql_route_initialize(qdev);
+	if (status) {
+		QPRINTK(qdev, IFUP, ERR, "Failed to init routing table.\n");
+		return status;
+	}
+
+	/* Start NAPI for the RSS queues. */
+	for (i = qdev->rss_ring_first_cq_id; i < qdev->rx_ring_count; i++) {
+		QPRINTK(qdev, IFUP, INFO, "Enabling NAPI for rx_ring[%d].\n",
+			i);
+		napi_enable(&qdev->rx_ring[i].napi);
+	}
+
+	return status;
+}
+
+/* Issue soft reset to chip. */
+static int ql_adapter_reset(struct ql_adapter *qdev)
+{
+	u32 value;
+	int max_wait_time;
+	int status = 0;
+	int resetCnt = 0;
+
+#define MAX_RESET_CNT   1
+issueReset:
+	resetCnt++;
+	QPRINTK(qdev, IFDOWN, DEBUG, "Issue soft reset to chip.\n");
+	ql_write32(qdev, RST_FO, (RST_FO_FR << 16) | RST_FO_FR);
+	/* Wait for reset to complete. */
+	max_wait_time = 3;
+	QPRINTK(qdev, IFDOWN, DEBUG, "Wait %d seconds for reset to complete.\n",
+		max_wait_time);
+	do {
+		value = ql_read32(qdev, RST_FO);
+		if ((value & RST_FO_FR) == 0)
+			break;
+
+		ssleep(1);
+	} while ((--max_wait_time));
+	if (value & RST_FO_FR) {
+		QPRINTK(qdev, IFDOWN, ERR,
+			"Stuck in SoftReset:  FSC_SR:0x%08x\n", value);
+		if (resetCnt < MAX_RESET_CNT)
+			goto issueReset;
+	}
+	if (max_wait_time == 0) {
+		status = -ETIMEDOUT;
+		QPRINTK(qdev, IFDOWN, ERR,
+			"ETIMEOUT!!! errored out of resetting the chip!\n");
+	}
+
+	return status;
+}
+
+static void ql_display_dev_info(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = (struct ql_adapter *)netdev_priv(ndev);
+
+	QPRINTK(qdev, PROBE, INFO,
+		"Function #%d, NIC Roll %d, NIC Rev = %d, "
+		"XG Roll = %d, XG Rev = %d.\n",
+		qdev->func,
+		qdev->chip_rev_id & 0x0000000f,
+		qdev->chip_rev_id >> 4 & 0x0000000f,
+		qdev->chip_rev_id >> 8 & 0x0000000f,
+		qdev->chip_rev_id >> 12 & 0x0000000f);
+	QPRINTK(qdev, PROBE, INFO,
+		"MAC address %02x:%02x:%02x:%02x:%02x:%02x\n",
+		ndev->dev_addr[0], ndev->dev_addr[1],
+		ndev->dev_addr[2], ndev->dev_addr[3], ndev->dev_addr[4],
+		ndev->dev_addr[5]);
+}
+
+static int ql_adapter_down(struct ql_adapter *qdev)
+{
+	struct net_device *ndev = qdev->ndev;
+	int i, status = 0;
+	struct rx_ring *rx_ring;
+
+	netif_stop_queue(ndev);
+	netif_carrier_off(ndev);
+
+	cancel_delayed_work_sync(&qdev->asic_reset_work);
+	cancel_delayed_work_sync(&qdev->mpi_reset_work);
+	cancel_delayed_work_sync(&qdev->mpi_work);
+
+	/* The default queue at index 0 is always processed in
+	 * a workqueue.
+	 */
+	cancel_delayed_work_sync(&qdev->rx_ring[0].rx_work);
+
+	/* The rest of the rx_rings are processed in
+	 * a workqueue only if it's a single interrupt
+	 * environment (MSI/Legacy).
+	 */
+	for (i = 1; i > qdev->rx_ring_count; i++) {
+		rx_ring = &qdev->rx_ring[i];
+		/* Only the RSS rings use NAPI on multi irq
+		 * environment.  Outbound completion processing
+		 * is done in interrupt context.
+		 */
+		if (i >= qdev->rss_ring_first_cq_id) {
+			napi_disable(&rx_ring->napi);
+		} else {
+			cancel_delayed_work_sync(&rx_ring->rx_work);
+		}
+	}
+
+	clear_bit(QL_ADAPTER_UP, &qdev->flags);
+
+	ql_disable_interrupts(qdev);
+
+	ql_tx_ring_clean(qdev);
+
+	spin_lock(&qdev->hw_lock);
+	status = ql_adapter_reset(qdev);
+	if (status)
+		QPRINTK(qdev, IFDOWN, ERR, "reset(func #%d) FAILED!\n",
+			qdev->func);
+	spin_unlock(&qdev->hw_lock);
+	return status;
+}
+
+static int ql_adapter_up(struct ql_adapter *qdev)
+{
+	int err = 0;
+
+	spin_lock(&qdev->hw_lock);
+	err = ql_adapter_initialize(qdev);
+	if (err) {
+		QPRINTK(qdev, IFUP, INFO, "Unable to initialize adapter.\n");
+		spin_unlock(&qdev->hw_lock);
+		goto err_init;
+	}
+	spin_unlock(&qdev->hw_lock);
+	set_bit(QL_ADAPTER_UP, &qdev->flags);
+	ql_enable_interrupts(qdev);
+	ql_enable_all_completion_interrupts(qdev);
+	if ((ql_read32(qdev, STS) & qdev->port_init)) {
+		netif_carrier_on(qdev->ndev);
+		netif_start_queue(qdev->ndev);
+	}
+
+	return 0;
+err_init:
+	ql_adapter_reset(qdev);
+	return err;
+}
+
+static int ql_cycle_adapter(struct ql_adapter *qdev)
+{
+	int status;
+
+	status = ql_adapter_down(qdev);
+	if (status)
+		goto error;
+
+	status = ql_adapter_up(qdev);
+	if (status)
+		goto error;
+
+	return status;
+error:
+	QPRINTK(qdev, IFUP, ALERT,
+		"Driver up/down cycle failed, closing device\n");
+	rtnl_lock();
+	dev_close(qdev->ndev);
+	rtnl_unlock();
+	return status;
+}
+
+static void ql_release_adapter_resources(struct ql_adapter *qdev)
+{
+	ql_free_mem_resources(qdev);
+	ql_free_irq(qdev);
+}
+
+static int ql_get_adapter_resources(struct ql_adapter *qdev)
+{
+	int status = 0;
+
+	if (ql_alloc_mem_resources(qdev)) {
+		QPRINTK(qdev, IFUP, ERR, "Unable to  allocate memory.\n");
+		return -ENOMEM;
+	}
+	status = ql_request_irq(qdev);
+	if (status)
+		goto err_irq;
+	return status;
+err_irq:
+	ql_free_mem_resources(qdev);
+	return status;
+}
+
+static int qlge_close(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	/*
+	 * Wait for device to recover from a reset.
+	 * (Rarely happens, but possible.)
+	 */
+	while (!test_bit(QL_ADAPTER_UP, &qdev->flags))
+		msleep(1);
+	ql_adapter_down(qdev);
+	ql_release_adapter_resources(qdev);
+	ql_free_ring_cb(qdev);
+	return 0;
+}
+
+static int ql_configure_rings(struct ql_adapter *qdev)
+{
+	int i;
+	struct rx_ring *rx_ring;
+	struct tx_ring *tx_ring;
+	int cpu_cnt = num_online_cpus();
+
+	/*
+	 * For each processor present we allocate one
+	 * rx_ring for outbound completions, and one
+	 * rx_ring for inbound completions.  Plus there is
+	 * always the one default queue.  For the CPU
+	 * counts we end up with the following rx_rings:
+	 * rx_ring count =
+	 *  one default queue +
+	 *  (CPU count * outbound completion rx_ring) +
+	 *  (CPU count * inbound (RSS) completion rx_ring)
+	 * To keep it simple we limit the total number of
+	 * queues to < 32, so we truncate CPU to 8.
+	 * This limitation can be removed when requested.
+	 */
+
+	if (cpu_cnt > 8)
+		cpu_cnt = 8;
+
+	/*
+	 * rx_ring[0] is always the default queue.
+	 */
+	/* Allocate outbound completion ring for each CPU. */
+	qdev->tx_ring_count = cpu_cnt;
+	/* Allocate inbound completion (RSS) ring for each CPU. */
+	qdev->rss_ring_count = cpu_cnt;
+	/* cq_id for the first inbound ring handler. */
+	qdev->rss_ring_first_cq_id = cpu_cnt + 1;
+	/*
+	 * qdev->rx_ring_count:
+	 * Total number of rx_rings.  This includes the one
+	 * default queue, a number of outbound completion
+	 * handler rx_rings, and the number of inbound
+	 * completion handler rx_rings.
+	 */
+	qdev->rx_ring_count = qdev->tx_ring_count + qdev->rss_ring_count + 1;
+
+	if (ql_alloc_ring_cb(qdev))
+		return -ENOMEM;
+
+	for (i = 0; i < qdev->tx_ring_count; i++) {
+		tx_ring = &qdev->tx_ring[i];
+		memset((void *)tx_ring, 0, sizeof(tx_ring));
+		tx_ring->qdev = qdev;
+		tx_ring->wq_id = i;
+		tx_ring->wq_len = qdev->tx_ring_size;
+		tx_ring->wq_size =
+		    tx_ring->wq_len * sizeof(struct ob_mac_iocb_req);
+
+		/*
+		 * The completion queue ID for the tx rings start
+		 * immediately after the default Q ID, which is zero.
+		 */
+		tx_ring->cq_id = i + 1;
+	}
+
+	for (i = 0; i < qdev->rx_ring_count; i++) {
+		rx_ring = &qdev->rx_ring[i];
+		memset((void *)rx_ring, 0, sizeof(rx_ring));
+		rx_ring->qdev = qdev;
+		rx_ring->cq_id = i;
+		rx_ring->cpu = i % cpu_cnt;	/* CPU to run handler on. */
+		if (i == 0) {	/* Default queue at index 0. */
+			/*
+			 * Default queue handles bcast/mcast plus
+			 * async events.  Needs buffers.
+			 */
+			rx_ring->cq_len = qdev->rx_ring_size;
+			rx_ring->cq_size =
+			    rx_ring->cq_len * sizeof(struct ql_net_rsp_iocb);
+			rx_ring->lbq_len = NUM_LARGE_BUFFERS;
+			rx_ring->lbq_size =
+			    rx_ring->lbq_len * sizeof(struct bq_element);
+			rx_ring->lbq_buf_size = LARGE_BUFFER_SIZE;
+			rx_ring->sbq_len = NUM_SMALL_BUFFERS;
+			rx_ring->sbq_size =
+			    rx_ring->sbq_len * sizeof(struct bq_element);
+			rx_ring->sbq_buf_size = SMALL_BUFFER_SIZE * 2;
+			rx_ring->type = DEFAULT_Q;
+		} else if (i < qdev->rss_ring_first_cq_id) {
+			/*
+			 * Outbound queue handles outbound completions only.
+			 */
+			/* outbound cq is same size as tx_ring it services. */
+			rx_ring->cq_len = qdev->tx_ring_size;
+			rx_ring->cq_size =
+			    rx_ring->cq_len * sizeof(struct ql_net_rsp_iocb);
+			rx_ring->lbq_len = 0;
+			rx_ring->lbq_size = 0;
+			rx_ring->lbq_buf_size = 0;
+			rx_ring->sbq_len = 0;
+			rx_ring->sbq_size = 0;
+			rx_ring->sbq_buf_size = 0;
+			rx_ring->type = TX_Q;
+		} else {	/* Inbound completions (RSS) queues */
+			/*
+			 * Inbound queues handle unicast frames only.
+			 */
+			rx_ring->cq_len = qdev->rx_ring_size;
+			rx_ring->cq_size =
+			    rx_ring->cq_len * sizeof(struct ql_net_rsp_iocb);
+			rx_ring->lbq_len = NUM_LARGE_BUFFERS;
+			rx_ring->lbq_size =
+			    rx_ring->lbq_len * sizeof(struct bq_element);
+			rx_ring->lbq_buf_size = LARGE_BUFFER_SIZE;
+			rx_ring->sbq_len = NUM_SMALL_BUFFERS;
+			rx_ring->sbq_size =
+			    rx_ring->sbq_len * sizeof(struct bq_element);
+			rx_ring->sbq_buf_size = SMALL_BUFFER_SIZE * 2;
+			rx_ring->type = RX_Q;
+		}
+	}
+	return 0;
+}
+
+static int qlge_open(struct net_device *ndev)
+{
+	int err = 0;
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	err = ql_configure_rings(qdev);
+	if (err)
+		return err;
+
+	err = ql_get_adapter_resources(qdev);
+	if (err)
+		goto error_up;
+
+	err = ql_adapter_up(qdev);
+	if (err)
+		goto error_up;
+
+	return err;
+
+error_up:
+	ql_release_adapter_resources(qdev);
+	ql_free_ring_cb(qdev);
+	return err;
+}
+
+static int qlge_change_mtu(struct net_device *ndev, int new_mtu)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	if (ndev->mtu == 1500 && new_mtu == 9000) {
+		QPRINTK(qdev, IFUP, ERR, "Changing to jumbo MTU.\n");
+	} else if (ndev->mtu == 9000 && new_mtu == 1500) {
+		QPRINTK(qdev, IFUP, ERR, "Changing to normal MTU.\n");
+	} else if ((ndev->mtu == 1500 && new_mtu == 1500) ||
+		   (ndev->mtu == 9000 && new_mtu == 9000)) {
+		return 0;
+	} else
+		return -EINVAL;
+	ndev->mtu = new_mtu;
+	return 0;
+}
+
+static struct net_device_stats *qlge_get_stats(struct net_device
+					       *ndev)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	return &qdev->stats;
+}
+
+static void qlge_set_multicast_list(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = (struct ql_adapter *)netdev_priv(ndev);
+	struct dev_mc_list *mc_ptr;
+	int i;
+
+	spin_lock(&qdev->hw_lock);
+	/*
+	 * Set or clear promiscuous mode if a
+	 * transition is taking place.
+	 */
+	if (ndev->flags & IFF_PROMISC) {
+		if (!test_bit(QL_PROMISCUOUS, &qdev->flags)) {
+			if (ql_set_routing_reg
+			    (qdev, RT_IDX_PROMISCUOUS_SLOT, RT_IDX_VALID, 1)) {
+				QPRINTK(qdev, HW, ERR,
+					"Failed to set promiscous mode.\n");
+			} else {
+				set_bit(QL_PROMISCUOUS, &qdev->flags);
+			}
+		}
+	} else {
+		if (test_bit(QL_PROMISCUOUS, &qdev->flags)) {
+			if (ql_set_routing_reg
+			    (qdev, RT_IDX_PROMISCUOUS_SLOT, RT_IDX_VALID, 0)) {
+				QPRINTK(qdev, HW, ERR,
+					"Failed to clear promiscous mode.\n");
+			} else {
+				clear_bit(QL_PROMISCUOUS, &qdev->flags);
+			}
+		}
+	}
+
+	/*
+	 * Set or clear all multicast mode if a
+	 * transition is taking place.
+	 */
+	if ((ndev->flags & IFF_ALLMULTI) ||
+	    (ndev->mc_count > MAX_MULTICAST_ENTRIES)) {
+		if (!test_bit(QL_ALLMULTI, &qdev->flags)) {
+			if (ql_set_routing_reg
+			    (qdev, RT_IDX_ALLMULTI_SLOT, RT_IDX_MCAST, 1)) {
+				QPRINTK(qdev, HW, ERR,
+					"Failed to set all-multi mode.\n");
+			} else {
+				set_bit(QL_ALLMULTI, &qdev->flags);
+			}
+		}
+	} else {
+		if (test_bit(QL_ALLMULTI, &qdev->flags)) {
+			if (ql_set_routing_reg
+			    (qdev, RT_IDX_ALLMULTI_SLOT, RT_IDX_MCAST, 0)) {
+				QPRINTK(qdev, HW, ERR,
+					"Failed to clear all-multi mode.\n");
+			} else {
+				clear_bit(QL_ALLMULTI, &qdev->flags);
+			}
+		}
+	}
+
+	if (ndev->mc_count) {
+		for (i = 0, mc_ptr = ndev->mc_list; mc_ptr;
+		     i++, mc_ptr = mc_ptr->next)
+			if (ql_set_mac_addr_reg(qdev, (u8 *) mc_ptr->dmi_addr,
+						MAC_ADDR_TYPE_MULTI_MAC, i)) {
+				QPRINTK(qdev, HW, ERR,
+					"Failed to loadmulticast address.\n");
+				goto exit;
+			}
+		if (ql_set_routing_reg
+		    (qdev, RT_IDX_MCAST_MATCH_SLOT, RT_IDX_MCAST_MATCH, 1)) {
+			QPRINTK(qdev, HW, ERR,
+				"Failed to set multicast match mode.\n");
+		} else {
+			set_bit(QL_ALLMULTI, &qdev->flags);
+		}
+	}
+exit:
+	spin_unlock(&qdev->hw_lock);
+}
+
+static int qlge_set_mac_address(struct net_device *ndev, void *p)
+{
+	struct ql_adapter *qdev = (struct ql_adapter *)netdev_priv(ndev);
+	struct sockaddr *addr = p;
+
+	if (netif_running(ndev))
+		return -EBUSY;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+	memcpy(ndev->dev_addr, addr->sa_data, ndev->addr_len);
+
+	spin_lock(&qdev->hw_lock);
+	if (ql_set_mac_addr_reg(qdev, (u8 *) ndev->dev_addr,
+			MAC_ADDR_TYPE_CAM_MAC, qdev->func)) {/* Unicast */
+		QPRINTK(qdev, HW, ERR, "Failed to load MAC address.\n");
+		return -1;
+	}
+	spin_unlock(&qdev->hw_lock);
+
+	return 0;
+}
+
+static void qlge_tx_timeout(struct net_device *ndev)
+{
+	struct ql_adapter *qdev = (struct ql_adapter *)netdev_priv(ndev);
+	queue_delayed_work(qdev->workqueue, &qdev->asic_reset_work, 0);
+}
+
+static void ql_asic_reset_work(struct work_struct *work)
+{
+	struct ql_adapter *qdev =
+	    container_of(work, struct ql_adapter, asic_reset_work.work);
+	ql_cycle_adapter(qdev);
+}
+
+static void ql_get_board_info(struct ql_adapter *qdev)
+{
+	qdev->func =
+	    (ql_read32(qdev, STS) & STS_FUNC_ID_MASK) >> STS_FUNC_ID_SHIFT;
+	if (qdev->func) {
+		qdev->xg_sem_mask = SEM_XGMAC1_MASK;
+		qdev->port_link_up = STS_PL1;
+		qdev->port_init = STS_PI1;
+		qdev->mailbox_in = PROC_ADDR_MPI_RISC | PROC_ADDR_FUNC2_MBI;
+		qdev->mailbox_out = PROC_ADDR_MPI_RISC | PROC_ADDR_FUNC2_MBO;
+	} else {
+		qdev->xg_sem_mask = SEM_XGMAC0_MASK;
+		qdev->port_link_up = STS_PL0;
+		qdev->port_init = STS_PI0;
+		qdev->mailbox_in = PROC_ADDR_MPI_RISC | PROC_ADDR_FUNC0_MBI;
+		qdev->mailbox_out = PROC_ADDR_MPI_RISC | PROC_ADDR_FUNC0_MBO;
+	}
+	qdev->chip_rev_id = ql_read32(qdev, REV_ID);
+}
+
+static void ql_release_all(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	if (qdev->workqueue) {
+		destroy_workqueue(qdev->workqueue);
+		qdev->workqueue = NULL;
+	}
+	if (qdev->q_workqueue) {
+		destroy_workqueue(qdev->q_workqueue);
+		qdev->q_workqueue = NULL;
+	}
+	if (qdev->reg_base)
+		iounmap((void *)qdev->reg_base);
+	if (qdev->doorbell_area)
+		iounmap(qdev->doorbell_area);
+	pci_release_regions(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static int __devinit ql_init_device(struct pci_dev *pdev,
+				    struct net_device *ndev, int cards_found)
+{
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int pos, err = 0;
+	u16 val16;
+
+	memset((void *)qdev, 0, sizeof(qdev));
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "PCI device enable failed.\n");
+		return err;
+	}
+
+	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (pos <= 0) {
+		dev_err(&pdev->dev, PFX "Cannot find PCI Express capability, "
+			"aborting.\n");
+		goto err_out;
+	} else {
+		pci_read_config_word(pdev, pos + PCI_EXP_DEVCTL, &val16);
+		val16 &= ~PCI_EXP_DEVCTL_NOSNOOP_EN;
+		val16 |= (PCI_EXP_DEVCTL_CERE |
+			  PCI_EXP_DEVCTL_NFERE |
+			  PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE);
+		pci_write_config_word(pdev, pos + PCI_EXP_DEVCTL, val16);
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "PCI region request failed.\n");
+		goto err_out;
+	}
+
+	pci_set_master(pdev);
+	if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) {
+		set_bit(QL_DMA64, &qdev->flags);
+		err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
+	} else {
+		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		if (!err)
+		       err = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK);
+	}
+
+	if (err) {
+		dev_err(&pdev->dev, "No usable DMA configuration.\n");
+		goto err_out;
+	}
+
+	pci_set_drvdata(pdev, ndev);
+	qdev->reg_base =
+	    ioremap_nocache(pci_resource_start(pdev, 1),
+			    pci_resource_len(pdev, 1));
+	if (!qdev->reg_base) {
+		dev_err(&pdev->dev, "Register mapping failed.\n");
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	qdev->doorbell_area_size = pci_resource_len(pdev, 3);
+	qdev->doorbell_area =
+	    ioremap_nocache(pci_resource_start(pdev, 3),
+			    pci_resource_len(pdev, 3));
+	if (!qdev->doorbell_area) {
+		dev_err(&pdev->dev, "Doorbell register mapping failed.\n");
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	ql_get_board_info(qdev);
+	qdev->ndev = ndev;
+	qdev->pdev = pdev;
+	qdev->msg_enable = netif_msg_init(debug, default_msg);
+	spin_lock_init(&qdev->hw_lock);
+	spin_lock_init(&qdev->stats_lock);
+
+	/* make sure the EEPROM is good */
+	err = ql_get_flash_params(qdev);
+	if (err) {
+		dev_err(&pdev->dev, "Invalid FLASH.\n");
+		goto err_out;
+	}
+
+	if (!is_valid_ether_addr(qdev->flash.mac_addr))
+		goto err_out;
+
+	memcpy(ndev->dev_addr, qdev->flash.mac_addr, ndev->addr_len);
+	memcpy(ndev->perm_addr, ndev->dev_addr, ndev->addr_len);
+
+	/* Set up the default ring sizes. */
+	qdev->tx_ring_size = NUM_TX_RING_ENTRIES;
+	qdev->rx_ring_size = NUM_RX_RING_ENTRIES;
+
+	/* Set up the coalescing parameters. */
+	qdev->rx_coalesce_usecs = DFLT_COALESCE_WAIT;
+	qdev->tx_coalesce_usecs = DFLT_COALESCE_WAIT;
+	qdev->rx_max_coalesced_frames = DFLT_INTER_FRAME_WAIT;
+	qdev->tx_max_coalesced_frames = DFLT_INTER_FRAME_WAIT;
+
+	/*
+	 * Set up the operating parameters.
+	 */
+	qdev->rx_csum = 1;
+
+	qdev->q_workqueue = create_workqueue(ndev->name);
+	qdev->workqueue = create_singlethread_workqueue(ndev->name);
+	INIT_DELAYED_WORK(&qdev->asic_reset_work, ql_asic_reset_work);
+	INIT_DELAYED_WORK(&qdev->mpi_reset_work, ql_mpi_reset_work);
+	INIT_DELAYED_WORK(&qdev->mpi_work, ql_mpi_work);
+
+	if (!cards_found) {
+		dev_info(&pdev->dev, "%s\n", DRV_STRING);
+		dev_info(&pdev->dev, "Driver name: %s, Version: %s.\n",
+			 DRV_NAME, DRV_VERSION);
+	}
+	return 0;
+err_out:
+	ql_release_all(pdev);
+	pci_disable_device(pdev);
+	return err;
+}
+
+static int __devinit qlge_probe(struct pci_dev *pdev,
+				const struct pci_device_id *pci_entry)
+{
+	struct net_device *ndev = NULL;
+	struct ql_adapter *qdev = NULL;
+	static int cards_found = 0;
+	int err = 0;
+
+	ndev = alloc_etherdev(sizeof(struct ql_adapter));
+	if (!ndev)
+		return -ENOMEM;
+
+	err = ql_init_device(pdev, ndev, cards_found);
+	if (err < 0) {
+		free_netdev(ndev);
+		return err;
+	}
+
+	qdev = netdev_priv(ndev);
+	SET_NETDEV_DEV(ndev, &pdev->dev);
+	ndev->features = (0
+			  | NETIF_F_IP_CSUM
+			  | NETIF_F_SG
+			  | NETIF_F_TSO
+			  | NETIF_F_TSO6
+			  | NETIF_F_TSO_ECN
+			  | NETIF_F_HW_VLAN_TX
+			  | NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER);
+
+	if (test_bit(QL_DMA64, &qdev->flags))
+		ndev->features |= NETIF_F_HIGHDMA;
+
+	/*
+	 * Set up net_device structure.
+	 */
+	ndev->tx_queue_len = qdev->tx_ring_size;
+	ndev->irq = pdev->irq;
+	ndev->open = qlge_open;
+	ndev->stop = qlge_close;
+	ndev->hard_start_xmit = qlge_send;
+	SET_ETHTOOL_OPS(ndev, &qlge_ethtool_ops);
+	ndev->change_mtu = qlge_change_mtu;
+	ndev->get_stats = qlge_get_stats;
+	ndev->set_multicast_list = qlge_set_multicast_list;
+	ndev->set_mac_address = qlge_set_mac_address;
+	ndev->tx_timeout = qlge_tx_timeout;
+	ndev->watchdog_timeo = 10 * HZ;
+	ndev->vlan_rx_register = ql_vlan_rx_register;
+	ndev->vlan_rx_add_vid = ql_vlan_rx_add_vid;
+	ndev->vlan_rx_kill_vid = ql_vlan_rx_kill_vid;
+	err = register_netdev(ndev);
+	if (err) {
+		dev_err(&pdev->dev, "net device registration failed.\n");
+		ql_release_all(pdev);
+		pci_disable_device(pdev);
+		return err;
+	}
+	netif_carrier_off(ndev);
+	netif_stop_queue(ndev);
+	ql_display_dev_info(ndev);
+	cards_found++;
+	return 0;
+}
+
+static void __devexit qlge_remove(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	unregister_netdev(ndev);
+	ql_release_all(pdev);
+	pci_disable_device(pdev);
+	free_netdev(ndev);
+}
+
+/*
+ * This callback is called by the PCI subsystem whenever
+ * a PCI bus error is detected.
+ */
+static pci_ers_result_t qlge_io_error_detected(struct pci_dev *pdev,
+					       enum pci_channel_state state)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	if (netif_running(ndev))
+		ql_adapter_down(qdev);
+
+	pci_disable_device(pdev);
+
+	/* Request a slot reset. */
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/*
+ * This callback is called after the PCI buss has been reset.
+ * Basically, this tries to restart the card from scratch.
+ * This is a shortened version of the device probe/discovery code,
+ * it resembles the first-half of the () routine.
+ */
+static pci_ers_result_t qlge_io_slot_reset(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	if (pci_enable_device(pdev)) {
+		QPRINTK(qdev, IFUP, ERR,
+			"Cannot re-enable PCI device after reset.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+
+	netif_carrier_off(ndev);
+	netif_stop_queue(ndev);
+	ql_adapter_reset(qdev);
+
+	/* Make sure the EEPROM is good */
+	memcpy(ndev->perm_addr, ndev->dev_addr, ndev->addr_len);
+
+	if (!is_valid_ether_addr(ndev->perm_addr)) {
+		QPRINTK(qdev, IFUP, ERR, "After reset, invalid MAC address.\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void qlge_io_resume(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+
+	pci_set_master(pdev);
+
+	if (netif_running(ndev)) {
+		if (ql_adapter_up(qdev)) {
+			QPRINTK(qdev, IFUP, ERR,
+				"Device initialization failed after reset.\n");
+			return;
+		}
+	}
+
+	netif_device_attach(ndev);
+}
+
+static struct pci_error_handlers qlge_err_handler = {
+	.error_detected = qlge_io_error_detected,
+	.slot_reset = qlge_io_slot_reset,
+	.resume = qlge_io_resume,
+};
+
+static int qlge_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int err;
+
+	netif_device_detach(ndev);
+
+	if (netif_running(ndev)) {
+		err = ql_adapter_down(qdev);
+		if (!err)
+			return err;
+	}
+
+	err = pci_save_state(pdev);
+	if (err)
+		return err;
+
+	pci_disable_device(pdev);
+
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
+
+	return 0;
+}
+
+static int qlge_resume(struct pci_dev *pdev)
+{
+	struct net_device *ndev = pci_get_drvdata(pdev);
+	struct ql_adapter *qdev = netdev_priv(ndev);
+	int err;
+
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	err = pci_enable_device(pdev);
+	if (err) {
+		QPRINTK(qdev, IFUP, ERR, "Cannot enable PCI device from suspend\n");
+		return err;
+	}
+	pci_set_master(pdev);
+
+	pci_enable_wake(pdev, PCI_D3hot, 0);
+	pci_enable_wake(pdev, PCI_D3cold, 0);
+
+	if (netif_running(ndev)) {
+		err = ql_adapter_up(qdev);
+		if (err)
+			return err;
+	}
+
+	netif_device_attach(ndev);
+
+	return 0;
+}
+
+static void qlge_shutdown(struct pci_dev *pdev)
+{
+	qlge_suspend(pdev, PMSG_SUSPEND);
+}
+
+static struct pci_driver qlge_driver = {
+	.name = DRV_NAME,
+	.id_table = qlge_pci_tbl,
+	.probe = qlge_probe,
+	.remove = __devexit_p(qlge_remove),
+#ifdef CONFIG_PM
+	.suspend = qlge_suspend,
+	.resume = qlge_resume,
+#endif
+	.shutdown = qlge_shutdown,
+	.err_handler = &qlge_err_handler
+};
+
+static int __init qlge_init_module(void)
+{
+	return pci_register_driver(&qlge_driver);
+}
+
+static void __exit qlge_exit(void)
+{
+	pci_unregister_driver(&qlge_driver);
+}
+
+module_init(qlge_init_module);
+module_exit(qlge_exit);
diff --git a/drivers/net/qlge/qlge_mpi.c b/drivers/net/qlge/qlge_mpi.c
new file mode 100644
index 00000000000..24fe344bcf1
--- /dev/null
+++ b/drivers/net/qlge/qlge_mpi.c
@@ -0,0 +1,150 @@
+#include "qlge.h"
+
+static int ql_read_mbox_reg(struct ql_adapter *qdev, u32 reg, u32 *data)
+{
+	int status;
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev, PROC_ADDR, PROC_ADDR_RDY, PROC_ADDR_ERR);
+	if (status)
+		goto exit;
+	/* set up for reg read */
+	ql_write32(qdev, PROC_ADDR, reg | PROC_ADDR_R);
+	/* wait for reg to come ready */
+	status = ql_wait_reg_rdy(qdev, PROC_ADDR, PROC_ADDR_RDY, PROC_ADDR_ERR);
+	if (status)
+		goto exit;
+	/* get the data */
+	*data = ql_read32(qdev, PROC_DATA);
+exit:
+	return status;
+}
+
+int ql_get_mb_sts(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	int i, status;
+
+	status = ql_sem_spinlock(qdev, SEM_PROC_REG_MASK);
+	if (status)
+		return -EBUSY;
+	for (i = 0; i < mbcp->out_count; i++) {
+		status =
+		    ql_read_mbox_reg(qdev, qdev->mailbox_out + i,
+				     &mbcp->mbox_out[i]);
+		if (status) {
+			QPRINTK(qdev, DRV, ERR, "Failed mailbox read.\n");
+			break;
+		}
+	}
+	ql_sem_unlock(qdev, SEM_PROC_REG_MASK);	/* does flush too */
+	return status;
+}
+
+static void ql_link_up(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	mbcp->out_count = 2;
+
+	if (ql_get_mb_sts(qdev, mbcp))
+		goto exit;
+
+	qdev->link_status = mbcp->mbox_out[1];
+	QPRINTK(qdev, DRV, ERR, "Link Up.\n");
+	QPRINTK(qdev, DRV, INFO, "Link Status = 0x%.08x.\n", mbcp->mbox_out[1]);
+	if (!netif_carrier_ok(qdev->ndev)) {
+		QPRINTK(qdev, LINK, INFO, "Link is Up.\n");
+		netif_carrier_on(qdev->ndev);
+		netif_wake_queue(qdev->ndev);
+	}
+exit:
+	/* Clear the MPI firmware status. */
+	ql_write32(qdev, CSR, CSR_CMD_CLR_R2PCI_INT);
+}
+
+static void ql_link_down(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	mbcp->out_count = 3;
+
+	if (ql_get_mb_sts(qdev, mbcp)) {
+		QPRINTK(qdev, DRV, ERR, "Firmware did not initialize!\n");
+		goto exit;
+	}
+
+	if (netif_carrier_ok(qdev->ndev)) {
+		QPRINTK(qdev, LINK, INFO, "Link is Down.\n");
+		netif_carrier_off(qdev->ndev);
+		netif_stop_queue(qdev->ndev);
+	}
+	QPRINTK(qdev, DRV, ERR, "Link Down.\n");
+	QPRINTK(qdev, DRV, ERR, "Link Status = 0x%.08x.\n", mbcp->mbox_out[1]);
+exit:
+	/* Clear the MPI firmware status. */
+	ql_write32(qdev, CSR, CSR_CMD_CLR_R2PCI_INT);
+}
+
+static void ql_init_fw_done(struct ql_adapter *qdev, struct mbox_params *mbcp)
+{
+	mbcp->out_count = 2;
+
+	if (ql_get_mb_sts(qdev, mbcp)) {
+		QPRINTK(qdev, DRV, ERR, "Firmware did not initialize!\n");
+		goto exit;
+	}
+	QPRINTK(qdev, DRV, ERR, "Firmware initialized!\n");
+	QPRINTK(qdev, DRV, ERR, "Firmware status = 0x%.08x.\n",
+		mbcp->mbox_out[0]);
+	QPRINTK(qdev, DRV, ERR, "Firmware Revision  = 0x%.08x.\n",
+		mbcp->mbox_out[1]);
+exit:
+	/* Clear the MPI firmware status. */
+	ql_write32(qdev, CSR, CSR_CMD_CLR_R2PCI_INT);
+}
+
+void ql_mpi_work(struct work_struct *work)
+{
+	struct ql_adapter *qdev =
+	    container_of(work, struct ql_adapter, mpi_work.work);
+	struct mbox_params mbc;
+	struct mbox_params *mbcp = &mbc;
+	mbcp->out_count = 1;
+
+	while (ql_read32(qdev, STS) & STS_PI) {
+		if (ql_get_mb_sts(qdev, mbcp)) {
+			QPRINTK(qdev, DRV, ERR,
+				"Could not read MPI, resetting ASIC!\n");
+			ql_queue_asic_error(qdev);
+		}
+
+		switch (mbcp->mbox_out[0]) {
+		case AEN_LINK_UP:
+			ql_link_up(qdev, mbcp);
+			break;
+		case AEN_LINK_DOWN:
+			ql_link_down(qdev, mbcp);
+			break;
+		case AEN_FW_INIT_DONE:
+			ql_init_fw_done(qdev, mbcp);
+			break;
+		case MB_CMD_STS_GOOD:
+			break;
+		case AEN_FW_INIT_FAIL:
+		case AEN_SYS_ERR:
+		case MB_CMD_STS_ERR:
+			ql_queue_fw_error(qdev);
+		default:
+			/* Clear the MPI firmware status. */
+			ql_write32(qdev, CSR, CSR_CMD_CLR_R2PCI_INT);
+			break;
+		}
+	}
+	ql_enable_completion_interrupt(qdev, 0);
+}
+
+void ql_mpi_reset_work(struct work_struct *work)
+{
+	struct ql_adapter *qdev =
+	    container_of(work, struct ql_adapter, mpi_reset_work.work);
+	QPRINTK(qdev, DRV, ERR,
+		"Enter, qdev = %p..\n", qdev);
+	ql_write32(qdev, CSR, CSR_CMD_SET_RST);
+	msleep(50);
+	ql_write32(qdev, CSR, CSR_CMD_CLR_RST);
+}
-- 
cgit v1.2.3-70-g09d2


From 953f551756a1275d9bfdbb70697323449305161a Mon Sep 17 00:00:00 2001
From: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Mon, 22 Sep 2008 20:09:46 -0700
Subject: Phonet: kernel documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/phonet.txt | 111 ++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 Documentation/networking/phonet.txt

(limited to 'Documentation')

diff --git a/Documentation/networking/phonet.txt b/Documentation/networking/phonet.txt
new file mode 100644
index 00000000000..f3c72e0ca8d
--- /dev/null
+++ b/Documentation/networking/phonet.txt
@@ -0,0 +1,111 @@
+Linux Phonet protocol family
+============================
+
+Introduction
+------------
+
+Phonet is a packet protocol used by Nokia cellular modems for both IPC
+and RPC. With the Linux Phonet socket family, Linux host processes can
+receive and send messages from/to the modem, or any other external
+device attached to the modem. The modem takes care of routing.
+
+Phonet packets can be exchanged through various hardware connections
+depending on the device, such as:
+  - USB with the CDC Phonet interface,
+  - infrared,
+  - Bluetooth,
+  - an RS232 serial port (with a dedicated "FBUS" line discipline),
+  - the SSI bus with some TI OMAP processors.
+
+
+Packets format
+--------------
+
+Phonet packet have a common header as follow:
+
+  struct phonethdr {
+    uint8_t  pn_media;  /* Media type (link-layer identifier) */
+    uint8_t  pn_rdev;   /* Receiver device ID */
+    uint8_t  pn_sdev;   /* Sender device ID */
+    uint8_t  pn_res;    /* Resource ID or function */
+    uint16_t pn_length; /* Big-endian message byte length (minus 6) */
+    uint8_t  pn_robj;   /* Receiver object ID */
+    uint8_t  pn_sobj;   /* Sender object ID */
+  };
+
+The device ID is split: the 6 higher order bits consitutes the device
+address, while the 2 lower order bits are used for multiplexing, as are
+the 8-bits object identifiers. As such, Phonet can be considered as a
+network layer with 6 bits of address space and 10 bits for transport
+protocol (much like port numbers in IP world).
+
+The modem always has address number zero. Each other device has a its
+own 6-bits address.
+
+
+Link layer
+----------
+
+Phonet links are always point-to-point links. The link layer header
+consists of a single Phonet media type byte. It uniquely identifies the
+link through which the packet is transmitted, from the modem's
+perspective.
+
+Linux Phonet network interfaces use a dedicated link layer type
+(ETH_P_PHONET) which is out of the Ethernet type range. They can only
+send and receive Phonet packets.
+
+Note that Phonet interfaces are not allowed to re-order packets, so
+only the (default) Linux FIFO qdisc should be used with them.
+
+
+Network layer
+-------------
+
+The Phonet socket address family maps the Phonet packet header:
+
+  struct sockaddr_pn {
+    sa_family_t spn_family;    /* AF_PHONET */
+    uint8_t     spn_obj;       /* Object ID */
+    uint8_t     spn_dev;       /* Device ID */
+    uint8_t     spn_resource;  /* Resource or function */
+    uint8_t     spn_zero[...]; /* Padding */
+  };
+
+The resource field is only used when sending and receiving;
+It is ignored by bind() and getsockname().
+
+
+Low-level datagram protocol
+---------------------------
+
+Applications can send Phonet messages using the Phonet datagram socket
+protocol from the PF_PHONET family. Each socket is bound to one of the
+2^10 object IDs available, and can send and receive packets with any
+other peer.
+
+  struct sockaddr_pn addr = { .spn_family = AF_PHONET, };
+  ssize_t len;
+  socklen_t addrlen = sizeof(addr);
+  int fd;
+
+  fd = socket(PF_PHONET, SOCK_DGRAM, 0);
+  bind(fd, (struct sockaddr *)&addr, sizeof(addr));
+  /* ... */
+
+  sendto(fd, msg, msglen, 0, (struct sockaddr *)&addr, sizeof(addr));
+  len = recvfrom(fd, buf, sizeof(buf), 0,
+                 (struct sockaddr *)&addr, &addrlen);
+
+This protocol follows the SOCK_DGRAM connection-less semantics.
+However, connect() and getpeername() are not supported, as they did
+not seem useful with Phonet usages (could be added easily).
+
+
+Authors
+-------
+
+Linux Phonet was initially written by Sakari Ailus.
+Other contributors include Mikä Liljeberg, Andras Domokos,
+Carlos Chinea and Rémi Denis-Courmont.
+Copyright (C) 2008 Nokia Corporation.
-- 
cgit v1.2.3-70-g09d2


From e5d2304802a63304a54cff010209c4a717a2509f Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <oliver@hartkopp.net>
Date: Tue, 23 Sep 2008 14:53:14 -0700
Subject: can: Add documentation for virtual CAN driver usage

This patch adds a usage documentation for the virtual CAN driver (vcan).

Signed-off-by: Oliver Hartkopp <oliver@hartkopp.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/can.txt | 44 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt
index 297ba7b1cca..2035bc4932f 100644
--- a/Documentation/networking/can.txt
+++ b/Documentation/networking/can.txt
@@ -35,8 +35,9 @@ This file contains
     6.1 general settings
     6.2 local loopback of sent frames
     6.3 CAN controller hardware filters
-    6.4 currently supported CAN hardware
-    6.5 todo
+    6.4 The virtual CAN driver (vcan)
+    6.5 currently supported CAN hardware
+    6.6 todo
 
   7 Credits
 
@@ -584,7 +585,42 @@ solution for a couple of reasons:
   @133MHz with four SJA1000 CAN controllers from 2002 under heavy bus
   load without any problems ...
 
-  6.4 currently supported CAN hardware (September 2007)
+  6.4 The virtual CAN driver (vcan)
+
+  Similar to the network loopback devices, vcan offers a virtual local
+  CAN interface. A full qualified address on CAN consists of
+
+  - a unique CAN Identifier (CAN ID)
+  - the CAN bus this CAN ID is transmitted on (e.g. can0)
+
+  so in common use cases more than one virtual CAN interface is needed.
+
+  The virtual CAN interfaces allow the transmission and reception of CAN
+  frames without real CAN controller hardware. Virtual CAN network
+  devices are usually named 'vcanX', like vcan0 vcan1 vcan2 ...
+  When compiled as a module the virtual CAN driver module is called vcan.ko
+
+  Since Linux Kernel version 2.6.24 the vcan driver supports the Kernel
+  netlink interface to create vcan network devices. The creation and
+  removal of vcan network devices can be managed with the ip(8) tool:
+
+  - Create a virtual CAN network interface:
+       ip link add type vcan
+
+  - Create a virtual CAN network interface with a specific name 'vcan42':
+       ip link add dev vcan42 type vcan
+
+  - Remove a (virtual CAN) network interface 'vcan42':
+       ip link del vcan42
+
+  The tool 'vcan' from the SocketCAN SVN repository on BerliOS is obsolete.
+
+  Virtual CAN network device creation in older Kernels:
+  In Linux Kernel versions < 2.6.24 the vcan driver creates 4 vcan
+  netdevices at module load time by default. This value can be changed
+  with the module parameter 'numdev'. E.g. 'modprobe vcan numdev=8'
+
+  6.5 currently supported CAN hardware
 
   On the project website http://developer.berlios.de/projects/socketcan
   there are different drivers available:
@@ -603,7 +639,7 @@ solution for a couple of reasons:
 
   Please check the Mailing Lists on the berlios OSS project website.
 
-  6.5 todo (September 2007)
+  6.6 todo
 
   The configuration interface for CAN network drivers is still an open
   issue that has not been finalized in the socketcan project. Also the
-- 
cgit v1.2.3-70-g09d2


From ac2dc8ca14fb9028b160d89fdef04ecc66add3a2 Mon Sep 17 00:00:00 2001
From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Tue, 30 Sep 2008 02:52:01 -0700
Subject: Phonet: improve documentation

Fix grammar errors spotted by Randy Dunlap,
and adds some more details.

Signed-off-by: Remi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/phonet.txt | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/phonet.txt b/Documentation/networking/phonet.txt
index f3c72e0ca8d..57d3e59edb1 100644
--- a/Documentation/networking/phonet.txt
+++ b/Documentation/networking/phonet.txt
@@ -21,7 +21,7 @@ depending on the device, such as:
 Packets format
 --------------
 
-Phonet packet have a common header as follow:
+Phonet packets have a common header as follows:
 
   struct phonethdr {
     uint8_t  pn_media;  /* Media type (link-layer identifier) */
@@ -33,14 +33,17 @@ Phonet packet have a common header as follow:
     uint8_t  pn_sobj;   /* Sender object ID */
   };
 
-The device ID is split: the 6 higher order bits consitutes the device
-address, while the 2 lower order bits are used for multiplexing, as are
-the 8-bits object identifiers. As such, Phonet can be considered as a
+On Linux, the link-layer header includes the pn_media byte (see below).
+The next 7 bytes are part of the network-layer header.
+
+The device ID is split: the 6 higher-order bits consitute the device
+address, while the 2 lower-order bits are used for multiplexing, as are
+the 8-bit object identifiers. As such, Phonet can be considered as a
 network layer with 6 bits of address space and 10 bits for transport
 protocol (much like port numbers in IP world).
 
-The modem always has address number zero. Each other device has a its
-own 6-bits address.
+The modem always has address number zero. All other device have a their
+own 6-bit address.
 
 
 Link layer
@@ -49,11 +52,18 @@ Link layer
 Phonet links are always point-to-point links. The link layer header
 consists of a single Phonet media type byte. It uniquely identifies the
 link through which the packet is transmitted, from the modem's
-perspective.
-
-Linux Phonet network interfaces use a dedicated link layer type
-(ETH_P_PHONET) which is out of the Ethernet type range. They can only
-send and receive Phonet packets.
+perspective. Each Phonet network device shall prepend and set the media
+type byte as appropriate. For convenience, a common phonet_header_ops
+link-layer header operations structure is provided. It sets the
+media type according to the network device hardware address.
+
+Linux Phonet network interfaces support a dedicated link layer packets
+type (ETH_P_PHONET) which is out of the Ethernet type range. They can
+only send and receive Phonet packets.
+
+The virtual TUN tunnel device driver can also be used for Phonet. This
+requires IFF_TUN mode, _without_ the IFF_NO_PI flag. In this case,
+there is no link-layer header, so there is no Phonet media type byte.
 
 Note that Phonet interfaces are not allowed to re-order packets, so
 only the (default) Linux FIFO qdisc should be used with them.
-- 
cgit v1.2.3-70-g09d2


From 95430c0b140c31cb9e39f876afe1c0e9947d1aaf Mon Sep 17 00:00:00 2001
From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Date: Sun, 5 Oct 2008 11:16:36 -0700
Subject: Phonet: pipe end-point protocol documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/phonet.txt | 54 +++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/networking/phonet.txt b/Documentation/networking/phonet.txt
index 57d3e59edb1..0e6e592f4f5 100644
--- a/Documentation/networking/phonet.txt
+++ b/Documentation/networking/phonet.txt
@@ -112,6 +112,60 @@ However, connect() and getpeername() are not supported, as they did
 not seem useful with Phonet usages (could be added easily).
 
 
+Phonet Pipe protocol
+--------------------
+
+The Phonet Pipe protocol is a simple sequenced packets protocol
+with end-to-end congestion control. It uses the passive listening
+socket paradigm. The listening socket is bound to an unique free object
+ID. Each listening socket can handle up to 255 simultaneous
+connections, one per accept()'d socket.
+
+  int lfd, cfd;
+
+  lfd = socket(PF_PHONET, SOCK_SEQPACKET, PN_PROTO_PIPE);
+  listen (lfd, INT_MAX);
+
+  /* ... */
+  cfd = accept(lfd, NULL, NULL);
+  for (;;)
+  {
+    char buf[...];
+    ssize_t len = read(cfd, buf, sizeof(buf));
+
+    /* ... */
+
+    write(cfd, msg, msglen);
+  }
+
+Connections are established between two endpoints by a "third party"
+application. This means that both endpoints are passive; so connect()
+is not possible.
+
+WARNING:
+When polling a connected pipe socket for writability, there is an
+intrinsic race condition whereby writability might be lost between the
+polling and the writing system calls. In this case, the socket will
+block until write because possible again, unless non-blocking mode
+becomes enabled.
+
+
+The pipe protocol provides two socket options at the SOL_PNPIPE level:
+
+  PNPIPE_ENCAP accepts one integer value (int) of:
+
+    PNPIPE_ENCAP_NONE: The socket operates normally (default).
+
+    PNPIPE_ENCAP_IP: The socket is used as a backend for a virtual IP
+      interface. This requires CAP_NET_ADMIN capability. GPRS data
+      support on Nokia modems can use this. Note that the socket cannot
+      be reliably poll()'d or read() from while in this mode.
+
+  PNPIPE_IFINDEX is a read-only integer value. It contains the
+    interface index of the network interface created by PNPIPE_ENCAP,
+    or zero if encapsulation is off.
+
+
 Authors
 -------
 
-- 
cgit v1.2.3-70-g09d2


From 079aa88fe7172b7650c7cf2c0bc01662bafea236 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 8 Oct 2008 11:35:00 +0200
Subject: netfilter: xt_recent: IPv6 support

This updates xt_recent to support the IPv6 address family.
The new /proc/net/xt_recent directory must be used for this.
The old proc interface can also be configured out.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 Documentation/feature-removal-schedule.txt |   3 +
 net/netfilter/Kconfig                      |   7 +
 net/netfilter/xt_recent.c                  | 300 +++++++++++++++++++++++------
 3 files changed, 256 insertions(+), 54 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index d0f22fac55d..3d2d0c29f02 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -250,6 +250,9 @@ What (Why):
 	- xt_mark match revision 0
 	  (superseded by xt_mark match revision 1)
 
+	- xt_recent: the old ipt_recent proc dir
+	  (superseded by /proc/net/xt_recent)
+
 When:	January 2009 or Linux 2.7.0, whichever comes first
 Why:	Superseded by newer revisions or modules
 Who:	Jan Engelhardt <jengelh@computergmbh.de>
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ccc78b07a1a..4a464857f21 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -743,6 +743,13 @@ config NETFILTER_XT_MATCH_RECENT
 	Short options are available by using 'iptables -m recent -h'
 	Official Website: <http://snowman.net/projects/ipt_recent/>
 
+config NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+	bool 'Enable obsolete /proc/net/ipt_recent'
+	depends on NETFILTER_XT_MATCH_RECENT && PROC_FS
+	---help---
+	This option enables the old /proc/net/ipt_recent interface,
+	which has been obsoleted by /proc/net/xt_recent.
+
 config NETFILTER_XT_MATCH_SCTP
 	tristate  '"sctp" protocol match support (EXPERIMENTAL)'
 	depends on NETFILTER_XTABLES && EXPERIMENTAL
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 422c0e4d66b..adc2e2f1b09 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2006 Patrick McHardy <kaber@trash.net>
+ * Copyright © CC Computer Consultants GmbH, 2007 - 2008
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -13,6 +14,8 @@
  */
 #include <linux/init.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -30,9 +33,11 @@
 #include <linux/netfilter/xt_recent.h>
 
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@computergmbh.de>");
 MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_recent");
+MODULE_ALIAS("ip6t_recent");
 
 static unsigned int ip_list_tot = 100;
 static unsigned int ip_pkt_list_tot = 20;
@@ -49,14 +54,15 @@ module_param(ip_list_gid, uint, 0400);
 MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
 MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)");
 MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
-MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files");
-MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files");
-MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files");
+MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files");
+MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/xt_recent/* files");
+MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/xt_recent/* files");
 
 struct recent_entry {
 	struct list_head	list;
 	struct list_head	lru_list;
-	__be32			addr;
+	union nf_inet_addr	addr;
+	u_int16_t		family;
 	u_int8_t		ttl;
 	u_int8_t		index;
 	u_int16_t		nstamps;
@@ -67,7 +73,7 @@ struct recent_table {
 	struct list_head	list;
 	char			name[XT_RECENT_NAME_LEN];
 #ifdef CONFIG_PROC_FS
-	struct proc_dir_entry	*proc;
+	struct proc_dir_entry	*proc_old, *proc;
 #endif
 	unsigned int		refcnt;
 	unsigned int		entries;
@@ -80,31 +86,53 @@ static DEFINE_SPINLOCK(recent_lock);
 static DEFINE_MUTEX(recent_mutex);
 
 #ifdef CONFIG_PROC_FS
-static struct proc_dir_entry	*proc_dir;
-static const struct file_operations	recent_fops;
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+static struct proc_dir_entry *proc_old_dir;
+#endif
+static struct proc_dir_entry *recent_proc_dir;
+static const struct file_operations recent_old_fops, recent_mt_fops;
 #endif
 
 static u_int32_t hash_rnd;
-static int hash_rnd_initted;
+static bool hash_rnd_initted;
+
+static unsigned int recent_entry_hash4(const union nf_inet_addr *addr)
+{
+	if (!hash_rnd_initted) {
+		get_random_bytes(&hash_rnd, sizeof(hash_rnd));
+		hash_rnd_initted = true;
+	}
+	return jhash_1word((__force u32)addr->ip, hash_rnd) &
+	       (ip_list_hash_size - 1);
+}
 
-static unsigned int recent_entry_hash(__be32 addr)
+static unsigned int recent_entry_hash6(const union nf_inet_addr *addr)
 {
 	if (!hash_rnd_initted) {
-		get_random_bytes(&hash_rnd, 4);
-		hash_rnd_initted = 1;
+		get_random_bytes(&hash_rnd, sizeof(hash_rnd));
+		hash_rnd_initted = true;
 	}
-	return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1);
+	return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) &
+	       (ip_list_hash_size - 1);
 }
 
 static struct recent_entry *
-recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl)
+recent_entry_lookup(const struct recent_table *table,
+		    const union nf_inet_addr *addrp, u_int16_t family,
+		    u_int8_t ttl)
 {
 	struct recent_entry *e;
 	unsigned int h;
 
-	h = recent_entry_hash(addr);
+	if (family == AF_INET)
+		h = recent_entry_hash4(addrp);
+	else
+		h = recent_entry_hash6(addrp);
+
 	list_for_each_entry(e, &table->iphash[h], list)
-		if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl))
+		if (e->family == family &&
+		    memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 &&
+		    (ttl == e->ttl || ttl == 0 || e->ttl == 0))
 			return e;
 	return NULL;
 }
@@ -118,7 +146,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
 }
 
 static struct recent_entry *
-recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl)
+recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr,
+		  u_int16_t family, u_int8_t ttl)
 {
 	struct recent_entry *e;
 
@@ -130,12 +159,16 @@ recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl)
 		    GFP_ATOMIC);
 	if (e == NULL)
 		return NULL;
-	e->addr      = addr;
+	memcpy(&e->addr, addr, sizeof(e->addr));
 	e->ttl       = ttl;
 	e->stamps[0] = jiffies;
 	e->nstamps   = 1;
 	e->index     = 1;
-	list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]);
+	e->family    = family;
+	if (family == AF_INET)
+		list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]);
+	else
+		list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]);
 	list_add_tail(&e->lru_list, &t->lru_list);
 	t->entries++;
 	return e;
@@ -179,28 +212,42 @@ recent_mt(const struct sk_buff *skb, const struct net_device *in,
 	const struct xt_recent_mtinfo *info = matchinfo;
 	struct recent_table *t;
 	struct recent_entry *e;
-	__be32 addr;
+	union nf_inet_addr addr = {};
 	u_int8_t ttl;
 	bool ret = info->invert;
 
-	if (info->side == XT_RECENT_DEST)
-		addr = ip_hdr(skb)->daddr;
-	else
-		addr = ip_hdr(skb)->saddr;
+	if (match->family == AF_INET) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		if (info->side == XT_RECENT_DEST)
+			addr.ip = iph->daddr;
+		else
+			addr.ip = iph->saddr;
+
+		ttl = iph->ttl;
+	} else {
+		const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+		if (info->side == XT_RECENT_DEST)
+			memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6));
+		else
+			memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6));
+
+		ttl = iph->hop_limit;
+	}
 
-	ttl = ip_hdr(skb)->ttl;
 	/* use TTL as seen before forwarding */
 	if (out && !skb->sk)
 		ttl++;
 
 	spin_lock_bh(&recent_lock);
 	t = recent_table_lookup(info->name);
-	e = recent_entry_lookup(t, addr,
-				info->check_set & XT_RECENT_TTL ? ttl : 0);
+	e = recent_entry_lookup(t, &addr, match->family,
+				(info->check_set & XT_RECENT_TTL) ? ttl : 0);
 	if (e == NULL) {
 		if (!(info->check_set & XT_RECENT_SET))
 			goto out;
-		e = recent_entry_init(t, addr, ttl);
+		e = recent_entry_init(t, &addr, match->family, ttl);
 		if (e == NULL)
 			*hotdrop = true;
 		ret = !ret;
@@ -277,11 +324,24 @@ recent_mt_check(const char *tablename, const void *ip,
 	for (i = 0; i < ip_list_hash_size; i++)
 		INIT_LIST_HEAD(&t->iphash[i]);
 #ifdef CONFIG_PROC_FS
-	t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops);
+	t->proc = proc_create(t->name, ip_list_perms, recent_proc_dir,
+		  &recent_mt_fops);
 	if (t->proc == NULL) {
 		kfree(t);
 		goto out;
 	}
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+	t->proc_old = proc_create(t->name, ip_list_perms, proc_old_dir,
+		      &recent_old_fops);
+	if (t->proc_old == NULL) {
+		remove_proc_entry(t->name, proc_old_dir);
+		kfree(t);
+		goto out;
+	}
+	t->proc_old->uid   = ip_list_uid;
+	t->proc_old->gid   = ip_list_gid;
+	t->proc_old->data  = t;
+#endif
 	t->proc->uid       = ip_list_uid;
 	t->proc->gid       = ip_list_gid;
 	t->proc->data      = t;
@@ -307,7 +367,10 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
 		list_del(&t->list);
 		spin_unlock_bh(&recent_lock);
 #ifdef CONFIG_PROC_FS
-		remove_proc_entry(t->name, proc_dir);
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+		remove_proc_entry(t->name, proc_old_dir);
+#endif
+		remove_proc_entry(t->name, recent_proc_dir);
 #endif
 		recent_table_flush(t);
 		kfree(t);
@@ -317,7 +380,7 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo)
 
 #ifdef CONFIG_PROC_FS
 struct recent_iter_state {
-	struct recent_table	*table;
+	const struct recent_table *table;
 	unsigned int		bucket;
 };
 
@@ -342,8 +405,8 @@ static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct recent_iter_state *st = seq->private;
 	const struct recent_table *t = st->table;
-	struct recent_entry *e = v;
-	struct list_head *head = e->list.next;
+	const struct recent_entry *e = v;
+	const struct list_head *head = e->list.next;
 
 	while (head == &t->iphash[st->bucket]) {
 		if (++st->bucket >= ip_list_hash_size)
@@ -366,8 +429,14 @@ static int recent_seq_show(struct seq_file *seq, void *v)
 	unsigned int i;
 
 	i = (e->index - 1) % ip_pkt_list_tot;
-	seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u",
-		   NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index);
+	if (e->family == AF_INET)
+		seq_printf(seq, "src=" NIPQUAD_FMT " ttl: %u last_seen: %lu "
+			   "oldest_pkt: %u", NIPQUAD(e->addr.ip), e->ttl,
+			   e->stamps[i], e->index);
+	else
+		seq_printf(seq, "src=" NIP6_FMT " ttl: %u last_seen: %lu "
+			   "oldest_pkt: %u", NIP6(e->addr.in6), e->ttl,
+			   e->stamps[i], e->index);
 	for (i = 0; i < e->nstamps; i++)
 		seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]);
 	seq_printf(seq, "\n");
@@ -394,8 +463,22 @@ static int recent_seq_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static ssize_t recent_proc_write(struct file *file, const char __user *input,
-				 size_t size, loff_t *loff)
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+static int recent_old_seq_open(struct inode *inode, struct file *filp)
+{
+	static bool warned_of_old;
+
+	if (unlikely(!warned_of_old)) {
+		printk(KERN_INFO KBUILD_MODNAME ": Use of /proc/net/ipt_recent"
+		       " is deprecated; use /proc/net/xt_recent.\n");
+		warned_of_old = true;
+	}
+	return recent_seq_open(inode, filp);
+}
+
+static ssize_t recent_old_proc_write(struct file *file,
+				     const char __user *input,
+				     size_t size, loff_t *loff)
 {
 	const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
 	struct recent_table *t = pde->data;
@@ -408,6 +491,7 @@ static ssize_t recent_proc_write(struct file *file, const char __user *input,
 		size = sizeof(buf);
 	if (copy_from_user(buf, input, size))
 		return -EFAULT;
+
 	while (isspace(*c))
 		c++;
 
@@ -435,10 +519,10 @@ static ssize_t recent_proc_write(struct file *file, const char __user *input,
 	addr = in_aton(c);
 
 	spin_lock_bh(&recent_lock);
-	e = recent_entry_lookup(t, addr, 0);
+	e = recent_entry_lookup(t, (const void *)&addr, PF_INET, 0);
 	if (e == NULL) {
 		if (add)
-			recent_entry_init(t, addr, 0);
+			recent_entry_init(t, (const void *)&addr, PF_INET, 0);
 	} else {
 		if (add)
 			recent_entry_update(t, e);
@@ -449,23 +533,118 @@ static ssize_t recent_proc_write(struct file *file, const char __user *input,
 	return size;
 }
 
-static const struct file_operations recent_fops = {
-	.open		= recent_seq_open,
+static const struct file_operations recent_old_fops = {
+	.open		= recent_old_seq_open,
 	.read		= seq_read,
-	.write		= recent_proc_write,
+	.write		= recent_old_proc_write,
 	.release	= seq_release_private,
 	.owner		= THIS_MODULE,
 };
+#endif
+
+static ssize_t
+recent_mt_proc_write(struct file *file, const char __user *input,
+		     size_t size, loff_t *loff)
+{
+	const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+	struct recent_table *t = pde->data;
+	struct recent_entry *e;
+	char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
+	const char *c = buf;
+	union nf_inet_addr addr;
+	u_int16_t family;
+	bool add, succ;
+
+	if (size == 0)
+		return 0;
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+	if (copy_from_user(buf, input, size) != 0)
+		return -EFAULT;
+
+	/* Strict protocol! */
+	if (*loff != 0)
+		return -ESPIPE;
+	switch (*c) {
+	case '/': /* flush table */
+		spin_lock_bh(&recent_lock);
+		recent_table_flush(t);
+		spin_unlock_bh(&recent_lock);
+		return size;
+	case '-': /* remove address */
+		add = false;
+		break;
+	case '+': /* add address */
+		add = true;
+		break;
+	default:
+		printk(KERN_INFO KBUILD_MODNAME ": Need +ip, -ip or /\n");
+		return -EINVAL;
+	}
+
+	++c;
+	--size;
+	if (strnchr(c, size, ':') != NULL) {
+		family = AF_INET6;
+		succ   = in6_pton(c, size, (void *)&addr, '\n', NULL);
+	} else {
+		family = AF_INET;
+		succ   = in4_pton(c, size, (void *)&addr, '\n', NULL);
+	}
+
+	if (!succ) {
+		printk(KERN_INFO KBUILD_MODNAME ": illegal address written "
+		       "to procfs\n");
+		return -EINVAL;
+	}
+
+	spin_lock_bh(&recent_lock);
+	e = recent_entry_lookup(t, &addr, family, 0);
+	if (e == NULL) {
+		if (add)
+			recent_entry_init(t, &addr, family, 0);
+	} else {
+		if (add)
+			recent_entry_update(t, e);
+		else
+			recent_entry_remove(t, e);
+	}
+	spin_unlock_bh(&recent_lock);
+	/* Note we removed one above */
+	*loff += size + 1;
+	return size + 1;
+}
+
+static const struct file_operations recent_mt_fops = {
+	.open    = recent_seq_open,
+	.read    = seq_read,
+	.write   = recent_mt_proc_write,
+	.release = seq_release_private,
+	.owner   = THIS_MODULE,
+};
 #endif /* CONFIG_PROC_FS */
 
-static struct xt_match recent_mt_reg __read_mostly = {
-	.name		= "recent",
-	.family		= AF_INET,
-	.match		= recent_mt,
-	.matchsize	= sizeof(struct xt_recent_mtinfo),
-	.checkentry	= recent_mt_check,
-	.destroy	= recent_mt_destroy,
-	.me		= THIS_MODULE,
+static struct xt_match recent_mt_reg[] __read_mostly = {
+	{
+		.name       = "recent",
+		.revision   = 0,
+		.family     = AF_INET,
+		.match      = recent_mt,
+		.matchsize  = sizeof(struct xt_recent_mtinfo),
+		.checkentry = recent_mt_check,
+		.destroy    = recent_mt_destroy,
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "recent",
+		.revision   = 0,
+		.family     = AF_INET6,
+		.match      = recent_mt,
+		.matchsize  = sizeof(struct xt_recent_mtinfo),
+		.checkentry = recent_mt_check,
+		.destroy    = recent_mt_destroy,
+		.me         = THIS_MODULE,
+	},
 };
 
 static int __init recent_mt_init(void)
@@ -476,15 +655,25 @@ static int __init recent_mt_init(void)
 		return -EINVAL;
 	ip_list_hash_size = 1 << fls(ip_list_tot);
 
-	err = xt_register_match(&recent_mt_reg);
+	err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
 #ifdef CONFIG_PROC_FS
 	if (err)
 		return err;
-	proc_dir = proc_mkdir("ipt_recent", init_net.proc_net);
-	if (proc_dir == NULL) {
-		xt_unregister_match(&recent_mt_reg);
+	recent_proc_dir = proc_mkdir("xt_recent", init_net.proc_net);
+	if (recent_proc_dir == NULL) {
+		xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+		err = -ENOMEM;
+	}
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+	if (err < 0)
+		return err;
+	proc_old_dir = proc_mkdir("ipt_recent", init_net.proc_net);
+	if (proc_old_dir == NULL) {
+		remove_proc_entry("xt_recent", init_net.proc_net);
+		xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
 		err = -ENOMEM;
 	}
+#endif
 #endif
 	return err;
 }
@@ -492,9 +681,12 @@ static int __init recent_mt_init(void)
 static void __exit recent_mt_exit(void)
 {
 	BUG_ON(!list_empty(&tables));
-	xt_unregister_match(&recent_mt_reg);
+	xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
 #ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
 	remove_proc_entry("ipt_recent", init_net.proc_net);
+#endif
+	remove_proc_entry("xt_recent", init_net.proc_net);
 #endif
 }
 
-- 
cgit v1.2.3-70-g09d2


From d2f26037a38ada4a5d40d1cf0b32bc5289f50312 Mon Sep 17 00:00:00 2001
From: KOVACS Krisztian <hidden@sch.bme.hu>
Date: Wed, 8 Oct 2008 11:35:12 +0200
Subject: netfilter: Add documentation for tproxy

Add basic usage instructions to Documentation/networking.

Signed-off-by: KOVACS Krisztian <hidden@sch.bme.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 Documentation/networking/tproxy.txt | 85 +++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 Documentation/networking/tproxy.txt

(limited to 'Documentation')

diff --git a/Documentation/networking/tproxy.txt b/Documentation/networking/tproxy.txt
new file mode 100644
index 00000000000..7b5996d9357
--- /dev/null
+++ b/Documentation/networking/tproxy.txt
@@ -0,0 +1,85 @@
+Transparent proxy support
+=========================
+
+This feature adds Linux 2.2-like transparent proxy support to current kernels.
+To use it, enable NETFILTER_TPROXY, the socket match and the TPROXY target in
+your kernel config. You will need policy routing too, so be sure to enable that
+as well.
+
+
+1. Making non-local sockets work
+================================
+
+The idea is that you identify packets with destination address matching a local
+socket on your box, set the packet mark to a certain value, and then match on that
+value using policy routing to have those packets delivered locally:
+
+# iptables -t mangle -N DIVERT
+# iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT
+# iptables -t mangle -A DIVERT -j MARK --set-mark 1
+# iptables -t mangle -A DIVERT -j ACCEPT
+
+# ip rule add fwmark 1 lookup 100
+# ip route add local 0.0.0.0/0 dev lo table 100
+
+Because of certain restrictions in the IPv4 routing output code you'll have to
+modify your application to allow it to send datagrams _from_ non-local IP
+addresses. All you have to do is enable the (SOL_IP, IP_TRANSPARENT) socket
+option before calling bind:
+
+fd = socket(AF_INET, SOCK_STREAM, 0);
+/* - 8< -*/
+int value = 1;
+setsockopt(fd, SOL_IP, IP_TRANSPARENT, &value, sizeof(value));
+/* - 8< -*/
+name.sin_family = AF_INET;
+name.sin_port = htons(0xCAFE);
+name.sin_addr.s_addr = htonl(0xDEADBEEF);
+bind(fd, &name, sizeof(name));
+
+A trivial patch for netcat is available here:
+http://people.netfilter.org/hidden/tproxy/netcat-ip_transparent-support.patch
+
+
+2. Redirecting traffic
+======================
+
+Transparent proxying often involves "intercepting" traffic on a router. This is
+usually done with the iptables REDIRECT target; however, there are serious
+limitations of that method. One of the major issues is that it actually
+modifies the packets to change the destination address -- which might not be
+acceptable in certain situations. (Think of proxying UDP for example: you won't
+be able to find out the original destination address. Even in case of TCP
+getting the original destination address is racy.)
+
+The 'TPROXY' target provides similar functionality without relying on NAT. Simply
+add rules like this to the iptables ruleset above:
+
+# iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \
+  --tproxy-mark 0x1/0x1 --on-port 50080
+
+Note that for this to work you'll have to modify the proxy to enable (SOL_IP,
+IP_TRANSPARENT) for the listening socket.
+
+
+3. Iptables extensions
+======================
+
+To use tproxy you'll need to have the 'socket' and 'TPROXY' modules
+compiled for iptables. A patched version of iptables is available
+here: http://git.balabit.hu/?p=bazsi/iptables-tproxy.git
+
+
+4. Application support
+======================
+
+4.1. Squid
+----------
+
+Squid 3.HEAD has support built-in. To use it, pass
+'--enable-linux-netfilter' to configure and set the 'tproxy' option on
+the HTTP listener you redirect traffic to with the TPROXY iptables
+target.
+
+For more information please consult the following page on the Squid
+wiki: http://wiki.squid-cache.org/Features/Tproxy4
-- 
cgit v1.2.3-70-g09d2