42 files changed, 1863 insertions, 920 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 8af508536d3..3a8c8fd63c8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -628,7 +628,7 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
 	netdev_features_t features)
 {
 	struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
-	u32 old_features = features;
+	netdev_features_t old_features = features;
 
 	features &= real_dev->vlan_features;
 	features |= NETIF_F_RXCSUM;
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 8e15d966d9b..239992021b1 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -837,6 +837,19 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
 
 	dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst);
 	if (dat_entry) {
+		/* If the ARP request is destined for a local client the local
+		 * client will answer itself. DAT would only generate a
+		 * duplicate packet.
+		 *
+		 * Moreover, if the soft-interface is enslaved into a bridge, an
+		 * additional DAT answer may trigger kernel warnings about
+		 * a packet coming from the wrong port.
+		 */
+		if (batadv_is_my_client(bat_priv, dat_entry->mac_addr)) {
+			ret = true;
+			goto out;
+		}
+
 		skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
 				     bat_priv->soft_iface, ip_dst, hw_src,
 				     dat_entry->mac_addr, hw_src);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 3e30a0f1b90..1240f07ad31 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -163,14 +163,22 @@ void batadv_mesh_free(struct net_device *soft_iface)
 	batadv_vis_quit(bat_priv);
 
 	batadv_gw_node_purge(bat_priv);
-	batadv_originator_free(bat_priv);
 	batadv_nc_free(bat_priv);
+	batadv_dat_free(bat_priv);
+	batadv_bla_free(bat_priv);
 
+	/* Free the TT and the originator tables only after having terminated
+	 * all the other depending components which may use these structures for
+	 * their purposes.
+	 */
 	batadv_tt_free(bat_priv);
 
-	batadv_bla_free(bat_priv);
-
-	batadv_dat_free(bat_priv);
+	/* Since the originator table clean up routine is accessing the TT
+	 * tables as well, it has to be invoked after the TT tables have been
+	 * freed and marked as empty. This ensures that no cleanup RCU callbacks
+	 * accessing the TT data are scheduled for later execution.
+	 */
+	batadv_originator_free(bat_priv);
 
 	free_percpu(bat_priv->bat_counters);
 
@@ -475,7 +483,7 @@ static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
 	char *algo_name = (char *)val;
 	size_t name_len = strlen(algo_name);
 
-	if (algo_name[name_len - 1] == '\n')
+	if (name_len > 0 && algo_name[name_len - 1] == '\n')
 		algo_name[name_len - 1] = '\0';
 
 	bat_algo_ops = batadv_algo_get(algo_name);
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index f7c54305a91..e84629ece9b 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1514,6 +1514,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
 	struct ethhdr *ethhdr, ethhdr_tmp;
 	uint8_t *orig_dest, ttl, ttvn;
 	unsigned int coding_len;
+	int err;
 
 	/* Save headers temporarily */
 	memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp));
@@ -1568,8 +1569,11 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
 			 coding_len);
 
 	/* Resize decoded skb if decoded with larger packet */
-	if (nc_packet->skb->len > coding_len + h_size)
-		pskb_trim_rcsum(skb, coding_len + h_size);
+	if (nc_packet->skb->len > coding_len + h_size) {
+		err = pskb_trim_rcsum(skb, coding_len + h_size);
+		if (err)
+			return NULL;
+	}
 
 	/* Create decoded unicast packet */
 	unicast_packet = (struct batadv_unicast_packet *)skb->data;
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index c3530a81a33..950663d4d33 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -107,7 +107,7 @@ static void br_tcn_timer_expired(unsigned long arg)
 
 	br_debug(br, "tcn timer expired\n");
 	spin_lock(&br->lock);
-	if (br->dev->flags & IFF_UP) {
+	if (!br_is_root_bridge(br) && (br->dev->flags & IFF_UP)) {
 		br_transmit_tcn(br);
 
 		mod_timer(&br->tcn_timer,jiffies + br->bridge_hello_time);
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index e87ef435e11..958d9856912 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	crypto.o armor.o \
 	auth_x.o \
 	ceph_fs.o ceph_strings.o ceph_hash.o \
-	pagevec.o
+	pagevec.o snapshot.o
 
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index b4bf4ac090f..6b923bcaa2a 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -47,6 +47,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
 	if (!ac)
 		goto out;
 
+	mutex_init(&ac->mutex);
 	ac->negotiating = true;
 	if (name)
 		ac->name = name;
@@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac)
  */
 void ceph_auth_reset(struct ceph_auth_client *ac)
 {
+	mutex_lock(&ac->mutex);
 	dout("auth_reset %p\n", ac);
 	if (ac->ops && !ac->negotiating)
 		ac->ops->reset(ac);
 	ac->negotiating = true;
+	mutex_unlock(&ac->mutex);
 }
 
 int ceph_entity_name_encode(const char *name, void **p, void *end)
@@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
 	int i, num;
 	int ret;
 
+	mutex_lock(&ac->mutex);
 	dout("auth_build_hello\n");
 	monhdr->have_version = 0;
 	monhdr->session_mon = cpu_to_le16(-1);
@@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
 
 	ret = ceph_entity_name_encode(ac->name, &p, end);
 	if (ret < 0)
-		return ret;
+		goto out;
 	ceph_decode_need(&p, end, sizeof(u64), bad);
 	ceph_encode_64(&p, ac->global_id);
 
 	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
-	return p - buf;
+	ret = p - buf;
+out:
+	mutex_unlock(&ac->mutex);
+	return ret;
 
 bad:
-	return -ERANGE;
+	ret = -ERANGE;
+	goto out;
 }
 
 static int ceph_build_auth_request(struct ceph_auth_client *ac,
@@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac,
 	if (ret < 0) {
 		pr_err("error %d building auth method %s request\n", ret,
 		       ac->ops->name);
-		return ret;
+		goto out;
 	}
 	dout(" built request %d bytes\n", ret);
 	ceph_encode_32(&p, ret);
-	return p + ret - msg_buf;
+	ret = p + ret - msg_buf;
+out:
+	return ret;
 }
 
 /*
@@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 	int result_msg_len;
 	int ret = -EINVAL;
 
+	mutex_lock(&ac->mutex);
 	dout("handle_auth_reply %p %p\n", p, end);
 	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
 	protocol = ceph_decode_32(&p);
@@ -227,33 +238,103 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 
 	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
 	if (ret == -EAGAIN) {
-		return ceph_build_auth_request(ac, reply_buf, reply_len);
+		ret = ceph_build_auth_request(ac, reply_buf, reply_len);
 	} else if (ret) {
 		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-		return ret;
 	}
-	return 0;
 
-bad:
-	pr_err("failed to decode auth msg\n");
 out:
+	mutex_unlock(&ac->mutex);
 	return ret;
+
+bad:
+	pr_err("failed to decode auth msg\n");
+	ret = -EINVAL;
+	goto out;
 }
 
 int ceph_build_auth(struct ceph_auth_client *ac,
 		    void *msg_buf, size_t msg_len)
 {
+	int ret = 0;
+
+	mutex_lock(&ac->mutex);
 	if (!ac->protocol)
-		return ceph_auth_build_hello(ac, msg_buf, msg_len);
-	BUG_ON(!ac->ops);
-	if (ac->ops->should_authenticate(ac))
-		return ceph_build_auth_request(ac, msg_buf, msg_len);
-	return 0;
+		ret = ceph_auth_build_hello(ac, msg_buf, msg_len);
+	else if (ac->ops->should_authenticate(ac))
+		ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+	mutex_unlock(&ac->mutex);
+	return ret;
 }
 
 int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
 {
-	if (!ac->ops)
-		return 0;
-	return ac->ops->is_authenticated(ac);
+	int ret = 0;
+
+	mutex_lock(&ac->mutex);
+	if (ac->ops)
+		ret = ac->ops->is_authenticated(ac);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_auth_is_authenticated);
+
+int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
+				int peer_type,
+				struct ceph_auth_handshake *auth)
+{
+	int ret = 0;
+
+	mutex_lock(&ac->mutex);
+	if (ac->ops && ac->ops->create_authorizer)
+		ret = ac->ops->create_authorizer(ac, peer_type, auth);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_auth_create_authorizer);
+
+void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
+				  struct ceph_authorizer *a)
+{
+	mutex_lock(&ac->mutex);
+	if (ac->ops && ac->ops->destroy_authorizer)
+		ac->ops->destroy_authorizer(ac, a);
+	mutex_unlock(&ac->mutex);
+}
+EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
+
+int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
+				int peer_type,
+				struct ceph_auth_handshake *a)
+{
+	int ret = 0;
+
+	mutex_lock(&ac->mutex);
+	if (ac->ops && ac->ops->update_authorizer)
+		ret = ac->ops->update_authorizer(ac, peer_type, a);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_auth_update_authorizer);
+
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a, size_t len)
+{
+	int ret = 0;
+
+	mutex_lock(&ac->mutex);
+	if (ac->ops && ac->ops->verify_authorizer_reply)
+		ret = ac->ops->verify_authorizer_reply(ac, a, len);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply);
+
+void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
+{
+	mutex_lock(&ac->mutex);
+	if (ac->ops && ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, peer_type);
+	mutex_unlock(&ac->mutex);
 }
+EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a16bf14eb02..96238ba95f2 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 			return -ENOMEM;
 	}
 	au->service = th->service;
+	au->secret_id = th->secret_id;
 
 	msg_a = au->buf->vec.iov_base;
 	msg_a->struct_v = 1;
@@ -555,6 +556,26 @@ static int ceph_x_create_authorizer(
 	return 0;
 }
 
+static int ceph_x_update_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_auth_handshake *auth)
+{
+	struct ceph_x_authorizer *au;
+	struct ceph_x_ticket_handler *th;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	au = (struct ceph_x_authorizer *)auth->authorizer;
+	if (au->secret_id < th->secret_id) {
+		dout("ceph_x_update_authorizer service %u secret %llu < %llu\n",
+		     au->service, au->secret_id, th->secret_id);
+		return ceph_x_build_authorizer(ac, th, au);
+	}
+	return 0;
+}
+
 static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
 					  struct ceph_authorizer *a, size_t len)
 {
@@ -630,7 +651,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
 
 	th = get_ticket_handler(ac, peer_type);
 	if (!IS_ERR(th))
-		remove_ticket_handler(ac, th);
+		memset(&th->validity, 0, sizeof(th->validity));
 }
 
 
@@ -641,6 +662,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = {
 	.build_request = ceph_x_build_request,
 	.handle_reply = ceph_x_handle_reply,
 	.create_authorizer = ceph_x_create_authorizer,
+	.update_authorizer = ceph_x_update_authorizer,
 	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
 	.destroy_authorizer = ceph_x_destroy_authorizer,
 	.invalidate_authorizer = ceph_x_invalidate_authorizer,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index f459e93b774..c5a058da7ac 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -29,6 +29,7 @@ struct ceph_x_authorizer {
 	struct ceph_buffer *buf;
 	unsigned int service;
 	u64 nonce;
+	u64 secret_id;
 	char reply_buf[128];  /* big enough for encrypted blob */
 };
 
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index e65e6e4be38..34b11ee8124 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -606,11 +606,17 @@ static int __init init_ceph_lib(void)
 	if (ret < 0)
 		goto out_crypto;
 
+	ret = ceph_osdc_setup();
+	if (ret < 0)
+		goto out_msgr;
+
 	pr_info("loaded (mon/osd proto %d/%d)\n",
 		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
 
 	return 0;
 
+out_msgr:
+	ceph_msgr_exit();
 out_crypto:
 	ceph_crypto_shutdown();
 out_debugfs:
@@ -622,6 +628,7 @@ out:
 static void __exit exit_ceph_lib(void)
 {
 	dout("exit_ceph_lib\n");
+	ceph_osdc_cleanup();
 	ceph_msgr_exit();
 	ceph_crypto_shutdown();
 	ceph_debugfs_cleanup();
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 00d051f4894..83661cdc076 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -123,8 +123,8 @@ static int osdc_show(struct seq_file *s, void *pp)
 	mutex_lock(&osdc->request_mutex);
 	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
 		struct ceph_osd_request *req;
+		unsigned int i;
 		int opcode;
-		int i;
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
 
@@ -142,7 +142,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 			seq_printf(s, "\t");
 
 		for (i = 0; i < req->r_num_ops; i++) {
-			opcode = le16_to_cpu(req->r_request_ops[i].op);
+			opcode = req->r_ops[i].op;
 			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
 		}
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 2c0669fb54e..eb0a46a49bd 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -21,6 +21,9 @@
 #include <linux/ceph/pagelist.h>
 #include <linux/export.h>
 
+#define list_entry_next(pos, member)					\
+	list_entry(pos->member.next, typeof(*pos), member)
+
 /*
  * Ceph uses the messenger to exchange ceph_msg messages with other
  * hosts in the system.  The messenger provides ordered and reliable
@@ -149,6 +152,11 @@ static bool con_flag_test_and_set(struct ceph_connection *con,
 	return test_and_set_bit(con_flag, &con->flags);
 }
 
+/* Slab caches for frequently-allocated structures */
+
+static struct kmem_cache	*ceph_msg_cache;
+static struct kmem_cache	*ceph_msg_data_cache;
+
 /* static tag bytes (protocol control messages) */
 static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -223,6 +231,41 @@ static void encode_my_addr(struct ceph_messenger *msgr)
  */
 static struct workqueue_struct *ceph_msgr_wq;
 
+static int ceph_msgr_slab_init(void)
+{
+	BUG_ON(ceph_msg_cache);
+	ceph_msg_cache = kmem_cache_create("ceph_msg",
+					sizeof (struct ceph_msg),
+					__alignof__(struct ceph_msg), 0, NULL);
+
+	if (!ceph_msg_cache)
+		return -ENOMEM;
+
+	BUG_ON(ceph_msg_data_cache);
+	ceph_msg_data_cache = kmem_cache_create("ceph_msg_data",
+					sizeof (struct ceph_msg_data),
+					__alignof__(struct ceph_msg_data),
+					0, NULL);
+	if (ceph_msg_data_cache)
+		return 0;
+
+	kmem_cache_destroy(ceph_msg_cache);
+	ceph_msg_cache = NULL;
+
+	return -ENOMEM;
+}
+
+static void ceph_msgr_slab_exit(void)
+{
+	BUG_ON(!ceph_msg_data_cache);
+	kmem_cache_destroy(ceph_msg_data_cache);
+	ceph_msg_data_cache = NULL;
+
+	BUG_ON(!ceph_msg_cache);
+	kmem_cache_destroy(ceph_msg_cache);
+	ceph_msg_cache = NULL;
+}
+
 static void _ceph_msgr_exit(void)
 {
 	if (ceph_msgr_wq) {
@@ -230,6 +273,8 @@ static void _ceph_msgr_exit(void)
 		ceph_msgr_wq = NULL;
 	}
 
+	ceph_msgr_slab_exit();
+
 	BUG_ON(zero_page == NULL);
 	kunmap(zero_page);
 	page_cache_release(zero_page);
@@ -242,6 +287,9 @@ int ceph_msgr_init(void)
 	zero_page = ZERO_PAGE(0);
 	page_cache_get(zero_page);
 
+	if (ceph_msgr_slab_init())
+		return -ENOMEM;
+
 	ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
 	if (ceph_msgr_wq)
 		return 0;
@@ -471,6 +519,22 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
 	return r;
 }
 
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+		     int page_offset, size_t length)
+{
+	void *kaddr;
+	int ret;
+
+	BUG_ON(page_offset + length > PAGE_SIZE);
+
+	kaddr = kmap(page);
+	BUG_ON(!kaddr);
+	ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length);
+	kunmap(page);
+
+	return ret;
+}
+
 /*
  * write something.  @more is true if caller will be sending more data
  * shortly.
@@ -493,7 +557,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
 }
 
 static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
-		     int offset, size_t size, int more)
+		     int offset, size_t size, bool more)
 {
 	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
 	int ret;
@@ -697,50 +761,397 @@ static void con_out_kvec_add(struct ceph_connection *con,
 }
 
 #ifdef CONFIG_BLOCK
-static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+
+/*
+ * For a bio data item, a piece is whatever remains of the next
+ * entry in the current bio iovec, or the first entry in the next
+ * bio in the list.
+ */
+static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
+					size_t length)
 {
-	if (!bio) {
-		*iter = NULL;
-		*seg = 0;
-		return;
+	struct ceph_msg_data *data = cursor->data;
+	struct bio *bio;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+
+	bio = data->bio;
+	BUG_ON(!bio);
+	BUG_ON(!bio->bi_vcnt);
+
+	cursor->resid = min(length, data->bio_length);
+	cursor->bio = bio;
+	cursor->vector_index = 0;
+	cursor->vector_offset = 0;
+	cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
+}
+
+static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
+						size_t *page_offset,
+						size_t *length)
+{
+	struct ceph_msg_data *data = cursor->data;
+	struct bio *bio;
+	struct bio_vec *bio_vec;
+	unsigned int index;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+
+	bio = cursor->bio;
+	BUG_ON(!bio);
+
+	index = cursor->vector_index;
+	BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+
+	bio_vec = &bio->bi_io_vec[index];
+	BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
+	*page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
+	BUG_ON(*page_offset >= PAGE_SIZE);
+	if (cursor->last_piece) /* pagelist offset is always 0 */
+		*length = cursor->resid;
+	else
+		*length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
+	BUG_ON(*length > cursor->resid);
+	BUG_ON(*page_offset + *length > PAGE_SIZE);
+
+	return bio_vec->bv_page;
+}
+
+static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
+					size_t bytes)
+{
+	struct bio *bio;
+	struct bio_vec *bio_vec;
+	unsigned int index;
+
+	BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
+
+	bio = cursor->bio;
+	BUG_ON(!bio);
+
+	index = cursor->vector_index;
+	BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+	bio_vec = &bio->bi_io_vec[index];
+
+	/* Advance the cursor offset */
+
+	BUG_ON(cursor->resid < bytes);
+	cursor->resid -= bytes;
+	cursor->vector_offset += bytes;
+	if (cursor->vector_offset < bio_vec->bv_len)
+		return false;	/* more bytes to process in this segment */
+	BUG_ON(cursor->vector_offset != bio_vec->bv_len);
+
+	/* Move on to the next segment, and possibly the next bio */
+
+	if (++index == (unsigned int) bio->bi_vcnt) {
+		bio = bio->bi_next;
+		index = 0;
 	}
-	*iter = bio;
-	*seg = bio->bi_idx;
+	cursor->bio = bio;
+	cursor->vector_index = index;
+	cursor->vector_offset = 0;
+
+	if (!cursor->last_piece) {
+		BUG_ON(!cursor->resid);
+		BUG_ON(!bio);
+		/* A short read is OK, so use <= rather than == */
+		if (cursor->resid <= bio->bi_io_vec[index].bv_len)
+			cursor->last_piece = true;
+	}
+
+	return true;
 }
+#endif /* CONFIG_BLOCK */
 
-static void iter_bio_next(struct bio **bio_iter, int *seg)
+/*
+ * For a page array, a piece comes from the first page in the array
+ * that has not already been fully consumed.
+ */
+static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
+					size_t length)
 {
-	if (*bio_iter == NULL)
-		return;
+	struct ceph_msg_data *data = cursor->data;
+	int page_count;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
 
-	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+	BUG_ON(!data->pages);
+	BUG_ON(!data->length);
 
-	(*seg)++;
-	if (*seg == (*bio_iter)->bi_vcnt)
-		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+	cursor->resid = min(length, data->length);
+	page_count = calc_pages_for(data->alignment, (u64)data->length);
+	cursor->page_offset = data->alignment & ~PAGE_MASK;
+	cursor->page_index = 0;
+	BUG_ON(page_count > (int)USHRT_MAX);
+	cursor->page_count = (unsigned short)page_count;
+	BUG_ON(length > SIZE_MAX - cursor->page_offset);
+	cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE;
 }
-#endif
 
-static void prepare_write_message_data(struct ceph_connection *con)
+static struct page *
+ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
+					size_t *page_offset, size_t *length)
 {
-	struct ceph_msg *msg = con->out_msg;
+	struct ceph_msg_data *data = cursor->data;
 
-	BUG_ON(!msg);
-	BUG_ON(!msg->hdr.data_len);
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+
+	BUG_ON(cursor->page_index >= cursor->page_count);
+	BUG_ON(cursor->page_offset >= PAGE_SIZE);
+
+	*page_offset = cursor->page_offset;
+	if (cursor->last_piece)
+		*length = cursor->resid;
+	else
+		*length = PAGE_SIZE - *page_offset;
+
+	return data->pages[cursor->page_index];
+}
+
+static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
+						size_t bytes)
+{
+	BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES);
+
+	BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
+
+	/* Advance the cursor page offset */
+
+	cursor->resid -= bytes;
+	cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK;
+	if (!bytes || cursor->page_offset)
+		return false;	/* more bytes to process in the current page */
+
+	/* Move on to the next page; offset is already at 0 */
+
+	BUG_ON(cursor->page_index >= cursor->page_count);
+	cursor->page_index++;
+	cursor->last_piece = cursor->resid <= PAGE_SIZE;
+
+	return true;
+}
+
+/*
+ * For a pagelist, a piece is whatever remains to be consumed in the
+ * first page in the list, or the front of the next page.
+ */
+static void
+ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
+					size_t length)
+{
+	struct ceph_msg_data *data = cursor->data;
+	struct ceph_pagelist *pagelist;
+	struct page *page;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+	pagelist = data->pagelist;
+	BUG_ON(!pagelist);
+
+	if (!length)
+		return;		/* pagelist can be assigned but empty */
+
+	BUG_ON(list_empty(&pagelist->head));
+	page = list_first_entry(&pagelist->head, struct page, lru);
+
+	cursor->resid = min(length, pagelist->length);
+	cursor->page = page;
+	cursor->offset = 0;
+	cursor->last_piece = cursor->resid <= PAGE_SIZE;
+}
+
+static struct page *
+ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
+				size_t *page_offset, size_t *length)
+{
+	struct ceph_msg_data *data = cursor->data;
+	struct ceph_pagelist *pagelist;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
 
-	/* initialize page iterator */
-	con->out_msg_pos.page = 0;
-	if (msg->pages)
-		con->out_msg_pos.page_pos = msg->page_alignment;
+	pagelist = data->pagelist;
+	BUG_ON(!pagelist);
+
+	BUG_ON(!cursor->page);
+	BUG_ON(cursor->offset + cursor->resid != pagelist->length);
+
+	/* offset of first page in pagelist is always 0 */
+	*page_offset = cursor->offset & ~PAGE_MASK;
+	if (cursor->last_piece)
+		*length = cursor->resid;
 	else
-		con->out_msg_pos.page_pos = 0;
+		*length = PAGE_SIZE - *page_offset;
+
+	return cursor->page;
+}
+
+static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
+						size_t bytes)
+{
+	struct ceph_msg_data *data = cursor->data;
+	struct ceph_pagelist *pagelist;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+	pagelist = data->pagelist;
+	BUG_ON(!pagelist);
+
+	BUG_ON(cursor->offset + cursor->resid != pagelist->length);
+	BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
+
+	/* Advance the cursor offset */
+
+	cursor->resid -= bytes;
+	cursor->offset += bytes;
+	/* offset of first page in pagelist is always 0 */
+	if (!bytes || cursor->offset & ~PAGE_MASK)
+		return false;	/* more bytes to process in the current page */
+
+	/* Move on to the next page */
+
+	BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
+	cursor->page = list_entry_next(cursor->page, lru);
+	cursor->last_piece = cursor->resid <= PAGE_SIZE;
+
+	return true;
+}
+
+/*
+ * Message data is handled (sent or received) in pieces, where each
+ * piece resides on a single page.  The network layer might not
+ * consume an entire piece at once.  A data item's cursor keeps
+ * track of which piece is next to process and how much remains to
+ * be processed in that piece.  It also tracks whether the current
+ * piece is the last one in the data item.
+ */
+static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
+{
+	size_t length = cursor->total_resid;
+
+	switch (cursor->data->type) {
+	case CEPH_MSG_DATA_PAGELIST:
+		ceph_msg_data_pagelist_cursor_init(cursor, length);
+		break;
+	case CEPH_MSG_DATA_PAGES:
+		ceph_msg_data_pages_cursor_init(cursor, length);
+		break;
 #ifdef CONFIG_BLOCK
-	if (msg->bio)
-		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
-#endif
-	con->out_msg_pos.data_pos = 0;
-	con->out_msg_pos.did_page_crc = false;
-	con->out_more = 1;  /* data + footer will follow */
+	case CEPH_MSG_DATA_BIO:
+		ceph_msg_data_bio_cursor_init(cursor, length);
+		break;
+#endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_NONE:
+	default:
+		/* BUG(); */
+		break;
+	}
+	cursor->need_crc = true;
+}
+
+static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+{
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	struct ceph_msg_data *data;
+
+	BUG_ON(!length);
+	BUG_ON(length > msg->data_length);
+	BUG_ON(list_empty(&msg->data));
+
+	cursor->data_head = &msg->data;
+	cursor->total_resid = length;
+	data = list_first_entry(&msg->data, struct ceph_msg_data, links);
+	cursor->data = data;
+
+	__ceph_msg_data_cursor_init(cursor);
+}
+
+/*
+ * Return the page containing the next piece to process for a given
+ * data item, and supply the page offset and length of that piece.
+ * Indicate whether this is the last piece in this data item.
+ */
+static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+					size_t *page_offset, size_t *length,
+					bool *last_piece)
+{
+	struct page *page;
+
+	switch (cursor->data->type) {
+	case CEPH_MSG_DATA_PAGELIST:
+		page = ceph_msg_data_pagelist_next(cursor, page_offset, length);
+		break;
+	case CEPH_MSG_DATA_PAGES:
+		page = ceph_msg_data_pages_next(cursor, page_offset, length);
+		break;
+#ifdef CONFIG_BLOCK
+	case CEPH_MSG_DATA_BIO:
+		page = ceph_msg_data_bio_next(cursor, page_offset, length);
+		break;
+#endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_NONE:
+	default:
+		page = NULL;
+		break;
+	}
+	BUG_ON(!page);
+	BUG_ON(*page_offset + *length > PAGE_SIZE);
+	BUG_ON(!*length);
+	if (last_piece)
+		*last_piece = cursor->last_piece;
+
+	return page;
+}
+
+/*
+ * Returns true if the result moves the cursor on to the next piece
+ * of the data item.
+ */
+static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
+				size_t bytes)
+{
+	bool new_piece;
+
+	BUG_ON(bytes > cursor->resid);
+	switch (cursor->data->type) {
+	case CEPH_MSG_DATA_PAGELIST:
+		new_piece = ceph_msg_data_pagelist_advance(cursor, bytes);
+		break;
+	case CEPH_MSG_DATA_PAGES:
+		new_piece = ceph_msg_data_pages_advance(cursor, bytes);
+		break;
+#ifdef CONFIG_BLOCK
+	case CEPH_MSG_DATA_BIO:
+		new_piece = ceph_msg_data_bio_advance(cursor, bytes);
+		break;
+#endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_NONE:
+	default:
+		BUG();
+		break;
+	}
+	cursor->total_resid -= bytes;
+
+	if (!cursor->resid && cursor->total_resid) {
+		WARN_ON(!cursor->last_piece);
+		BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
+		cursor->data = list_entry_next(cursor->data, links);
+		__ceph_msg_data_cursor_init(cursor);
+		new_piece = true;
+	}
+	cursor->need_crc = new_piece;
+
+	return new_piece;
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+	BUG_ON(!msg);
+	BUG_ON(!data_len);
+
+	/* Initialize data cursor */
+
+	ceph_msg_data_cursor_init(msg, (size_t)data_len);
 }
 
 /*
@@ -803,16 +1214,12 @@ static void prepare_write_message(struct ceph_connection *con)
 		m->hdr.seq = cpu_to_le64(++con->out_seq);
 		m->needs_out_seq = false;
 	}
-#ifdef CONFIG_BLOCK
-	else
-		m->bio_iter = NULL;
-#endif
+	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
 
-	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
 	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     le32_to_cpu(m->hdr.data_len),
-	     m->nr_pages);
+	     m->data_length);
 	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
 
 	/* tag + hdr + front + middle */
@@ -843,11 +1250,13 @@ static void prepare_write_message(struct ceph_connection *con)
 
 	/* is there a data payload? */
 	con->out_msg->footer.data_crc = 0;
-	if (m->hdr.data_len)
-		prepare_write_message_data(con);
-	else
+	if (m->data_length) {
+		prepare_message_data(con->out_msg, m->data_length);
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
 		/* no, queue up footer too and be done */
 		prepare_write_message_footer(con);
+	}
 
 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
@@ -874,6 +1283,24 @@ static void prepare_write_ack(struct ceph_connection *con)
 }
 
 /*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+	dout("prepare_write_seq %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof (con->out_temp_ack),
+			 &con->out_temp_ack);
+
+	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+}
+
+/*
  * Prepare to write keepalive byte.
  */
 static void prepare_write_keepalive(struct ceph_connection *con)
@@ -1022,35 +1449,19 @@ out:
 	return ret;  /* done! */
 }
 
-static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
-			size_t len, size_t sent, bool in_trail)
+static u32 ceph_crc32c_page(u32 crc, struct page *page,
+				unsigned int page_offset,
+				unsigned int length)
 {
-	struct ceph_msg *msg = con->out_msg;
+	char *kaddr;
 
-	BUG_ON(!msg);
-	BUG_ON(!sent);
-
-	con->out_msg_pos.data_pos += sent;
-	con->out_msg_pos.page_pos += sent;
-	if (sent < len)
-		return;
+	kaddr = kmap(page);
+	BUG_ON(kaddr == NULL);
+	crc = crc32c(crc, kaddr + page_offset, length);
+	kunmap(page);
 
-	BUG_ON(sent != len);
-	con->out_msg_pos.page_pos = 0;
-	con->out_msg_pos.page++;
-	con->out_msg_pos.did_page_crc = false;
-	if (in_trail)
-		list_move_tail(&page->lru,
-			       &msg->trail->head);
-	else if (msg->pagelist)
-		list_move_tail(&page->lru,
-			       &msg->pagelist->head);
-#ifdef CONFIG_BLOCK
-	else if (msg->bio)
-		iter_bio_next(&msg->bio_iter, &msg->bio_seg);
-#endif
+	return crc;
 }
-
 /*
  * Write as much message data payload as we can.  If we finish, queue
  * up the footer.
@@ -1058,21 +1469,17 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
  *  0 -> socket full, but more to do
  * <0 -> error
  */
-static int write_partial_msg_pages(struct ceph_connection *con)
+static int write_partial_message_data(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->out_msg;
-	unsigned int data_len = le32_to_cpu(msg->hdr.data_len);
-	size_t len;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
 	bool do_datacrc = !con->msgr->nocrc;
-	int ret;
-	int total_max_write;
-	bool in_trail = false;
-	const size_t trail_len = (msg->trail ? msg->trail->length : 0);
-	const size_t trail_off = data_len - trail_len;
+	u32 crc;
 
-	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
-	     con, msg, con->out_msg_pos.page, msg->nr_pages,
-	     con->out_msg_pos.page_pos);
+	dout("%s %p msg %p\n", __func__, con, msg);
+
+	if (list_empty(&msg->data))
+		return -EINVAL;
 
 	/*
 	 * Iterate through each page that contains data to be
@@ -1082,72 +1489,41 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	 * need to map the page.  If we have no pages, they have
 	 * been revoked, so use the zero page.
 	 */
-	while (data_len > con->out_msg_pos.data_pos) {
-		struct page *page = NULL;
-		int max_write = PAGE_SIZE;
-		int bio_offset = 0;
-
-		in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off;
-		if (!in_trail)
-			total_max_write = trail_off - con->out_msg_pos.data_pos;
-
-		if (in_trail) {
-			total_max_write = data_len - con->out_msg_pos.data_pos;
-
-			page = list_first_entry(&msg->trail->head,
-						struct page, lru);
-		} else if (msg->pages) {
-			page = msg->pages[con->out_msg_pos.page];
-		} else if (msg->pagelist) {
-			page = list_first_entry(&msg->pagelist->head,
-						struct page, lru);
-#ifdef CONFIG_BLOCK
-		} else if (msg->bio) {
-			struct bio_vec *bv;
+	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+	while (cursor->resid) {
+		struct page *page;
+		size_t page_offset;
+		size_t length;
+		bool last_piece;
+		bool need_crc;
+		int ret;
 
-			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
-			page = bv->bv_page;
-			bio_offset = bv->bv_offset;
-			max_write = bv->bv_len;
-#endif
-		} else {
-			page = zero_page;
-		}
-		len = min_t(int, max_write - con->out_msg_pos.page_pos,
-			    total_max_write);
-
-		if (do_datacrc && !con->out_msg_pos.did_page_crc) {
-			void *base;
-			u32 crc = le32_to_cpu(msg->footer.data_crc);
-			char *kaddr;
-
-			kaddr = kmap(page);
-			BUG_ON(kaddr == NULL);
-			base = kaddr + con->out_msg_pos.page_pos + bio_offset;
-			crc = crc32c(crc, base, len);
-			kunmap(page);
-			msg->footer.data_crc = cpu_to_le32(crc);
-			con->out_msg_pos.did_page_crc = true;
-		}
-		ret = ceph_tcp_sendpage(con->sock, page,
-				      con->out_msg_pos.page_pos + bio_offset,
-				      len, 1);
-		if (ret <= 0)
-			goto out;
+		page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
+							&last_piece);
+		ret = ceph_tcp_sendpage(con->sock, page, page_offset,
+				      length, last_piece);
+		if (ret <= 0) {
+			if (do_datacrc)
+				msg->footer.data_crc = cpu_to_le32(crc);
 
-		out_msg_pos_next(con, page, len, (size_t) ret, in_trail);
+			return ret;
+		}
+		if (do_datacrc && cursor->need_crc)
+			crc = ceph_crc32c_page(crc, page, page_offset, length);
+		need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret);
 	}
 
-	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+	dout("%s %p msg %p done\n", __func__, con, msg);
 
 	/* prepare and queue up footer, too */
-	if (!do_datacrc)
+	if (do_datacrc)
+		msg->footer.data_crc = cpu_to_le32(crc);
+	else
 		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
 	con_out_kvec_reset(con);
 	prepare_write_message_footer(con);
-	ret = 1;
-out:
-	return ret;
+
+	return 1;	/* must return > 0 to indicate success */
 }
 
 /*
@@ -1160,7 +1536,7 @@ static int write_partial_skip(struct ceph_connection *con)
 	while (con->out_skip > 0) {
 		size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
 
-		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1);
+		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
 		if (ret <= 0)
 			goto out;
 		con->out_skip -= ret;
@@ -1191,6 +1567,13 @@ static void prepare_read_ack(struct ceph_connection *con)
 	con->in_base_pos = 0;
 }
 
+static void prepare_read_seq(struct ceph_connection *con)
+{
+	dout("prepare_read_seq %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
 static void prepare_read_tag(struct ceph_connection *con)
 {
 	dout("prepare_read_tag %p\n", con);
@@ -1597,7 +1980,6 @@ static int process_connect(struct ceph_connection *con)
 			con->error_msg = "connect authorization failure";
 			return -1;
 		}
-		con->auth_retry = 1;
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -1668,6 +2050,7 @@ static int process_connect(struct ceph_connection *con)
 		prepare_read_connect(con);
 		break;
 
+	case CEPH_MSGR_TAG_SEQ:
 	case CEPH_MSGR_TAG_READY:
 		if (req_feat & ~server_feat) {
 			pr_err("%s%lld %s protocol feature mismatch,"
@@ -1682,7 +2065,7 @@ static int process_connect(struct ceph_connection *con)
 
 		WARN_ON(con->state != CON_STATE_NEGOTIATING);
 		con->state = CON_STATE_OPEN;
-
+		con->auth_retry = 0;    /* we authenticated; clear flag */
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
 		con->peer_features = server_feat;
@@ -1698,7 +2081,12 @@ static int process_connect(struct ceph_connection *con)
 
 		con->delay = 0;      /* reset backoff memory */
 
-		prepare_read_tag(con);
+		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+			prepare_write_seq(con);
+			prepare_read_seq(con);
+		} else {
+			prepare_read_tag(con);
+		}
 		break;
 
 	case CEPH_MSGR_TAG_WAIT:
@@ -1732,7 +2120,6 @@ static int read_partial_ack(struct ceph_connection *con)
 	return read_partial(con, end, size, &con->in_temp_ack);
 }
 
-
 /*
  * We can finally discard anything that's been acked.
  */
@@ -1757,8 +2144,6 @@ static void process_ack(struct ceph_connection *con)
 }
 
 
-
-
 static int read_partial_message_section(struct ceph_connection *con,
 					struct kvec *section,
 					unsigned int sec_len, u32 *crc)
@@ -1782,77 +2167,49 @@ static int read_partial_message_section(struct ceph_connection *con,
 	return 1;
 }
 
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
-
-static int read_partial_message_pages(struct ceph_connection *con,
-				      struct page **pages,
-				      unsigned int data_len, bool do_datacrc)
+static int read_partial_msg_data(struct ceph_connection *con)
 {
-	void *p;
+	struct ceph_msg *msg = con->in_msg;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	const bool do_datacrc = !con->msgr->nocrc;
+	struct page *page;
+	size_t page_offset;
+	size_t length;
+	u32 crc = 0;
 	int ret;
-	int left;
 
-	left = min((int)(data_len - con->in_msg_pos.data_pos),
-		   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
-	/* (page) data */
-	BUG_ON(pages == NULL);
-	p = kmap(pages[con->in_msg_pos.page]);
-	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-			       left);
-	if (ret > 0 && do_datacrc)
-		con->in_data_crc =
-			crc32c(con->in_data_crc,
-				  p + con->in_msg_pos.page_pos, ret);
-	kunmap(pages[con->in_msg_pos.page]);
-	if (ret <= 0)
-		return ret;
-	con->in_msg_pos.data_pos += ret;
-	con->in_msg_pos.page_pos += ret;
-	if (con->in_msg_pos.page_pos == PAGE_SIZE) {
-		con->in_msg_pos.page_pos = 0;
-		con->in_msg_pos.page++;
-	}
-
-	return ret;
-}
-
-#ifdef CONFIG_BLOCK
-static int read_partial_message_bio(struct ceph_connection *con,
-				    struct bio **bio_iter, int *bio_seg,
-				    unsigned int data_len, bool do_datacrc)
-{
-	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
-	void *p;
-	int ret, left;
+	BUG_ON(!msg);
+	if (list_empty(&msg->data))
+		return -EIO;
 
-	left = min((int)(data_len - con->in_msg_pos.data_pos),
-		   (int)(bv->bv_len - con->in_msg_pos.page_pos));
+	if (do_datacrc)
+		crc = con->in_data_crc;
+	while (cursor->resid) {
+		page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
+							NULL);
+		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+		if (ret <= 0) {
+			if (do_datacrc)
+				con->in_data_crc = crc;
 
-	p = kmap(bv->bv_page) + bv->bv_offset;
+			return ret;
+		}
 
-	ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-			       left);
-	if (ret > 0 && do_datacrc)
-		con->in_data_crc =
-			crc32c(con->in_data_crc,
-				  p + con->in_msg_pos.page_pos, ret);
-	kunmap(bv->bv_page);
-	if (ret <= 0)
-		return ret;
-	con->in_msg_pos.data_pos += ret;
-	con->in_msg_pos.page_pos += ret;
-	if (con->in_msg_pos.page_pos == bv->bv_len) {
-		con->in_msg_pos.page_pos = 0;
-		iter_bio_next(bio_iter, bio_seg);
+		if (do_datacrc)
+			crc = ceph_crc32c_page(crc, page, page_offset, ret);
+		(void) ceph_msg_data_advance(&msg->cursor, (size_t)ret);
 	}
+	if (do_datacrc)
+		con->in_data_crc = crc;
 
-	return ret;
+	return 1;	/* must return > 0 to indicate success */
 }
-#endif
 
 /*
  * read (part of) a message.
  */
+static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
+
 static int read_partial_message(struct ceph_connection *con)
 {
 	struct ceph_msg *m = con->in_msg;
@@ -1885,7 +2242,7 @@ static int read_partial_message(struct ceph_connection *con)
 	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
 		return -EIO;
 	middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
 		return -EIO;
 	data_len = le32_to_cpu(con->in_hdr.data_len);
 	if (data_len > CEPH_MSG_MAX_DATA_LEN)
@@ -1914,14 +2271,22 @@ static int read_partial_message(struct ceph_connection *con)
 		int skip = 0;
 
 		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-		     con->in_hdr.front_len, con->in_hdr.data_len);
+		     front_len, data_len);
 		ret = ceph_con_in_msg_alloc(con, &skip);
 		if (ret < 0)
 			return ret;
+
+		BUG_ON(!con->in_msg ^ skip);
+		if (con->in_msg && data_len > con->in_msg->data_length) {
+			pr_warning("%s skipping long message (%u > %zd)\n",
+				__func__, data_len, con->in_msg->data_length);
+			ceph_msg_put(con->in_msg);
+			con->in_msg = NULL;
+			skip = 1;
+		}
 		if (skip) {
 			/* skip this message */
 			dout("alloc_msg said skip message\n");
-			BUG_ON(con->in_msg);
 			con->in_base_pos = -front_len - middle_len - data_len -
 				sizeof(m->footer);
 			con->in_tag = CEPH_MSGR_TAG_READY;
@@ -1936,17 +2301,10 @@ static int read_partial_message(struct ceph_connection *con)
 		if (m->middle)
 			m->middle->vec.iov_len = 0;
 
-		con->in_msg_pos.page = 0;
-		if (m->pages)
-			con->in_msg_pos.page_pos = m->page_alignment;
-		else
-			con->in_msg_pos.page_pos = 0;
-		con->in_msg_pos.data_pos = 0;
+		/* prepare for data payload, if any */
 
-#ifdef CONFIG_BLOCK
-		if (m->bio)
-			init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
-#endif
+		if (data_len)
+			prepare_message_data(con->in_msg, data_len);
 	}
 
 	/* front */
@@ -1965,24 +2323,10 @@ static int read_partial_message(struct ceph_connection *con)
 	}
 
 	/* (page) data */
-	while (con->in_msg_pos.data_pos < data_len) {
-		if (m->pages) {
-			ret = read_partial_message_pages(con, m->pages,
-						 data_len, do_datacrc);
-			if (ret <= 0)
-				return ret;
-#ifdef CONFIG_BLOCK
-		} else if (m->bio) {
-			BUG_ON(!m->bio_iter);
-			ret = read_partial_message_bio(con,
-						 &m->bio_iter, &m->bio_seg,
-						 data_len, do_datacrc);
-			if (ret <= 0)
-				return ret;
-#endif
-		} else {
-			BUG_ON(1);
-		}
+	if (data_len) {
+		ret = read_partial_msg_data(con);
+		if (ret <= 0)
+			return ret;
 	}
 
 	/* footer */
@@ -2108,13 +2452,13 @@ more_kvec:
 			goto do_next;
 		}
 
-		ret = write_partial_msg_pages(con);
+		ret = write_partial_message_data(con);
 		if (ret == 1)
 			goto more_kvec;  /* we need to send the footer, too! */
 		if (ret == 0)
 			goto out;
 		if (ret < 0) {
-			dout("try_write write_partial_msg_pages err %d\n",
+			dout("try_write write_partial_message_data err %d\n",
 			     ret);
 			goto out;
 		}
@@ -2266,7 +2610,12 @@ more:
 			prepare_read_tag(con);
 		goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
+	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
+		/*
+		 * the final handshake seq exchange is semantically
+		 * equivalent to an ACK
+		 */
 		ret = read_partial_ack(con);
 		if (ret <= 0)
 			goto out;
@@ -2672,6 +3021,88 @@ void ceph_con_keepalive(struct ceph_connection *con)
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
 
+static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
+{
+	struct ceph_msg_data *data;
+
+	if (WARN_ON(!ceph_msg_data_type_valid(type)))
+		return NULL;
+
+	data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS);
+	if (data)
+		data->type = type;
+	INIT_LIST_HEAD(&data->links);
+
+	return data;
+}
+
+static void ceph_msg_data_destroy(struct ceph_msg_data *data)
+{
+	if (!data)
+		return;
+
+	WARN_ON(!list_empty(&data->links));
+	if (data->type == CEPH_MSG_DATA_PAGELIST) {
+		ceph_pagelist_release(data->pagelist);
+		kfree(data->pagelist);
+	}
+	kmem_cache_free(ceph_msg_data_cache, data);
+}
+
+void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
+		size_t length, size_t alignment)
+{
+	struct ceph_msg_data *data;
+
+	BUG_ON(!pages);
+	BUG_ON(!length);
+
+	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
+	BUG_ON(!data);
+	data->pages = pages;
+	data->length = length;
+	data->alignment = alignment & ~PAGE_MASK;
+
+	list_add_tail(&data->links, &msg->data);
+	msg->data_length += length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_pages);
+
+void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
+				struct ceph_pagelist *pagelist)
+{
+	struct ceph_msg_data *data;
+
+	BUG_ON(!pagelist);
+	BUG_ON(!pagelist->length);
+
+	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
+	BUG_ON(!data);
+	data->pagelist = pagelist;
+
+	list_add_tail(&data->links, &msg->data);
+	msg->data_length += pagelist->length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
+
+#ifdef	CONFIG_BLOCK
+void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
+		size_t length)
+{
+	struct ceph_msg_data *data;
+
+	BUG_ON(!bio);
+
+	data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
+	BUG_ON(!data);
+	data->bio = bio;
+	data->bio_length = length;
+
+	list_add_tail(&data->links, &msg->data);
+	msg->data_length += length;
+}
+EXPORT_SYMBOL(ceph_msg_data_add_bio);
+#endif	/* CONFIG_BLOCK */
 
 /*
  * construct a new message with given type, size
@@ -2682,49 +3113,20 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 {
 	struct ceph_msg *m;
 
-	m = kmalloc(sizeof(*m), flags);
+	m = kmem_cache_zalloc(ceph_msg_cache, flags);
 	if (m == NULL)
 		goto out;
-	kref_init(&m->kref);
 
-	m->con = NULL;
-	INIT_LIST_HEAD(&m->list_head);
-
-	m->hdr.tid = 0;
 	m->hdr.type = cpu_to_le16(type);
 	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
-	m->hdr.version = 0;
 	m->hdr.front_len = cpu_to_le32(front_len);
-	m->hdr.middle_len = 0;
-	m->hdr.data_len = 0;
-	m->hdr.data_off = 0;
-	m->hdr.reserved = 0;
-	m->footer.front_crc = 0;
-	m->footer.middle_crc = 0;
-	m->footer.data_crc = 0;
-	m->footer.flags = 0;
-	m->front_max = front_len;
-	m->front_is_vmalloc = false;
-	m->more_to_follow = false;
-	m->ack_stamp = 0;
-	m->pool = NULL;
-
-	/* middle */
-	m->middle = NULL;
 
-	/* data */
-	m->nr_pages = 0;
-	m->page_alignment = 0;
-	m->pages = NULL;
-	m->pagelist = NULL;
-#ifdef	CONFIG_BLOCK
-	m->bio = NULL;
-	m->bio_iter = NULL;
-	m->bio_seg = 0;
-#endif	/* CONFIG_BLOCK */
-	m->trail = NULL;
+	INIT_LIST_HEAD(&m->list_head);
+	kref_init(&m->kref);
+	INIT_LIST_HEAD(&m->data);
 
 	/* front */
+	m->front_max = front_len;
 	if (front_len) {
 		if (front_len > PAGE_CACHE_SIZE) {
 			m->front.iov_base = __vmalloc(front_len, flags,
@@ -2802,49 +3204,37 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
 static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 {
 	struct ceph_msg_header *hdr = &con->in_hdr;
-	int type = le16_to_cpu(hdr->type);
-	int front_len = le32_to_cpu(hdr->front_len);
 	int middle_len = le32_to_cpu(hdr->middle_len);
+	struct ceph_msg *msg;
 	int ret = 0;
 
 	BUG_ON(con->in_msg != NULL);
+	BUG_ON(!con->ops->alloc_msg);
 
-	if (con->ops->alloc_msg) {
-		struct ceph_msg *msg;
-
-		mutex_unlock(&con->mutex);
-		msg = con->ops->alloc_msg(con, hdr, skip);
-		mutex_lock(&con->mutex);
-		if (con->state != CON_STATE_OPEN) {
-			if (msg)
-				ceph_msg_put(msg);
-			return -EAGAIN;
-		}
-		con->in_msg = msg;
-		if (con->in_msg) {
-			con->in_msg->con = con->ops->get(con);
-			BUG_ON(con->in_msg->con == NULL);
-		}
-		if (*skip) {
-			con->in_msg = NULL;
-			return 0;
-		}
-		if (!con->in_msg) {
-			con->error_msg =
-				"error allocating memory for incoming message";
-			return -ENOMEM;
-		}
+	mutex_unlock(&con->mutex);
+	msg = con->ops->alloc_msg(con, hdr, skip);
+	mutex_lock(&con->mutex);
+	if (con->state != CON_STATE_OPEN) {
+		if (msg)
+			ceph_msg_put(msg);
+		return -EAGAIN;
 	}
-	if (!con->in_msg) {
-		con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
-		if (!con->in_msg) {
-			pr_err("unable to allocate msg type %d len %d\n",
-			       type, front_len);
-			return -ENOMEM;
-		}
+	if (msg) {
+		BUG_ON(*skip);
+		con->in_msg = msg;
 		con->in_msg->con = con->ops->get(con);
 		BUG_ON(con->in_msg->con == NULL);
-		con->in_msg->page_alignment = le16_to_cpu(hdr->data_off);
+	} else {
+		/*
+		 * Null message pointer means either we should skip
+		 * this message or we couldn't allocate memory.  The
+		 * former is not an error.
+		 */
+		if (*skip)
+			return 0;
+		con->error_msg = "error allocating memory for incoming message";
+
+		return -ENOMEM;
 	}
 	memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
 
@@ -2870,7 +3260,7 @@ void ceph_msg_kfree(struct ceph_msg *m)
 		vfree(m->front.iov_base);
 	else
 		kfree(m->front.iov_base);
-	kfree(m);
+	kmem_cache_free(ceph_msg_cache, m);
 }
 
 /*
@@ -2879,6 +3269,9 @@ void ceph_msg_kfree(struct ceph_msg *m)
 void ceph_msg_last_put(struct kref *kref)
 {
 	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+	LIST_HEAD(data);
+	struct list_head *links;
+	struct list_head *next;
 
 	dout("ceph_msg_put last one on %p\n", m);
 	WARN_ON(!list_empty(&m->list_head));
@@ -2888,16 +3281,16 @@ void ceph_msg_last_put(struct kref *kref)
 		ceph_buffer_put(m->middle);
 		m->middle = NULL;
 	}
-	m->nr_pages = 0;
-	m->pages = NULL;
 
-	if (m->pagelist) {
-		ceph_pagelist_release(m->pagelist);
-		kfree(m->pagelist);
-		m->pagelist = NULL;
-	}
+	list_splice_init(&m->data, &data);
+	list_for_each_safe(links, next, &data) {
+		struct ceph_msg_data *data;
 
-	m->trail = NULL;
+		data = list_entry(links, struct ceph_msg_data, links);
+		list_del_init(links);
+		ceph_msg_data_destroy(data);
+	}
+	m->data_length = 0;
 
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
@@ -2908,8 +3301,8 @@ EXPORT_SYMBOL(ceph_msg_last_put);
 
 void ceph_msg_dump(struct ceph_msg *msg)
 {
-	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
-		 msg->front_max, msg->nr_pages);
+	pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
+		 msg->front_max, msg->data_length);
 	print_hex_dump(KERN_DEBUG, "header: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       &msg->hdr, sizeof(msg->hdr), true);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index aef5b1062be..1fe25cd29d0 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -737,7 +737,7 @@ static void delayed_work(struct work_struct *work)
 
 		__validate_auth(monc);
 
-		if (monc->auth->ops->is_authenticated(monc->auth))
+		if (ceph_auth_is_authenticated(monc->auth))
 			__send_subscribe(monc);
 	}
 	__schedule_delayed(monc);
@@ -892,8 +892,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 
 	mutex_lock(&monc->mutex);
 	had_debugfs_info = have_debugfs_info(monc);
-	if (monc->auth->ops)
-		was_auth = monc->auth->ops->is_authenticated(monc->auth);
+	was_auth = ceph_auth_is_authenticated(monc->auth);
 	monc->pending_auth = 0;
 	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
 				     msg->front.iov_len,
@@ -904,7 +903,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 		wake_up_all(&monc->client->auth_wq);
 	} else if (ret > 0) {
 		__send_prepared_auth_request(monc, ret);
-	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
+	} else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
 		dout("authenticated, starting session\n");
 
 		monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d730dd4d8eb..d5953b87918 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1,3 +1,4 @@
+
 #include <linux/ceph/ceph_debug.h>
 
 #include <linux/module.h>
@@ -21,6 +22,8 @@
 #define OSD_OP_FRONT_LEN	4096
 #define OSD_OPREPLY_FRONT_LEN	512
 
+static struct kmem_cache	*ceph_osd_request_cache;
+
 static const struct ceph_connection_operations osd_con_ops;
 
 static void __send_queued(struct ceph_osd_client *osdc);
@@ -32,12 +35,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 static void __send_request(struct ceph_osd_client *osdc,
 			   struct ceph_osd_request *req);
 
-static int op_has_extent(int op)
-{
-	return (op == CEPH_OSD_OP_READ ||
-		op == CEPH_OSD_OP_WRITE);
-}
-
 /*
  * Implement client access to distributed object storage cluster.
  *
@@ -63,53 +60,238 @@ static int op_has_extent(int op)
  *
  * fill osd op in request message.
  */
-static int calc_layout(struct ceph_vino vino,
-		       struct ceph_file_layout *layout,
-		       u64 off, u64 *plen,
-		       struct ceph_osd_request *req,
-		       struct ceph_osd_req_op *op)
+static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
+			u64 *objnum, u64 *objoff, u64 *objlen)
 {
 	u64 orig_len = *plen;
-	u64 bno = 0;
-	u64 objoff = 0;
-	u64 objlen = 0;
 	int r;
 
 	/* object extent? */
-	r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno,
-					  &objoff, &objlen);
+	r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
+					  objoff, objlen);
 	if (r < 0)
 		return r;
-	if (objlen < orig_len) {
-		*plen = objlen;
+	if (*objlen < orig_len) {
+		*plen = *objlen;
 		dout(" skipping last %llu, final file extent %llu~%llu\n",
 		     orig_len - *plen, off, *plen);
 	}
 
-	if (op_has_extent(op->op)) {
-		u32 osize = le32_to_cpu(layout->fl_object_size);
-		op->extent.offset = objoff;
-		op->extent.length = objlen;
-		if (op->extent.truncate_size <= off - objoff) {
-			op->extent.truncate_size = 0;
-		} else {
-			op->extent.truncate_size -= off - objoff;
-			if (op->extent.truncate_size > osize)
-				op->extent.truncate_size = osize;
-		}
+	dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
+
+	return 0;
+}
+
+static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
+{
+	memset(osd_data, 0, sizeof (*osd_data));
+	osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
+}
+
+static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
+			struct page **pages, u64 length, u32 alignment,
+			bool pages_from_pool, bool own_pages)
+{
+	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
+	osd_data->pages = pages;
+	osd_data->length = length;
+	osd_data->alignment = alignment;
+	osd_data->pages_from_pool = pages_from_pool;
+	osd_data->own_pages = own_pages;
+}
+
+static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
+			struct ceph_pagelist *pagelist)
+{
+	osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
+	osd_data->pagelist = pagelist;
+}
+
+#ifdef CONFIG_BLOCK
+static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
+			struct bio *bio, size_t bio_length)
+{
+	osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
+	osd_data->bio = bio;
+	osd_data->bio_length = bio_length;
+}
+#endif /* CONFIG_BLOCK */
+
+#define osd_req_op_data(oreq, whch, typ, fld)	\
+	({						\
+		BUG_ON(whch >= (oreq)->r_num_ops);	\
+		&(oreq)->r_ops[whch].typ.fld;		\
+	})
+
+static struct ceph_osd_data *
+osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
+{
+	BUG_ON(which >= osd_req->r_num_ops);
+
+	return &osd_req->r_ops[which].raw_data_in;
+}
+
+struct ceph_osd_data *
+osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
+			unsigned int which)
+{
+	return osd_req_op_data(osd_req, which, extent, osd_data);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data);
+
+struct ceph_osd_data *
+osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
+			unsigned int which)
+{
+	return osd_req_op_data(osd_req, which, cls, response_data);
+}
+EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */
+
+void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
+			unsigned int which, struct page **pages,
+			u64 length, u32 alignment,
+			bool pages_from_pool, bool own_pages)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_raw_data_in(osd_req, which);
+	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+				pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
+
+void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
+			unsigned int which, struct page **pages,
+			u64 length, u32 alignment,
+			bool pages_from_pool, bool own_pages)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+				pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
+
+void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
+			unsigned int which, struct ceph_pagelist *pagelist)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+	ceph_osd_data_pagelist_init(osd_data, pagelist);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
+
+#ifdef CONFIG_BLOCK
+void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
+			unsigned int which, struct bio *bio, size_t bio_length)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+	ceph_osd_data_bio_init(osd_data, bio, bio_length);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
+#endif /* CONFIG_BLOCK */
+
+static void osd_req_op_cls_request_info_pagelist(
+			struct ceph_osd_request *osd_req,
+			unsigned int which, struct ceph_pagelist *pagelist)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, cls, request_info);
+	ceph_osd_data_pagelist_init(osd_data, pagelist);
+}
+
+void osd_req_op_cls_request_data_pagelist(
+			struct ceph_osd_request *osd_req,
+			unsigned int which, struct ceph_pagelist *pagelist)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
+	ceph_osd_data_pagelist_init(osd_data, pagelist);
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
+
+void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
+			unsigned int which, struct page **pages, u64 length,
+			u32 alignment, bool pages_from_pool, bool own_pages)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
+	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+				pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
+
+void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
+			unsigned int which, struct page **pages, u64 length,
+			u32 alignment, bool pages_from_pool, bool own_pages)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, cls, response_data);
+	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+				pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
+
+static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
+{
+	switch (osd_data->type) {
+	case CEPH_OSD_DATA_TYPE_NONE:
+		return 0;
+	case CEPH_OSD_DATA_TYPE_PAGES:
+		return osd_data->length;
+	case CEPH_OSD_DATA_TYPE_PAGELIST:
+		return (u64)osd_data->pagelist->length;
+#ifdef CONFIG_BLOCK
+	case CEPH_OSD_DATA_TYPE_BIO:
+		return (u64)osd_data->bio_length;
+#endif /* CONFIG_BLOCK */
+	default:
+		WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
+		return 0;
 	}
-	req->r_num_pages = calc_pages_for(off, *plen);
-	req->r_page_alignment = off & ~PAGE_MASK;
-	if (op->op == CEPH_OSD_OP_WRITE)
-		op->payload_len = *plen;
+}
 
-	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
-	     bno, objoff, objlen, req->r_num_pages);
+static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
+{
+	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
+		int num_pages;
 
-	snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
-	req->r_oid_len = strlen(req->r_oid);
+		num_pages = calc_pages_for((u64)osd_data->alignment,
+						(u64)osd_data->length);
+		ceph_release_page_vector(osd_data->pages, num_pages);
+	}
+	ceph_osd_data_init(osd_data);
+}
+
+static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
+			unsigned int which)
+{
+	struct ceph_osd_req_op *op;
+
+	BUG_ON(which >= osd_req->r_num_ops);
+	op = &osd_req->r_ops[which];
 
-	return r;
+	switch (op->op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		ceph_osd_data_release(&op->extent.osd_data);
+		break;
+	case CEPH_OSD_OP_CALL:
+		ceph_osd_data_release(&op->cls.request_info);
+		ceph_osd_data_release(&op->cls.request_data);
+		ceph_osd_data_release(&op->cls.response_data);
+		break;
+	default:
+		break;
+	}
 }
 
 /*
@@ -117,30 +299,26 @@ static int calc_layout(struct ceph_vino vino,
  */
 void ceph_osdc_release_request(struct kref *kref)
 {
-	struct ceph_osd_request *req = container_of(kref,
-						    struct ceph_osd_request,
-						    r_kref);
+	struct ceph_osd_request *req;
+	unsigned int which;
 
+	req = container_of(kref, struct ceph_osd_request, r_kref);
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
-	if (req->r_con_filling_msg) {
-		dout("%s revoking msg %p from con %p\n", __func__,
-		     req->r_reply, req->r_con_filling_msg);
+	if (req->r_reply) {
 		ceph_msg_revoke_incoming(req->r_reply);
-		req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
-		req->r_con_filling_msg = NULL;
-	}
-	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	if (req->r_own_pages)
-		ceph_release_page_vector(req->r_pages,
-					 req->r_num_pages);
+	}
+
+	for (which = 0; which < req->r_num_ops; which++)
+		osd_req_op_data_release(req, which);
+
 	ceph_put_snap_context(req->r_snapc);
-	ceph_pagelist_release(&req->r_trail);
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else
-		kfree(req);
+		kmem_cache_free(ceph_osd_request_cache, req);
+
 }
 EXPORT_SYMBOL(ceph_osdc_release_request);
 
@@ -154,6 +332,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	struct ceph_msg *msg;
 	size_t msg_size;
 
+	BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
+	BUG_ON(num_ops > CEPH_OSD_MAX_OP);
+
 	msg_size = 4 + 4 + 8 + 8 + 4+8;
 	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
 	msg_size += 1 + 8 + 4 + 4;     /* pg_t */
@@ -168,13 +349,14 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 		req = mempool_alloc(osdc->req_mempool, gfp_flags);
 		memset(req, 0, sizeof(*req));
 	} else {
-		req = kzalloc(sizeof(*req), gfp_flags);
+		req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
 	}
 	if (req == NULL)
 		return NULL;
 
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
+	req->r_num_ops = num_ops;
 
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
@@ -198,8 +380,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	}
 	req->r_reply = msg;
 
-	ceph_pagelist_init(&req->r_trail);
-
 	/* create request message; allow space for oid */
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
@@ -218,60 +398,24 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
-static void osd_req_encode_op(struct ceph_osd_request *req,
-			      struct ceph_osd_op *dst,
-			      struct ceph_osd_req_op *src)
+static bool osd_req_opcode_valid(u16 opcode)
 {
-	dst->op = cpu_to_le16(src->op);
-
-	switch (src->op) {
-	case CEPH_OSD_OP_STAT:
-		break;
+	switch (opcode) {
 	case CEPH_OSD_OP_READ:
-	case CEPH_OSD_OP_WRITE:
-		dst->extent.offset =
-			cpu_to_le64(src->extent.offset);
-		dst->extent.length =
-			cpu_to_le64(src->extent.length);
-		dst->extent.truncate_size =
-			cpu_to_le64(src->extent.truncate_size);
-		dst->extent.truncate_seq =
-			cpu_to_le32(src->extent.truncate_seq);
-		break;
-	case CEPH_OSD_OP_CALL:
-		dst->cls.class_len = src->cls.class_len;
-		dst->cls.method_len = src->cls.method_len;
-		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
-
-		ceph_pagelist_append(&req->r_trail, src->cls.class_name,
-				     src->cls.class_len);
-		ceph_pagelist_append(&req->r_trail, src->cls.method_name,
-				     src->cls.method_len);
-		ceph_pagelist_append(&req->r_trail, src->cls.indata,
-				     src->cls.indata_len);
-		break;
-	case CEPH_OSD_OP_STARTSYNC:
-		break;
-	case CEPH_OSD_OP_NOTIFY_ACK:
-	case CEPH_OSD_OP_WATCH:
-		dst->watch.cookie = cpu_to_le64(src->watch.cookie);
-		dst->watch.ver = cpu_to_le64(src->watch.ver);
-		dst->watch.flag = src->watch.flag;
-		break;
-	default:
-		pr_err("unrecognized osd opcode %d\n", dst->op);
-		WARN_ON(1);
-		break;
+	case CEPH_OSD_OP_STAT:
 	case CEPH_OSD_OP_MAPEXT:
 	case CEPH_OSD_OP_MASKTRUNC:
 	case CEPH_OSD_OP_SPARSE_READ:
 	case CEPH_OSD_OP_NOTIFY:
+	case CEPH_OSD_OP_NOTIFY_ACK:
 	case CEPH_OSD_OP_ASSERT_VER:
+	case CEPH_OSD_OP_WRITE:
 	case CEPH_OSD_OP_WRITEFULL:
 	case CEPH_OSD_OP_TRUNCATE:
 	case CEPH_OSD_OP_ZERO:
 	case CEPH_OSD_OP_DELETE:
 	case CEPH_OSD_OP_APPEND:
+	case CEPH_OSD_OP_STARTSYNC:
 	case CEPH_OSD_OP_SETTRUNC:
 	case CEPH_OSD_OP_TRIMTRUNC:
 	case CEPH_OSD_OP_TMAPUP:
@@ -279,11 +423,11 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 	case CEPH_OSD_OP_TMAPGET:
 	case CEPH_OSD_OP_CREATE:
 	case CEPH_OSD_OP_ROLLBACK:
+	case CEPH_OSD_OP_WATCH:
 	case CEPH_OSD_OP_OMAPGETKEYS:
 	case CEPH_OSD_OP_OMAPGETVALS:
 	case CEPH_OSD_OP_OMAPGETHEADER:
 	case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
-	case CEPH_OSD_OP_MODE_RD:
 	case CEPH_OSD_OP_OMAPSETVALS:
 	case CEPH_OSD_OP_OMAPSETHEADER:
 	case CEPH_OSD_OP_OMAPCLEAR:
@@ -314,113 +458,233 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 	case CEPH_OSD_OP_RDUNLOCK:
 	case CEPH_OSD_OP_UPLOCK:
 	case CEPH_OSD_OP_DNLOCK:
+	case CEPH_OSD_OP_CALL:
 	case CEPH_OSD_OP_PGLS:
 	case CEPH_OSD_OP_PGLS_FILTER:
-		pr_err("unsupported osd opcode %s\n",
-			ceph_osd_op_name(dst->op));
-		WARN_ON(1);
-		break;
+		return true;
+	default:
+		return false;
 	}
-	dst->payload_len = cpu_to_le32(src->payload_len);
 }
 
 /*
- * build new request AND message
- *
+ * This is an osd op init function for opcodes that have no data or
+ * other information associated with them.  It also serves as a
+ * common init routine for all the other init functions, below.
  */
-void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, u64 len, unsigned int num_ops,
-			     struct ceph_osd_req_op *src_ops,
-			     struct ceph_snap_context *snapc, u64 snap_id,
-			     struct timespec *mtime)
+static struct ceph_osd_req_op *
+_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
+				u16 opcode)
 {
-	struct ceph_msg *msg = req->r_request;
-	struct ceph_osd_req_op *src_op;
-	void *p;
-	size_t msg_size;
-	int flags = req->r_flags;
-	u64 data_len;
-	int i;
+	struct ceph_osd_req_op *op;
 
-	req->r_num_ops = num_ops;
-	req->r_snapid = snap_id;
-	req->r_snapc = ceph_get_snap_context(snapc);
+	BUG_ON(which >= osd_req->r_num_ops);
+	BUG_ON(!osd_req_opcode_valid(opcode));
 
-	/* encode request */
-	msg->hdr.version = cpu_to_le16(4);
+	op = &osd_req->r_ops[which];
+	memset(op, 0, sizeof (*op));
+	op->op = opcode;
 
-	p = msg->front.iov_base;
-	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
-	req->r_request_osdmap_epoch = p;
-	p += 4;
-	req->r_request_flags = p;
-	p += 4;
-	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
-		ceph_encode_timespec(p, mtime);
-	p += sizeof(struct ceph_timespec);
-	req->r_request_reassert_version = p;
-	p += sizeof(struct ceph_eversion); /* will get filled in */
+	return op;
+}
 
-	/* oloc */
-	ceph_encode_8(&p, 4);
-	ceph_encode_8(&p, 4);
-	ceph_encode_32(&p, 8 + 4 + 4);
-	req->r_request_pool = p;
-	p += 8;
-	ceph_encode_32(&p, -1);  /* preferred */
-	ceph_encode_32(&p, 0);   /* key len */
+void osd_req_op_init(struct ceph_osd_request *osd_req,
+				unsigned int which, u16 opcode)
+{
+	(void)_osd_req_op_init(osd_req, which, opcode);
+}
+EXPORT_SYMBOL(osd_req_op_init);
 
-	ceph_encode_8(&p, 1);
-	req->r_request_pgid = p;
-	p += 8 + 4;
-	ceph_encode_32(&p, -1);  /* preferred */
+void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
+				unsigned int which, u16 opcode,
+				u64 offset, u64 length,
+				u64 truncate_size, u32 truncate_seq)
+{
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	size_t payload_len = 0;
 
-	/* oid */
-	ceph_encode_32(&p, req->r_oid_len);
-	memcpy(p, req->r_oid, req->r_oid_len);
-	dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
-	p += req->r_oid_len;
+	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
 
-	/* ops */
-	ceph_encode_16(&p, num_ops);
-	src_op = src_ops;
-	req->r_request_ops = p;
-	for (i = 0; i < num_ops; i++, src_op++) {
-		osd_req_encode_op(req, p, src_op);
-		p += sizeof(struct ceph_osd_op);
-	}
+	op->extent.offset = offset;
+	op->extent.length = length;
+	op->extent.truncate_size = truncate_size;
+	op->extent.truncate_seq = truncate_seq;
+	if (opcode == CEPH_OSD_OP_WRITE)
+		payload_len += length;
 
-	/* snaps */
-	ceph_encode_64(&p, req->r_snapid);
-	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
-	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
-	if (req->r_snapc) {
-		for (i = 0; i < snapc->num_snaps; i++) {
-			ceph_encode_64(&p, req->r_snapc->snaps[i]);
-		}
+	op->payload_len = payload_len;
+}
+EXPORT_SYMBOL(osd_req_op_extent_init);
+
+void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
+				unsigned int which, u64 length)
+{
+	struct ceph_osd_req_op *op;
+	u64 previous;
+
+	BUG_ON(which >= osd_req->r_num_ops);
+	op = &osd_req->r_ops[which];
+	previous = op->extent.length;
+
+	if (length == previous)
+		return;		/* Nothing to do */
+	BUG_ON(length > previous);
+
+	op->extent.length = length;
+	op->payload_len -= previous - length;
+}
+EXPORT_SYMBOL(osd_req_op_extent_update);
+
+void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
+			u16 opcode, const char *class, const char *method)
+{
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	struct ceph_pagelist *pagelist;
+	size_t payload_len = 0;
+	size_t size;
+
+	BUG_ON(opcode != CEPH_OSD_OP_CALL);
+
+	pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+	BUG_ON(!pagelist);
+	ceph_pagelist_init(pagelist);
+
+	op->cls.class_name = class;
+	size = strlen(class);
+	BUG_ON(size > (size_t) U8_MAX);
+	op->cls.class_len = size;
+	ceph_pagelist_append(pagelist, class, size);
+	payload_len += size;
+
+	op->cls.method_name = method;
+	size = strlen(method);
+	BUG_ON(size > (size_t) U8_MAX);
+	op->cls.method_len = size;
+	ceph_pagelist_append(pagelist, method, size);
+	payload_len += size;
+
+	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
+
+	op->cls.argc = 0;	/* currently unused */
+
+	op->payload_len = payload_len;
+}
+EXPORT_SYMBOL(osd_req_op_cls_init);
+
+void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
+				unsigned int which, u16 opcode,
+				u64 cookie, u64 version, int flag)
+{
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+
+	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+
+	op->watch.cookie = cookie;
+	op->watch.ver = version;
+	if (opcode == CEPH_OSD_OP_WATCH && flag)
+		op->watch.flag = (u8)1;
+}
+EXPORT_SYMBOL(osd_req_op_watch_init);
+
+static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
+				struct ceph_osd_data *osd_data)
+{
+	u64 length = ceph_osd_data_length(osd_data);
+
+	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+		BUG_ON(length > (u64) SIZE_MAX);
+		if (length)
+			ceph_msg_data_add_pages(msg, osd_data->pages,
+					length, osd_data->alignment);
+	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
+		BUG_ON(!length);
+		ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
+#ifdef CONFIG_BLOCK
+	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
+		ceph_msg_data_add_bio(msg, osd_data->bio, length);
+#endif
+	} else {
+		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
 	}
+}
 
-	req->r_request_attempts = p;
-	p += 4;
+static u64 osd_req_encode_op(struct ceph_osd_request *req,
+			      struct ceph_osd_op *dst, unsigned int which)
+{
+	struct ceph_osd_req_op *src;
+	struct ceph_osd_data *osd_data;
+	u64 request_data_len = 0;
+	u64 data_length;
 
-	data_len = req->r_trail.length;
-	if (flags & CEPH_OSD_FLAG_WRITE) {
-		req->r_request->hdr.data_off = cpu_to_le16(off);
-		data_len += len;
+	BUG_ON(which >= req->r_num_ops);
+	src = &req->r_ops[which];
+	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
+		pr_err("unrecognized osd opcode %d\n", src->op);
+
+		return 0;
 	}
-	req->r_request->hdr.data_len = cpu_to_le32(data_len);
-	req->r_request->page_alignment = req->r_page_alignment;
 
-	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-	msg_size = p - msg->front.iov_base;
-	msg->front.iov_len = msg_size;
-	msg->hdr.front_len = cpu_to_le32(msg_size);
+	switch (src->op) {
+	case CEPH_OSD_OP_STAT:
+		osd_data = &src->raw_data_in;
+		ceph_osdc_msg_data_add(req->r_reply, osd_data);
+		break;
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		if (src->op == CEPH_OSD_OP_WRITE)
+			request_data_len = src->extent.length;
+		dst->extent.offset = cpu_to_le64(src->extent.offset);
+		dst->extent.length = cpu_to_le64(src->extent.length);
+		dst->extent.truncate_size =
+			cpu_to_le64(src->extent.truncate_size);
+		dst->extent.truncate_seq =
+			cpu_to_le32(src->extent.truncate_seq);
+		osd_data = &src->extent.osd_data;
+		if (src->op == CEPH_OSD_OP_WRITE)
+			ceph_osdc_msg_data_add(req->r_request, osd_data);
+		else
+			ceph_osdc_msg_data_add(req->r_reply, osd_data);
+		break;
+	case CEPH_OSD_OP_CALL:
+		dst->cls.class_len = src->cls.class_len;
+		dst->cls.method_len = src->cls.method_len;
+		osd_data = &src->cls.request_info;
+		ceph_osdc_msg_data_add(req->r_request, osd_data);
+		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
+		request_data_len = osd_data->pagelist->length;
+
+		osd_data = &src->cls.request_data;
+		data_length = ceph_osd_data_length(osd_data);
+		if (data_length) {
+			BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
+			dst->cls.indata_len = cpu_to_le32(data_length);
+			ceph_osdc_msg_data_add(req->r_request, osd_data);
+			src->payload_len += data_length;
+			request_data_len += data_length;
+		}
+		osd_data = &src->cls.response_data;
+		ceph_osdc_msg_data_add(req->r_reply, osd_data);
+		break;
+	case CEPH_OSD_OP_STARTSYNC:
+		break;
+	case CEPH_OSD_OP_NOTIFY_ACK:
+	case CEPH_OSD_OP_WATCH:
+		dst->watch.cookie = cpu_to_le64(src->watch.cookie);
+		dst->watch.ver = cpu_to_le64(src->watch.ver);
+		dst->watch.flag = src->watch.flag;
+		break;
+	default:
+		pr_err("unsupported osd opcode %s\n",
+			ceph_osd_op_name(src->op));
+		WARN_ON(1);
 
-	dout("build_request msg_size was %d num_ops %d\n", (int)msg_size,
-	     num_ops);
-	return;
+		return 0;
+	}
+	dst->op = cpu_to_le16(src->op);
+	dst->payload_len = cpu_to_le32(src->payload_len);
+
+	return request_data_len;
 }
-EXPORT_SYMBOL(ceph_osdc_build_request);
 
 /*
  * build new request AND message, calculate layout, and adjust file
@@ -436,51 +700,63 @@ EXPORT_SYMBOL(ceph_osdc_build_request);
 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       struct ceph_file_layout *layout,
 					       struct ceph_vino vino,
-					       u64 off, u64 *plen,
+					       u64 off, u64 *plen, int num_ops,
 					       int opcode, int flags,
 					       struct ceph_snap_context *snapc,
-					       int do_sync,
 					       u32 truncate_seq,
 					       u64 truncate_size,
-					       struct timespec *mtime,
-					       bool use_mempool,
-					       int page_align)
+					       bool use_mempool)
 {
-	struct ceph_osd_req_op ops[2];
 	struct ceph_osd_request *req;
-	unsigned int num_op = 1;
+	u64 objnum = 0;
+	u64 objoff = 0;
+	u64 objlen = 0;
+	u32 object_size;
+	u64 object_base;
 	int r;
 
-	memset(&ops, 0, sizeof ops);
-
-	ops[0].op = opcode;
-	ops[0].extent.truncate_seq = truncate_seq;
-	ops[0].extent.truncate_size = truncate_size;
-
-	if (do_sync) {
-		ops[1].op = CEPH_OSD_OP_STARTSYNC;
-		num_op++;
-	}
+	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
 
-	req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool,
+	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
 					GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
+
 	req->r_flags = flags;
 
 	/* calculate max write size */
-	r = calc_layout(vino, layout, off, plen, req, ops);
-	if (r < 0)
+	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
+	if (r < 0) {
+		ceph_osdc_put_request(req);
 		return ERR_PTR(r);
-	req->r_file_layout = *layout;  /* keep a copy */
+	}
 
-	/* in case it differs from natural (file) alignment that
-	   calc_layout filled in for us */
-	req->r_num_pages = calc_pages_for(page_align, *plen);
-	req->r_page_alignment = page_align;
+	object_size = le32_to_cpu(layout->fl_object_size);
+	object_base = off - objoff;
+	if (truncate_size <= object_base) {
+		truncate_size = 0;
+	} else {
+		truncate_size -= object_base;
+		if (truncate_size > object_size)
+			truncate_size = object_size;
+	}
+
+	osd_req_op_extent_init(req, 0, opcode, objoff, objlen,
+				truncate_size, truncate_seq);
+
+	/*
+	 * A second op in the ops array means the caller wants to
+	 * also issue a include a 'startsync' command so that the
+	 * osd will flush data quickly.
+	 */
+	if (num_ops > 1)
+		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+
+	req->r_file_layout = *layout;  /* keep a copy */
 
-	ceph_osdc_build_request(req, off, *plen, num_op, ops,
-				snapc, vino.snap, mtime);
+	snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx",
+		vino.ino, objnum);
+	req->r_oid_len = strlen(req->r_oid);
 
 	return req;
 }
@@ -558,21 +834,46 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
 				struct ceph_osd *osd)
 {
 	struct ceph_osd_request *req, *nreq;
+	LIST_HEAD(resend);
 	int err;
 
 	dout("__kick_osd_requests osd%d\n", osd->o_osd);
 	err = __reset_osd(osdc, osd);
 	if (err)
 		return;
-
+	/*
+	 * Build up a list of requests to resend by traversing the
+	 * osd's list of requests.  Requests for a given object are
+	 * sent in tid order, and that is also the order they're
+	 * kept on this list.  Therefore all requests that are in
+	 * flight will be found first, followed by all requests that
+	 * have not yet been sent.  And to resend requests while
+	 * preserving this order we will want to put any sent
+	 * requests back on the front of the osd client's unsent
+	 * list.
+	 *
+	 * So we build a separate ordered list of already-sent
+	 * requests for the affected osd and splice it onto the
+	 * front of the osd client's unsent list.  Once we've seen a
+	 * request that has not yet been sent we're done.  Those
+	 * requests are already sitting right where they belong.
+	 */
 	list_for_each_entry(req, &osd->o_requests, r_osd_item) {
-		list_move(&req->r_req_lru_item, &osdc->req_unsent);
-		dout("requeued %p tid %llu osd%d\n", req, req->r_tid,
+		if (!req->r_sent)
+			break;
+		list_move_tail(&req->r_req_lru_item, &resend);
+		dout("requeueing %p tid %llu osd%d\n", req, req->r_tid,
 		     osd->o_osd);
 		if (!req->r_linger)
 			req->r_flags |= CEPH_OSD_FLAG_RETRY;
 	}
+	list_splice(&resend, &osdc->req_unsent);
 
+	/*
+	 * Linger requests are re-registered before sending, which
+	 * sets up a new tid for each.  We add them to the unsent
+	 * list at the end to keep things in tid order.
+	 */
 	list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
 				 r_linger_osd) {
 		/*
@@ -581,8 +882,8 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
 		 */
 		BUG_ON(!list_empty(&req->r_req_lru_item));
 		__register_request(osdc, req);
-		list_add(&req->r_req_lru_item, &osdc->req_unsent);
-		list_add(&req->r_osd_item, &req->r_osd->o_requests);
+		list_add_tail(&req->r_req_lru_item, &osdc->req_unsent);
+		list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
 		__unregister_linger_request(osdc, req);
 		dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid,
 		     osd->o_osd);
@@ -654,8 +955,7 @@ static void put_osd(struct ceph_osd *osd)
 	if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) {
 		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
 
-		if (ac->ops && ac->ops->destroy_authorizer)
-			ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer);
+		ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
 		kfree(osd);
 	}
 }
@@ -820,14 +1120,6 @@ static void __register_request(struct ceph_osd_client *osdc,
 	}
 }
 
-static void register_request(struct ceph_osd_client *osdc,
-			     struct ceph_osd_request *req)
-{
-	mutex_lock(&osdc->request_mutex);
-	__register_request(osdc, req);
-	mutex_unlock(&osdc->request_mutex);
-}
-
 /*
  * called under osdc->request_mutex
  */
@@ -912,6 +1204,7 @@ void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
 	mutex_lock(&osdc->request_mutex);
 	if (req->r_linger) {
 		__unregister_linger_request(osdc, req);
+		req->r_linger = 0;
 		ceph_osdc_put_request(req);
 	}
 	mutex_unlock(&osdc->request_mutex);
@@ -952,8 +1245,8 @@ static int __map_request(struct ceph_osd_client *osdc,
 	int err;
 
 	dout("map_request %p tid %lld\n", req, req->r_tid);
-	err = ceph_calc_object_layout(&pgid, req->r_oid,
-				      &req->r_file_layout, osdc->osdmap);
+	err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap,
+				ceph_file_layout_pg_pool(req->r_file_layout));
 	if (err) {
 		list_move(&req->r_req_lru_item, &osdc->req_notarget);
 		return err;
@@ -1007,10 +1300,10 @@ static int __map_request(struct ceph_osd_client *osdc,
 
 	if (req->r_osd) {
 		__remove_osd_from_lru(req->r_osd);
-		list_add(&req->r_osd_item, &req->r_osd->o_requests);
-		list_move(&req->r_req_lru_item, &osdc->req_unsent);
+		list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
+		list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
 	} else {
-		list_move(&req->r_req_lru_item, &osdc->req_notarget);
+		list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
 	}
 	err = 1;   /* osd or pg changed */
 
@@ -1045,8 +1338,14 @@ static void __send_request(struct ceph_osd_client *osdc,
 	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
 
 	ceph_msg_get(req->r_request); /* send consumes a ref */
-	ceph_con_send(&req->r_osd->o_con, req->r_request);
+
+	/* Mark the request unsafe if this is the first timet's being sent. */
+
+	if (!req->r_sent && req->r_unsafe_callback)
+		req->r_unsafe_callback(req, true);
 	req->r_sent = req->r_osd->o_incarnation;
+
+	ceph_con_send(&req->r_osd->o_con, req->r_request);
 }
 
 /*
@@ -1134,31 +1433,11 @@ static void handle_osds_timeout(struct work_struct *work)
 
 static void complete_request(struct ceph_osd_request *req)
 {
-	if (req->r_safe_callback)
-		req->r_safe_callback(req, NULL);
+	if (req->r_unsafe_callback)
+		req->r_unsafe_callback(req, false);
 	complete_all(&req->r_safe_completion);  /* fsync waiter */
 }
 
-static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid)
-{
-	__u8 v;
-
-	ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad);
-	v = ceph_decode_8(p);
-	if (v > 1) {
-		pr_warning("do not understand pg encoding %d > 1", v);
-		return -EINVAL;
-	}
-	pgid->pool = ceph_decode_64(p);
-	pgid->seed = ceph_decode_32(p);
-	*p += 4;
-	return 0;
-
-bad:
-	pr_warning("incomplete pg encoding");
-	return -EINVAL;
-}
-
 /*
  * handle osd op reply.  either call the callback if it is specified,
  * or do the completion to wake up the waiting thread.
@@ -1170,7 +1449,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	struct ceph_osd_request *req;
 	u64 tid;
 	int object_len;
-	int numops, payload_len, flags;
+	unsigned int numops;
+	int payload_len, flags;
 	s32 result;
 	s32 retry_attempt;
 	struct ceph_pg pg;
@@ -1178,7 +1458,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	u32 reassert_epoch;
 	u64 reassert_version;
 	u32 osdmap_epoch;
-	int i;
+	int already_completed;
+	u32 bytes;
+	unsigned int i;
 
 	tid = le64_to_cpu(msg->hdr.tid);
 	dout("handle_reply %p tid %llu\n", msg, tid);
@@ -1191,7 +1473,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	ceph_decode_need(&p, end, object_len, bad);
 	p += object_len;
 
-	err = __decode_pgid(&p, end, &pg);
+	err = ceph_decode_pgid(&p, end, &pg);
 	if (err)
 		goto bad;
 
@@ -1207,8 +1489,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	req = __lookup_request(osdc, tid);
 	if (req == NULL) {
 		dout("handle_reply tid %llu dne\n", tid);
-		mutex_unlock(&osdc->request_mutex);
-		return;
+		goto bad_mutex;
 	}
 	ceph_osdc_get_request(req);
 
@@ -1233,9 +1514,10 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 		payload_len += len;
 		p += sizeof(*op);
 	}
-	if (payload_len != le32_to_cpu(msg->hdr.data_len)) {
+	bytes = le32_to_cpu(msg->hdr.data_len);
+	if (payload_len != bytes) {
 		pr_warning("sum of op payload lens %d != data_len %d",
-			   payload_len, le32_to_cpu(msg->hdr.data_len));
+			   payload_len, bytes);
 		goto bad_put;
 	}
 
@@ -1244,21 +1526,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	for (i = 0; i < numops; i++)
 		req->r_reply_op_result[i] = ceph_decode_32(&p);
 
-	/*
-	 * if this connection filled our message, drop our reference now, to
-	 * avoid a (safe but slower) revoke later.
-	 */
-	if (req->r_con_filling_msg == con && req->r_reply == msg) {
-		dout(" dropping con_filling_msg ref %p\n", con);
-		req->r_con_filling_msg = NULL;
-		con->ops->put(con);
-	}
-
 	if (!req->r_got_reply) {
-		unsigned int bytes;
 
 		req->r_result = result;
-		bytes = le32_to_cpu(msg->hdr.data_len);
 		dout("handle_reply result %d bytes %d\n", req->r_result,
 		     bytes);
 		if (req->r_result == 0)
@@ -1286,7 +1556,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
 		__unregister_request(osdc, req);
 
+	already_completed = req->r_completed;
+	req->r_completed = 1;
 	mutex_unlock(&osdc->request_mutex);
+	if (already_completed)
+		goto done;
 
 	if (req->r_callback)
 		req->r_callback(req, msg);
@@ -1303,6 +1577,8 @@ done:
 
 bad_put:
 	ceph_osdc_put_request(req);
+bad_mutex:
+	mutex_unlock(&osdc->request_mutex);
 bad:
 	pr_err("corrupt osd_op_reply got %d %d\n",
 	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
@@ -1736,6 +2012,104 @@ bad:
 }
 
 /*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
+				struct ceph_snap_context *snapc, u64 snap_id,
+				struct timespec *mtime)
+{
+	struct ceph_msg *msg = req->r_request;
+	void *p;
+	size_t msg_size;
+	int flags = req->r_flags;
+	u64 data_len;
+	unsigned int i;
+
+	req->r_snapid = snap_id;
+	req->r_snapc = ceph_get_snap_context(snapc);
+
+	/* encode request */
+	msg->hdr.version = cpu_to_le16(4);
+
+	p = msg->front.iov_base;
+	ceph_encode_32(&p, 1);   /* client_inc  is always 1 */
+	req->r_request_osdmap_epoch = p;
+	p += 4;
+	req->r_request_flags = p;
+	p += 4;
+	if (req->r_flags & CEPH_OSD_FLAG_WRITE)
+		ceph_encode_timespec(p, mtime);
+	p += sizeof(struct ceph_timespec);
+	req->r_request_reassert_version = p;
+	p += sizeof(struct ceph_eversion); /* will get filled in */
+
+	/* oloc */
+	ceph_encode_8(&p, 4);
+	ceph_encode_8(&p, 4);
+	ceph_encode_32(&p, 8 + 4 + 4);
+	req->r_request_pool = p;
+	p += 8;
+	ceph_encode_32(&p, -1);  /* preferred */
+	ceph_encode_32(&p, 0);   /* key len */
+
+	ceph_encode_8(&p, 1);
+	req->r_request_pgid = p;
+	p += 8 + 4;
+	ceph_encode_32(&p, -1);  /* preferred */
+
+	/* oid */
+	ceph_encode_32(&p, req->r_oid_len);
+	memcpy(p, req->r_oid, req->r_oid_len);
+	dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
+	p += req->r_oid_len;
+
+	/* ops--can imply data */
+	ceph_encode_16(&p, (u16)req->r_num_ops);
+	data_len = 0;
+	for (i = 0; i < req->r_num_ops; i++) {
+		data_len += osd_req_encode_op(req, p, i);
+		p += sizeof(struct ceph_osd_op);
+	}
+
+	/* snaps */
+	ceph_encode_64(&p, req->r_snapid);
+	ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
+	ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
+	if (req->r_snapc) {
+		for (i = 0; i < snapc->num_snaps; i++) {
+			ceph_encode_64(&p, req->r_snapc->snaps[i]);
+		}
+	}
+
+	req->r_request_attempts = p;
+	p += 4;
+
+	/* data */
+	if (flags & CEPH_OSD_FLAG_WRITE) {
+		u16 data_off;
+
+		/*
+		 * The header "data_off" is a hint to the receiver
+		 * allowing it to align received data into its
+		 * buffers such that there's no need to re-copy
+		 * it before writing it to disk (direct I/O).
+		 */
+		data_off = (u16) (off & 0xffff);
+		req->r_request->hdr.data_off = cpu_to_le16(data_off);
+	}
+	req->r_request->hdr.data_len = cpu_to_le32(data_len);
+
+	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+	msg_size = p - msg->front.iov_base;
+	msg->front.iov_len = msg_size;
+	msg->hdr.front_len = cpu_to_le32(msg_size);
+
+	dout("build_request msg_size was %d\n", (int)msg_size);
+}
+EXPORT_SYMBOL(ceph_osdc_build_request);
+
+/*
  * Register request, send initial attempt.
  */
 int ceph_osdc_start_request(struct ceph_osd_client *osdc,
@@ -1744,41 +2118,28 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 {
 	int rc = 0;
 
-	req->r_request->pages = req->r_pages;
-	req->r_request->nr_pages = req->r_num_pages;
-#ifdef CONFIG_BLOCK
-	req->r_request->bio = req->r_bio;
-#endif
-	req->r_request->trail = &req->r_trail;
-
-	register_request(osdc, req);
-
 	down_read(&osdc->map_sem);
 	mutex_lock(&osdc->request_mutex);
-	/*
-	 * a racing kick_requests() may have sent the message for us
-	 * while we dropped request_mutex above, so only send now if
-	 * the request still han't been touched yet.
-	 */
-	if (req->r_sent == 0) {
-		rc = __map_request(osdc, req, 0);
-		if (rc < 0) {
-			if (nofail) {
-				dout("osdc_start_request failed map, "
-				     " will retry %lld\n", req->r_tid);
-				rc = 0;
-			}
-			goto out_unlock;
-		}
-		if (req->r_osd == NULL) {
-			dout("send_request %p no up osds in pg\n", req);
-			ceph_monc_request_next_osdmap(&osdc->client->monc);
-		} else {
-			__send_request(osdc, req);
+	__register_request(osdc, req);
+	req->r_sent = 0;
+	req->r_got_reply = 0;
+	req->r_completed = 0;
+	rc = __map_request(osdc, req, 0);
+	if (rc < 0) {
+		if (nofail) {
+			dout("osdc_start_request failed map, "
+				" will retry %lld\n", req->r_tid);
+			rc = 0;
 		}
-		rc = 0;
+		goto out_unlock;
 	}
-
+	if (req->r_osd == NULL) {
+		dout("send_request %p no up osds in pg\n", req);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	} else {
+		__send_queued(osdc);
+	}
+	rc = 0;
 out_unlock:
 	mutex_unlock(&osdc->request_mutex);
 	up_read(&osdc->map_sem);
@@ -1940,18 +2301,22 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 
 	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
 	     vino.snap, off, *plen);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-				    NULL, 0, truncate_seq, truncate_size, NULL,
-				    false, page_align);
+				    NULL, truncate_seq, truncate_size,
+				    false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
-	req->r_pages = pages;
 
-	dout("readpages  final extent is %llu~%llu (%d pages align %d)\n",
-	     off, *plen, req->r_num_pages, page_align);
+	osd_req_op_extent_osd_data_pages(req, 0,
+				pages, *plen, page_align, false, false);
+
+	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
+	     off, *plen, *plen, page_align);
+
+	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
 
 	rc = ceph_osdc_start_request(osdc, req, false);
 	if (!rc)
@@ -1978,20 +2343,21 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	int rc = 0;
 	int page_align = off & ~PAGE_MASK;
 
-	BUG_ON(vino.snap != CEPH_NOSNAP);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+	BUG_ON(vino.snap != CEPH_NOSNAP);	/* snapshots aren't writeable */
+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1,
 				    CEPH_OSD_OP_WRITE,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
-				    snapc, 0,
-				    truncate_seq, truncate_size, mtime,
-				    true, page_align);
+				    snapc, truncate_seq, truncate_size,
+				    true);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
 	/* it may be a short write due to an object boundary */
-	req->r_pages = pages;
-	dout("writepages %llu~%llu (%d pages)\n", off, len,
-	     req->r_num_pages);
+	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+				false, false);
+	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
+
+	ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
 
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
@@ -2005,6 +2371,26 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 }
 EXPORT_SYMBOL(ceph_osdc_writepages);
 
+int ceph_osdc_setup(void)
+{
+	BUG_ON(ceph_osd_request_cache);
+	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
+					sizeof (struct ceph_osd_request),
+					__alignof__(struct ceph_osd_request),
+					0, NULL);
+
+	return ceph_osd_request_cache ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(ceph_osdc_setup);
+
+void ceph_osdc_cleanup(void)
+{
+	BUG_ON(!ceph_osd_request_cache);
+	kmem_cache_destroy(ceph_osd_request_cache);
+	ceph_osd_request_cache = NULL;
+}
+EXPORT_SYMBOL(ceph_osdc_cleanup);
+
 /*
  * handle incoming message
  */
@@ -2064,13 +2450,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		goto out;
 	}
 
-	if (req->r_con_filling_msg) {
+	if (req->r_reply->con)
 		dout("%s revoking msg %p from old con %p\n", __func__,
-		     req->r_reply, req->r_con_filling_msg);
-		ceph_msg_revoke_incoming(req->r_reply);
-		req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
-		req->r_con_filling_msg = NULL;
-	}
+		     req->r_reply, req->r_reply->con);
+	ceph_msg_revoke_incoming(req->r_reply);
 
 	if (front > req->r_reply->front.iov_len) {
 		pr_warning("get_reply front %d > preallocated %d\n",
@@ -2084,26 +2467,29 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	m = ceph_msg_get(req->r_reply);
 
 	if (data_len > 0) {
-		int want = calc_pages_for(req->r_page_alignment, data_len);
-
-		if (req->r_pages && unlikely(req->r_num_pages < want)) {
-			pr_warning("tid %lld reply has %d bytes %d pages, we"
-				   " had only %d pages ready\n", tid, data_len,
-				   want, req->r_num_pages);
-			*skip = 1;
-			ceph_msg_put(m);
-			m = NULL;
-			goto out;
+		struct ceph_osd_data *osd_data;
+
+		/*
+		 * XXX This is assuming there is only one op containing
+		 * XXX page data.  Probably OK for reads, but this
+		 * XXX ought to be done more generally.
+		 */
+		osd_data = osd_req_op_extent_osd_data(req, 0);
+		if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+			if (osd_data->pages &&
+				unlikely(osd_data->length < data_len)) {
+
+				pr_warning("tid %lld reply has %d bytes "
+					"we had only %llu bytes ready\n",
+					tid, data_len, osd_data->length);
+				*skip = 1;
+				ceph_msg_put(m);
+				m = NULL;
+				goto out;
+			}
 		}
-		m->pages = req->r_pages;
-		m->nr_pages = req->r_num_pages;
-		m->page_alignment = req->r_page_alignment;
-#ifdef CONFIG_BLOCK
-		m->bio = req->r_bio;
-#endif
 	}
 	*skip = 0;
-	req->r_con_filling_msg = con->ops->get(con);
 	dout("get_reply tid %lld %p\n", tid, m);
 
 out:
@@ -2168,13 +2554,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	struct ceph_auth_handshake *auth = &o->o_auth;
 
 	if (force_new && auth->authorizer) {
-		if (ac->ops && ac->ops->destroy_authorizer)
-			ac->ops->destroy_authorizer(ac, auth->authorizer);
+		ceph_auth_destroy_authorizer(ac, auth->authorizer);
 		auth->authorizer = NULL;
 	}
-	if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
-		int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
-							auth);
+	if (!auth->authorizer) {
+		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
+						      auth);
+		if (ret)
+			return ERR_PTR(ret);
+	} else {
+		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
+						     auth);
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -2190,11 +2580,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 
-	/*
-	 * XXX If ac->ops or ac->ops->verify_authorizer_reply is null,
-	 * XXX which do we do:  succeed or fail?
-	 */
-	return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len);
+	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
@@ -2203,9 +2589,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 
-	if (ac->ops && ac->ops->invalidate_authorizer)
-		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
-
+	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
 	return ceph_monc_validate_auth(&osdc->client->monc);
 }
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4543b9aba40..603ddd92db1 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -654,24 +654,6 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 	return 0;
 }
 
-static int __decode_pgid(void **p, void *end, struct ceph_pg *pg)
-{
-	u8 v;
-
-	ceph_decode_need(p, end, 1+8+4+4, bad);
-	v = ceph_decode_8(p);
-	if (v != 1)
-		goto bad;
-	pg->pool = ceph_decode_64(p);
-	pg->seed = ceph_decode_32(p);
-	*p += 4; /* skip preferred */
-	return 0;
-
-bad:
-	dout("error decoding pgid\n");
-	return -EINVAL;
-}
-
 /*
  * decode a full map.
  */
@@ -765,7 +747,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		struct ceph_pg pgid;
 		struct ceph_pg_mapping *pg;
 
-		err = __decode_pgid(p, end, &pgid);
+		err = ceph_decode_pgid(p, end, &pgid);
 		if (err)
 			goto bad;
 		ceph_decode_need(p, end, sizeof(u32), bad);
@@ -983,7 +965,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		struct ceph_pg pgid;
 		u32 pglen;
 
-		err = __decode_pgid(p, end, &pgid);
+		err = ceph_decode_pgid(p, end, &pgid);
 		if (err)
 			goto bad;
 		ceph_decode_need(p, end, sizeof(u32), bad);
@@ -1111,27 +1093,22 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
  * calculate an object layout (i.e. pgid) from an oid,
  * file_layout, and osdmap
  */
-int ceph_calc_object_layout(struct ceph_pg *pg,
-			    const char *oid,
-			    struct ceph_file_layout *fl,
-			    struct ceph_osdmap *osdmap)
+int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
+			struct ceph_osdmap *osdmap, uint64_t pool)
 {
-	unsigned int num, num_mask;
-	struct ceph_pg_pool_info *pool;
+	struct ceph_pg_pool_info *pool_info;
 
 	BUG_ON(!osdmap);
-	pg->pool = le32_to_cpu(fl->fl_pg_pool);
-	pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool);
-	if (!pool)
+	pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool);
+	if (!pool_info)
 		return -EIO;
-	pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
-	num = pool->pg_num;
-	num_mask = pool->pg_num_mask;
+	pg->pool = pool;
+	pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid));
 
-	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed);
+	dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed);
 	return 0;
 }
-EXPORT_SYMBOL(ceph_calc_object_layout);
+EXPORT_SYMBOL(ceph_calc_ceph_pg);
 
 /*
  * Calculate raw osd vector for the given pgid.  Return pointer to osd
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
new file mode 100644
index 00000000000..154683f5f14
--- /dev/null
+++ b/net/ceph/snapshot.c
@@ -0,0 +1,78 @@
+/*
+ * snapshot.c    Ceph snapshot context utility routines (part of libceph)
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <stddef.h>
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/ceph/libceph.h>
+
+/*
+ * Ceph snapshot contexts are reference counted objects, and the
+ * returned structure holds a single reference.  Acquire additional
+ * references with ceph_get_snap_context(), and release them with
+ * ceph_put_snap_context().  When the reference count reaches zero
+ * the entire structure is freed.
+ */
+
+/*
+ * Create a new ceph snapshot context large enough to hold the
+ * indicated number of snapshot ids (which can be 0).  Caller has
+ * to fill in snapc->seq and snapc->snaps[0..snap_count-1].
+ *
+ * Returns a null pointer if an error occurs.
+ */
+struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
+						gfp_t gfp_flags)
+{
+	struct ceph_snap_context *snapc;
+	size_t size;
+
+	size = sizeof (struct ceph_snap_context);
+	size += snap_count * sizeof (snapc->snaps[0]);
+	snapc = kzalloc(size, gfp_flags);
+	if (!snapc)
+		return NULL;
+
+	atomic_set(&snapc->nref, 1);
+	snapc->num_snaps = snap_count;
+
+	return snapc;
+}
+EXPORT_SYMBOL(ceph_create_snap_context);
+
+struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+	if (sc)
+		atomic_inc(&sc->nref);
+	return sc;
+}
+EXPORT_SYMBOL(ceph_get_snap_context);
+
+void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+	if (!sc)
+		return;
+	if (atomic_dec_and_test(&sc->nref)) {
+		/*printk(" deleting snap_context %p\n", sc);*/
+		kfree(sc);
+	}
+}
+EXPORT_SYMBOL(ceph_put_snap_context);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4040673f806..fc1e289397f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2213,6 +2213,17 @@ __be16 skb_network_protocol(struct sk_buff *skb)
 	__be16 type = skb->protocol;
 	int vlan_depth = ETH_HLEN;
 
+	/* Tunnel gso handlers can set protocol to ethernet. */
+	if (type == htons(ETH_P_TEB)) {
+		struct ethhdr *eth;
+
+		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+			return 0;
+
+		eth = (struct ethhdr *)skb_mac_header(skb);
+		type = eth->h_proto;
+	}
+
 	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
 		struct vlan_hdr *vh;
 
@@ -2456,7 +2467,7 @@ EXPORT_SYMBOL(netif_skb_features);
  *	2. skb is fragmented and the device does not support SG.
  */
 static inline int skb_needs_linearize(struct sk_buff *skb,
-				      int features)
+				      netdev_features_t features)
 {
 	return skb_is_nonlinear(skb) &&
 			((skb_has_frag_list(skb) &&
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 5a934ef90f8..22efdaa76eb 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1421,7 +1421,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	void __user *useraddr = ifr->ifr_data;
 	u32 ethcmd;
 	int rc;
-	u32 old_features;
+	netdev_features_t old_features;
 
 	if (!dev || !netif_device_present(dev))
 		return -ENODEV;
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 7e7aeb01de4..de178e46268 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -75,31 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
 
 /*
  *	Copy kernel to iovec. Returns -EFAULT on error.
- *
- *	Note: this modifies the original iovec.
- */
-
-int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
-{
-	while (len > 0) {
-		if (iov->iov_len) {
-			int copy = min_t(unsigned int, iov->iov_len, len);
-			if (copy_to_user(iov->iov_base, kdata, copy))
-				return -EFAULT;
-			kdata += copy;
-			len -= copy;
-			iov->iov_len -= copy;
-			iov->iov_base += copy;
-		}
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovec);
-
-/*
- *	Copy kernel to iovec. Returns -EFAULT on error.
  */
 
 int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
@@ -125,31 +100,6 @@ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
 EXPORT_SYMBOL(memcpy_toiovecend);
 
 /*
- *	Copy iovec to kernel. Returns -EFAULT on error.
- *
- *	Note: this modifies the original iovec.
- */
-
-int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
-{
-	while (len > 0) {
-		if (iov->iov_len) {
-			int copy = min_t(unsigned int, len, iov->iov_len);
-			if (copy_from_user(kdata, iov->iov_base, copy))
-				return -EFAULT;
-			len -= copy;
-			kdata += copy;
-			iov->iov_base += copy;
-			iov->iov_len -= copy;
-		}
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovec);
-
-/*
  *	Copy iovec from kernel. Returns -EFAULT on error.
  */
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7427ab5e27d..981fed397d1 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -606,21 +606,11 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
 	return sprintf(buf, "%lu\n", val);
 }
 
-static void rps_dev_flow_table_release_work(struct work_struct *work)
-{
-	struct rps_dev_flow_table *table = container_of(work,
-	    struct rps_dev_flow_table, free_work);
-
-	vfree(table);
-}
-
 static void rps_dev_flow_table_release(struct rcu_head *rcu)
 {
 	struct rps_dev_flow_table *table = container_of(rcu,
 	    struct rps_dev_flow_table, rcu);
-
-	INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
-	schedule_work(&table->free_work);
+	vfree(table);
 }
 
 static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a5802a8b652..cec074be8c4 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -206,7 +206,7 @@ static void netpoll_poll_dev(struct net_device *dev)
 	 * the dev_open/close paths use this to block netpoll activity
 	 * while changing device state
 	 */
-	if (!down_trylock(&ni->dev_lock))
+	if (down_trylock(&ni->dev_lock))
 		return;
 
 	if (!netif_running(dev)) {
diff --git a/net/core/sock.c b/net/core/sock.c
index d4f4cea726e..6ba327da79e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1217,18 +1217,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
 #endif
 }
 
-/*
- * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
- * un-modified. Special care is taken when initializing object to zero.
- */
-static inline void sk_prot_clear_nulls(struct sock *sk, int size)
-{
-	if (offsetof(struct sock, sk_node.next) != 0)
-		memset(sk, 0, offsetof(struct sock, sk_node.next));
-	memset(&sk->sk_node.pprev, 0,
-	       size - offsetof(struct sock, sk_node.pprev));
-}
-
 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
 {
 	unsigned long nulls1, nulls2;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c61b3bb87a1..d01be2a3ae5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1293,6 +1293,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
+		       SKB_GSO_TCPV6 |
 		       SKB_GSO_UDP_TUNNEL |
 		       0)))
 		goto out;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ff06b7543d9..49616fed934 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -125,7 +125,6 @@ struct tnode {
 	unsigned int empty_children;	/* KEYLENGTH bits needed */
 	union {
 		struct rcu_head rcu;
-		struct work_struct work;
 		struct tnode *tnode_free;
 	};
 	struct rt_trie_node __rcu *child[0];
@@ -383,12 +382,6 @@ static struct tnode *tnode_alloc(size_t size)
 		return vzalloc(size);
 }
 
-static void __tnode_vfree(struct work_struct *arg)
-{
-	struct tnode *tn = container_of(arg, struct tnode, work);
-	vfree(tn);
-}
-
 static void __tnode_free_rcu(struct rcu_head *head)
 {
 	struct tnode *tn = container_of(head, struct tnode, rcu);
@@ -397,10 +390,8 @@ static void __tnode_free_rcu(struct rcu_head *head)
 
 	if (size <= PAGE_SIZE)
 		kfree(tn);
-	else {
-		INIT_WORK(&tn->work, __tnode_vfree);
-		schedule_work(&tn->work);
-	}
+	else
+		vfree(tn);
 }
 
 static inline void tnode_free(struct tnode *tn)
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index d2d5a99fba0..b2e805af9b8 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -121,6 +121,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	int ghl = GRE_HEADER_SECTION;
 	struct gre_base_hdr *greh;
 	int mac_len = skb->mac_len;
+	__be16 protocol = skb->protocol;
 	int tnl_hlen;
 	bool csum;
 
@@ -149,13 +150,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 		csum = false;
 
 	/* setup inner skb. */
-	if (greh->protocol == htons(ETH_P_TEB)) {
-		struct ethhdr *eth = eth_hdr(skb);
-		skb->protocol = eth->h_proto;
-	} else {
-		skb->protocol = greh->protocol;
-	}
-
+	skb->protocol = greh->protocol;
 	skb->encapsulation = 0;
 
 	if (unlikely(!pskb_may_pull(skb, ghl)))
@@ -199,6 +194,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 		skb_reset_mac_header(skb);
 		skb_set_network_header(skb, mac_len);
 		skb->mac_len = mac_len;
+		skb->protocol = protocol;
 	} while ((skb = skb->next));
 out:
 	return segs;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index e97d66a1fdd..7e06641e36a 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -305,6 +305,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
 	spin_lock_init(&q->lock);
 	atomic_set(&q->refcnt, 1);
+	INIT_LIST_HEAD(&q->lru_list);
 
 	return q;
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 147abf5275a..4bcabf3ab4c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -84,7 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
 EXPORT_SYMBOL(sysctl_ip_default_ttl);
 
 /* Generate a checksum for an outgoing IP datagram. */
-__inline__ void ip_send_check(struct iphdr *iph)
+void ip_send_check(struct iphdr *iph)
 {
 	iph->check = 0;
 	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f696d7c2e9f..f6a005c485a 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -96,7 +96,8 @@ struct tcpm_hash_bucket {
 
 static DEFINE_SPINLOCK(tcp_metrics_lock);
 
-static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
+static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
+			  bool fastopen_clear)
 {
 	u32 val;
 
@@ -122,9 +123,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
 	tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
 	tm->tcpm_ts = 0;
 	tm->tcpm_ts_stamp = 0;
-	tm->tcpm_fastopen.mss = 0;
-	tm->tcpm_fastopen.syn_loss = 0;
-	tm->tcpm_fastopen.cookie.len = 0;
+	if (fastopen_clear) {
+		tm->tcpm_fastopen.mss = 0;
+		tm->tcpm_fastopen.syn_loss = 0;
+		tm->tcpm_fastopen.cookie.len = 0;
+	}
 }
 
 static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
@@ -154,7 +157,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
 	}
 	tm->tcpm_addr = *addr;
 
-	tcpm_suck_dst(tm, dst);
+	tcpm_suck_dst(tm, dst, true);
 
 	if (likely(!reclaim)) {
 		tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
@@ -171,7 +174,7 @@ out_unlock:
 static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
 {
 	if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
-		tcpm_suck_dst(tm, dst);
+		tcpm_suck_dst(tm, dst, false);
 }
 
 #define TCP_METRICS_RECLAIM_DEPTH	5
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6abbe645512..0bf5d399a03 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2311,8 +2311,9 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	int mac_len = skb->mac_len;
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-	int outer_hlen;
+	__be16 protocol = skb->protocol;
 	netdev_features_t enc_features;
+	int outer_hlen;
 
 	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
 		goto out;
@@ -2322,6 +2323,7 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 	skb_reset_mac_header(skb);
 	skb_set_network_header(skb, skb_inner_network_offset(skb));
 	skb->mac_len = skb_inner_network_offset(skb);
+	skb->protocol = htons(ETH_P_TEB);
 
 	/* segment inner packet. */
 	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
@@ -2358,6 +2360,7 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 
 		}
 		skb->ip_summed = CHECKSUM_NONE;
+		skb->protocol = protocol;
 	} while ((skb = skb->next));
 out:
 	return segs;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index d3ddd840035..ecd60733e5e 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1081,6 +1081,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
 		}
 		if (t == NULL)
 			t = netdev_priv(dev);
+		memset(&p, 0, sizeof(p));
 		ip6gre_tnl_parm_to_user(&p, &t->parms);
 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
 			err = -EFAULT;
@@ -1128,6 +1129,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
 		if (t) {
 			err = 0;
 
+			memset(&p, 0, sizeof(p));
 			ip6gre_tnl_parm_to_user(&p, &t->parms);
 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
 				err = -EFAULT;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 71167069b39..0a17ed9eaf3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1890,6 +1890,17 @@ void tcp6_proc_exit(struct net *net)
 }
 #endif
 
+static void tcp_v6_clear_sk(struct sock *sk, int size)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* we do not want to clear pinet6 field, because of RCU lookups */
+	sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));
+
+	size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
+	memset(&inet->pinet6 + 1, 0, size);
+}
+
 struct proto tcpv6_prot = {
 	.name			= "TCPv6",
 	.owner			= THIS_MODULE,
@@ -1933,6 +1944,7 @@ struct proto tcpv6_prot = {
 #ifdef CONFIG_MEMCG_KMEM
 	.proto_cgroup		= tcp_proto_cgroup,
 #endif
+	.clear_sk		= tcp_v6_clear_sk,
 };
 
 static const struct inet6_protocol tcpv6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d4defdd4493..42923b14dfa 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1432,6 +1432,17 @@ void udp6_proc_exit(struct net *net) {
 }
 #endif /* CONFIG_PROC_FS */
 
+void udp_v6_clear_sk(struct sock *sk, int size)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* we do not want to clear pinet6 field, because of RCU lookups */
+	sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6));
+
+	size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
+	memset(&inet->pinet6 + 1, 0, size);
+}
+
 /* ------------------------------------------------------------------------ */
 
 struct proto udpv6_prot = {
@@ -1462,7 +1473,7 @@ struct proto udpv6_prot = {
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
 #endif
-	.clear_sk	   = sk_prot_clear_portaddr_nulls,
+	.clear_sk	   = udp_v6_clear_sk,
 };
 
 static struct inet_protosw udpv6_protosw = {
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index d7571046bfc..4691ed50a92 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -31,6 +31,8 @@ extern int	udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 extern int	udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
 extern void	udpv6_destroy_sock(struct sock *sk);
 
+extern void udp_v6_clear_sk(struct sock *sk, int size);
+
 #ifdef CONFIG_PROC_FS
 extern int	udp6_seq_show(struct seq_file *seq, void *v);
 #endif
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 1d08e21d9f6..dfcc4be4689 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -56,7 +56,7 @@ struct proto udplitev6_prot = {
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
 #endif
-	.clear_sk	   = sk_prot_clear_portaddr_nulls,
+	.clear_sk	   = udp_v6_clear_sk,
 };
 
 static struct inet_protosw udplite6_protosw = {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 4ef7bdb6544..23ed03d786c 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -103,8 +103,10 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	dev_hold(dev);
 
 	xdst->u.rt6.rt6i_idev = in6_dev_get(dev);
-	if (!xdst->u.rt6.rt6i_idev)
+	if (!xdst->u.rt6.rt6i_idev) {
+		dev_put(dev);
 		return -ENODEV;
+	}
 
 	rt6_transfer_peer(&xdst->u.rt6, rt);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index dd5cd49b0e0..8ec1bca7f85 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -742,36 +742,33 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
 
 	smp_rmb();
 
-	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
+	/* We could have just memset this but we will lose the
+	 * flexibility of making the priv area sticky
+	 */
 
-		/* We could have just memset this but we will lose the
-		 * flexibility of making the priv area sticky
-		 */
-		BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
-		BLOCK_NUM_PKTS(pbd1) = 0;
-		BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
-		getnstimeofday(&ts);
-		h1->ts_first_pkt.ts_sec = ts.tv_sec;
-		h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
-		pkc1->pkblk_start = (char *)pbd1;
-		pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
-		BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
-		BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
-		pbd1->version = pkc1->version;
-		pkc1->prev = pkc1->nxt_offset;
-		pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
-		prb_thaw_queue(pkc1);
-		_prb_refresh_rx_retire_blk_timer(pkc1);
+	BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
+	BLOCK_NUM_PKTS(pbd1) = 0;
+	BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
 
-		smp_wmb();
+	getnstimeofday(&ts);
 
-		return;
-	}
+	h1->ts_first_pkt.ts_sec = ts.tv_sec;
+	h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
 
-	WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
-		pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
-	dump_stack();
-	BUG();
+	pkc1->pkblk_start = (char *)pbd1;
+	pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
+
+	BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
+	BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
+
+	pbd1->version = pkc1->version;
+	pkc1->prev = pkc1->nxt_offset;
+	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
+
+	prb_thaw_queue(pkc1);
+	_prb_refresh_rx_retire_blk_timer(pkc1);
+
+	smp_wmb();
 }
 
 /*
@@ -862,10 +859,6 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
 		prb_close_block(pkc, pbd, po, status);
 		return;
 	}
-
-	WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
-	dump_stack();
-	BUG();
 }
 
 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
diff --git a/net/socket.c b/net/socket.c
index b416093997d..6b94633ca61 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2412,7 +2412,7 @@ static const unsigned char nargs[21] = {
 
 SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 {
-	unsigned long a[6];
+	unsigned long a[AUDITSC_ARGS];
 	unsigned long a0, a1;
 	int err;
 	unsigned int len;
@@ -2428,7 +2428,9 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 	if (copy_from_user(a, args, len))
 		return -EFAULT;
 
-	audit_socketcall(nargs[call] / sizeof(unsigned long), a);
+	err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
+	if (err)
+		return err;
 
 	a0 = a[0];
 	a1 = a[1];
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a764e227fdd..7da6b457f66 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -867,8 +867,7 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 	err = -EINVAL;
 	gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
 	if (!gss_auth->mech) {
-		printk(KERN_WARNING "%s: Pseudoflavor %d not found!\n",
-				__func__, flavor);
+		dprintk("RPC:       Pseudoflavor %d not found!\n", flavor);
 		goto err_free;
 	}
 	gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 5c4c61d527e..357f613df7f 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -21,16 +21,6 @@
 #include <linux/sunrpc/svcauth.h>
 #include "gss_rpc_xdr.h"
 
-static bool gssx_check_pointer(struct xdr_stream *xdr)
-{
-	__be32 *p;
-
-	p = xdr_reserve_space(xdr, 4);
-	if (unlikely(p == NULL))
-		return -ENOSPC;
-	return *p?true:false;
-}
-
 static int gssx_enc_bool(struct xdr_stream *xdr, int v)
 {
 	__be32 *p;
@@ -264,25 +254,27 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
 	if (unlikely(p == NULL))
 		return -ENOSPC;
 	count = be32_to_cpup(p++);
-	if (count != 0) {
-		/* we recognize only 1 currently: CREDS_VALUE */
-		oa->count = 1;
+	if (!count)
+		return 0;
 
-		oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL);
-		if (!oa->data)
-			return -ENOMEM;
+	/* we recognize only 1 currently: CREDS_VALUE */
+	oa->count = 1;
 
-		creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL);
-		if (!creds) {
-			kfree(oa->data);
-			return -ENOMEM;
-		}
+	oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL);
+	if (!oa->data)
+		return -ENOMEM;
 
-		oa->data[0].option.data = CREDS_VALUE;
-		oa->data[0].option.len = sizeof(CREDS_VALUE);
-		oa->data[0].value.data = (void *)creds;
-		oa->data[0].value.len = 0;
+	creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL);
+	if (!creds) {
+		kfree(oa->data);
+		return -ENOMEM;
 	}
+
+	oa->data[0].option.data = CREDS_VALUE;
+	oa->data[0].option.len = sizeof(CREDS_VALUE);
+	oa->data[0].value.data = (void *)creds;
+	oa->data[0].value.len = 0;
+
 	for (i = 0; i < count; i++) {
 		gssx_buffer dummy = { 0, NULL };
 		u32 length;
@@ -800,6 +792,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
 				struct xdr_stream *xdr,
 				struct gssx_res_accept_sec_context *res)
 {
+	u32 value_follows;
 	int err;
 
 	/* res->status */
@@ -808,7 +801,10 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
 		return err;
 
 	/* res->context_handle */
-	if (gssx_check_pointer(xdr)) {
+	err = gssx_dec_bool(xdr, &value_follows);
+	if (err)
+		return err;
+	if (value_follows) {
 		err = gssx_dec_ctx(xdr, res->context_handle);
 		if (err)
 			return err;
@@ -817,7 +813,10 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
 	}
 
 	/* res->output_token */
-	if (gssx_check_pointer(xdr)) {
+	err = gssx_dec_bool(xdr, &value_follows);
+	if (err)
+		return err;
+	if (value_follows) {
 		err = gssx_dec_buffer(xdr, res->output_token);
 		if (err)
 			return err;
@@ -826,7 +825,10 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
 	}
 
 	/* res->delegated_cred_handle */
-	if (gssx_check_pointer(xdr)) {
+	err = gssx_dec_bool(xdr, &value_follows);
+	if (err)
+		return err;
+	if (value_follows) {
 		/* we do not support upcall servers sending this data. */
 		return -EINVAL;
 	}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 3f7930f938c..5a750b9c364 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -360,7 +360,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
 
 	auth = rpcauth_create(args->authflavor, clnt);
 	if (IS_ERR(auth)) {
-		printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
+		dprintk("RPC:       Couldn't create auth handle (flavor %u)\n",
 				args->authflavor);
 		err = PTR_ERR(auth);
 		goto out_no_auth;
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 25e159c2feb..e5f3da50782 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -584,8 +584,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
 {
 	int bp_index;
 
-	/*
-	 * Prepare broadcast link message for reliable transmission,
+	/* Prepare broadcast link message for reliable transmission,
 	 * if first time trying to send it;
 	 * preparation is skipped for broadcast link protocol messages
 	 * since they are sent in an unreliable manner and don't need it
@@ -611,30 +610,43 @@ static int tipc_bcbearer_send(struct sk_buff *buf,
 	for (bp_index = 0; bp_index < MAX_BEARERS; bp_index++) {
 		struct tipc_bearer *p = bcbearer->bpairs[bp_index].primary;
 		struct tipc_bearer *s = bcbearer->bpairs[bp_index].secondary;
+		struct tipc_bearer *b = p;
+		struct sk_buff *tbuf;
 
 		if (!p)
-			break;	/* no more bearers to try */
+			break; /* No more bearers to try */
+
+		if (tipc_bearer_blocked(p)) {
+			if (!s || tipc_bearer_blocked(s))
+				continue; /* Can't use either bearer */
+			b = s;
+		}
 
-		tipc_nmap_diff(&bcbearer->remains, &p->nodes, &bcbearer->remains_new);
+		tipc_nmap_diff(&bcbearer->remains, &b->nodes,
+			       &bcbearer->remains_new);
 		if (bcbearer->remains_new.count == bcbearer->remains.count)
-			continue;	/* bearer pair doesn't add anything */
+			continue; /* Nothing added by bearer pair */
 
-		if (!tipc_bearer_blocked(p))
-			tipc_bearer_send(p, buf, &p->bcast_addr);
-		else if (s && !tipc_bearer_blocked(s))
-			/* unable to send on primary bearer */
-			tipc_bearer_send(s, buf, &s->bcast_addr);
-		else
-			/* unable to send on either bearer */
-			continue;
+		if (bp_index == 0) {
+			/* Use original buffer for first bearer */
+			tipc_bearer_send(b, buf, &b->bcast_addr);
+		} else {
+			/* Avoid concurrent buffer access */
+			tbuf = pskb_copy(buf, GFP_ATOMIC);
+			if (!tbuf)
+				break;
+			tipc_bearer_send(b, tbuf, &b->bcast_addr);
+			kfree_skb(tbuf); /* Bearer keeps a clone */
+		}
 
+		/* Swap bearers for next packet */
 		if (s) {
 			bcbearer->bpairs[bp_index].primary = s;
 			bcbearer->bpairs[bp_index].secondary = p;
 		}
 
 		if (bcbearer->remains_new.count == 0)
-			break;	/* all targets reached */
+			break; /* All targets reached */
 
 		bcbearer->remains = bcbearer->remains_new;
 	}
diff --git a/net/tipc/link.c b/net/tipc/link.c
index daa6080a2a0..a80feee5197 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -2306,8 +2306,11 @@ static int link_recv_changeover_msg(struct tipc_link **l_ptr,
 	struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf);
 	u32 msg_typ = msg_type(tunnel_msg);
 	u32 msg_count = msg_msgcnt(tunnel_msg);
+	u32 bearer_id = msg_bearer_id(tunnel_msg);
 
-	dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)];
+	if (bearer_id >= MAX_BEARERS)
+		goto exit;
+	dest_link = (*l_ptr)->owner->links[bearer_id];
 	if (!dest_link)
 		goto exit;
 	if (dest_link == *l_ptr) {
@@ -2521,14 +2524,16 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb,
 		struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm);
 		u32 msg_sz = msg_size(imsg);
 		u32 fragm_sz = msg_data_sz(fragm);
-		u32 exp_fragm_cnt = msg_sz/fragm_sz + !!(msg_sz % fragm_sz);
+		u32 exp_fragm_cnt;
 		u32 max =  TIPC_MAX_USER_MSG_SIZE + NAMED_H_SIZE;
+
 		if (msg_type(imsg) == TIPC_MCAST_MSG)
 			max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE;
-		if (msg_size(imsg) > max) {
+		if (fragm_sz == 0 || msg_size(imsg) > max) {
 			kfree_skb(fbuf);
 			return 0;
 		}
+		exp_fragm_cnt = msg_sz / fragm_sz + !!(msg_sz % fragm_sz);
 		pbuf = tipc_buf_acquire(msg_size(imsg));
 		if (pbuf != NULL) {
 			pbuf->next = *pending;