diff options
Diffstat (limited to 'net/rds/ib_cm.c')
-rw-r--r-- | net/rds/ib_cm.c | 184 |
1 files changed, 126 insertions, 58 deletions
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f68832798db..ee369d201a6 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -38,6 +38,36 @@ #include "rds.h" #include "ib.h" +static char *rds_ib_event_type_strings[] = { +#define RDS_IB_EVENT_STRING(foo) \ + [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) + RDS_IB_EVENT_STRING(CQ_ERR), + RDS_IB_EVENT_STRING(QP_FATAL), + RDS_IB_EVENT_STRING(QP_REQ_ERR), + RDS_IB_EVENT_STRING(QP_ACCESS_ERR), + RDS_IB_EVENT_STRING(COMM_EST), + RDS_IB_EVENT_STRING(SQ_DRAINED), + RDS_IB_EVENT_STRING(PATH_MIG), + RDS_IB_EVENT_STRING(PATH_MIG_ERR), + RDS_IB_EVENT_STRING(DEVICE_FATAL), + RDS_IB_EVENT_STRING(PORT_ACTIVE), + RDS_IB_EVENT_STRING(PORT_ERR), + RDS_IB_EVENT_STRING(LID_CHANGE), + RDS_IB_EVENT_STRING(PKEY_CHANGE), + RDS_IB_EVENT_STRING(SM_CHANGE), + RDS_IB_EVENT_STRING(SRQ_ERR), + RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), + RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), + RDS_IB_EVENT_STRING(CLIENT_REREGISTER), +#undef RDS_IB_EVENT_STRING +}; + +static char *rds_ib_event_str(enum ib_event_type type) +{ + return rds_str_array(rds_ib_event_type_strings, + ARRAY_SIZE(rds_ib_event_type_strings), type); +}; + /* * Set the selected protocol version */ @@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even { const struct rds_ib_connect_private *dp = NULL; struct rds_ib_connection *ic = conn->c_transport_data; - struct rds_ib_device *rds_ibdev; struct ib_qp_attr qp_attr; int err; @@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } } - printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + if (conn->c_version < RDS_PROTOCOL(3,1)) { + printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," + " no longer supported\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rds_conn_destroy(conn); + return; + } else { + printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + } /* * Init rings and fill recv. this needs to wait until protocol negotiation @@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_ib_recv_init_ring(ic); /* Post receive buffers - as a side effect, this will update * the posted credit count. */ - rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + rds_ib_recv_refill(conn, 1); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); @@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even if (err) printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); - /* update ib_device with this local ipaddr & conn */ - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); + /* update ib_device with this local ipaddr */ + err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); if (err) - printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); - rds_ib_add_conn(rds_ibdev, conn); + printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", + err); /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ @@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, struct rdma_conn_param *conn_param, struct rds_ib_connect_private *dp, - u32 protocol_version) + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth) { + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + memset(conn_param, 0, sizeof(struct rdma_conn_param)); - /* XXX tune these? */ - conn_param->responder_resources = 1; - conn_param->initiator_depth = 1; + + conn_param->responder_resources = + min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); + conn_param->initiator_depth = + min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); conn_param->rnr_retry_count = 7; if (dp) { - struct rds_ib_connection *ic = conn->c_transport_data; - memset(dp, 0, sizeof(*dp)); dp->dp_saddr = conn->c_laddr; dp->dp_daddr = conn->c_faddr; @@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, static void rds_ib_cq_event_handler(struct ib_event *event, void *data) { - rdsdebug("event %u data %p\n", event->event, data); + rdsdebug("event %u (%s) data %p\n", + event->event, rds_ib_event_str(event->event), data); } static void rds_ib_qp_event_handler(struct ib_event *event, void *data) @@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) struct rds_connection *conn = data; struct rds_ib_connection *ic = conn->c_transport_data; - rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, + rds_ib_event_str(event->event)); switch (event->event) { case IB_EVENT_COMM_EST: rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; default: - rdsdebug("Fatal QP Event %u " + rdsdebug("Fatal QP Event %u (%s) " "- connection %pI4->%pI4, reconnecting\n", - event->event, &conn->c_laddr, &conn->c_faddr); + event->event, rds_ib_event_str(event->event), + &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; } @@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct rds_ib_device *rds_ibdev; int ret; - /* rds_ib_add_one creates a rds_ib_device object per IB device, - * and allocates a protection domain, memory range and FMR pool - * for each. If that fails for any reason, it will not register - * the rds_ibdev at all. + /* + * It's normal to see a null device if an incoming connection races + * with device removal, so we don't print a warning. */ - rds_ibdev = ib_get_client_data(dev, &rds_ib_client); - if (rds_ibdev == NULL) { - if (printk_ratelimit()) - printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", - dev->name); + rds_ibdev = rds_ib_get_client_data(dev); + if (!rds_ibdev) return -EOPNOTSUPP; - } + + /* add the conn now so that connection establishment has the dev */ + rds_ib_add_conn(rds_ibdev, conn); if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); @@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_ring.w_nr * sizeof(struct rds_header), &ic->i_send_hdrs_dma, GFP_KERNEL); - if (ic->i_send_hdrs == NULL) { + if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); goto out; @@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_recv_ring.w_nr * sizeof(struct rds_header), &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (ic->i_recv_hdrs == NULL) { + if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); goto out; @@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), &ic->i_ack_dma, GFP_KERNEL); - if (ic->i_ack == NULL) { + if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); goto out; } - ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - if (ic->i_sends == NULL) { + ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), + ibdev_to_node(dev)); + if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); goto out; } memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); - if (ic->i_recvs == NULL) { + ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), + ibdev_to_node(dev)); + if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); goto out; @@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_cq, ic->i_recv_cq); out: + rds_ib_dev_put(rds_ibdev); return ret; } @@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, struct rds_ib_connection *ic = NULL; struct rdma_conn_param conn_param; u32 version; - int err, destroy = 1; + int err = 1, destroy = 1; /* Check whether the remote protocol version matches ours. */ version = rds_ib_protocol_compatible(event); @@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* Wait and see - our connect may still be succeeding */ rds_ib_stats_inc(s_ib_connect_raced); } - mutex_unlock(&conn->c_cm_lock); goto out; } @@ -475,24 +521,23 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, err = rds_ib_setup_qp(conn); if (err) { rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); - mutex_unlock(&conn->c_cm_lock); goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, + event->param.conn.responder_resources, + event->param.conn.initiator_depth); /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); - mutex_unlock(&conn->c_cm_lock); - if (err) { + if (err) rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); - goto out; - } - - return 0; out: - rdma_reject(cm_id, NULL, 0); + if (conn) + mutex_unlock(&conn->c_cm_lock); + if (err) + rdma_reject(cm_id, NULL, 0); return destroy; } @@ -516,8 +561,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); - + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, + UINT_MAX, UINT_MAX); ret = rdma_connect(cm_id, &conn_param); if (ret) rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); @@ -601,9 +646,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ic->i_cm_id, err); } + /* + * We want to wait for tx and rx completion to finish + * before we tear down the connection, but we have to be + * careful not to get stuck waiting on a send ring that + * only has unsignaled sends in it. We've shutdown new + * sends before getting here so by waiting for signaled + * sends to complete we're ensured that there will be no + * more tx processing. + */ wait_event(rds_ib_ring_empty_wait, - rds_ib_ring_empty(&ic->i_send_ring) && - rds_ib_ring_empty(&ic->i_recv_ring)); + rds_ib_ring_empty(&ic->i_recv_ring) && + (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_recv_tasklet); if (ic->i_send_hdrs) ib_dma_free_coherent(dev, @@ -654,9 +709,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) BUG_ON(ic->rds_ibdev); /* Clear pending transmit */ - if (ic->i_rm) { - rds_message_put(ic->i_rm); - ic->i_rm = NULL; + if (ic->i_data_op) { + struct rds_message *rm; + + rm = container_of(ic->i_data_op, struct rds_message, data); + rds_message_put(rm); + ic->i_data_op = NULL; } /* Clear the ACK state */ @@ -690,12 +748,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_ib_connection *ic; unsigned long flags; + int ret; /* XXX too lazy? */ ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); - if (ic == NULL) + if (!ic) return -ENOMEM; + ret = rds_ib_recv_alloc_caches(ic); + if (ret) { + kfree(ic); + return ret; + } + INIT_LIST_HEAD(&ic->ib_node); tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, (unsigned long) ic); @@ -703,6 +768,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); #endif + atomic_set(&ic->i_signaled_sends, 0); /* * rds_ib_conn_shutdown() waits for these to be emptied so they @@ -744,6 +810,8 @@ void rds_ib_conn_free(void *arg) list_del(&ic->ib_node); spin_unlock_irq(lock_ptr); + rds_ib_recv_free_caches(ic); + kfree(ic); } |