From d414371795d54fa916938f948105d08928abfbb9 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 4 Mar 2010 13:16:52 +0000 Subject: IPoIB: Allow disabling/enabling TSO on the fly through ethtool Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'drivers/infiniband/ulp') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index d10b4ec68d2..40e858492f9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -49,6 +49,25 @@ static u32 ipoib_get_rx_csum(struct net_device *dev) !test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); } +static int ipoib_set_tso(struct net_device *dev, u32 data) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (data) { + if (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && + (dev->features & NETIF_F_SG) && + (priv->hca_caps & IB_DEVICE_UD_TSO)) { + dev->features |= NETIF_F_TSO; + } else { + ipoib_warn(priv, "can't set TSO on\n"); + return -EOPNOTSUPP; + } + } else + dev->features &= ~NETIF_F_TSO; + + return 0; +} + static int ipoib_get_coalesce(struct net_device *dev, struct ethtool_coalesce *coal) { @@ -131,6 +150,7 @@ static void ipoib_get_ethtool_stats(struct net_device *dev, static const struct ethtool_ops ipoib_ethtool_ops = { .get_drvinfo = ipoib_get_drvinfo, .get_rx_csum = ipoib_get_rx_csum, + .set_tso = ipoib_set_tso, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, .get_flags = ethtool_op_get_flags, -- cgit v1.2.3-70-g09d2 From 2110f9bf37511df06220bb7e977f417baecf2950 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 5 May 2010 17:30:10 +0300 Subject: IB/iser: Add asynchronous event handler Add handler to handle events such as port up and down. This is useful when testing high-availability schemes such as multi-pathing. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.h | 1 + drivers/infiniband/ulp/iser/iser_verbs.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/ulp') diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 036934cdcb9..53da74b45c7 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -232,6 +232,7 @@ struct iser_device { struct ib_cq *tx_cq; struct ib_mr *mr; struct tasklet_struct cq_tasklet; + struct ib_event_handler event_handler; struct list_head ig_list; /* entry in ig devices list */ int refcount; }; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index b89d76b39a1..b9d6aa102aa 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -54,6 +54,13 @@ static void iser_qp_event_callback(struct ib_event *cause, void *context) iser_err("got qp event %d\n",cause->event); } +static void iser_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + iser_err("async event %d on device %s port %d\n", event->event, + event->device->name, event->element.port_num); +} + /** * iser_create_device_ib_res - creates Protection Domain (PD), Completion * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with @@ -96,8 +103,15 @@ static int iser_create_device_ib_res(struct iser_device *device) if (IS_ERR(device->mr)) goto dma_mr_err; + INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, + iser_event_handler); + if (ib_register_event_handler(&device->event_handler)) + goto handler_err; + return 0; +handler_err: + ib_dereg_mr(device->mr); dma_mr_err: tasklet_kill(&device->cq_tasklet); cq_arm_err: @@ -120,7 +134,7 @@ static void iser_free_device_ib_res(struct iser_device *device) BUG_ON(device->mr == NULL); tasklet_kill(&device->cq_tasklet); - + (void)ib_unregister_event_handler(&device->event_handler); (void)ib_dereg_mr(device->mr); (void)ib_destroy_cq(device->tx_cq); (void)ib_destroy_cq(device->rx_cq); -- cgit v1.2.3-70-g09d2 From d265b9808272c9f25e1c36d3fb5ddb466efd90e9 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 5 May 2010 17:30:34 +0300 Subject: IB/iser: Remove buggy back-pointer setting The iscsi connection object life cycle includes binding and unbinding (conn_stop) to/from the iscsi transport connection object. Since iscsi connection objects are recycled, at the time the transport connection (e.g iser's IB connection) is released, it is not valid to touch the iscsi connection tied to the transport back-pointer since it may already point to a different transport connection. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_verbs.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/infiniband/ulp') diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index b9d6aa102aa..ed7c9013541 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -346,8 +346,6 @@ static void iser_conn_release(struct iser_conn *ib_conn) /* on EVENT_ADDR_ERROR there's no device yet for this conn */ if (device != NULL) iser_device_try_release(device); - if (ib_conn->iser_conn) - ib_conn->iser_conn->ib_conn = NULL; iscsi_destroy_endpoint(ib_conn->ep); } -- cgit v1.2.3-70-g09d2 From 39ff05dbbbdb082bbabf06206c56b3cd4ef73904 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 5 May 2010 17:31:44 +0300 Subject: IB/iser: Enhance disconnection logic for multi-pathing The iser connection teardown flow isn't over until the underlying Connection Manager (e.g the IB CM) delivers a disconnected or timeout event through the RDMA-CM. When the remote (target) side isn't reachable, e.g when some HW e.g port/hca/switch isn't functioning or taken down administratively, the CM timeout flow is used and the event may be generated only after relatively long time -- on the order of tens of seconds. The current iser code exposes this possibly long delay to higher layers, specifically to the iscsid daemon and iscsi kernel stack. As a result, the iscsi stack doesn't respond well: this low-level CM delay is added to the fail-over time under HA schemes such as the one provided by DM multipath through the multipathd(8) service. This patch enhances the reference counting scheme on iser's IB connections so that the disconnect flow initiated by iscsid from user space (ep_disconnect) doesn't wait for the CM to deliver the disconnect/timeout event. (The connection teardown isn't done from iser's view point until the event is delivered) The iser ib (rdma) connection object is destroyed when its reference count reaches zero. When this happens on the RDMA-CM callback context, extra care is taken so that the RDMA-CM does the actual destroying of the associated ID, since doing it in the callback is prohibited. The reference count of iser ib connection normally reaches three, where the relations are 1. conn 2. conn 3. cma id With this patch, multipath fail-over time is about 30 seconds, while without this patch, multipath fail-over time is about 130 seconds. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.c | 9 ++-- drivers/infiniband/ulp/iser/iscsi_iser.h | 3 +- drivers/infiniband/ulp/iser/iser_verbs.c | 72 ++++++++++++++++++-------------- 3 files changed, 46 insertions(+), 38 deletions(-) (limited to 'drivers/infiniband/ulp') diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 93399dff0c6..7b2fc98e2f2 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -325,7 +325,7 @@ iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn) */ if (ib_conn) { ib_conn->iser_conn = NULL; - iser_conn_put(ib_conn); + iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ } } @@ -357,11 +357,12 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, /* binds the iSER connection retrieved from the previously * connected ep_handle to the iSCSI layer connection. exchanges * connection pointers */ - iser_err("binding iscsi conn %p to iser_conn %p\n",conn,ib_conn); + iser_err("binding iscsi/iser conn %p %p to ib_conn %p\n", + conn, conn->dd_data, ib_conn); iser_conn = conn->dd_data; ib_conn->iser_conn = iser_conn; iser_conn->ib_conn = ib_conn; - iser_conn_get(ib_conn); + iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */ return 0; } @@ -382,7 +383,7 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) * There is no unbind event so the stop callback * must release the ref from the bind. */ - iser_conn_put(ib_conn); + iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ } iser_conn->ib_conn = NULL; } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 53da74b45c7..f1df01567bb 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -247,7 +247,6 @@ struct iser_conn { struct rdma_cm_id *cma_id; /* CMA ID */ struct ib_qp *qp; /* QP */ struct ib_fmr_pool *fmr_pool; /* pool of IB FMRs */ - int disc_evt_flag; /* disconn event delivered */ wait_queue_head_t wait; /* waitq for conn/disconn */ int post_recv_buf_count; /* posted rx count */ atomic_t post_send_buf_count; /* posted tx count */ @@ -321,7 +320,7 @@ void iser_conn_init(struct iser_conn *ib_conn); void iser_conn_get(struct iser_conn *ib_conn); -void iser_conn_put(struct iser_conn *ib_conn); +int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed); void iser_conn_terminate(struct iser_conn *ib_conn); diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index ed7c9013541..78fdecacea3 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -238,7 +238,7 @@ alloc_err: * releases the FMR pool, QP and CMA ID objects, returns 0 on success, * -1 on failure */ -static int iser_free_ib_conn_res(struct iser_conn *ib_conn) +static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id) { BUG_ON(ib_conn == NULL); @@ -253,7 +253,8 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn) if (ib_conn->qp != NULL) rdma_destroy_qp(ib_conn->cma_id); - if (ib_conn->cma_id != NULL) + /* if cma handler context, the caller acts s.t the cma destroy the id */ + if (ib_conn->cma_id != NULL && can_destroy_id) rdma_destroy_id(ib_conn->cma_id); ib_conn->fmr_pool = NULL; @@ -331,7 +332,7 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, /** * Frees all conn objects and deallocs conn descriptor */ -static void iser_conn_release(struct iser_conn *ib_conn) +static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id) { struct iser_device *device = ib_conn->device; @@ -341,7 +342,7 @@ static void iser_conn_release(struct iser_conn *ib_conn) list_del(&ib_conn->conn_list); mutex_unlock(&ig.connlist_mutex); iser_free_rx_descriptors(ib_conn); - iser_free_ib_conn_res(ib_conn); + iser_free_ib_conn_res(ib_conn, can_destroy_id); ib_conn->device = NULL; /* on EVENT_ADDR_ERROR there's no device yet for this conn */ if (device != NULL) @@ -354,10 +355,13 @@ void iser_conn_get(struct iser_conn *ib_conn) atomic_inc(&ib_conn->refcount); } -void iser_conn_put(struct iser_conn *ib_conn) +int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id) { - if (atomic_dec_and_test(&ib_conn->refcount)) - iser_conn_release(ib_conn); + if (atomic_dec_and_test(&ib_conn->refcount)) { + iser_conn_release(ib_conn, can_destroy_id); + return 1; + } + return 0; } /** @@ -381,19 +385,20 @@ void iser_conn_terminate(struct iser_conn *ib_conn) wait_event_interruptible(ib_conn->wait, ib_conn->state == ISER_CONN_DOWN); - iser_conn_put(ib_conn); + iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */ } -static void iser_connect_error(struct rdma_cm_id *cma_id) +static int iser_connect_error(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; ib_conn = (struct iser_conn *)cma_id->context; ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); + return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ } -static void iser_addr_handler(struct rdma_cm_id *cma_id) +static int iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; struct iser_conn *ib_conn; @@ -402,8 +407,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) device = iser_device_find_by_ib_device(cma_id); if (!device) { iser_err("device lookup/creation failed\n"); - iser_connect_error(cma_id); - return; + return iser_connect_error(cma_id); } ib_conn = (struct iser_conn *)cma_id->context; @@ -412,11 +416,13 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) ret = rdma_resolve_route(cma_id, 1000); if (ret) { iser_err("resolve route failed: %d\n", ret); - iser_connect_error(cma_id); + return iser_connect_error(cma_id); } + + return 0; } -static void iser_route_handler(struct rdma_cm_id *cma_id) +static int iser_route_handler(struct rdma_cm_id *cma_id) { struct rdma_conn_param conn_param; int ret; @@ -437,9 +443,9 @@ static void iser_route_handler(struct rdma_cm_id *cma_id) goto failure; } - return; + return 0; failure: - iser_connect_error(cma_id); + return iser_connect_error(cma_id); } static void iser_connected_handler(struct rdma_cm_id *cma_id) @@ -451,12 +457,12 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) wake_up_interruptible(&ib_conn->wait); } -static void iser_disconnected_handler(struct rdma_cm_id *cma_id) +static int iser_disconnected_handler(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; + int ret; ib_conn = (struct iser_conn *)cma_id->context; - ib_conn->disc_evt_flag = 1; /* getting here when the state is UP means that the conn is being * * terminated asynchronously from the iSCSI layer's perspective. */ @@ -471,20 +477,24 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id) ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); } + + ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ + return ret; } static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { int ret = 0; - iser_err("event %d conn %p id %p\n",event->event,cma_id->context,cma_id); + iser_err("event %d status %d conn %p id %p\n", + event->event, event->status, cma_id->context, cma_id); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: - iser_addr_handler(cma_id); + ret = iser_addr_handler(cma_id); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: - iser_route_handler(cma_id); + ret = iser_route_handler(cma_id); break; case RDMA_CM_EVENT_ESTABLISHED: iser_connected_handler(cma_id); @@ -494,13 +504,12 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: - iser_err("event: %d, error: %d\n", event->event, event->status); - iser_connect_error(cma_id); + ret = iser_connect_error(cma_id); break; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: - iser_disconnected_handler(cma_id); + ret = iser_disconnected_handler(cma_id); break; default: iser_err("Unexpected RDMA CM event (%d)\n", event->event); @@ -515,7 +524,7 @@ void iser_conn_init(struct iser_conn *ib_conn) init_waitqueue_head(&ib_conn->wait); ib_conn->post_recv_buf_count = 0; atomic_set(&ib_conn->post_send_buf_count, 0); - atomic_set(&ib_conn->refcount, 1); + atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */ INIT_LIST_HEAD(&ib_conn->conn_list); spin_lock_init(&ib_conn->lock); } @@ -543,6 +552,7 @@ int iser_connect(struct iser_conn *ib_conn, ib_conn->state = ISER_CONN_PENDING; + iser_conn_get(ib_conn); /* ref ib conn's cma id */ ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)ib_conn, RDMA_PS_TCP); @@ -580,7 +590,7 @@ id_failure: addr_failure: ib_conn->state = ISER_CONN_DOWN; connect_failure: - iser_conn_release(ib_conn); + iser_conn_release(ib_conn, 1); return err; } @@ -749,12 +759,10 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc, iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); - /* complete the termination process if disconnect event was delivered * - * note there are no more non completed posts to the QP */ - if (ib_conn->disc_evt_flag) { - ib_conn->state = ISER_CONN_DOWN; - wake_up_interruptible(&ib_conn->wait); - } + /* no more non completed posts to the QP, complete the + * termination process w.o worrying on disconnect event */ + ib_conn->state = ISER_CONN_DOWN; + wake_up_interruptible(&ib_conn->wait); } } -- cgit v1.2.3-70-g09d2 From 9fda1ac5fa09c49e9148f85be14f55e2bb856c0f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 May 2010 16:22:21 +0300 Subject: IB/iser: Fix error flow in iser_create_ib_conn_res() We shouldn't free things here because we free them later. The call tree looks like this: iser_connect() ==> initiating the connection establishment and later iser_cma_handler() => iser_route_handler() => iser_create_ib_conn_res() if we fail here, eventually iser_conn_release() is called, resulting in a double free. Signed-off-by: Dan Carpenter Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_verbs.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband/ulp') diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 78fdecacea3..9876865732f 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -163,10 +163,8 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) device = ib_conn->device; ib_conn->login_buf = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL); - if (!ib_conn->login_buf) { - goto alloc_err; - ret = -ENOMEM; - } + if (!ib_conn->login_buf) + goto out_err; ib_conn->login_dma = ib_dma_map_single(ib_conn->device->ib_device, (void *)ib_conn->login_buf, ISER_RX_LOGIN_SIZE, @@ -175,10 +173,9 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) + (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)), GFP_KERNEL); - if (!ib_conn->page_vec) { - ret = -ENOMEM; - goto alloc_err; - } + if (!ib_conn->page_vec) + goto out_err; + ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1); params.page_shift = SHIFT_4K; @@ -198,7 +195,8 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) ib_conn->fmr_pool = ib_create_fmr_pool(device->pd, ¶ms); if (IS_ERR(ib_conn->fmr_pool)) { ret = PTR_ERR(ib_conn->fmr_pool); - goto fmr_pool_err; + ib_conn->fmr_pool = NULL; + goto out_err; } memset(&init_attr, 0, sizeof init_attr); @@ -216,7 +214,7 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); if (ret) - goto qp_err; + goto out_err; ib_conn->qp = ib_conn->cma_id->qp; iser_err("setting conn %p cma_id %p: fmr_pool %p qp %p\n", @@ -224,12 +222,7 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) ib_conn->fmr_pool, ib_conn->cma_id->qp); return ret; -qp_err: - (void)ib_destroy_fmr_pool(ib_conn->fmr_pool); -fmr_pool_err: - kfree(ib_conn->page_vec); - kfree(ib_conn->login_buf); -alloc_err: +out_err: iser_err("unable to alloc mem or create resource, err %d\n", ret); return ret; } -- cgit v1.2.3-70-g09d2