From dd5bdff83b19d9174126e0398b47117c3a80e22d Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 22 Jul 2008 14:14:22 -0700 Subject: RDMA/cma: Add RDMA_CM_EVENT_ADDR_CHANGE event Add an RDMA_CM_EVENT_ADDR_CHANGE event can be used by rdma-cm consumers that wish to have their RDMA sessions always use the same links (eg ) as the IP stack does. In the current code, this does not happen when bonding is used and fail-over happened but the IB link used by an already existing session is operating fine. Use the netevent notification for sensing that a change has happened in the IP stack, then scan the rdma-cm ID list to see if there is an ID that is "misaligned" with respect to the IP stack, and deliver RDMA_CM_EVENT_ADDR_CHANGE for this ID. The consumer can act on the event or just ignore it. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 92 +++++++++++++++++++++++++++++++++++++++++++ include/rdma/rdma_cm.h | 3 +- 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ae11d5cc74d..79792c92e6f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -168,6 +168,12 @@ struct cma_work { struct rdma_cm_event event; }; +struct cma_ndev_work { + struct work_struct work; + struct rdma_id_private *id; + struct rdma_cm_event event; +}; + union cma_ip_addr { struct in6_addr ip6; struct { @@ -1598,6 +1604,30 @@ out: kfree(work); } +static void cma_ndev_work_handler(struct work_struct *_work) +{ + struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); + struct rdma_id_private *id_priv = work->id; + int destroy = 0; + + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state == CMA_DESTROYING || + id_priv->state == CMA_DEVICE_REMOVAL) + goto out; + + if (id_priv->id.event_handler(&id_priv->id, &work->event)) { + cma_exch(id_priv, CMA_DESTROYING); + destroy = 1; + } + +out: + mutex_unlock(&id_priv->handler_mutex); + cma_deref_id(id_priv); + if (destroy) + rdma_destroy_id(&id_priv->id); + kfree(work); +} + static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; @@ -2723,6 +2753,65 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) } EXPORT_SYMBOL(rdma_leave_multicast); +static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) +{ + struct rdma_dev_addr *dev_addr; + struct cma_ndev_work *work; + + dev_addr = &id_priv->id.route.addr.dev_addr; + + if ((dev_addr->src_dev == ndev) && + memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { + printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", + ndev->name, &id_priv->id); + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + INIT_WORK(&work->work, cma_ndev_work_handler); + work->id = id_priv; + work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; + atomic_inc(&id_priv->refcount); + queue_work(cma_wq, &work->work); + } + + return 0; +} + +static int cma_netdev_callback(struct notifier_block *self, unsigned long event, + void *ctx) +{ + struct net_device *ndev = (struct net_device *)ctx; + struct cma_device *cma_dev; + struct rdma_id_private *id_priv; + int ret = NOTIFY_DONE; + + if (dev_net(ndev) != &init_net) + return NOTIFY_DONE; + + if (event != NETDEV_BONDING_FAILOVER) + return NOTIFY_DONE; + + if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) + return NOTIFY_DONE; + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) + list_for_each_entry(id_priv, &cma_dev->id_list, list) { + ret = cma_netdev_change(ndev, id_priv); + if (ret) + goto out; + } + +out: + mutex_unlock(&lock); + return ret; +} + +static struct notifier_block cma_nb = { + .notifier_call = cma_netdev_callback +}; + static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; @@ -2831,6 +2920,7 @@ static int cma_init(void) ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); + register_netdevice_notifier(&cma_nb); ret = ib_register_client(&cma_client); if (ret) @@ -2838,6 +2928,7 @@ static int cma_init(void) return 0; err: + unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_wq); @@ -2847,6 +2938,7 @@ err: static void cma_cleanup(void) { ib_unregister_client(&cma_client); + unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); destroy_workqueue(cma_wq); diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 22bb2e7bab1..001d606517f 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -57,7 +57,8 @@ enum rdma_cm_event_type { RDMA_CM_EVENT_DISCONNECTED, RDMA_CM_EVENT_DEVICE_REMOVAL, RDMA_CM_EVENT_MULTICAST_JOIN, - RDMA_CM_EVENT_MULTICAST_ERROR + RDMA_CM_EVENT_MULTICAST_ERROR, + RDMA_CM_EVENT_ADDR_CHANGE }; enum rdma_port_space { -- cgit v1.2.3-70-g09d2 From 38ca83a588662f0af684ba2567dd910a564268ab Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Tue, 22 Jul 2008 14:14:23 -0700 Subject: RDMA/cma: Add RDMA_CM_EVENT_TIMEWAIT_EXIT event Consumers that want to re-use their QPs in new connections need to know when the QP has exited the timewait state. Report the timewait event through the rdma_cm. Signed-off-by: Amir Vadai Acked-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 7 ++++++- include/rdma/rdma_cm.h | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 79792c92e6f..e980ff3335d 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -920,7 +920,10 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) struct rdma_cm_event event; int ret = 0; - if (cma_disable_callback(id_priv, CMA_CONNECT)) + if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && + cma_disable_callback(id_priv, CMA_CONNECT)) || + (ib_event->event == IB_CM_TIMEWAIT_EXIT && + cma_disable_callback(id_priv, CMA_DISCONNECT))) return 0; memset(&event, 0, sizeof event); @@ -956,6 +959,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: + event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; + break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 001d606517f..df7faf09d66 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -58,7 +58,8 @@ enum rdma_cm_event_type { RDMA_CM_EVENT_DEVICE_REMOVAL, RDMA_CM_EVENT_MULTICAST_JOIN, RDMA_CM_EVENT_MULTICAST_ERROR, - RDMA_CM_EVENT_ADDR_CHANGE + RDMA_CM_EVENT_ADDR_CHANGE, + RDMA_CM_EVENT_TIMEWAIT_EXIT }; enum rdma_port_space { -- cgit v1.2.3-70-g09d2 From 2f5de1512884da8c74bec2c76e8f114b972ab4be Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 22 Jul 2008 14:16:21 -0700 Subject: IB/iser: Add support for RDMA_CM_EVENT_ADDR_CHANGE event Enhance iser to act upon notification on network stack changes that make its RDMA connection unaligned with the link used by the stack for the IPs used to establish the connection. When RDMA_CM_EVENT_ADDR_CHANGE arrives, just disconnect the connection, assuming that the user space iscsid daemon will reconnect, and the new connection will be aligned with the IP stack. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 3a917c1f796..63462ecca14 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -483,6 +483,7 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve break; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_ADDR_CHANGE: iser_disconnected_handler(cma_id); break; default: -- cgit v1.2.3-70-g09d2 From 5b673b71c8ca0fbdb99dc1b1434cfb554212d6ff Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Tue, 22 Jul 2008 14:18:07 -0700 Subject: IB/ehca: Filter PATH_MIG events if QP was never armed Certain firmware versions sometimes cause spurious PATH_MIG events to occur during QP creation. Filter these events by making sure PATH_MIG events are only handed down when they actually make sense (i.e. when the QP has been armed at least once). Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_classes.h | 1 + drivers/infiniband/hw/ehca/ehca_irq.c | 4 ++++ drivers/infiniband/hw/ehca/ehca_qp.c | 2 ++ 3 files changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index 1e9e99a1393..0b0618edd64 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -194,6 +194,7 @@ struct ehca_qp { u32 packet_count; atomic_t nr_events; /* events seen */ wait_queue_head_t wait_completion; + int mig_armed; }; #define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ) diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index 0792d930c48..99642a6e17c 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -178,6 +178,10 @@ static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp, { struct ib_event event; + /* PATH_MIG without the QP ever having been armed is false alarm */ + if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed) + return; + event.device = &shca->ib_device; event.event = event_type; diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index 3f59587338e..ea13efddf17 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -1460,6 +1460,8 @@ static int internal_modify_qp(struct ib_qp *ibqp, goto modify_qp_exit2; } mqpcb->path_migration_state = attr->path_mig_state + 1; + if (attr->path_mig_state == IB_MIG_REARM) + my_qp->mig_armed = 1; update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1); } -- cgit v1.2.3-70-g09d2 From 593e4d4a05c8263a6dbd5452c21d47c5bdadd40c Mon Sep 17 00:00:00 2001 From: Joachim Fenkes Date: Tue, 22 Jul 2008 14:18:08 -0700 Subject: IB/ehca: Use default value for Local CA ACK Delay if FW returns 0 Some firmware versions report a Local CA ACK Delay of 0. In that case, return a more sensible default value of 12 (-> 16 msec) instead. Signed-off-by: Joachim Fenkes Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_hca.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c index bc3b37d2070..46288220cfb 100644 --- a/drivers/infiniband/hw/ehca/ehca_hca.c +++ b/drivers/infiniband/hw/ehca/ehca_hca.c @@ -114,7 +114,9 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props) } props->max_pkeys = 16; - props->local_ca_ack_delay = min_t(u8, rblock->local_ca_ack_delay, 255); + /* Some FW versions say 0 here; insert sensible value in that case */ + props->local_ca_ack_delay = rblock->local_ca_ack_delay ? + min_t(u8, rblock->local_ca_ack_delay, 255) : 12; props->max_raw_ipv6_qp = limit_uint(rblock->max_raw_ipv6_qp); props->max_raw_ethy_qp = limit_uint(rblock->max_raw_ethy_qp); props->max_mcast_grp = limit_uint(rblock->max_mcast_grp); -- cgit v1.2.3-70-g09d2 From 1a867c33bb65f2921351a9bdd98548bb96f0ff8c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 22 Jul 2008 14:18:10 -0700 Subject: IB/ehca: Release mutex in error path of alloc_small_queue_page() The pd->lock mutex is released on a successful return, so it should be released on an error return as well. The semantic patch that makes this change is as follows: (http://www.emn.fr/x-info/coccinelle/) // @@ expression l; @@ mutex_lock(l); ... when != mutex_unlock(l) when any when strict ( if (...) { ... when != mutex_unlock(l) + mutex_unlock(l); return ...; } | mutex_unlock(l); ) // Signed-off-by: Julia Lawall Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ipz_pt_fn.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c index 661f8db6270..c3a32846543 100644 --- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c +++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c @@ -163,6 +163,7 @@ static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd) out: ehca_err(pd->ib_pd.device, "failed to allocate small queue page"); + mutex_unlock(&pd->lock); return 0; } -- cgit v1.2.3-70-g09d2 From 64b784b583061ebfe1d484dd1fdc5a26c6d4293f Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Tue, 22 Jul 2008 14:18:33 -0700 Subject: IB/sa_query: Check if sm_ah is NULL in ib_sa_remove_one() If update_sm_ah() fails, it leaves the port's sm_ah as NULL. Then if the device or module is removed, ib_sa_remove_one() will dereference a NULL pointer when it calls kref_put(). Fix this by testing if sm_ah is NULL before dropping the reference. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/core/sa_query.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 1341de793e5..7863a50d56f 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1064,7 +1064,8 @@ static void ib_sa_remove_one(struct ib_device *device) for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { ib_unregister_mad_agent(sa_dev->port[i].agent); - kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); + if (sa_dev->port[i].sm_ah) + kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); } kfree(sa_dev); -- cgit v1.2.3-70-g09d2 From 01b3fc8b15432f7931e40fe099839e1559fb0e09 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 22 Jul 2008 14:18:34 -0700 Subject: IPoIB: Include err code in trace message for ib_sa_path_rec_get() failures Print the return code of ib_sa_path_rec_get() if it fails to help debug errors. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 8be9ea0436e..f51201b17bf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -548,7 +548,7 @@ static int path_rec_start(struct net_device *dev, path_rec_completion, path, &path->query); if (path->query_id < 0) { - ipoib_warn(priv, "ib_sa_path_rec_get failed\n"); + ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; return path->query_id; } -- cgit v1.2.3-70-g09d2 From 1ca8d15619f725e223c19137350b0336b9196193 Mon Sep 17 00:00:00 2001 From: Dotan Barak Date: Tue, 22 Jul 2008 14:18:34 -0700 Subject: RDMA/iwcm: Remove IB_ACCESS_LOCAL_WRITE from remote QP attributes Remove IB_ACCESS_LOCAL_WRITE from qp.qp_access_flags because this attribute is only used to set remote permissions. Signed-off-by: Dotan Barak Signed-off-by: Roland Dreier --- drivers/infiniband/core/iwcm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 81c9195b512..8f9509e1ebf 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -942,8 +942,7 @@ static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; - qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE| + qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE| IB_ACCESS_REMOTE_READ; ret = 0; break; -- cgit v1.2.3-70-g09d2 From 51f5f0ee22b98980f7816d42647467cd5f4b3b45 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 22 Jul 2008 14:19:37 -0700 Subject: mlx4_core: Add module parameter to enable QoS support Add a module parameter "enable_qos" to mlx4_core. If this param is set, enable support for QoS in the INIT_HCA command. By default, the parameter is set to 0 (disabled). Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/net/mlx4/fw.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 2b5006b9be6..0851ebdddfd 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -46,6 +46,10 @@ enum { extern void __buggy_use_of_MLX4_GET(void); extern void __buggy_use_of_MLX4_PUT(void); +static int enable_qos; +module_param(enable_qos, bool, 0444); +MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)"); + #define MLX4_GET(dest, source, offset) \ do { \ void *__p = (char *) (source) + (offset); \ @@ -737,6 +741,10 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 3); + /* Enable QoS support if module parameter set */ + if (enable_qos) + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2); + /* QPC/EEC/CQC/EQC/RDMARC attributes */ MLX4_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); -- cgit v1.2.3-70-g09d2 From 47b374752aed1c029f995473c7c463ee3ae5fbaa Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 22 Jul 2008 14:19:39 -0700 Subject: IB/mlx4: Rename struct mlx4_lso_seg to mlx4_wqe_lso_seg Make the struct name consistent with other WQE segment struct types defined in . Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/qp.c | 2 +- include/linux/mlx4/qp.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 89eb6cbe592..bda0859a5ac 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1395,7 +1395,7 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) dseg->addr = cpu_to_be64(sg->addr); } -static int build_lso_seg(struct mlx4_lso_seg *wqe, struct ib_send_wr *wr, +static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, struct mlx4_ib_qp *qp, unsigned *lso_seg_len) { unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 7f128b266fa..f02e9ed36cf 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -219,7 +219,7 @@ struct mlx4_wqe_datagram_seg { __be32 reservd[2]; }; -struct mlx4_lso_seg { +struct mlx4_wqe_lso_seg { __be32 mss_hdr_size; __be32 header[0]; }; -- cgit v1.2.3-70-g09d2 From 899698dad72340b562478b8b770317f2f0fe0c09 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 22 Jul 2008 14:19:39 -0700 Subject: mlx4_code: Add missing FW status return code Add ICM_ERROR firmware status code. In mapping to errnos, -ENFILE seems closest. This is in preparation for providing more detailed log info using mlx4_err() in low-level driver when a non-zero status is returned. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/net/mlx4/cmd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 70dff94a8bc..04d5bc69a6f 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -67,6 +67,8 @@ enum { CMD_STAT_BAD_INDEX = 0x0a, /* FW image corrupted: */ CMD_STAT_BAD_NVMEM = 0x0b, + /* Error in ICM mapping (e.g. not enough auxiliary ICM pages to execute command): */ + CMD_STAT_ICM_ERROR = 0x0c, /* Attempt to modify a QP/EE which is not in the presumed state: */ CMD_STAT_BAD_QP_STATE = 0x10, /* Bad segment parameters (Address/Size): */ @@ -119,6 +121,7 @@ static int mlx4_status_to_errno(u8 status) [CMD_STAT_BAD_RES_STATE] = -EBADF, [CMD_STAT_BAD_INDEX] = -EBADF, [CMD_STAT_BAD_NVMEM] = -EFAULT, + [CMD_STAT_ICM_ERROR] = -ENFILE, [CMD_STAT_BAD_QP_STATE] = -EINVAL, [CMD_STAT_BAD_SEG_PARAM] = -EFAULT, [CMD_STAT_REG_BOUND] = -EBUSY, -- cgit v1.2.3-70-g09d2 From e4044cfc493338cd09870bd45dc646336bb66e9f Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 22 Jul 2008 14:19:40 -0700 Subject: mlx4_core: Keep free count for MTT buddy allocator MTT entries are allocated with a buddy allocator, which just keeps bitmaps for each level of the buddy table. However, all free space starts out at the highest order, and small allocations start scanning from the lowest order. When the lowest order tables have no free space, this can lead to scanning potentially millions of bits before finding a free entry at a higher order. We can avoid this by just keeping a count of how many free entries each order has, and skipping the bitmap scan when an order is completely empty. This provides a nice performance boost for a negligible increase in memory usage. Signed-off-by: Roland Dreier --- drivers/net/mlx4/mlx4.h | 1 + drivers/net/mlx4/mr.c | 26 ++++++++++++++++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index a4023c2dd05..78038499cff 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -118,6 +118,7 @@ struct mlx4_bitmap { struct mlx4_buddy { unsigned long **bits; + unsigned int *num_free; int max_order; spinlock_t lock; }; diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 03a9abcce52..b3ea93b9868 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -79,23 +79,26 @@ static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order) spin_lock(&buddy->lock); - for (o = order; o <= buddy->max_order; ++o) { - m = 1 << (buddy->max_order - o); - seg = find_first_bit(buddy->bits[o], m); - if (seg < m) - goto found; - } + for (o = order; o <= buddy->max_order; ++o) + if (buddy->num_free[o]) { + m = 1 << (buddy->max_order - o); + seg = find_first_bit(buddy->bits[o], m); + if (seg < m) + goto found; + } spin_unlock(&buddy->lock); return -1; found: clear_bit(seg, buddy->bits[o]); + --buddy->num_free[o]; while (o > order) { --o; seg <<= 1; set_bit(seg ^ 1, buddy->bits[o]); + ++buddy->num_free[o]; } spin_unlock(&buddy->lock); @@ -113,11 +116,13 @@ static void mlx4_buddy_free(struct mlx4_buddy *buddy, u32 seg, int order) while (test_bit(seg ^ 1, buddy->bits[order])) { clear_bit(seg ^ 1, buddy->bits[order]); + --buddy->num_free[order]; seg >>= 1; ++order; } set_bit(seg, buddy->bits[order]); + ++buddy->num_free[order]; spin_unlock(&buddy->lock); } @@ -131,7 +136,9 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *), GFP_KERNEL); - if (!buddy->bits) + buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *), + GFP_KERNEL); + if (!buddy->bits || !buddy->num_free) goto err_out; for (i = 0; i <= buddy->max_order; ++i) { @@ -143,6 +150,7 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) } set_bit(0, buddy->bits[buddy->max_order]); + buddy->num_free[buddy->max_order] = 1; return 0; @@ -150,9 +158,10 @@ err_out_free: for (i = 0; i <= buddy->max_order; ++i) kfree(buddy->bits[i]); +err_out: kfree(buddy->bits); + kfree(buddy->num_free); -err_out: return -ENOMEM; } @@ -164,6 +173,7 @@ static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy) kfree(buddy->bits[i]); kfree(buddy->bits); + kfree(buddy->num_free); } static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order) -- cgit v1.2.3-70-g09d2 From e8bb4beb2b1f90d499134f2849727ed04c3bedc4 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 22 Jul 2008 14:20:05 -0700 Subject: IB/mthca: Keep free count for MTT buddy allocator MTT entries are allocated with a buddy allocator, which just keeps bitmaps for each level of the buddy table. However, all free space starts out at the highest order, and small allocations start scanning from the lowest order. When the lowest order tables have no free space, this can lead to scanning potentially millions of bits before finding a free entry at a higher order. We can avoid this by just keeping a count of how many free entries each order has, and skipping the bitmap scan when an order is completely empty. This provides a nice performance boost for a negligible increase in memory usage. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_dev.h | 1 + drivers/infiniband/hw/mthca/mthca_mr.c | 26 ++++++++++++++++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index ee4d073c889..252590116df 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -202,6 +202,7 @@ struct mthca_pd_table { struct mthca_buddy { unsigned long **bits; + int *num_free; int max_order; spinlock_t lock; }; diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 8489b1e81c0..882e6b73591 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -89,23 +89,26 @@ static u32 mthca_buddy_alloc(struct mthca_buddy *buddy, int order) spin_lock(&buddy->lock); - for (o = order; o <= buddy->max_order; ++o) { - m = 1 << (buddy->max_order - o); - seg = find_first_bit(buddy->bits[o], m); - if (seg < m) - goto found; - } + for (o = order; o <= buddy->max_order; ++o) + if (buddy->num_free[o]) { + m = 1 << (buddy->max_order - o); + seg = find_first_bit(buddy->bits[o], m); + if (seg < m) + goto found; + } spin_unlock(&buddy->lock); return -1; found: clear_bit(seg, buddy->bits[o]); + --buddy->num_free[o]; while (o > order) { --o; seg <<= 1; set_bit(seg ^ 1, buddy->bits[o]); + ++buddy->num_free[o]; } spin_unlock(&buddy->lock); @@ -123,11 +126,13 @@ static void mthca_buddy_free(struct mthca_buddy *buddy, u32 seg, int order) while (test_bit(seg ^ 1, buddy->bits[order])) { clear_bit(seg ^ 1, buddy->bits[order]); + --buddy->num_free[order]; seg >>= 1; ++order; } set_bit(seg, buddy->bits[order]); + ++buddy->num_free[order]; spin_unlock(&buddy->lock); } @@ -141,7 +146,9 @@ static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order) buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *), GFP_KERNEL); - if (!buddy->bits) + buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *), + GFP_KERNEL); + if (!buddy->bits || !buddy->num_free) goto err_out; for (i = 0; i <= buddy->max_order; ++i) { @@ -154,6 +161,7 @@ static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order) } set_bit(0, buddy->bits[buddy->max_order]); + buddy->num_free[buddy->max_order] = 1; return 0; @@ -161,9 +169,10 @@ err_out_free: for (i = 0; i <= buddy->max_order; ++i) kfree(buddy->bits[i]); +err_out: kfree(buddy->bits); + kfree(buddy->num_free); -err_out: return -ENOMEM; } @@ -175,6 +184,7 @@ static void mthca_buddy_cleanup(struct mthca_buddy *buddy) kfree(buddy->bits[i]); kfree(buddy->bits); + kfree(buddy->num_free); } static u32 mthca_alloc_mtt_range(struct mthca_dev *dev, int order, -- cgit v1.2.3-70-g09d2 From 95d04f0735b4fc837bff9aedcc3f3efb20ddc3d1 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 23 Jul 2008 08:12:26 -0700 Subject: IB/mlx4: Add support for memory management extensions and local DMA L_Key Add support for the following operations to mlx4 when device firmware supports them: - Send with invalidate and local invalidate send queue work requests; - Allocate/free fast register MRs; - Allocate/free fast register MR page lists; - Fast register MR send queue work requests; - Local DMA L_Key. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/cq.c | 12 ++++++ drivers/infiniband/hw/mlx4/main.c | 11 ++++++ drivers/infiniband/hw/mlx4/mlx4_ib.h | 15 ++++++++ drivers/infiniband/hw/mlx4/mr.c | 70 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx4/qp.c | 72 +++++++++++++++++++++++++++++++++--- drivers/net/mlx4/fw.c | 10 ++--- drivers/net/mlx4/fw.h | 2 +- drivers/net/mlx4/main.c | 2 + drivers/net/mlx4/mr.c | 23 +++++++++--- include/linux/mlx4/device.h | 10 +++++ include/linux/mlx4/qp.h | 16 ++++++-- 11 files changed, 221 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 299f20832ab..0b191a4842c 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -637,6 +637,7 @@ repoll: case MLX4_OPCODE_SEND_IMM: wc->wc_flags |= IB_WC_WITH_IMM; case MLX4_OPCODE_SEND: + case MLX4_OPCODE_SEND_INVAL: wc->opcode = IB_WC_SEND; break; case MLX4_OPCODE_RDMA_READ: @@ -657,6 +658,12 @@ repoll: case MLX4_OPCODE_LSO: wc->opcode = IB_WC_LSO; break; + case MLX4_OPCODE_FMR: + wc->opcode = IB_WC_FAST_REG_MR; + break; + case MLX4_OPCODE_LOCAL_INVAL: + wc->opcode = IB_WC_LOCAL_INV; + break; } } else { wc->byte_len = be32_to_cpu(cqe->byte_cnt); @@ -667,6 +674,11 @@ repoll: wc->wc_flags = IB_WC_WITH_IMM; wc->ex.imm_data = cqe->immed_rss_invalid; break; + case MLX4_RECV_OPCODE_SEND_INVAL: + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid); + break; case MLX4_RECV_OPCODE_SEND: wc->opcode = IB_WC_RECV; wc->wc_flags = 0; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index bcf50648fa1..38d6907ab52 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -104,6 +104,12 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; if (dev->dev->caps.max_gso_sz) props->device_cap_flags |= IB_DEVICE_UD_TSO; + if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) + props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && + (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && + (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; @@ -127,6 +133,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; + props->max_fast_reg_page_list_len = PAGE_SIZE / sizeof (u64); props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; @@ -565,6 +572,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; + ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->ib_dev.phys_port_cnt = dev->caps.num_ports; ibdev->ib_dev.num_comp_vectors = 1; ibdev->ib_dev.dma_device = &dev->pdev->dev; @@ -627,6 +635,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; + ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; + ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; + ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index c4cf5b69eef..d26a91317d4 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -83,6 +83,11 @@ struct mlx4_ib_mr { struct ib_umem *umem; }; +struct mlx4_ib_fast_reg_page_list { + struct ib_fast_reg_page_list ibfrpl; + dma_addr_t map; +}; + struct mlx4_ib_fmr { struct ib_fmr ibfmr; struct mlx4_fmr mfmr; @@ -199,6 +204,11 @@ static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr) return container_of(ibmr, struct mlx4_ib_mr, ibmr); } +static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +{ + return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl); +} + static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) { return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); @@ -239,6 +249,11 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr); +struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len); +struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len); +void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 68e92485fc7..db2086faa4e 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -183,6 +183,76 @@ int mlx4_ib_dereg_mr(struct ib_mr *ibmr) return 0; } +struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, + int max_page_list_len) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_mr *mr; + int err; + + mr = kmalloc(sizeof *mr, GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0, + max_page_list_len, 0, &mr->mmr); + if (err) + goto err_free; + + err = mlx4_mr_enable(dev->dev, &mr->mmr); + if (err) + goto err_mr; + + return &mr->ibmr; + +err_mr: + mlx4_mr_free(dev->dev, &mr->mmr); + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, + int page_list_len) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct mlx4_ib_fast_reg_page_list *mfrpl; + int size = page_list_len * sizeof (u64); + + if (size > PAGE_SIZE) + return ERR_PTR(-EINVAL); + + mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL); + if (!mfrpl) + return ERR_PTR(-ENOMEM); + + mfrpl->ibfrpl.page_list = dma_alloc_coherent(&dev->dev->pdev->dev, + size, &mfrpl->map, + GFP_KERNEL); + if (!mfrpl->ibfrpl.page_list) + goto err_free; + + WARN_ON(mfrpl->map & 0x3f); + + return &mfrpl->ibfrpl; + +err_free: + kfree(mfrpl); + return ERR_PTR(-ENOMEM); +} + +void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + struct mlx4_ib_dev *dev = to_mdev(page_list->device); + struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); + int size = page_list->max_page_list_len * sizeof (u64); + + dma_free_coherent(&dev->dev->pdev->dev, size, page_list->page_list, + mfrpl->map); + kfree(mfrpl); +} + struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, struct ib_fmr_attr *fmr_attr) { diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index bda0859a5ac..02a99bc4442 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -78,6 +78,9 @@ static const __be32 mlx4_ib_opcode[] = { [IB_WR_RDMA_READ] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ), [IB_WR_ATOMIC_CMP_AND_SWP] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), [IB_WR_ATOMIC_FETCH_AND_ADD] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), + [IB_WR_SEND_WITH_INV] = __constant_cpu_to_be32(MLX4_OPCODE_SEND_INVAL), + [IB_WR_LOCAL_INV] = __constant_cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL), + [IB_WR_FAST_REG_MR] = __constant_cpu_to_be32(MLX4_OPCODE_FMR), }; static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) @@ -976,6 +979,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pdn); context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); + /* Set "fast registration enabled" for all kernel QPs */ + if (!qp->ibqp.uobject) + context->params1 |= cpu_to_be32(1 << 11); + if (attr_mask & IB_QP_RNR_RETRY) { context->params1 |= cpu_to_be32(attr->rnr_retry << 13); optpar |= MLX4_QP_OPTPAR_RNR_RETRY; @@ -1322,6 +1329,38 @@ static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq return cur + nreq >= wq->max_post; } +static __be32 convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC) : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) | + (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ) : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) | + cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ); +} + +static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr) +{ + struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); + + fseg->flags = convert_access(wr->wr.fast_reg.access_flags); + fseg->mem_key = cpu_to_be32(wr->wr.fast_reg.rkey); + fseg->buf_list = cpu_to_be64(mfrpl->map); + fseg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); + fseg->reg_len = cpu_to_be64(wr->wr.fast_reg.length); + fseg->offset = 0; /* XXX -- is this just for ZBVA? */ + fseg->page_size = cpu_to_be32(wr->wr.fast_reg.page_shift); + fseg->reserved[0] = 0; + fseg->reserved[1] = 0; +} + +static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) +{ + iseg->flags = 0; + iseg->mem_key = cpu_to_be32(rkey); + iseg->guest_id = 0; + iseg->pa = 0; +} + static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, u64 remote_addr, u32 rkey) { @@ -1423,6 +1462,21 @@ static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, return 0; } +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { @@ -1469,11 +1523,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) | qp->sq_signal_bits; - if (wr->opcode == IB_WR_SEND_WITH_IMM || - wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) - ctrl->imm = wr->ex.imm_data; - else - ctrl->imm = 0; + ctrl->imm = send_ieth(wr); wqe += sizeof *ctrl; size = sizeof *ctrl / 16; @@ -1505,6 +1555,18 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof (struct mlx4_wqe_raddr_seg) / 16; break; + case IB_WR_LOCAL_INV: + set_local_inv_seg(wqe, wr->ex.invalidate_rkey); + wqe += sizeof (struct mlx4_wqe_local_inval_seg); + size += sizeof (struct mlx4_wqe_local_inval_seg) / 16; + break; + + case IB_WR_FAST_REG_MR: + set_fmr_seg(wqe, wr); + wqe += sizeof (struct mlx4_wqe_fmr_seg); + size += sizeof (struct mlx4_wqe_fmr_seg) / 16; + break; + default: /* No extra segments required for sends */ break; diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 0851ebdddfd..57278224ba1 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -202,7 +202,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET 0x8e #define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET 0x90 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET 0x92 -#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET 0x97 +#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET 0x94 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 @@ -377,12 +377,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) } } - if (dev_cap->bmme_flags & 1) - mlx4_dbg(dev, "Base MM extensions: yes " - "(flags %d, rsvd L_Key %08x)\n", - dev_cap->bmme_flags, dev_cap->reserved_lkey); - else - mlx4_dbg(dev, "Base MM extensions: no\n"); + mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n", + dev_cap->bmme_flags, dev_cap->reserved_lkey); /* * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h index a0e046c149b..fbf0e22be12 100644 --- a/drivers/net/mlx4/fw.h +++ b/drivers/net/mlx4/fw.h @@ -98,7 +98,7 @@ struct mlx4_dev_cap { int cmpt_entry_sz; int mtt_entry_sz; int resize_srq; - u8 bmme_flags; + u32 bmme_flags; u32 reserved_lkey; u64 max_icm_sz; int max_gso_sz; diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index d3736013fe9..8e1d24cda1b 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -158,6 +158,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.max_msg_sz = dev_cap->max_msg_sz; dev->caps.page_size_cap = ~(u32) (dev_cap->min_page_sz - 1); dev->caps.flags = dev_cap->flags; + dev->caps.bmme_flags = dev_cap->bmme_flags; + dev->caps.reserved_lkey = dev_cap->reserved_lkey; dev->caps.stat_rate_support = dev_cap->stat_rate_support; dev->caps.max_gso_sz = dev_cap->max_gso_sz; diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index b3ea93b9868..a3c04c5f12c 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -47,7 +47,7 @@ struct mlx4_mpt_entry { __be32 flags; __be32 qpn; __be32 key; - __be32 pd; + __be32 pd_flags; __be64 start; __be64 length; __be32 lkey; @@ -61,11 +61,15 @@ struct mlx4_mpt_entry { } __attribute__((packed)); #define MLX4_MPT_FLAG_SW_OWNS (0xfUL << 28) +#define MLX4_MPT_FLAG_FREE (0x3UL << 28) #define MLX4_MPT_FLAG_MIO (1 << 17) #define MLX4_MPT_FLAG_BIND_ENABLE (1 << 15) #define MLX4_MPT_FLAG_PHYSICAL (1 << 9) #define MLX4_MPT_FLAG_REGION (1 << 8) +#define MLX4_MPT_PD_FLAG_FAST_REG (1 << 26) +#define MLX4_MPT_PD_FLAG_EN_INV (3 << 24) + #define MLX4_MTT_FLAG_PRESENT 1 #define MLX4_MPT_STATUS_SW 0xF0 @@ -324,21 +328,30 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) memset(mpt_entry, 0, sizeof *mpt_entry); - mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS | - MLX4_MPT_FLAG_MIO | + mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_MIO | MLX4_MPT_FLAG_REGION | mr->access); mpt_entry->key = cpu_to_be32(key_to_hw_index(mr->key)); - mpt_entry->pd = cpu_to_be32(mr->pd); + mpt_entry->pd_flags = cpu_to_be32(mr->pd | MLX4_MPT_PD_FLAG_EN_INV); mpt_entry->start = cpu_to_be64(mr->iova); mpt_entry->length = cpu_to_be64(mr->size); mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift); + if (mr->mtt.order < 0) { mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL); mpt_entry->mtt_seg = 0; - } else + } else { mpt_entry->mtt_seg = cpu_to_be64(mlx4_mtt_addr(dev, &mr->mtt)); + } + + if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) { + /* fast register MR in free state */ + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_FREE); + mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG); + } else { + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS); + } err = mlx4_SW2HW_MPT(dev, mailbox, key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1)); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 81b3dd5206e..655ea0d1ee1 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -68,6 +68,14 @@ enum { MLX4_DEV_CAP_FLAG_UD_MCAST = 1 << 21 }; +enum { + MLX4_BMME_FLAG_LOCAL_INV = 1 << 6, + MLX4_BMME_FLAG_REMOTE_INV = 1 << 7, + MLX4_BMME_FLAG_TYPE_2_WIN = 1 << 9, + MLX4_BMME_FLAG_RESERVED_LKEY = 1 << 10, + MLX4_BMME_FLAG_FAST_REG_WR = 1 << 11, +}; + enum mlx4_event { MLX4_EVENT_TYPE_COMP = 0x00, MLX4_EVENT_TYPE_PATH_MIG = 0x01, @@ -184,6 +192,8 @@ struct mlx4_caps { u32 max_msg_sz; u32 page_size_cap; u32 flags; + u32 bmme_flags; + u32 reserved_lkey; u16 stat_rate_support; u8 port_width_cap[MLX4_MAX_PORTS + 1]; int max_gso_sz; diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index f02e9ed36cf..e27082cd650 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -233,6 +233,14 @@ struct mlx4_wqe_bind_seg { __be64 length; }; +enum { + MLX4_WQE_FMR_PERM_LOCAL_READ = 1 << 27, + MLX4_WQE_FMR_PERM_LOCAL_WRITE = 1 << 28, + MLX4_WQE_FMR_PERM_REMOTE_READ = 1 << 29, + MLX4_WQE_FMR_PERM_REMOTE_WRITE = 1 << 30, + MLX4_WQE_FMR_PERM_ATOMIC = 1 << 31 +}; + struct mlx4_wqe_fmr_seg { __be32 flags; __be32 mem_key; @@ -255,11 +263,11 @@ struct mlx4_wqe_fmr_ext_seg { }; struct mlx4_wqe_local_inval_seg { - u8 flags; - u8 reserved1[3]; + __be32 flags; + u32 reserved1; __be32 mem_key; - u8 reserved2[3]; - u8 guest_id; + u32 reserved2[2]; + __be32 guest_id; __be64 pa; }; -- cgit v1.2.3-70-g09d2 From 76442640829163d0cdb67c2bf0cb4b81a0fe537b Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 23 Jul 2008 08:12:47 -0700 Subject: mlx4_core: Improve error message when not enough UAR pages are available If an mlx4 device with default FW (which gives a UAR BAR size of 8 MB) is used in a system with 64 KB pages, then there are only 8192/64==128 UAR pages available. However, the first 128 UAR pages are reserved for use with event queue doorbells, so no UAR pages are available to do anything else with, which means that the driver cannot work. The current driver fails with a fairly cryptic "Failed to allocate driver access region, aborting" message in this situation. Fix the driver to detect the problem earlier and print out a clearer description of the problem and a suggestion of how to fix it (use a new firmware image). Signed-off-by: Roland Dreier --- drivers/net/mlx4/pd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c index 3a93c5f0f7a..aa616892d09 100644 --- a/drivers/net/mlx4/pd.c +++ b/drivers/net/mlx4/pd.c @@ -91,6 +91,13 @@ EXPORT_SYMBOL_GPL(mlx4_uar_free); int mlx4_init_uar_table(struct mlx4_dev *dev) { + if (dev->caps.num_uars <= 128) { + mlx4_err(dev, "Only %d UAR pages (need more than 128)\n", + dev->caps.num_uars); + mlx4_err(dev, "Increase firmware log2_uar_bar_megabytes?\n"); + return -ENODEV; + } + return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap, dev->caps.num_uars, dev->caps.num_uars - 1, max(128, dev->caps.reserved_uars)); -- cgit v1.2.3-70-g09d2 From 1fa6d8181b7bb0361512170c30e436dcc95591ee Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 23 Jul 2008 14:20:12 -0700 Subject: MAINTAINERS: Remove Glenn Streiff from NetEffect entry Glenn is no longer at NetEffect. Signed-off-by: Roland Dreier --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 11944b44c2f..2a73da0cd07 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2915,8 +2915,6 @@ P: Faisal Latif M: flatif@neteffect.com P: Chien Tung M: ctung@neteffect.com -P: Glenn Streiff -M: gstreiff@neteffect.com L: general@lists.openfabrics.org W: http://www.neteffect.com S: Supported -- cgit v1.2.3-70-g09d2