summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/ulp/ipoib/ipoib_ib.c
diff options
context:
space:
mode:
authorMichael S. Tsirkin <mst@mellanox.co.il>2007-02-05 22:12:23 +0200
committerRoland Dreier <rolandd@cisco.com>2007-02-10 08:00:48 -0800
commit839fcaba355abaffb7b44f0f4504093acb0b11cf (patch)
tree9e23f61ab0569ff144e6d9d4cb6a0887783f923c /drivers/infiniband/ulp/ipoib/ipoib_ib.c
parent9a6b090c0d1cd5c90f21db772dbe2fbcf14366de (diff)
IPoIB: Connected mode experimental support
The following patch adds experimental support for IPoIB connected mode, as defined by the draft from the IETF ipoib working group. The idea is to increase performance by increasing the MTU from the maximum of 2K (theoretically 4K) supported by IPoIB on top of UD. With this code, I'm able to get 800MByte/sec or more with netperf without options on a Mellanox 4x back-to-back DDR system. Some notes on code: 1. SRQ is used for scalability to large cluster sizes 2. Only RC connections are used (UC does not support SRQ now) 3. Retry count is set to 0 since spec draft warns against retries 4. Each connection is used for data transfers in only 1 direction, so each connection is either active(TX) or passive (RX). 2 sides that want to communicate create 2 connections. 5. Each active (TX) connection has a separate CQ for send completions - this keeps the code simple without CQ resize and other tricks 6. To detect stale passive side connections (where the remote side is down), we keep an LRU list of passive connections (updated once per second per connection) and destroy a connection after it has been unused for several seconds. The LRU rule makes it possible to avoid scanning connections that have recently been active. Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/ulp/ipoib/ipoib_ib.c')
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c29
1 files changed, 20 insertions, 9 deletions
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 59d9594ed6d..f2aa923ddbe 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level,
"Enable data path debug tracing if > 0");
#endif
-#define IPOIB_OP_RECV (1ul << 31)
-
static DEFINE_MUTEX(pkey_mutex);
struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
@@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
spin_lock_irqsave(&priv->tx_lock, flags);
++priv->tx_tail;
- if (netif_queue_stopped(dev) &&
- test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) &&
- priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
+ if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
+ priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
+ clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
netif_wake_queue(dev);
+ }
spin_unlock_irqrestore(&priv->tx_lock, flags);
if (wc->status != IB_WC_SUCCESS &&
@@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
{
- if (wc->wr_id & IPOIB_OP_RECV)
+ if (wc->wr_id & IPOIB_CM_OP_SRQ)
+ ipoib_cm_handle_rx_wc(dev, wc);
+ else if (wc->wr_id & IPOIB_OP_RECV)
ipoib_ib_handle_rx_wc(dev, wc);
else
ipoib_ib_handle_tx_wc(dev, wc);
@@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
struct ipoib_tx_buf *tx_req;
u64 addr;
- if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) {
+ if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) {
ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
- skb->len, dev->mtu + INFINIBAND_ALEN);
+ skb->len, priv->mcast_mtu + INFINIBAND_ALEN);
++priv->stats.tx_dropped;
++priv->stats.tx_errors;
- dev_kfree_skb_any(skb);
+ ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
return;
}
@@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
netif_stop_queue(dev);
+ set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
}
}
}
@@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
return -1;
}
+ ret = ipoib_cm_dev_open(dev);
+ if (ret) {
+ ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+ ipoib_ib_dev_stop(dev);
+ return -1;
+ }
+
clear_bit(IPOIB_STOP_REAPER, &priv->flags);
queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
@@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev)
clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
+ ipoib_cm_dev_stop(dev);
+
/*
* Move our QP to the error state and then reinitialize in
* when all work requests have completed or have been flushed.