From 99d4f22e91d26e0f8b113bf7fde65a335d36ad6b Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sat, 10 Feb 2007 08:00:47 -0800 Subject: IB/mthca: Use correct structure size in call to memset() When clearing the ib_ah_attr parameter in to_ib_ah_attr(), use sizeof *ib_ah_attr instead of sizeof *path. Pointed out by Jack Morgenstein . Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 5f5214c0337..224c93dd29e 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -399,7 +399,7 @@ static int to_ib_qp_access_flags(int mthca_flags) static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr, struct mthca_qp_path *path) { - memset(ib_ah_attr, 0, sizeof *path); + memset(ib_ah_attr, 0, sizeof *ib_ah_attr); ib_ah_attr->port_num = (be32_to_cpu(path->port_pkey) >> 24) & 0x3; if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports) -- cgit v1.2.3-70-g09d2 From 9a6b090c0d1cd5c90f21db772dbe2fbcf14366de Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 6 Feb 2007 18:07:25 +0200 Subject: IB/core: Use ARRAY_SIZE macro for mandatory_table Use ARRAY_SIZE() macro already defined in kernel.h instead of open coding equivalent code. Signed-off-by: Ahmed S. Darwish Signed-off-by: Roland Dreier --- drivers/infiniband/core/device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 63d2a39fb82..7fabb425b03 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -93,7 +94,7 @@ static int ib_device_check_mandatory(struct ib_device *device) }; int i; - for (i = 0; i < sizeof mandatory_table / sizeof mandatory_table[0]; ++i) { + for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) device + mandatory_table[i].offset)) { printk(KERN_WARNING "Device %s is missing mandatory function %s\n", device->name, mandatory_table[i].name); -- cgit v1.2.3-70-g09d2 From 839fcaba355abaffb7b44f0f4504093acb0b11cf Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 5 Feb 2007 22:12:23 +0200 Subject: IPoIB: Connected mode experimental support The following patch adds experimental support for IPoIB connected mode, as defined by the draft from the IETF ipoib working group. The idea is to increase performance by increasing the MTU from the maximum of 2K (theoretically 4K) supported by IPoIB on top of UD. With this code, I'm able to get 800MByte/sec or more with netperf without options on a Mellanox 4x back-to-back DDR system. Some notes on code: 1. SRQ is used for scalability to large cluster sizes 2. Only RC connections are used (UC does not support SRQ now) 3. Retry count is set to 0 since spec draft warns against retries 4. Each connection is used for data transfers in only 1 direction, so each connection is either active(TX) or passive (RX). 2 sides that want to communicate create 2 connections. 5. Each active (TX) connection has a separate CQ for send completions - this keeps the code simple without CQ resize and other tricks 6. To detect stale passive side connections (where the remote side is down), we keep an LRU list of passive connections (updated once per second per connection) and destroy a connection after it has been unused for several seconds. The LRU rule makes it possible to avoid scanning connections that have recently been active. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/Kconfig | 16 +- drivers/infiniband/ulp/ipoib/Makefile | 1 + drivers/infiniband/ulp/ipoib/ipoib.h | 215 ++++ drivers/infiniband/ulp/ipoib/ipoib_cm.c | 1237 ++++++++++++++++++++++++ drivers/infiniband/ulp/ipoib/ipoib_ib.c | 29 +- drivers/infiniband/ulp/ipoib/ipoib_main.c | 63 +- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 4 +- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 40 +- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 2 + 9 files changed, 1575 insertions(+), 32 deletions(-) create mode 100644 drivers/infiniband/ulp/ipoib/ipoib_cm.c (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig index c75322d820d..af78ccc4ce7 100644 --- a/drivers/infiniband/ulp/ipoib/Kconfig +++ b/drivers/infiniband/ulp/ipoib/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_IPOIB tristate "IP-over-InfiniBand" - depends on INFINIBAND && NETDEVICES && INET + depends on INFINIBAND && NETDEVICES && INET && (IPV6 || IPV6=n) ---help--- Support for the IP-over-InfiniBand protocol (IPoIB). This transports IP packets over InfiniBand so you can use your IB @@ -8,6 +8,20 @@ config INFINIBAND_IPOIB See Documentation/infiniband/ipoib.txt for more information +config INFINIBAND_IPOIB_CM + bool "IP-over-InfiniBand Connected Mode support" + depends on INFINIBAND_IPOIB && EXPERIMENTAL + default n + ---help--- + This option enables experimental support for IPoIB connected mode. + After enabling this option, you need to switch to connected mode through + /sys/class/net/ibXXX/mode to actually create connections, and then increase + the interface MTU with e.g. ifconfig ib0 mtu 65520. + + WARNING: Enabling connected mode will trigger some + packet drops for multicast and UD mode traffic from this interface, + unless you limit mtu for these destinations to 2044. + config INFINIBAND_IPOIB_DEBUG bool "IP-over-InfiniBand debugging" if EMBEDDED depends on INFINIBAND_IPOIB diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile index 8935e74ae3f..98ee38e8c2c 100644 --- a/drivers/infiniband/ulp/ipoib/Makefile +++ b/drivers/infiniband/ulp/ipoib/Makefile @@ -5,5 +5,6 @@ ib_ipoib-y := ipoib_main.o \ ipoib_multicast.o \ ipoib_verbs.o \ ipoib_vlan.o +ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 07deee8f81c..2594db2030b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -62,6 +62,10 @@ enum { IPOIB_ENCAP_LEN = 4, + IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */ + IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, + IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, + IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, IPOIB_RX_RING_SIZE = 128, IPOIB_TX_RING_SIZE = 64, IPOIB_MAX_QUEUE_SIZE = 8192, @@ -81,6 +85,8 @@ enum { IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, IPOIB_MCAST_STARTED = 8, + IPOIB_FLAG_NETIF_STOPPED = 9, + IPOIB_FLAG_ADMIN_CM = 10, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -90,6 +96,13 @@ enum { IPOIB_MCAST_FLAG_ATTACHED = 3, }; +#define IPOIB_OP_RECV (1ul << 31) +#ifdef CONFIG_INFINIBAND_IPOIB_CM +#define IPOIB_CM_OP_SRQ (1ul << 30) +#else +#define IPOIB_CM_OP_SRQ (0) +#endif + /* structs */ struct ipoib_header { @@ -113,6 +126,59 @@ struct ipoib_tx_buf { u64 mapping; }; +struct ib_cm_id; + +struct ipoib_cm_data { + __be32 qpn; /* High byte MUST be ignored on receive */ + __be32 mtu; +}; + +struct ipoib_cm_rx { + struct ib_cm_id *id; + struct ib_qp *qp; + struct list_head list; + struct net_device *dev; + unsigned long jiffies; +}; + +struct ipoib_cm_tx { + struct ib_cm_id *id; + struct ib_cq *cq; + struct ib_qp *qp; + struct list_head list; + struct net_device *dev; + struct ipoib_neigh *neigh; + struct ipoib_path *path; + struct ipoib_tx_buf *tx_ring; + unsigned tx_head; + unsigned tx_tail; + unsigned long flags; + u32 mtu; + struct ib_wc ibwc[IPOIB_NUM_WC]; +}; + +struct ipoib_cm_rx_buf { + struct sk_buff *skb; + u64 mapping[IPOIB_CM_RX_SG]; +}; + +struct ipoib_cm_dev_priv { + struct ib_srq *srq; + struct ipoib_cm_rx_buf *srq_ring; + struct ib_cm_id *id; + struct list_head passive_ids; + struct work_struct start_task; + struct work_struct reap_task; + struct work_struct skb_task; + struct delayed_work stale_task; + struct sk_buff_head skb_queue; + struct list_head start_list; + struct list_head reap_list; + struct ib_wc ibwc[IPOIB_NUM_WC]; + struct ib_sge rx_sge[IPOIB_CM_RX_SG]; + struct ib_recv_wr rx_wr; +}; + /* * Device private locking: tx_lock protects members used in TX fast * path (and we use LLTX so upper layers don't do extra locking). @@ -179,6 +245,10 @@ struct ipoib_dev_priv { struct list_head child_intfs; struct list_head list; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_dev_priv cm; +#endif + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct list_head fs_list; struct dentry *mcg_dentry; @@ -212,6 +282,9 @@ struct ipoib_path { struct ipoib_neigh { struct ipoib_ah *ah; +#ifdef CONFIG_INFINIBAND_IPOIB_CM + struct ipoib_cm_tx *cm; +#endif union ib_gid dgid; struct sk_buff_head queue; @@ -315,6 +388,146 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct net_device *dev); +#ifdef CONFIG_INFINIBAND_IPOIB_CM + +#define IPOIB_FLAGS_RC 0x80 +#define IPOIB_FLAGS_UC 0x40 + +/* We don't support UC connections at the moment */ +#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC)) + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(dev->dev_addr) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return IPOIB_CM_SUPPORTED(n->ha) && + test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags); +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return neigh->cm; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ + neigh->cm = tx; +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx); +int ipoib_cm_dev_open(struct net_device *dev); +void ipoib_cm_dev_stop(struct net_device *dev); +int ipoib_cm_dev_init(struct net_device *dev); +int ipoib_cm_add_mode_attr(struct net_device *dev); +void ipoib_cm_dev_cleanup(struct net_device *dev); +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh); +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); +void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, + unsigned int mtu); +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +#else + +struct ipoib_cm_tx; + +static inline int ipoib_cm_admin_enabled(struct net_device *dev) +{ + return 0; +} +static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) + +{ + return 0; +} + +static inline int ipoib_cm_up(struct ipoib_neigh *neigh) + +{ + return 0; +} + +static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx) +{ +} + +static inline +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_dev_open(struct net_device *dev) +{ + return 0; +} + +static inline +void ipoib_cm_dev_stop(struct net_device *dev) +{ + return; +} + +static inline +int ipoib_cm_dev_init(struct net_device *dev) +{ + return -ENOSYS; +} + +static inline +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + return; +} + +static inline +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + return NULL; +} + +static inline +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + return; +} + +static inline +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return 0; +} + +static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, + unsigned int mtu) +{ + dev_kfree_skb_any(skb); +} + +static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} + +#endif + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG void ipoib_create_debug_files(struct net_device *dev); void ipoib_delete_debug_files(struct net_device *dev); @@ -392,4 +605,6 @@ extern int ipoib_debug_level; #define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw) +#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) + #endif /* _IPOIB_H */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c new file mode 100644 index 00000000000..2d483874a58 --- /dev/null +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -0,0 +1,1237 @@ +/* + * Copyright (c) 2006 Mellanox Technologies. All rights reserved + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ + +#include +#include +#include +#include +#include + +#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA +static int data_debug_level; + +module_param_named(cm_data_debug_level, data_debug_level, int, 0644); +MODULE_PARM_DESC(cm_data_debug_level, + "Enable data path debug tracing for connected mode if > 0"); +#endif + +#include "ipoib.h" + +#define IPOIB_CM_IETF_ID 0x1000000000000000ULL + +#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ) +#define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ) +#define IPOIB_CM_RX_DELAY (3 * 256 * HZ) +#define IPOIB_CM_RX_UPDATE_MASK (0x3) + +struct ipoib_cm_id { + struct ib_cm_id *id; + int flags; + u32 remote_qpn; + u32 remote_mtu; +}; + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event); + +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, + u64 mapping[IPOIB_CM_RX_SG]) +{ + int i; + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (i = 0; i < IPOIB_CM_RX_SG - 1; ++i) + ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); +} + +static int ipoib_cm_post_receive(struct net_device *dev, int id) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_recv_wr *bad_wr; + int i, ret; + + priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ; + + for (i = 0; i < IPOIB_CM_RX_SG; ++i) + priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; + + ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); + if (unlikely(ret)) { + ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); + ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[id].mapping); + dev_kfree_skb_any(priv->cm.srq_ring[id].skb); + priv->cm.srq_ring[id].skb = NULL; + } + + return ret; +} + +static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, + u64 mapping[IPOIB_CM_RX_SG]) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sk_buff *skb; + int i; + + skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); + if (unlikely(!skb)) + return -ENOMEM; + + /* + * IPoIB adds a 4 byte header. So we need 12 more bytes to align the + * IP header to a multiple of 16. + */ + skb_reserve(skb, 12); + + mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, + DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { + dev_kfree_skb_any(skb); + return -EIO; + } + + for (i = 0; i < IPOIB_CM_RX_SG - 1; i++) { + struct page *page = alloc_page(GFP_ATOMIC); + + if (!page) + goto partial_error; + skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE); + + mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page, + 0, PAGE_SIZE, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1]))) + goto partial_error; + } + + priv->cm.srq_ring[id].skb = skb; + return 0; + +partial_error: + + ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); + + for (; i >= 0; --i) + ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); + + kfree_skb(skb); + return -ENOMEM; +} + +static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, + struct ipoib_cm_rx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = { + .send_cq = priv->cq, /* does not matter, we never send anything */ + .recv_cq = priv->cq, + .srq = priv->cm.srq, + .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */ + .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */ + .sq_sig_type = IB_SIGNAL_ALL_WR, + .qp_type = IB_QPT_RC, + .qp_context = p, + }; + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_modify_rx_qp(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + + qp_attr.qp_state = IB_QPS_INIT; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret); + return ret; + } + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret); + return ret; + } + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + qp_attr.rq_psn = psn; + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + return 0; +} + +static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, + struct ib_qp *qp, struct ib_cm_req_event_param *req, + unsigned psn) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_rep_param rep = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + rep.private_data = &data; + rep.private_data_len = sizeof data; + rep.flow_control = 0; + rep.rnr_retry_count = req->rnr_retry_count; + rep.target_ack_delay = 20; /* FIXME */ + rep.srq = 1; + rep.qp_num = qp->qp_num; + rep.starting_psn = psn; + return ib_send_cm_rep(cm_id, &rep); +} + +static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct net_device *dev = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned long flags; + unsigned psn; + int ret; + + ipoib_dbg(priv, "REQ arrived\n"); + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + p->dev = dev; + p->id = cm_id; + p->qp = ipoib_cm_create_rx_qp(dev, p); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + goto err_qp; + } + + psn = random32() & 0xffffff; + ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn); + if (ret) + goto err_modify; + + ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn); + if (ret) { + ipoib_warn(priv, "failed to send REP: %d\n", ret); + goto err_rep; + } + + cm_id->context = p; + p->jiffies = jiffies; + spin_lock_irqsave(&priv->lock, flags); + list_add(&p->list, &priv->cm.passive_ids); + spin_unlock_irqrestore(&priv->lock, flags); + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + return 0; + +err_rep: +err_modify: + ib_destroy_qp(p->qp); +err_qp: + kfree(p); + return ret; +} + +static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_rx *p; + struct ipoib_dev_priv *priv; + unsigned long flags; + int ret; + + switch (event->event) { + case IB_CM_REQ_RECEIVED: + return ipoib_cm_req_handler(cm_id, event); + case IB_CM_DREQ_RECEIVED: + p = cm_id->context; + ib_send_cm_drep(cm_id, NULL, 0); + /* Fall through */ + case IB_CM_REJ_RECEIVED: + p = cm_id->context; + priv = netdev_priv(p->dev); + spin_lock_irqsave(&priv->lock, flags); + if (list_empty(&p->list)) + ret = 0; /* Connection is going away already. */ + else { + list_del_init(&p->list); + ret = -ECONNRESET; + } + spin_unlock_irqrestore(&priv->lock, flags); + if (ret) { + ib_destroy_qp(p->qp); + kfree(p); + return ret; + } + return 0; + default: + return 0; + } +} +/* Adjust length of skb with fragments to match received data */ +static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, + unsigned int length) +{ + int i, num_frags; + unsigned int size; + + /* put header into skb */ + size = min(length, hdr_space); + skb->tail += size; + skb->len += size; + length -= size; + + num_frags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < num_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (length == 0) { + /* don't need this page */ + __free_page(frag->page); + --skb_shinfo(skb)->nr_frags; + } else { + size = min(length, (unsigned) PAGE_SIZE); + + frag->size = size; + skb->data_len += size; + skb->truesize += size; + skb->len += size; + length -= size; + } + } +} + +void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ; + struct sk_buff *skb; + struct ipoib_cm_rx *p; + unsigned long flags; + u64 mapping[IPOIB_CM_RX_SG]; + + ipoib_dbg_data(priv, "cm recv completion: id %d, op %d, status: %d\n", + wr_id, wc->opcode, wc->status); + + if (unlikely(wr_id >= ipoib_recvq_size)) { + ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", + wr_id, ipoib_recvq_size); + return; + } + + skb = priv->cm.srq_ring[wr_id].skb; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + ipoib_dbg(priv, "cm recv error " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + ++priv->stats.rx_dropped; + goto repost; + } + + if (!likely(wr_id & IPOIB_CM_RX_UPDATE_MASK)) { + p = wc->qp->qp_context; + if (time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { + spin_lock_irqsave(&priv->lock, flags); + p->jiffies = jiffies; + /* Move this entry to list head, but do + * not re-add it if it has been removed. */ + if (!list_empty(&p->list)) + list_move(&p->list, &priv->cm.passive_ids); + spin_unlock_irqrestore(&priv->lock, flags); + queue_delayed_work(ipoib_workqueue, + &priv->cm.stale_task, IPOIB_CM_RX_DELAY); + } + } + + if (unlikely(ipoib_cm_alloc_rx_skb(dev, wr_id, mapping))) { + /* + * If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id); + ++priv->stats.rx_dropped; + goto repost; + } + + ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[wr_id].mapping); + memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, sizeof mapping); + + ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", + wc->byte_len, wc->slid); + + skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len); + + skb->protocol = ((struct ipoib_header *) skb->data)->proto; + skb->mac.raw = skb->data; + skb_pull(skb, IPOIB_ENCAP_LEN); + + dev->last_rx = jiffies; + ++priv->stats.rx_packets; + priv->stats.rx_bytes += skb->len; + + skb->dev = dev; + /* XXX get correct PACKET_ type here */ + skb->pkt_type = PACKET_HOST; + netif_rx_ni(skb); + +repost: + if (unlikely(ipoib_cm_post_receive(dev, wr_id))) + ipoib_warn(priv, "ipoib_cm_post_receive failed " + "for buf %d\n", wr_id); +} + +static inline int post_send(struct ipoib_dev_priv *priv, + struct ipoib_cm_tx *tx, + unsigned int wr_id, + u64 addr, int len) +{ + struct ib_send_wr *bad_wr; + + priv->tx_sge.addr = addr; + priv->tx_sge.length = len; + + priv->tx_wr.wr_id = wr_id; + + return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); +} + +void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_tx_buf *tx_req; + u64 addr; + + if (unlikely(skb->len > tx->mtu)) { + ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", + skb->len, tx->mtu); + ++priv->stats.tx_dropped; + ++priv->stats.tx_errors; + ipoib_cm_skb_too_long(dev, skb, tx->mtu - INFINIBAND_ALEN); + return; + } + + ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", + tx->tx_head, skb->len, tx->qp->qp_num); + + /* + * We put the skb into the tx_ring _before_ we call post_send() + * because it's entirely possible that the completion handler will + * run before we execute anything after the post_send(). That + * means we have to make sure everything is properly recorded and + * our state is consistent before we call post_send(). + */ + tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; + tx_req->skb = skb; + addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(priv->ca, addr))) { + ++priv->stats.tx_errors; + dev_kfree_skb_any(skb); + return; + } + + tx_req->mapping = addr; + + if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), + addr, skb->len))) { + ipoib_warn(priv, "post_send failed\n"); + ++priv->stats.tx_errors; + ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); + dev_kfree_skb_any(skb); + } else { + dev->trans_start = jiffies; + ++tx->tx_head; + + if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + netif_stop_queue(dev); + set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); + } + } +} + +static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx, + struct ib_wc *wc) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned int wr_id = wc->wr_id; + struct ipoib_tx_buf *tx_req; + unsigned long flags; + + ipoib_dbg_data(priv, "cm send completion: id %d, op %d, status: %d\n", + wr_id, wc->opcode, wc->status); + + if (unlikely(wr_id >= ipoib_sendq_size)) { + ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", + wr_id, ipoib_sendq_size); + return; + } + + tx_req = &tx->tx_ring[wr_id]; + + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); + + /* FIXME: is this right? Shouldn't we only increment on success? */ + ++priv->stats.tx_packets; + priv->stats.tx_bytes += tx_req->skb->len; + + dev_kfree_skb_any(tx_req->skb); + + spin_lock_irqsave(&priv->tx_lock, flags); + ++tx->tx_tail; + if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) && + tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) { + clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); + netif_wake_queue(dev); + } + + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_neigh *neigh; + + ipoib_dbg(priv, "failed cm send event " + "(status=%d, wrid=%d vend_err %x)\n", + wc->status, wr_id, wc->vendor_err); + + spin_lock(&priv->lock); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + + tx->neigh = NULL; + } + + /* queue would be re-started anyway when TX is destroyed, + * but it makes sense to do it ASAP here. */ + if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) + netif_wake_queue(dev); + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); + + spin_unlock(&priv->lock); + } + + spin_unlock_irqrestore(&priv->tx_lock, flags); +} + +static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr) +{ + struct ipoib_cm_tx *tx = tx_ptr; + int n, i; + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + do { + n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc); + for (i = 0; i < n; ++i) + ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i); + } while (n == IPOIB_NUM_WC); +} + +int ipoib_cm_dev_open(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + return 0; + + priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); + if (IS_ERR(priv->cm.id)) { + printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); + return IS_ERR(priv->cm.id); + } + + ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), + 0, NULL); + if (ret) { + printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, + IPOIB_CM_IETF_ID | priv->qp->qp_num); + ib_destroy_cm_id(priv->cm.id); + return ret; + } + return 0; +} + +void ipoib_cm_dev_stop(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_rx *p; + unsigned long flags; + + if (!IPOIB_CM_SUPPORTED(dev->dev_addr)) + return; + + ib_destroy_cm_id(priv->cm.id); + spin_lock_irqsave(&priv->lock, flags); + while (!list_empty(&priv->cm.passive_ids)) { + p = list_entry(priv->cm.passive_ids.next, typeof(*p), list); + list_del_init(&p->list); + spin_unlock_irqrestore(&priv->lock, flags); + ib_destroy_cm_id(p->id); + ib_destroy_qp(p->qp); + kfree(p); + spin_lock_irqsave(&priv->lock, flags); + } + spin_unlock_irqrestore(&priv->lock, flags); + + cancel_delayed_work(&priv->cm.stale_task); +} + +static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +{ + struct ipoib_cm_tx *p = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_cm_data *data = event->private_data; + struct sk_buff_head skqueue; + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + struct sk_buff *skb; + unsigned long flags; + + p->mtu = be32_to_cpu(data->mtu); + + if (p->mtu < priv->dev->mtu + IPOIB_ENCAP_LEN) { + ipoib_warn(priv, "Rejecting connection: mtu %d < device mtu %d + 4\n", + p->mtu, priv->dev->mtu); + return -EINVAL; + } + + qp_attr.qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret); + return ret; + } + + qp_attr.rq_psn = 0 /* FIXME */; + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret); + return ret; + } + ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret); + return ret; + } + + skb_queue_head_init(&skqueue); + + spin_lock_irqsave(&priv->lock, flags); + set_bit(IPOIB_FLAG_OPER_UP, &p->flags); + if (p->neigh) + while ((skb = __skb_dequeue(&p->neigh->queue))) + __skb_queue_tail(&skqueue, skb); + spin_unlock_irqrestore(&priv->lock, flags); + + while ((skb = __skb_dequeue(&skqueue))) { + skb->dev = p->dev; + if (dev_queue_xmit(skb)) + ipoib_warn(priv, "dev_queue_xmit failed " + "to requeue packet\n"); + } + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + if (ret) { + ipoib_warn(priv, "failed to send RTU: %d\n", ret); + return ret; + } + return 0; +} + +static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_init_attr attr = {}; + attr.recv_cq = priv->cq; + attr.srq = priv->cm.srq; + attr.cap.max_send_wr = ipoib_sendq_size; + attr.cap.max_send_sge = 1; + attr.sq_sig_type = IB_SIGNAL_ALL_WR; + attr.qp_type = IB_QPT_RC; + attr.send_cq = cq; + return ib_create_qp(priv->pd, &attr); +} + +static int ipoib_cm_send_req(struct net_device *dev, + struct ib_cm_id *id, struct ib_qp *qp, + u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_data data = {}; + struct ib_cm_req_param req = {}; + + data.qpn = cpu_to_be32(priv->qp->qp_num); + data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); + + req.primary_path = pathrec; + req.alternate_path = NULL; + req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn); + req.qp_num = qp->qp_num; + req.qp_type = qp->qp_type; + req.private_data = &data; + req.private_data_len = sizeof data; + req.flow_control = 0; + + req.starting_psn = 0; /* FIXME */ + + /* + * Pick some arbitrary defaults here; we could make these + * module parameters if anyone cared about setting them. + */ + req.responder_resources = 4; + req.remote_cm_response_timeout = 20; + req.local_cm_response_timeout = 20; + req.retry_count = 0; /* RFC draft warns against retries */ + req.rnr_retry_count = 0; /* RFC draft warns against retries */ + req.max_cm_retries = 15; + req.srq = 1; + return ib_send_cm_req(id, &req); +} + +static int ipoib_cm_modify_tx_init(struct net_device *dev, + struct ib_cm_id *cm_id, struct ib_qp *qp) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret; + ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index); + if (ret) { + ipoib_warn(priv, "pkey 0x%x not in cache: %d\n", priv->pkey, ret); + return ret; + } + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE; + qp_attr.port_num = priv->port; + qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + + ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); + if (ret) { + ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret); + return ret; + } + return 0; +} + +static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, + struct ib_sa_path_rec *pathrec) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + int ret; + + p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, + GFP_KERNEL); + if (!p->tx_ring) { + ipoib_warn(priv, "failed to allocate tx ring\n"); + ret = -ENOMEM; + goto err_tx; + } + + p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p, + ipoib_sendq_size + 1); + if (IS_ERR(p->cq)) { + ret = PTR_ERR(p->cq); + ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret); + goto err_cq; + } + + ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP); + if (ret) { + ipoib_warn(priv, "failed to request completion notification: %d\n", ret); + goto err_req_notify; + } + + p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq); + if (IS_ERR(p->qp)) { + ret = PTR_ERR(p->qp); + ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); + goto err_qp; + } + + p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p); + if (IS_ERR(p->id)) { + ret = PTR_ERR(p->id); + ipoib_warn(priv, "failed to create tx cm id: %d\n", ret); + goto err_id; + } + + ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp); + if (ret) { + ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret); + goto err_modify; + } + + ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec); + if (ret) { + ipoib_warn(priv, "failed to send cm req: %d\n", ret); + goto err_send_cm; + } + + ipoib_dbg(priv, "Request connection 0x%x for gid " IPOIB_GID_FMT " qpn 0x%x\n", + p->qp->qp_num, IPOIB_GID_ARG(pathrec->dgid), qpn); + + return 0; + +err_send_cm: +err_modify: + ib_destroy_cm_id(p->id); +err_id: + p->id = NULL; + ib_destroy_qp(p->qp); +err_req_notify: +err_qp: + p->qp = NULL; + ib_destroy_cq(p->cq); +err_cq: + p->cq = NULL; +err_tx: + return ret; +} + +static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) +{ + struct ipoib_dev_priv *priv = netdev_priv(p->dev); + struct ipoib_tx_buf *tx_req; + + ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", + p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); + + if (p->id) + ib_destroy_cm_id(p->id); + + if (p->qp) + ib_destroy_qp(p->qp); + + if (p->cq) + ib_destroy_cq(p->cq); + + if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags)) + netif_wake_queue(p->dev); + + if (p->tx_ring) { + while ((int) p->tx_tail - (int) p->tx_head < 0) { + tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, + DMA_TO_DEVICE); + dev_kfree_skb_any(tx_req->skb); + ++p->tx_tail; + } + + kfree(p->tx_ring); + } + + kfree(p); +} + +static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event) +{ + struct ipoib_cm_tx *tx = cm_id->context; + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + unsigned long flags; + int ret; + + switch (event->event) { + case IB_CM_DREQ_RECEIVED: + ipoib_dbg(priv, "DREQ received.\n"); + ib_send_cm_drep(cm_id, NULL, 0); + break; + case IB_CM_REP_RECEIVED: + ipoib_dbg(priv, "REP received.\n"); + ret = ipoib_cm_rep_handler(cm_id, event); + if (ret) + ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + break; + case IB_CM_REQ_ERROR: + case IB_CM_REJ_RECEIVED: + case IB_CM_TIMEWAIT_EXIT: + ipoib_dbg(priv, "CM error %d.\n", event->event); + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + neigh = tx->neigh; + + if (neigh) { + neigh->cm = NULL; + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + + tx->neigh = NULL; + } + + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + } + + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); + break; + default: + break; + } + + return 0; +} + +struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path, + struct ipoib_neigh *neigh) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_cm_tx *tx; + + tx = kzalloc(sizeof *tx, GFP_ATOMIC); + if (!tx) + return NULL; + + neigh->cm = tx; + tx->neigh = neigh; + tx->path = path; + tx->dev = dev; + list_add(&tx->list, &priv->cm.start_list); + set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); + queue_work(ipoib_workqueue, &priv->cm.start_task); + return tx; +} + +void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) +{ + struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + list_move(&tx->list, &priv->cm.reap_list); + queue_work(ipoib_workqueue, &priv->cm.reap_task); + ipoib_dbg(priv, "Reap connection for gid " IPOIB_GID_FMT "\n", + IPOIB_GID_ARG(tx->neigh->dgid)); + tx->neigh = NULL; + } +} + +static void ipoib_cm_tx_start(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.start_task); + struct net_device *dev = priv->dev; + struct ipoib_neigh *neigh; + struct ipoib_cm_tx *p; + unsigned long flags; + int ret; + + struct ib_sa_path_rec pathrec; + u32 qpn; + + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + while (!list_empty(&priv->cm.start_list)) { + p = list_entry(priv->cm.start_list.next, typeof(*p), list); + list_del_init(&p->list); + neigh = p->neigh; + qpn = IPOIB_QPN(neigh->neighbour->ha); + memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); + ret = ipoib_cm_tx_init(p, qpn, &pathrec); + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + if (ret) { + neigh = p->neigh; + if (neigh) { + neigh->cm = NULL; + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + } + list_del(&p->list); + kfree(p); + } + } + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); +} + +static void ipoib_cm_tx_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.reap_task); + struct ipoib_cm_tx *p; + unsigned long flags; + + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + while (!list_empty(&priv->cm.reap_list)) { + p = list_entry(priv->cm.reap_list.next, typeof(*p), list); + list_del(&p->list); + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); + ipoib_cm_tx_destroy(p); + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + } + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); +} + +static void ipoib_cm_skb_reap(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.skb_task); + struct net_device *dev = priv->dev; + struct sk_buff *skb; + unsigned long flags; + + unsigned mtu = priv->mcast_mtu; + + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + while ((skb = skb_dequeue(&priv->cm.skb_queue))) { + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); + if (skb->protocol == htons(ETH_P_IP)) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (skb->protocol == htons(ETH_P_IPV6)) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); +#endif + dev_kfree_skb_any(skb); + spin_lock_irqsave(&priv->tx_lock, flags); + spin_lock(&priv->lock); + } + spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->tx_lock, flags); +} + +void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, + unsigned int mtu) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int e = skb_queue_empty(&priv->cm.skb_queue); + + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + skb_queue_tail(&priv->cm.skb_queue, skb); + if (e) + queue_work(ipoib_workqueue, &priv->cm.skb_task); +} + +static void ipoib_cm_stale_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, + cm.stale_task.work); + struct ipoib_cm_rx *p; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + while (!list_empty(&priv->cm.passive_ids)) { + /* List if sorted by LRU, start from tail, + * stop when we see a recently used entry */ + p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list); + if (time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT)) + break; + list_del_init(&p->list); + spin_unlock_irqrestore(&priv->lock, flags); + ib_destroy_cm_id(p->id); + ib_destroy_qp(p->qp); + kfree(p); + spin_lock_irqsave(&priv->lock, flags); + } + spin_unlock_irqrestore(&priv->lock, flags); +} + + +static ssize_t show_mode(struct device *d, struct device_attribute *attr, + char *buf) +{ + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d)); + + if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) + return sprintf(buf, "connected\n"); + else + return sprintf(buf, "datagram\n"); +} + +static ssize_t set_mode(struct device *d, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *dev = to_net_dev(d); + struct ipoib_dev_priv *priv = netdev_priv(dev); + + /* flush paths if we switch modes so that connections are restarted */ + if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { + set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + ipoib_warn(priv, "enabling connected mode " + "will cause multicast packet drops\n"); + ipoib_flush_paths(dev); + return count; + } + + if (!strcmp(buf, "datagram\n")) { + clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); + dev->mtu = min(priv->mcast_mtu, dev->mtu); + ipoib_flush_paths(dev); + return count; + } + + return -EINVAL; +} + +static DEVICE_ATTR(mode, S_IWUGO | S_IRUGO, show_mode, set_mode); + +int ipoib_cm_add_mode_attr(struct net_device *dev) +{ + return device_create_file(&dev->dev, &dev_attr_mode); +} + +int ipoib_cm_dev_init(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_srq_init_attr srq_init_attr = { + .attr = { + .max_wr = ipoib_recvq_size, + .max_sge = IPOIB_CM_RX_SG + } + }; + int ret, i; + + INIT_LIST_HEAD(&priv->cm.passive_ids); + INIT_LIST_HEAD(&priv->cm.reap_list); + INIT_LIST_HEAD(&priv->cm.start_list); + INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start); + INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap); + INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap); + INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task); + + skb_queue_head_init(&priv->cm.skb_queue); + + priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); + if (IS_ERR(priv->cm.srq)) { + ret = PTR_ERR(priv->cm.srq); + priv->cm.srq = NULL; + return ret; + } + + priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, + GFP_KERNEL); + if (!priv->cm.srq_ring) { + printk(KERN_WARNING "%s: failed to allocate CM ring (%d entries)\n", + priv->ca->name, ipoib_recvq_size); + ipoib_cm_dev_cleanup(dev); + return -ENOMEM; + } + + for (i = 0; i < IPOIB_CM_RX_SG; ++i) + priv->cm.rx_sge[i].lkey = priv->mr->lkey; + + priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE; + for (i = 1; i < IPOIB_CM_RX_SG; ++i) + priv->cm.rx_sge[i].length = PAGE_SIZE; + priv->cm.rx_wr.next = NULL; + priv->cm.rx_wr.sg_list = priv->cm.rx_sge; + priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG; + + for (i = 0; i < ipoib_recvq_size; ++i) { + if (ipoib_cm_alloc_rx_skb(dev, i, priv->cm.srq_ring[i].mapping)) { + ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -ENOMEM; + } + if (ipoib_cm_post_receive(dev, i)) { + ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); + ipoib_cm_dev_cleanup(dev); + return -EIO; + } + } + + priv->dev->dev_addr[0] = IPOIB_FLAGS_RC; + return 0; +} + +void ipoib_cm_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int i, ret; + + if (!priv->cm.srq) + return; + + ipoib_dbg(priv, "Cleanup ipoib connected mode.\n"); + + ret = ib_destroy_srq(priv->cm.srq); + if (ret) + ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret); + + priv->cm.srq = NULL; + if (!priv->cm.srq_ring) + return; + for (i = 0; i < ipoib_recvq_size; ++i) + if (priv->cm.srq_ring[i].skb) { + ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[i].mapping); + dev_kfree_skb_any(priv->cm.srq_ring[i].skb); + priv->cm.srq_ring[i].skb = NULL; + } + kfree(priv->cm.srq_ring); + priv->cm.srq_ring = NULL; +} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 59d9594ed6d..f2aa923ddbe 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0"); #endif -#define IPOIB_OP_RECV (1ul << 31) - static DEFINE_MUTEX(pkey_mutex); struct ipoib_ah *ipoib_create_ah(struct net_device *dev, @@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->tx_lock, flags); ++priv->tx_tail; - if (netif_queue_stopped(dev) && - test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) && - priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) + if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) && + priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) { + clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); netif_wake_queue(dev); + } spin_unlock_irqrestore(&priv->tx_lock, flags); if (wc->status != IB_WC_SUCCESS && @@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc) { - if (wc->wr_id & IPOIB_OP_RECV) + if (wc->wr_id & IPOIB_CM_OP_SRQ) + ipoib_cm_handle_rx_wc(dev, wc); + else if (wc->wr_id & IPOIB_OP_RECV) ipoib_ib_handle_rx_wc(dev, wc); else ipoib_ib_handle_tx_wc(dev, wc); @@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_tx_buf *tx_req; u64 addr; - if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) { + if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", - skb->len, dev->mtu + INFINIBAND_ALEN); + skb->len, priv->mcast_mtu + INFINIBAND_ALEN); ++priv->stats.tx_dropped; ++priv->stats.tx_errors; - dev_kfree_skb_any(skb); + ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); return; } @@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); netif_stop_queue(dev); + set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); } } } @@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev) return -1; } + ret = ipoib_cm_dev_open(dev); + if (ret) { + ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); + ipoib_ib_dev_stop(dev); + return -1; + } + clear_bit(IPOIB_STOP_REAPER, &priv->flags); queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); @@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev) clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); + ipoib_cm_dev_stop(dev); + /* * Move our QP to the error state and then reinitialize in * when all work requests have completed or have been flushed. diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index af5ee2ec449..18d27fd352a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -49,8 +49,6 @@ #include -#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff) - MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); @@ -145,6 +143,8 @@ static int ipoib_stop(struct net_device *dev) netif_stop_queue(dev); + clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); + /* * Now flush workqueue to make sure a scheduled task doesn't * bring our internal state back up. @@ -178,8 +178,18 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { struct ipoib_dev_priv *priv = netdev_priv(dev); - if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) + /* dev->mtu > 2K ==> connected mode */ + if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) { + if (new_mtu > priv->mcast_mtu) + ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", + priv->mcast_mtu); + dev->mtu = new_mtu; + return 0; + } + + if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) { return -EINVAL; + } priv->admin_mtu = new_mtu; @@ -414,6 +424,20 @@ static void path_rec_completion(int status, memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, sizeof(union ib_gid)); + if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, + path, + neigh)); + if (!ipoib_cm_get(neigh)) { + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + continue; + } + } + while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); } @@ -520,7 +544,25 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, sizeof(union ib_gid)); - ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); + if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (!ipoib_cm_get(neigh)) + ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); + if (!ipoib_cm_get(neigh)) { + list_del(&neigh->list); + if (neigh->ah) + ipoib_put_ah(neigh->ah); + ipoib_neigh_free(dev, neigh); + goto err_drop; + } + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + __skb_queue_tail(&neigh->queue, skb); + else { + ipoib_warn(priv, "queue length limit %d. Packet drop.\n", + skb_queue_len(&neigh->queue)); + goto err_drop; + } + } else + ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); } else { neigh->ah = NULL; @@ -538,6 +580,7 @@ err_list: err_path: ipoib_neigh_free(dev, neigh); +err_drop: ++priv->stats.tx_dropped; dev_kfree_skb_any(skb); @@ -640,7 +683,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) neigh = *to_ipoib_neigh(skb->dst->neighbour); - if (likely(neigh->ah)) { + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto out; + } + } else if (neigh->ah) { if (unlikely(memcmp(&neigh->dgid.raw, skb->dst->neighbour->ha + 4, sizeof(union ib_gid)))) { @@ -805,6 +853,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour) neigh->neighbour = neighbour; *to_ipoib_neigh(neighbour) = neigh; skb_queue_head_init(&neigh->queue); + ipoib_cm_set(neigh, NULL); return neigh; } @@ -818,6 +867,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) ++priv->stats.tx_dropped; dev_kfree_skb_any(skb); } + if (ipoib_cm_get(neigh)) + ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); kfree(neigh); } @@ -1080,6 +1131,8 @@ static struct net_device *ipoib_add_port(const char *format, ipoib_create_debug_files(priv->dev); + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) goto sysfs_failed; if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index b04b72ca32e..fea737f520f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -597,7 +597,9 @@ void ipoib_mcast_join_task(struct work_struct *work) priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) - IPOIB_ENCAP_LEN; - dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + + if (!ipoib_cm_admin_enabled(dev)) + dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 7b717c648f7..3cb551b8875 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -168,35 +168,41 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) .qp_type = IB_QPT_UD }; + int ret, size; + priv->pd = ib_alloc_pd(priv->ca); if (IS_ERR(priv->pd)) { printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); return -ENODEV; } - priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, - ipoib_sendq_size + ipoib_recvq_size + 1); + priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(priv->mr)) { + printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); + goto out_free_pd; + } + + size = ipoib_sendq_size + ipoib_recvq_size + 1; + ret = ipoib_cm_dev_init(dev); + if (!ret) + size += ipoib_recvq_size; + + priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size); if (IS_ERR(priv->cq)) { printk(KERN_WARNING "%s: failed to create CQ\n", ca->name); - goto out_free_pd; + goto out_free_mr; } if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP)) goto out_free_cq; - priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(priv->mr)) { - printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name); - goto out_free_cq; - } - init_attr.send_cq = priv->cq; init_attr.recv_cq = priv->cq, priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { printk(KERN_WARNING "%s: failed to create QP\n", ca->name); - goto out_free_mr; + goto out_free_cq; } priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; @@ -212,12 +218,12 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) return 0; -out_free_mr: - ib_dereg_mr(priv->mr); - out_free_cq: ib_destroy_cq(priv->cq); +out_free_mr: + ib_dereg_mr(priv->mr); + out_free_pd: ib_dealloc_pd(priv->pd); return -ENODEV; @@ -235,12 +241,14 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); } - if (ib_dereg_mr(priv->mr)) - ipoib_warn(priv, "ib_dereg_mr failed\n"); - if (ib_destroy_cq(priv->cq)) ipoib_warn(priv, "ib_cq_destroy failed\n"); + ipoib_cm_dev_cleanup(dev); + + if (ib_dereg_mr(priv->mr)) + ipoib_warn(priv, "ib_dereg_mr failed\n"); + if (ib_dealloc_pd(priv->pd)) ipoib_warn(priv, "ib_dealloc_pd failed\n"); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 085eafe6667..6762988439d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -115,6 +115,8 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ipoib_create_debug_files(priv->dev); + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) goto sysfs_failed; -- cgit v1.2.3-70-g09d2 From 6bdd61d876e6eacea5c59230b6b2d988b22793e6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Feb 2007 16:21:08 -0800 Subject: IB/mthca: Work around gcc bug on sparc64 For some reason gcc-3.4.5 on sparc64 does: WARNING: "____ilog2_NaN" [drivers/infiniband/hw/mthca/ib_mthca.ko] undefined! Points to note: (1) The asm volatile flush/flushw are just markers for viewing what comes out in the assembly; removing them has no effect on the result. (2) Changing almost anything else in dwh__mthca_arbel_init_srq_context() or dwh__mthca_alloc_srq() causes the problem to go away. The compiler command line issued by the kernel build is: /opt/crosstool/gcc-3.4.5-glibc-2.3.6/sparc64-unknown-linux-gnu/bin/sparc64-unknown-linux-gnu-gcc -fno-strict-aliasing -fno-common -Os -m64 -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -ffixed-g5 -fcall-used-g7 -Wa,--undeclared-regs -pg -fno-omit-frame-pointer -fno-optimize-sibling-calls -fasynchronous-unwind-tables -g -c -o drivers/infiniband/hw/mthca/.tmp_mthca_srq.o drivers/infiniband/hw/mthca/mthca_srq.c This can be reduced to this whilst still retaining the problem: /opt/crosstool/gcc-3.4.5-glibc-2.3.6/sparc64-unknown-linux-gnu/bin/sparc64-unknown-linux-gnu-gcc -m64 -c -o drivers/infiniband/hw/mthca/mthca_srq.o drivers/infiniband/hw/mthca/mthca_srq.c -Os Removing -Os or changing it to -O or -O0 thru -O6 gets rid of the problem. This patch to the kernel code fixes the problem: Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_srq.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index 10684da33d5..61974b0296c 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -116,11 +116,16 @@ static void mthca_arbel_init_srq_context(struct mthca_dev *dev, struct mthca_srq *srq, struct mthca_arbel_srq_context *context) { - int logsize; + int logsize, max; memset(context, 0, sizeof *context); - logsize = ilog2(srq->max); + /* + * Put max in a temporary variable to work around gcc bug + * triggered by ilog2() on sparc64. + */ + max = srq->max; + logsize = ilog2(max); context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn); context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); context->db_index = cpu_to_be32(srq->db_index); -- cgit v1.2.3-70-g09d2 From 65e5c0262169a92bdec71a8bb9edb32dab2d8d1f Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 5 Feb 2007 16:21:09 -0800 Subject: IB/ehca: Fix memleak on module unloading Percpu data is not freed on module unloading. Cc: Heiko Carstens Cc: Christoph Raisch Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Acked-by: Hoang-Nam Nguyen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ehca/ehca_irq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c index c069be8cbcb..6c4f9f91b15 100644 --- a/drivers/infiniband/hw/ehca/ehca_irq.c +++ b/drivers/infiniband/hw/ehca/ehca_irq.c @@ -756,6 +756,8 @@ void ehca_destroy_comp_pool(void) if (cpu_online(i)) destroy_comp_task(pool, i); } + free_percpu(pool->cpu_comp_tasks); + kfree(pool); #endif return; -- cgit v1.2.3-70-g09d2 From aedec08050255db1989a38b59616dd973dfe660b Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 29 Jan 2007 16:41:23 -0800 Subject: RDMA/cma: Increment port number after close to avoid re-use Randomize the starting port number and avoid re-using port values immediately after they are closed. Instead keep track of the last port value used and increment it every time a new port number is assigned, to better replicate other port spaces. Signed-off-by: Roland Dreier --- drivers/infiniband/core/cma.c | 66 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9e0ab048c87..bc31b54e9ca 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -71,6 +71,7 @@ static struct workqueue_struct *cma_wq; static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); +static int next_port; struct cma_device { struct list_head list; @@ -1722,33 +1723,74 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; - int port, start, ret; + int port, ret; bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; - start = snum ? snum : sysctl_local_port_range[0]; + do { + ret = idr_get_new_above(ps, bind_list, snum, &port); + } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); + + if (ret) + goto err1; + + if (port != snum) { + ret = -EADDRNOTAVAIL; + goto err2; + } + + bind_list->ps = ps; + bind_list->port = (unsigned short) port; + cma_bind_port(bind_list, id_priv); + return 0; +err2: + idr_remove(ps, port); +err1: + kfree(bind_list); + return ret; +} +static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) +{ + struct rdma_bind_list *bind_list; + int port, ret; + + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); + if (!bind_list) + return -ENOMEM; + +retry: do { - ret = idr_get_new_above(ps, bind_list, start, &port); + ret = idr_get_new_above(ps, bind_list, next_port, &port); } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); if (ret) - goto err; + goto err1; - if ((snum && port != snum) || - (!snum && port > sysctl_local_port_range[1])) { - idr_remove(ps, port); + if (port > sysctl_local_port_range[1]) { + if (next_port != sysctl_local_port_range[0]) { + idr_remove(ps, port); + next_port = sysctl_local_port_range[0]; + goto retry; + } ret = -EADDRNOTAVAIL; - goto err; + goto err2; } + if (port == sysctl_local_port_range[1]) + next_port = sysctl_local_port_range[0]; + else + next_port = port + 1; + bind_list->ps = ps; bind_list->port = (unsigned short) port; cma_bind_port(bind_list, id_priv); return 0; -err: +err2: + idr_remove(ps, port); +err1: kfree(bind_list); return ret; } @@ -1811,7 +1853,7 @@ static int cma_get_port(struct rdma_id_private *id_priv) mutex_lock(&lock); if (cma_any_port(&id_priv->id.route.addr.src_addr)) - ret = cma_alloc_port(ps, id_priv, 0); + ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); @@ -2448,6 +2490,10 @@ static int cma_init(void) { int ret; + get_random_bytes(&next_port, sizeof next_port); + next_port = (next_port % (sysctl_local_port_range[1] - + sysctl_local_port_range[0])) + + sysctl_local_port_range[0]; cma_wq = create_singlethread_workqueue("rdma_cm_wq"); if (!cma_wq) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From c7f743a669c27f9c392e78fda8829db9d6d50f43 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 1 Feb 2007 12:23:37 -0800 Subject: IB: Remove redundant "_wq" from workqueue names Signed-off-by: Roland Dreier --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/cma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index d2bb5a9a303..a91001c59b6 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -373,7 +373,7 @@ static struct notifier_block nb = { static int addr_init(void) { - addr_wq = create_singlethread_workqueue("ib_addr_wq"); + addr_wq = create_singlethread_workqueue("ib_addr"); if (!addr_wq) return -ENOMEM; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index bc31b54e9ca..db88e609bf4 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2494,7 +2494,7 @@ static int cma_init(void) next_port = (next_port % (sysctl_local_port_range[1] - sysctl_local_port_range[0])) + sysctl_local_port_range[0]; - cma_wq = create_singlethread_workqueue("rdma_cm_wq"); + cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From b038ced7b3705bf0ac9b30e118af0f56ab48b847 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 12 Feb 2007 16:16:18 -0800 Subject: RDMA/cxgb3: Add driver for Chelsio T3 RNIC Add an RDMA/iWARP driver for the Chelsio T3 1GbE and 10GbE adapters. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/Makefile | 1 + drivers/infiniband/hw/cxgb3/Kconfig | 27 + drivers/infiniband/hw/cxgb3/Makefile | 12 + drivers/infiniband/hw/cxgb3/cxio_dbg.c | 207 +++ drivers/infiniband/hw/cxgb3/cxio_hal.c | 1280 ++++++++++++++++ drivers/infiniband/hw/cxgb3/cxio_hal.h | 201 +++ drivers/infiniband/hw/cxgb3/cxio_resource.c | 331 +++++ drivers/infiniband/hw/cxgb3/cxio_resource.h | 70 + drivers/infiniband/hw/cxgb3/cxio_wr.h | 685 +++++++++ drivers/infiniband/hw/cxgb3/iwch.c | 189 +++ drivers/infiniband/hw/cxgb3/iwch.h | 177 +++ drivers/infiniband/hw/cxgb3/iwch_cm.c | 2081 +++++++++++++++++++++++++++ drivers/infiniband/hw/cxgb3/iwch_cm.h | 223 +++ drivers/infiniband/hw/cxgb3/iwch_cq.c | 225 +++ drivers/infiniband/hw/cxgb3/iwch_ev.c | 231 +++ drivers/infiniband/hw/cxgb3/iwch_mem.c | 172 +++ drivers/infiniband/hw/cxgb3/iwch_provider.c | 1203 ++++++++++++++++ drivers/infiniband/hw/cxgb3/iwch_provider.h | 367 +++++ drivers/infiniband/hw/cxgb3/iwch_qp.c | 1007 +++++++++++++ drivers/infiniband/hw/cxgb3/iwch_user.h | 67 + drivers/infiniband/hw/cxgb3/tcb.h | 632 ++++++++ 22 files changed, 9389 insertions(+) create mode 100644 drivers/infiniband/hw/cxgb3/Kconfig create mode 100644 drivers/infiniband/hw/cxgb3/Makefile create mode 100644 drivers/infiniband/hw/cxgb3/cxio_dbg.c create mode 100644 drivers/infiniband/hw/cxgb3/cxio_hal.c create mode 100644 drivers/infiniband/hw/cxgb3/cxio_hal.h create mode 100644 drivers/infiniband/hw/cxgb3/cxio_resource.c create mode 100644 drivers/infiniband/hw/cxgb3/cxio_resource.h create mode 100644 drivers/infiniband/hw/cxgb3/cxio_wr.h create mode 100644 drivers/infiniband/hw/cxgb3/iwch.c create mode 100644 drivers/infiniband/hw/cxgb3/iwch.h create mode 100644 drivers/infiniband/hw/cxgb3/iwch_cm.c create mode 100644 drivers/infiniband/hw/cxgb3/iwch_cm.h create mode 100644 drivers/infiniband/hw/cxgb3/iwch_cq.c create mode 100644 drivers/infiniband/hw/cxgb3/iwch_ev.c create mode 100644 drivers/infiniband/hw/cxgb3/iwch_mem.c create mode 100644 drivers/infiniband/hw/cxgb3/iwch_provider.c create mode 100644 drivers/infiniband/hw/cxgb3/iwch_provider.h create mode 100644 drivers/infiniband/hw/cxgb3/iwch_qp.c create mode 100644 drivers/infiniband/hw/cxgb3/iwch_user.h create mode 100644 drivers/infiniband/hw/cxgb3/tcb.h (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 9edfacee7d8..66b36de9fa6 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -38,6 +38,7 @@ source "drivers/infiniband/hw/mthca/Kconfig" source "drivers/infiniband/hw/ipath/Kconfig" source "drivers/infiniband/hw/ehca/Kconfig" source "drivers/infiniband/hw/amso1100/Kconfig" +source "drivers/infiniband/hw/cxgb3/Kconfig" source "drivers/infiniband/ulp/ipoib/Kconfig" diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index 2b5d1098ef4..da2066c4f22 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_INFINIBAND_MTHCA) += hw/mthca/ obj-$(CONFIG_INFINIBAND_IPATH) += hw/ipath/ obj-$(CONFIG_INFINIBAND_EHCA) += hw/ehca/ obj-$(CONFIG_INFINIBAND_AMSO1100) += hw/amso1100/ +obj-$(CONFIG_INFINIBAND_CXGB3) += hw/cxgb3/ obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/ obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/ obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/ diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig new file mode 100644 index 00000000000..77977f55dca --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/Kconfig @@ -0,0 +1,27 @@ +config INFINIBAND_CXGB3 + tristate "Chelsio RDMA Driver" + depends on CHELSIO_T3 && INFINIBAND && INET + select GENERIC_ALLOCATOR + ---help--- + This is an iWARP/RDMA driver for the Chelsio T3 1GbE and + 10GbE adapters. + + For general information about Chelsio and our products, visit + our website at . + + For customer support, please visit our customer support page at + . + + Please send feedback to . + + To compile this driver as a module, choose M here: the module + will be called iw_cxgb3. + +config INFINIBAND_CXGB3_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_CXGB3 + default n + ---help--- + This option causes the Chelsio RDMA driver to produce copious + amounts of debug messages. Select this if you are developing + the driver or trying to diagnose a problem. diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile new file mode 100644 index 00000000000..0e110f32f12 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/Makefile @@ -0,0 +1,12 @@ +EXTRA_CFLAGS += -I$(TOPDIR)/drivers/net/cxgb3 \ + -I$(TOPDIR)/drivers/infiniband/hw/cxgb3/core + +obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o + +iw_cxgb3-y := iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \ + iwch_provider.o iwch.o cxio_hal.o cxio_resource.o + +ifdef CONFIG_INFINIBAND_CXGB3_DEBUG +EXTRA_CFLAGS += -DDEBUG +iw_cxgb3-y += cxio_dbg.o +endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_dbg.c b/drivers/infiniband/hw/cxgb3/cxio_dbg.c new file mode 100644 index 00000000000..5a7306f5efa --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_dbg.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef DEBUG +#include +#include "common.h" +#include "cxgb3_ioctl.h" +#include "cxio_hal.h" +#include "cxio_wr.h" + +void cxio_dump_tpt(struct cxio_rdev *rdev, u32 stag) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size = 32; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __FUNCTION__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base; + m->len = size; + PDBG("%s TPT addr 0x%x len %d\n", __FUNCTION__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __FUNCTION__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("TPT %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_pbl(struct cxio_rdev *rdev, u32 pbl_addr, uint len, u8 shift) +{ + struct ch_mem_range *m; + u64 *data; + int rc; + int size, npages; + + shift += 12; + npages = (len + (1ULL << shift) - 1) >> shift; + size = npages * sizeof(u64); + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __FUNCTION__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = pbl_addr; + m->len = size; + PDBG("%s PBL addr 0x%x len %d depth %d\n", + __FUNCTION__, m->addr, m->len, npages); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __FUNCTION__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("PBL %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_wqe(union t3_wr *wqe) +{ + __be64 *data = (__be64 *)wqe; + uint size = (uint)(be64_to_cpu(*data) & 0xff); + + if (size == 0) + size = 8; + while (size > 0) { + PDBG("WQE %p: %016llx\n", data, + (unsigned long long) be64_to_cpu(*data)); + size--; + data++; + } +} + +void cxio_dump_wce(struct t3_cqe *wce) +{ + __be64 *data = (__be64 *)wce; + int size = sizeof(*wce); + + while (size > 0) { + PDBG("WCE %p: %016llx\n", data, + (unsigned long long) be64_to_cpu(*data)); + size -= 8; + data++; + } +} + +void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents) +{ + struct ch_mem_range *m; + int size = nents * 64; + u64 *data; + int rc; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __FUNCTION__); + return; + } + m->mem_id = MEM_PMRX; + m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base; + m->len = size; + PDBG("%s RQT addr 0x%x len %d\n", __FUNCTION__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __FUNCTION__, rc); + kfree(m); + return; + } + + data = (u64 *)m->buf; + while (size > 0) { + PDBG("RQT %08x: %016llx\n", m->addr, (unsigned long long) *data); + size -= 8; + data++; + m->addr += 8; + } + kfree(m); +} + +void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid) +{ + struct ch_mem_range *m; + int size = TCB_SIZE; + u32 *data; + int rc; + + m = kmalloc(sizeof(*m) + size, GFP_ATOMIC); + if (!m) { + PDBG("%s couldn't allocate memory.\n", __FUNCTION__); + return; + } + m->mem_id = MEM_CM; + m->addr = hwtid * size; + m->len = size; + PDBG("%s TCB %d len %d\n", __FUNCTION__, m->addr, m->len); + rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m); + if (rc) { + PDBG("%s toectl returned error %d\n", __FUNCTION__, rc); + kfree(m); + return; + } + + data = (u32 *)m->buf; + while (size > 0) { + printk("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n", + m->addr, + *(data+2), *(data+3), *(data),*(data+1), + *(data+6), *(data+7), *(data+4), *(data+5)); + size -= 32; + data += 8; + m->addr += 32; + } + kfree(m); +} +#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c new file mode 100644 index 00000000000..82fa7204198 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -0,0 +1,1280 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#include +#include +#include +#include +#include + +#include "cxio_resource.h" +#include "cxio_hal.h" +#include "cxgb3_offload.h" +#include "sge_defs.h" + +static LIST_HEAD(rdev_list); +static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL; + +static inline struct cxio_rdev *cxio_hal_find_rdev_by_name(char *dev_name) +{ + struct cxio_rdev *rdev; + + list_for_each_entry(rdev, &rdev_list, entry) + if (!strcmp(rdev->dev_name, dev_name)) + return rdev; + return NULL; +} + +static inline struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev + *tdev) +{ + struct cxio_rdev *rdev; + + list_for_each_entry(rdev, &rdev_list, entry) + if (rdev->t3cdev_p == tdev) + return rdev; + return NULL; +} + +int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit) +{ + int ret; + struct t3_cqe *cqe; + u32 rptr; + + struct rdma_cq_op setup; + setup.id = cq->cqid; + setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0; + setup.op = op; + ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup); + + if ((ret < 0) || (op == CQ_CREDIT_UPDATE)) + return ret; + + /* + * If the rearm returned an index other than our current index, + * then there might be CQE's in flight (being DMA'd). We must wait + * here for them to complete or the consumer can miss a notification. + */ + if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) { + int i=0; + + rptr = cq->rptr; + + /* + * Keep the generation correct by bumping rptr until it + * matches the index returned by the rearm - 1. + */ + while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret) + rptr++; + + /* + * Now rptr is the index for the (last) cqe that was + * in-flight at the time the HW rearmed the CQ. We + * spin until that CQE is valid. + */ + cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2); + while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) { + udelay(1); + if (i++ > 1000000) { + BUG_ON(1); + printk(KERN_ERR "%s: stalled rnic\n", + rdev_p->dev_name); + return -EIO; + } + } + } + return 0; +} + +static inline int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid) +{ + struct rdma_cq_setup setup; + setup.id = cqid; + setup.base_addr = 0; /* NULL address */ + setup.size = 0; /* disaable the CQ */ + setup.credits = 0; + setup.credit_thres = 0; + setup.ovfl_mode = 0; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid) +{ + u64 sge_cmd; + struct t3_modify_qp_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL); + if (!skb) { + PDBG("%s alloc_skb failed\n", __FUNCTION__); + return -ENOMEM; + } + wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 1, qpid, 7); + wqe->flags = cpu_to_be32(MODQP_WRITE_EC); + sge_cmd = qpid << 8 | 3; + wqe->sge_cmd = cpu_to_be64(sge_cmd); + skb->priority = CPL_PRIORITY_CONTROL; + return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb)); +} + +int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + struct rdma_cq_setup setup; + int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe); + + cq->cqid = cxio_hal_get_cqid(rdev_p->rscp); + if (!cq->cqid) + return -ENOMEM; + cq->sw_queue = kzalloc(size, GFP_KERNEL); + if (!cq->sw_queue) + return -ENOMEM; + cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (cq->size_log2)) * + sizeof(struct t3_cqe), + &(cq->dma_addr), GFP_KERNEL); + if (!cq->queue) { + kfree(cq->sw_queue); + return -ENOMEM; + } + pci_unmap_addr_set(cq, mapping, cq->dma_addr); + memset(cq->queue, 0, size); + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = 65535; + setup.credit_thres = 1; + if (rdev_p->t3cdev_p->type == T3B) + setup.ovfl_mode = 0; + else + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + struct rdma_cq_setup setup; + setup.id = cq->cqid; + setup.base_addr = (u64) (cq->dma_addr); + setup.size = 1UL << cq->size_log2; + setup.credits = setup.size; + setup.credit_thres = setup.size; /* TBD: overflow recovery */ + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct cxio_qpid_list *entry; + u32 qpid; + int i; + + mutex_lock(&uctx->lock); + if (!list_empty(&uctx->qpids)) { + entry = list_entry(uctx->qpids.next, struct cxio_qpid_list, + entry); + list_del(&entry->entry); + qpid = entry->qpid; + kfree(entry); + } else { + qpid = cxio_hal_get_qpid(rdev_p->rscp); + if (!qpid) + goto out; + for (i = qpid+1; i & rdev_p->qpmask; i++) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + break; + entry->qpid = i; + list_add_tail(&entry->entry, &uctx->qpids); + } + } +out: + mutex_unlock(&uctx->lock); + PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid); + return qpid; +} + +static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid, + struct cxio_ucontext *uctx) +{ + struct cxio_qpid_list *entry; + + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) + return; + PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid); + entry->qpid = qpid; + mutex_lock(&uctx->lock); + list_add_tail(&entry->entry, &uctx->qpids); + mutex_unlock(&uctx->lock); +} + +void cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + struct list_head *pos, *nxt; + struct cxio_qpid_list *entry; + + mutex_lock(&uctx->lock); + list_for_each_safe(pos, nxt, &uctx->qpids) { + entry = list_entry(pos, struct cxio_qpid_list, entry); + list_del_init(&entry->entry); + if (!(entry->qpid & rdev_p->qpmask)) + cxio_hal_put_qpid(rdev_p->rscp, entry->qpid); + kfree(entry); + } + mutex_unlock(&uctx->lock); +} + +void cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx) +{ + INIT_LIST_HEAD(&uctx->qpids); + mutex_init(&uctx->lock); +} + +int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain, + struct t3_wq *wq, struct cxio_ucontext *uctx) +{ + int depth = 1UL << wq->size_log2; + int rqsize = 1UL << wq->rq_size_log2; + + wq->qpid = get_qpid(rdev_p, uctx); + if (!wq->qpid) + return -ENOMEM; + + wq->rq = kzalloc(depth * sizeof(u64), GFP_KERNEL); + if (!wq->rq) + goto err1; + + wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize); + if (!wq->rq_addr) + goto err2; + + wq->sq = kzalloc(depth * sizeof(struct t3_swsq), GFP_KERNEL); + if (!wq->sq) + goto err3; + + wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), + depth * sizeof(union t3_wr), + &(wq->dma_addr), GFP_KERNEL); + if (!wq->queue) + goto err4; + + memset(wq->queue, 0, depth * sizeof(union t3_wr)); + pci_unmap_addr_set(wq, mapping, wq->dma_addr); + wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr; + if (!kernel_domain) + wq->udb = (u64)rdev_p->rnic_info.udbell_physbase + + (wq->qpid << rdev_p->qpshift); + PDBG("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n", __FUNCTION__, + wq->qpid, wq->doorbell, (unsigned long long) wq->udb); + return 0; +err4: + kfree(wq->sq); +err3: + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize); +err2: + kfree(wq->rq); +err1: + put_qpid(rdev_p, wq->qpid, uctx); + return -ENOMEM; +} + +int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +{ + int err; + err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid); + kfree(cq->sw_queue); + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (cq->size_log2)) + * sizeof(struct t3_cqe), cq->queue, + pci_unmap_addr(cq, mapping)); + cxio_hal_put_cqid(rdev_p->rscp, cq->cqid); + return err; +} + +int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq, + struct cxio_ucontext *uctx) +{ + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << (wq->size_log2)) + * sizeof(union t3_wr), wq->queue, + pci_unmap_addr(wq, mapping)); + kfree(wq->sq); + cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2)); + kfree(wq->rq); + put_qpid(rdev_p, wq->qpid, uctx); + return 0; +} + +static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_cqe cqe; + + PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __FUNCTION__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(T3_SEND) | + V_CQE_TYPE(0) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + u32 ptr; + + PDBG("%s wq %p cq %p\n", __FUNCTION__, wq, cq); + + /* flush RQ */ + PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __FUNCTION__, + wq->rq_rptr, wq->rq_wptr, count); + ptr = wq->rq_rptr + count; + while (ptr++ != wq->rq_wptr) + insert_recv_cqe(wq, cq); +} + +static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq, + struct t3_swsq *sqp) +{ + struct t3_cqe cqe; + + PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __FUNCTION__, + wq, cq, cq->sw_rptr, cq->sw_wptr); + memset(&cqe, 0, sizeof(cqe)); + cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) | + V_CQE_OPCODE(sqp->opcode) | + V_CQE_TYPE(1) | + V_CQE_SWCQE(1) | + V_CQE_QPID(wq->qpid) | + V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr, + cq->size_log2))); + cqe.u.scqe.wrid_hi = sqp->sq_wptr; + + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe; + cq->sw_wptr++; +} + +void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count) +{ + __u32 ptr; + struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2); + + ptr = wq->sq_rptr + count; + sqp += count; + while (ptr != wq->sq_wptr) { + insert_sq_cqe(wq, cq, sqp); + sqp++; + ptr++; + } +} + +/* + * Move all CQEs from the HWCQ into the SWCQ. + */ +void cxio_flush_hw_cq(struct t3_cq *cq) +{ + struct t3_cqe *cqe, *swcqe; + + PDBG("%s cq %p cqid 0x%x\n", __FUNCTION__, cq, cq->cqid); + cqe = cxio_next_hw_cqe(cq); + while (cqe) { + PDBG("%s flushing hwcq rptr 0x%x to swcq wptr 0x%x\n", + __FUNCTION__, cq->rptr, cq->sw_wptr); + swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2); + *swcqe = *cqe; + swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1)); + cq->sw_wptr++; + cq->rptr++; + cqe = cxio_next_hw_cqe(cq); + } +} + +static inline int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq) +{ + if (CQE_OPCODE(*cqe) == T3_TERMINATE) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe)) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe)) + return 0; + + if ((CQE_OPCODE(*cqe) == T3_SEND) && RQ_TYPE(*cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) + return 0; + + return 1; +} + +void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if ((SQ_TYPE(*cqe) || (CQE_OPCODE(*cqe) == T3_READ_RESP)) && + (CQE_QPID(*cqe) == wq->qpid)) + (*count)++; + ptr++; + } + PDBG("%s cq %p count %d\n", __FUNCTION__, cq, *count); +} + +void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count) +{ + struct t3_cqe *cqe; + u32 ptr; + + *count = 0; + PDBG("%s count zero %d\n", __FUNCTION__, *count); + ptr = cq->sw_rptr; + while (!Q_EMPTY(ptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2)); + if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) && + (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq)) + (*count)++; + ptr++; + } + PDBG("%s cq %p count %d\n", __FUNCTION__, cq, *count); +} + +static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p) +{ + struct rdma_cq_setup setup; + setup.id = 0; + setup.base_addr = 0; /* NULL address */ + setup.size = 1; /* enable the CQ */ + setup.credits = 0; + + /* force SGE to redirect to RspQ and interrupt */ + setup.credit_thres = 0; + setup.ovfl_mode = 1; + return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup)); +} + +static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p) +{ + int err; + u64 sge_cmd, ctx0, ctx1; + u64 base_addr; + struct t3_modify_qp_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL); + + + if (!skb) { + PDBG("%s alloc_skb failed\n", __FUNCTION__); + return -ENOMEM; + } + err = cxio_hal_init_ctrl_cq(rdev_p); + if (err) { + PDBG("%s err %d initializing ctrl_cq\n", __FUNCTION__, err); + return err; + } + rdev_p->ctrl_qp.workq = dma_alloc_coherent( + &(rdev_p->rnic_info.pdev->dev), + (1 << T3_CTRL_QP_SIZE_LOG2) * + sizeof(union t3_wr), + &(rdev_p->ctrl_qp.dma_addr), + GFP_KERNEL); + if (!rdev_p->ctrl_qp.workq) { + PDBG("%s dma_alloc_coherent failed\n", __FUNCTION__); + return -ENOMEM; + } + pci_unmap_addr_set(&rdev_p->ctrl_qp, mapping, + rdev_p->ctrl_qp.dma_addr); + rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr; + memset(rdev_p->ctrl_qp.workq, 0, + (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr)); + + mutex_init(&rdev_p->ctrl_qp.lock); + init_waitqueue_head(&rdev_p->ctrl_qp.waitq); + + /* update HW Ctrl QP context */ + base_addr = rdev_p->ctrl_qp.dma_addr; + base_addr >>= 12; + ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) | + V_EC_BASE_LO((u32) base_addr & 0xffff)); + ctx0 <<= 32; + ctx0 |= V_EC_CREDITS(FW_WR_NUM); + base_addr >>= 16; + ctx1 = (u32) base_addr; + base_addr >>= 32; + ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) | + V_EC_TYPE(0) | V_EC_GEN(1) | + V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32; + wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe)); + memset(wqe, 0, sizeof(*wqe)); + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 1, + T3_CTL_QP_TID, 7); + wqe->flags = cpu_to_be32(MODQP_WRITE_EC); + sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3; + wqe->sge_cmd = cpu_to_be64(sge_cmd); + wqe->ctx1 = cpu_to_be64(ctx1); + wqe->ctx0 = cpu_to_be64(ctx0); + PDBG("CtrlQP dma_addr 0x%llx workq %p size %d\n", + (unsigned long long) rdev_p->ctrl_qp.dma_addr, + rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2); + skb->priority = CPL_PRIORITY_CONTROL; + return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb)); +} + +static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p) +{ + dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), + (1UL << T3_CTRL_QP_SIZE_LOG2) + * sizeof(union t3_wr), rdev_p->ctrl_qp.workq, + pci_unmap_addr(&rdev_p->ctrl_qp, mapping)); + return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID); +} + +/* write len bytes of data into addr (32B aligned address) + * If data is NULL, clear len byte of memory to zero. + * caller aquires the ctrl_qp lock before the call + */ +static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, + u32 len, void *data, int completion) +{ + u32 i, nr_wqe, copy_len; + u8 *copy_data; + u8 wr_len, utx_len; /* lenght in 8 byte flit */ + enum t3_wr_flags flag; + __be64 *wqe; + u64 utx_cmd; + addr &= 0x7FFFFFF; + nr_wqe = len % 96 ? len / 96 + 1 : len / 96; /* 96B max per WQE */ + PDBG("%s wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x\n", + __FUNCTION__, rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len, + nr_wqe, data, addr); + utx_len = 3; /* in 32B unit */ + for (i = 0; i < nr_wqe; i++) { + if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2)) { + PDBG("%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, " + "wait for more space i %d\n", __FUNCTION__, + rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i); + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + !Q_FULL(rdev_p->ctrl_qp.rptr, + rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2))) { + PDBG("%s ctrl_qp workq interrupted\n", + __FUNCTION__); + return -ERESTARTSYS; + } + PDBG("%s ctrl_qp wakeup, continue posting work request " + "i %d\n", __FUNCTION__, i); + } + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + flag = 0; + if (i == (nr_wqe - 1)) { + /* last WQE */ + flag = completion ? T3_COMPLETION_FLAG : 0; + if (len % 32) + utx_len = len / 32 + 1; + else + utx_len = len / 32; + } + + /* + * Force a CQE to return the credit to the workq in case + * we posted more than half the max QP size of WRs + */ + if ((i != 0) && + (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) { + flag = T3_COMPLETION_FLAG; + PDBG("%s force completion at i %d\n", __FUNCTION__, i); + } + + /* build the utx mem command */ + wqe += (sizeof(struct t3_bypass_wr) >> 3); + utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3); + utx_cmd <<= 32; + utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1); + *wqe = cpu_to_be64(utx_cmd); + wqe++; + copy_data = (u8 *) data + i * 96; + copy_len = len > 96 ? 96 : len; + + /* clear memory content if data is NULL */ + if (data) + memcpy(wqe, copy_data, copy_len); + else + memset(wqe, 0, copy_len); + if (copy_len % 32) + memset(((u8 *) wqe) + copy_len, 0, + 32 - (copy_len % 32)); + wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 + + (utx_len << 2); + wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr % + (1 << T3_CTRL_QP_SIZE_LOG2))); + + /* wptr in the WRID[31:0] */ + ((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr; + + /* + * This must be the last write with a memory barrier + * for the genbit + */ + build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag, + Q_GENBIT(rdev_p->ctrl_qp.wptr, + T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID, + wr_len); + if (flag == T3_COMPLETION_FLAG) + ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID); + len -= 96; + rdev_p->ctrl_qp.wptr++; + } + return 0; +} + +/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl, and pbl_size + * OUT: stag index, actual pbl_size, pbl_addr allocated. + * TBD: shared memory region support + */ +static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, + u32 *stag, u8 stag_state, u32 pdid, + enum tpt_mem_type type, enum tpt_mem_perm perm, + u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl, + u32 *pbl_size, u32 *pbl_addr) +{ + int err; + struct tpt_entry tpt; + u32 stag_idx; + u32 wptr; + int rereg = (*stag != T3_STAG_UNSET); + + stag_state = stag_state > 0; + stag_idx = (*stag) >> 8; + + if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) { + stag_idx = cxio_hal_get_stag(rdev_p->rscp); + if (!stag_idx) + return -ENOMEM; + *stag = (stag_idx << 8) | ((*stag) & 0xFF); + } + PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n", + __FUNCTION__, stag_state, type, pdid, stag_idx); + + if (reset_tpt_entry) + cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3); + else if (!rereg) { + *pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3); + if (!*pbl_addr) { + return -ENOMEM; + } + } + + mutex_lock(&rdev_p->ctrl_qp.lock); + + /* write PBL first if any - update pbl only if pbl list exist */ + if (pbl) { + + PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + __FUNCTION__, *pbl_addr, rdev_p->rnic_info.pbl_base, + *pbl_size); + err = cxio_hal_ctrl_qp_write_mem(rdev_p, + (*pbl_addr >> 5), + (*pbl_size << 3), pbl, 0); + if (err) + goto ret; + } + + /* write TPT entry */ + if (reset_tpt_entry) + memset(&tpt, 0, sizeof(tpt)); + else { + tpt.valid_stag_pdid = cpu_to_be32(F_TPT_VALID | + V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) | + V_TPT_STAG_STATE(stag_state) | + V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid)); + BUG_ON(page_size >= 28); + tpt.flags_pagesize_qpid = cpu_to_be32(V_TPT_PERM(perm) | + F_TPT_MW_BIND_ENABLE | + V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | + V_TPT_PAGE_SIZE(page_size)); + tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : + cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3)); + tpt.len = cpu_to_be32(len); + tpt.va_hi = cpu_to_be32((u32) (to >> 32)); + tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL)); + tpt.rsvd_bind_cnt_or_pstag = 0; + tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : + cpu_to_be32(V_TPT_PBL_SIZE((*pbl_size) >> 2)); + } + err = cxio_hal_ctrl_qp_write_mem(rdev_p, + stag_idx + + (rdev_p->rnic_info.tpt_base >> 5), + sizeof(tpt), &tpt, 1); + + /* release the stag index to free pool */ + if (reset_tpt_entry) + cxio_hal_put_stag(rdev_p->rscp, stag_idx); +ret: + wptr = rdev_p->ctrl_qp.wptr; + mutex_unlock(&rdev_p->ctrl_qp.lock); + if (!err) + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + SEQ32_GE(rdev_p->ctrl_qp.rptr, + wptr))) + return -ERESTARTSYS; + return err; +} + +/* IN : stag key, pdid, pbl_size + * Out: stag index, actaul pbl_size, and pbl_addr allocated. + */ +int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 * pbl_size, u32 * pbl_addr) +{ + *stag = T3_STAG_UNSET; + return (__cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR, + perm, 0, 0ULL, 0, 0, NULL, pbl_size, pbl_addr)); +} + +int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr) +{ + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl, pbl_size, pbl_addr); +} + +int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, + zbva, to, len, page_size, pbl, pbl_size, pbl_addr); +} + +int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size, + u32 pbl_addr) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, + &pbl_size, &pbl_addr); +} + +int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid) +{ + u32 pbl_size = 0; + *stag = T3_STAG_UNSET; + return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0, + NULL, &pbl_size, NULL); +} + +int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag) +{ + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, + NULL, NULL); +} + +int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) +{ + struct t3_rdma_init_wr *wqe; + struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_ATOMIC); + if (!skb) + return -ENOMEM; + PDBG("%s rdev_p %p\n", __FUNCTION__, rdev_p); + wqe = (struct t3_rdma_init_wr *) __skb_put(skb, sizeof(*wqe)); + wqe->wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_INIT)); + wqe->wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(attr->tid) | + V_FW_RIWR_LEN(sizeof(*wqe) >> 3)); + wqe->wrid.id1 = 0; + wqe->qpid = cpu_to_be32(attr->qpid); + wqe->pdid = cpu_to_be32(attr->pdid); + wqe->scqid = cpu_to_be32(attr->scqid); + wqe->rcqid = cpu_to_be32(attr->rcqid); + wqe->rq_addr = cpu_to_be32(attr->rq_addr - rdev_p->rnic_info.rqt_base); + wqe->rq_size = cpu_to_be32(attr->rq_size); + wqe->mpaattrs = attr->mpaattrs; + wqe->qpcaps = attr->qpcaps; + wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss); + wqe->flags = cpu_to_be32(attr->flags); + wqe->ord = cpu_to_be32(attr->ord); + wqe->ird = cpu_to_be32(attr->ird); + wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr); + wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size); + wqe->rsvd = 0; + skb->priority = 0; /* 0=>ToeQ; 1=>CtrlQ */ + return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb)); +} + +void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = ev_cb; +} + +void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb) +{ + cxio_ev_cb = NULL; +} + +static int cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct sk_buff *skb) +{ + static int cnt; + struct cxio_rdev *rdev_p = NULL; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data; + PDBG("%d: %s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x" + " se %0x notify %0x cqbranch %0x creditth %0x\n", + cnt, __FUNCTION__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg), + RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg), + RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg), + RSPQ_CREDIT_THRESH(rsp_msg)); + PDBG("CQE: QPID 0x%0x genbit %0x type 0x%0x status 0x%0x opcode %d " + "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + CQE_QPID(rsp_msg->cqe), CQE_GENBIT(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe), CQE_LEN(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + rdev_p = (struct cxio_rdev *)t3cdev_p->ulp; + if (!rdev_p) { + PDBG("%s called by t3cdev %p with null ulp\n", __FUNCTION__, + t3cdev_p); + return 0; + } + if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) { + rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1; + wake_up_interruptible(&rdev_p->ctrl_qp.waitq); + dev_kfree_skb_irq(skb); + } else if (CQE_QPID(rsp_msg->cqe) == 0xfff8) + dev_kfree_skb_irq(skb); + else if (cxio_ev_cb) + (*cxio_ev_cb) (rdev_p, skb); + else + dev_kfree_skb_irq(skb); + cnt++; + return 0; +} + +/* Caller takes care of locking if needed */ +int cxio_rdev_open(struct cxio_rdev *rdev_p) +{ + struct net_device *netdev_p = NULL; + int err = 0; + if (strlen(rdev_p->dev_name)) { + if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) { + return -EBUSY; + } + netdev_p = dev_get_by_name(rdev_p->dev_name); + if (!netdev_p) { + return -EINVAL; + } + dev_put(netdev_p); + } else if (rdev_p->t3cdev_p) { + if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) { + return -EBUSY; + } + netdev_p = rdev_p->t3cdev_p->lldev; + strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name, + T3_MAX_DEV_NAME_LEN); + } else { + PDBG("%s t3cdev_p or dev_name must be set\n", __FUNCTION__); + return -EINVAL; + } + + list_add_tail(&rdev_p->entry, &rdev_list); + + PDBG("%s opening rnic dev %s\n", __FUNCTION__, rdev_p->dev_name); + memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp)); + if (!rdev_p->t3cdev_p) + rdev_p->t3cdev_p = T3CDEV(netdev_p); + rdev_p->t3cdev_p->ulp = (void *) rdev_p; + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS, + &(rdev_p->rnic_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __FUNCTION__, rdev_p->t3cdev_p, err); + goto err1; + } + err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS, + &(rdev_p->port_info)); + if (err) { + printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n", + __FUNCTION__, rdev_p->t3cdev_p, err); + goto err1; + } + + /* + * qpshift is the number of bits to shift the qpid left in order + * to get the correct address of the doorbell for that qp. + */ + cxio_init_ucontext(rdev_p, &rdev_p->uctx); + rdev_p->qpshift = PAGE_SHIFT - + ilog2(65536 >> + ilog2(rdev_p->rnic_info.udbell_len >> + PAGE_SHIFT)); + rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT; + rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1; + PDBG("%s rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d " + "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x\n", + __FUNCTION__, rdev_p->dev_name, rdev_p->rnic_info.tpt_base, + rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p), + rdev_p->rnic_info.pbl_base, + rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base, + rdev_p->rnic_info.rqt_top); + PDBG("udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu " + "qpnr %d qpmask 0x%x\n", + rdev_p->rnic_info.udbell_len, + rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr, + rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask); + + err = cxio_hal_init_ctrl_qp(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing ctrl_qp.\n", + __FUNCTION__, err); + goto err1; + } + err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0, + 0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ, + T3_MAX_NUM_PD); + if (err) { + printk(KERN_ERR "%s error %d initializing hal resources.\n", + __FUNCTION__, err); + goto err2; + } + err = cxio_hal_pblpool_create(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing pbl mem pool.\n", + __FUNCTION__, err); + goto err3; + } + err = cxio_hal_rqtpool_create(rdev_p); + if (err) { + printk(KERN_ERR "%s error %d initializing rqt mem pool.\n", + __FUNCTION__, err); + goto err4; + } + return 0; +err4: + cxio_hal_pblpool_destroy(rdev_p); +err3: + cxio_hal_destroy_resource(rdev_p->rscp); +err2: + cxio_hal_destroy_ctrl_qp(rdev_p); +err1: + list_del(&rdev_p->entry); + return err; +} + +void cxio_rdev_close(struct cxio_rdev *rdev_p) +{ + if (rdev_p) { + cxio_hal_pblpool_destroy(rdev_p); + cxio_hal_rqtpool_destroy(rdev_p); + list_del(&rdev_p->entry); + rdev_p->t3cdev_p->ulp = NULL; + cxio_hal_destroy_ctrl_qp(rdev_p); + cxio_hal_destroy_resource(rdev_p->rscp); + } +} + +int __init cxio_hal_init(void) +{ + if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI)) + return -ENOMEM; + t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler); + return 0; +} + +void __exit cxio_hal_exit(void) +{ + struct cxio_rdev *rdev, *tmp; + + t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL); + list_for_each_entry_safe(rdev, tmp, &rdev_list, entry) + cxio_rdev_close(rdev); + cxio_hal_destroy_rhdl_resource(); +} + +static inline void flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq) +{ + struct t3_swsq *sqp; + __u32 ptr = wq->sq_rptr; + int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr); + + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + while (count--) + if (!sqp->signaled) { + ptr++; + sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2); + } else if (sqp->complete) { + + /* + * Insert this completed cqe into the swcq. + */ + PDBG("%s moving cqe into swcq sq idx %ld cq idx %ld\n", + __FUNCTION__, Q_PTR2IDX(ptr, wq->sq_size_log2), + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)); + sqp->cqe.header |= htonl(V_CQE_SWCQE(1)); + *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) + = sqp->cqe; + cq->sw_wptr++; + sqp->signaled = 0; + break; + } else + break; +} + +static inline void create_read_req_cqe(struct t3_wq *wq, + struct t3_cqe *hw_cqe, + struct t3_cqe *read_cqe) +{ + read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr; + read_cqe->len = wq->oldest_read->read_len; + read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) | + V_CQE_SWCQE(SW_CQE(*hw_cqe)) | + V_CQE_OPCODE(T3_READ_REQ) | + V_CQE_TYPE(1)); +} + +/* + * Return a ptr to the next read wr in the SWSQ or NULL. + */ +static inline void advance_oldest_read(struct t3_wq *wq) +{ + + u32 rptr = wq->oldest_read - wq->sq + 1; + u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2); + + while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) { + wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2); + + if (wq->oldest_read->opcode == T3_READ_REQ) + return; + rptr++; + } + wq->oldest_read = NULL; +} + +/* + * cxio_poll_cq + * + * Caller must: + * check the validity of the first CQE, + * supply the wq assicated with the qpid. + * + * credit: cq credit to return to sge. + * cqe_flushed: 1 iff the CQE is flushed. + * cqe: copy of the polled CQE. + * + * return value: + * 0 CQE returned, + * -1 CQE skipped, try again. + */ +int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit) +{ + int ret = 0; + struct t3_cqe *hw_cqe, read_cqe; + + *cqe_flushed = 0; + *credit = 0; + hw_cqe = cxio_next_cqe(cq); + + PDBG("%s CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x" + " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n", + __FUNCTION__, CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe), + CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe), + CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe), + CQE_WRID_LOW(*hw_cqe)); + + /* + * skip cqe's not affiliated with a QP. + */ + if (wq == NULL) { + ret = -1; + goto skip_cqe; + } + + /* + * Gotta tweak READ completions: + * 1) the cqe doesn't contain the sq_wptr from the wr. + * 2) opcode not reflected from the wr. + * 3) read_len not reflected from the wr. + * 4) cq_type is RQ_TYPE not SQ_TYPE. + */ + if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) { + + /* + * Don't write to the HWCQ, so create a new read req CQE + * in local memory. + */ + create_read_req_cqe(wq, hw_cqe, &read_cqe); + hw_cqe = &read_cqe; + advance_oldest_read(wq); + } + + /* + * T3A: Discard TERMINATE CQEs. + */ + if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) { + ret = -1; + wq->error = 1; + goto skip_cqe; + } + + if (CQE_STATUS(*hw_cqe) || wq->error) { + *cqe_flushed = wq->error; + wq->error = 1; + + /* + * T3A inserts errors into the CQE. We cannot return + * these as work completions. + */ + /* incoming write failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE) + && RQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + /* incoming read request failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) { + ret = -1; + goto skip_cqe; + } + + /* incoming SEND with no receive posted failures */ + if ((CQE_OPCODE(*hw_cqe) == T3_SEND) && RQ_TYPE(*hw_cqe) && + Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) { + ret = -1; + goto skip_cqe; + } + goto proc_cqe; + } + + /* + * RECV completion. + */ + if (RQ_TYPE(*hw_cqe)) { + + /* + * HW only validates 4 bits of MSN. So we must validate that + * the MSN in the SEND is the next expected MSN. If its not, + * then we complete this with TPT_ERR_MSN and mark the wq in + * error. + */ + if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) { + wq->error = 1; + hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN)); + goto proc_cqe; + } + goto proc_cqe; + } + + /* + * If we get here its a send completion. + * + * Handle out of order completion. These get stuffed + * in the SW SQ. Then the SW SQ is walked to move any + * now in-order completions into the SW CQ. This handles + * 2 cases: + * 1) reaping unsignaled WRs when the first subsequent + * signaled WR is completed. + * 2) out of order read completions. + */ + if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) { + struct t3_swsq *sqp; + + PDBG("%s out of order completion going in swsq at idx %ld\n", + __FUNCTION__, + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2)); + sqp = wq->sq + + Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2); + sqp->cqe = *hw_cqe; + sqp->complete = 1; + ret = -1; + goto flush_wq; + } + +proc_cqe: + *cqe = *hw_cqe; + + /* + * Reap the associated WR(s) that are freed up with this + * completion. + */ + if (SQ_TYPE(*hw_cqe)) { + wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe); + PDBG("%s completing sq idx %ld\n", __FUNCTION__, + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)); + *cookie = (wq->sq + + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2))->wr_id; + wq->sq_rptr++; + } else { + PDBG("%s completing rq idx %ld\n", __FUNCTION__, + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); + *cookie = *(wq->rq + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)); + wq->rq_rptr++; + } + +flush_wq: + /* + * Flush any completed cqes that are now in-order. + */ + flush_completed_wrs(wq, cq); + +skip_cqe: + if (SW_CQE(*hw_cqe)) { + PDBG("%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x\n", + __FUNCTION__, cq, cq->cqid, cq->sw_rptr); + ++cq->sw_rptr; + } else { + PDBG("%s cq %p cqid 0x%x skip hw cqe rptr 0x%x\n", + __FUNCTION__, cq, cq->cqid, cq->rptr); + ++cq->rptr; + + /* + * T3A: compute credits. + */ + if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1))) + || ((cq->rptr - cq->wptr) >= 128)) { + *credit = cq->rptr - cq->wptr; + cq->wptr = cq->rptr; + } + } + return ret; +} diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h new file mode 100644 index 00000000000..1b97e80b878 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_HAL_H__ +#define __CXIO_HAL_H__ + +#include +#include + +#include "t3_cpl.h" +#include "t3cdev.h" +#include "cxgb3_ctl_defs.h" +#include "cxio_wr.h" + +#define T3_CTRL_QP_ID FW_RI_SGEEC_START +#define T3_CTL_QP_TID FW_RI_TID_START +#define T3_CTRL_QP_SIZE_LOG2 8 +#define T3_CTRL_CQ_ID 0 + +/* TBD */ +#define T3_MAX_NUM_RI (1<<15) +#define T3_MAX_NUM_QP (1<<15) +#define T3_MAX_NUM_CQ (1<<15) +#define T3_MAX_NUM_PD (1<<15) +#define T3_MAX_PBL_SIZE 256 +#define T3_MAX_RQ_SIZE 1024 +#define T3_MAX_NUM_STAG (1<<15) + +#define T3_STAG_UNSET 0xffffffff + +#define T3_MAX_DEV_NAME_LEN 32 + +struct cxio_hal_ctrl_qp { + u32 wptr; + u32 rptr; + struct mutex lock; /* for the wtpr, can sleep */ + wait_queue_head_t waitq;/* wait for RspQ/CQE msg */ + union t3_wr *workq; /* the work request queue */ + dma_addr_t dma_addr; /* pci bus address of the workq */ + DECLARE_PCI_UNMAP_ADDR(mapping) + void __iomem *doorbell; +}; + +struct cxio_hal_resource { + struct kfifo *tpt_fifo; + spinlock_t tpt_fifo_lock; + struct kfifo *qpid_fifo; + spinlock_t qpid_fifo_lock; + struct kfifo *cqid_fifo; + spinlock_t cqid_fifo_lock; + struct kfifo *pdid_fifo; + spinlock_t pdid_fifo_lock; +}; + +struct cxio_qpid_list { + struct list_head entry; + u32 qpid; +}; + +struct cxio_ucontext { + struct list_head qpids; + struct mutex lock; +}; + +struct cxio_rdev { + char dev_name[T3_MAX_DEV_NAME_LEN]; + struct t3cdev *t3cdev_p; + struct rdma_info rnic_info; + struct adap_ports port_info; + struct cxio_hal_resource *rscp; + struct cxio_hal_ctrl_qp ctrl_qp; + void *ulp; + unsigned long qpshift; + u32 qpnr; + u32 qpmask; + struct cxio_ucontext uctx; + struct gen_pool *pbl_pool; + struct gen_pool *rqt_pool; + struct list_head entry; +}; + +static inline int cxio_num_stags(struct cxio_rdev *rdev_p) +{ + return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5)); +} + +typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p, + struct sk_buff * skb); + +#define RSPQ_CQID(rsp) (be32_to_cpu(rsp->cq_ptrid) & 0xffff) +#define RSPQ_CQPTR(rsp) ((be32_to_cpu(rsp->cq_ptrid) >> 16) & 0xffff) +#define RSPQ_GENBIT(rsp) ((be32_to_cpu(rsp->flags) >> 16) & 1) +#define RSPQ_OVERFLOW(rsp) ((be32_to_cpu(rsp->flags) >> 17) & 1) +#define RSPQ_AN(rsp) ((be32_to_cpu(rsp->flags) >> 18) & 1) +#define RSPQ_SE(rsp) ((be32_to_cpu(rsp->flags) >> 19) & 1) +#define RSPQ_NOTIFY(rsp) ((be32_to_cpu(rsp->flags) >> 20) & 1) +#define RSPQ_CQBRANCH(rsp) ((be32_to_cpu(rsp->flags) >> 21) & 1) +#define RSPQ_CREDIT_THRESH(rsp) ((be32_to_cpu(rsp->flags) >> 22) & 1) + +struct respQ_msg_t { + __be32 flags; /* flit 0 */ + __be32 cq_ptrid; + __be64 rsvd; /* flit 1 */ + struct t3_cqe cqe; /* flits 2-3 */ +}; + +enum t3_cq_opcode { + CQ_ARM_AN = 0x2, + CQ_ARM_SE = 0x6, + CQ_FORCE_AN = 0x3, + CQ_CREDIT_UPDATE = 0x7 +}; + +int cxio_rdev_open(struct cxio_rdev *rdev); +void cxio_rdev_close(struct cxio_rdev *rdev); +int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq, + enum t3_cq_opcode op, u32 credit); +int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev, u32 qpid); +int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); +int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq, + struct cxio_ucontext *uctx); +int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode); +int cxio_allocate_stag(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 * pbl_size, u32 * pbl_addr); +int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr); +int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, + enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, + u8 page_size, __be64 *pbl, u32 *pbl_size, + u32 *pbl_addr); +int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size, + u32 pbl_addr); +int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid); +int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag); +int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr); +void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb); +u32 cxio_hal_get_rhdl(void); +void cxio_hal_put_rhdl(u32 rhdl); +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp); +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid); +int __init cxio_hal_init(void); +void __exit cxio_hal_exit(void); +void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count); +void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count); +void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count); +void cxio_flush_hw_cq(struct t3_cq *cq); +int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe, + u8 *cqe_flushed, u64 *cookie, u32 *credit); + +#define MOD "iw_cxgb3: " +#define PDBG(fmt, args...) pr_debug(MOD fmt, ## args) + +#ifdef DEBUG +void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag); +void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint len, u8 shift); +void cxio_dump_wqe(union t3_wr *wqe); +void cxio_dump_wce(struct t3_cqe *wce); +void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents); +void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid); +#endif + +#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c new file mode 100644 index 00000000000..997aa32cbf0 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* Crude resource management */ +#include +#include +#include +#include +#include +#include +#include "cxio_resource.h" +#include "cxio_hal.h" + +static struct kfifo *rhdl_fifo; +static spinlock_t rhdl_fifo_lock; + +#define RANDOM_SIZE 16 + +static int __cxio_init_resource_fifo(struct kfifo **fifo, + spinlock_t *fifo_lock, + u32 nr, u32 skip_low, + u32 skip_high, + int random) +{ + u32 i, j, entry = 0, idx; + u32 random_bytes; + u32 rarray[16]; + spin_lock_init(fifo_lock); + + *fifo = kfifo_alloc(nr * sizeof(u32), GFP_KERNEL, fifo_lock); + if (IS_ERR(*fifo)) + return -ENOMEM; + + for (i = 0; i < skip_low + skip_high; i++) + __kfifo_put(*fifo, (unsigned char *) &entry, sizeof(u32)); + if (random) { + j = 0; + random_bytes = random32(); + for (i = 0; i < RANDOM_SIZE; i++) + rarray[i] = i + skip_low; + for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) { + if (j >= RANDOM_SIZE) { + j = 0; + random_bytes = random32(); + } + idx = (random_bytes >> (j * 2)) & 0xF; + __kfifo_put(*fifo, + (unsigned char *) &rarray[idx], + sizeof(u32)); + rarray[idx] = i; + j++; + } + for (i = 0; i < RANDOM_SIZE; i++) + __kfifo_put(*fifo, + (unsigned char *) &rarray[i], + sizeof(u32)); + } else + for (i = skip_low; i < nr - skip_high; i++) + __kfifo_put(*fifo, (unsigned char *) &i, sizeof(u32)); + + for (i = 0; i < skip_low + skip_high; i++) + kfifo_get(*fifo, (unsigned char *) &entry, sizeof(u32)); + return 0; +} + +static int cxio_init_resource_fifo(struct kfifo **fifo, spinlock_t * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 0)); +} + +static int cxio_init_resource_fifo_random(struct kfifo **fifo, + spinlock_t * fifo_lock, + u32 nr, u32 skip_low, u32 skip_high) +{ + + return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low, + skip_high, 1)); +} + +static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p) +{ + u32 i; + + spin_lock_init(&rdev_p->rscp->qpid_fifo_lock); + + rdev_p->rscp->qpid_fifo = kfifo_alloc(T3_MAX_NUM_QP * sizeof(u32), + GFP_KERNEL, + &rdev_p->rscp->qpid_fifo_lock); + if (IS_ERR(rdev_p->rscp->qpid_fifo)) + return -ENOMEM; + + for (i = 16; i < T3_MAX_NUM_QP; i++) + if (!(i & rdev_p->qpmask)) + __kfifo_put(rdev_p->rscp->qpid_fifo, + (unsigned char *) &i, sizeof(u32)); + return 0; +} + +int cxio_hal_init_rhdl_resource(u32 nr_rhdl) +{ + return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1, + 0); +} + +void cxio_hal_destroy_rhdl_resource(void) +{ + kfifo_free(rhdl_fifo); +} + +/* nr_* must be power of 2 */ +int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid) +{ + int err = 0; + struct cxio_hal_resource *rscp; + + rscp = kmalloc(sizeof(*rscp), GFP_KERNEL); + if (!rscp) + return -ENOMEM; + rdev_p->rscp = rscp; + err = cxio_init_resource_fifo_random(&rscp->tpt_fifo, + &rscp->tpt_fifo_lock, + nr_tpt, 1, 0); + if (err) + goto tpt_err; + err = cxio_init_qpid_fifo(rdev_p); + if (err) + goto qpid_err; + err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, + nr_cqid, 1, 0); + if (err) + goto cqid_err; + err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, + nr_pdid, 1, 0); + if (err) + goto pdid_err; + return 0; +pdid_err: + kfifo_free(rscp->cqid_fifo); +cqid_err: + kfifo_free(rscp->qpid_fifo); +qpid_err: + kfifo_free(rscp->tpt_fifo); +tpt_err: + return -ENOMEM; +} + +/* + * returns 0 if no resource available + */ +static inline u32 cxio_hal_get_resource(struct kfifo *fifo) +{ + u32 entry; + if (kfifo_get(fifo, (unsigned char *) &entry, sizeof(u32))) + return entry; + else + return 0; /* fifo emptry */ +} + +static inline void cxio_hal_put_resource(struct kfifo *fifo, u32 entry) +{ + BUG_ON(kfifo_put(fifo, (unsigned char *) &entry, sizeof(u32)) == 0); +} + +u32 cxio_hal_get_rhdl(void) +{ + return cxio_hal_get_resource(rhdl_fifo); +} + +void cxio_hal_put_rhdl(u32 rhdl) +{ + cxio_hal_put_resource(rhdl_fifo, rhdl); +} + +u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(rscp->tpt_fifo); +} + +void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag) +{ + cxio_hal_put_resource(rscp->tpt_fifo, stag); +} + +u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp) +{ + u32 qpid = cxio_hal_get_resource(rscp->qpid_fifo); + PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid); + return qpid; +} + +void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid) +{ + PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid); + cxio_hal_put_resource(rscp->qpid_fifo, qpid); +} + +u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(rscp->cqid_fifo); +} + +void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid) +{ + cxio_hal_put_resource(rscp->cqid_fifo, cqid); +} + +u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp) +{ + return cxio_hal_get_resource(rscp->pdid_fifo); +} + +void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid) +{ + cxio_hal_put_resource(rscp->pdid_fifo, pdid); +} + +void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) +{ + kfifo_free(rscp->tpt_fifo); + kfifo_free(rscp->cqid_fifo); + kfifo_free(rscp->qpid_fifo); + kfifo_free(rscp->pdid_fifo); + kfree(rscp); +} + +/* + * PBL Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_PBL_SHIFT 8 /* 256B == min PBL size (32 entries) */ +#define PBL_CHUNK 2*1024*1024 + +u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size); + PDBG("%s addr 0x%x size %d\n", __FUNCTION__, (u32)addr, size); + return (u32)addr; +} + +void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __FUNCTION__, addr, size); + gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size); +} + +int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p) +{ + unsigned long i; + rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1); + if (rdev_p->pbl_pool) + for (i = rdev_p->rnic_info.pbl_base; + i <= rdev_p->rnic_info.pbl_top - PBL_CHUNK + 1; + i += PBL_CHUNK) + gen_pool_add(rdev_p->pbl_pool, i, PBL_CHUNK, -1); + return rdev_p->pbl_pool ? 0 : -ENOMEM; +} + +void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->pbl_pool); +} + +/* + * RQT Memory Manager. Uses Linux generic allocator. + */ + +#define MIN_RQT_SHIFT 10 /* 1KB == mini RQT size (16 entries) */ +#define RQT_CHUNK 2*1024*1024 + +u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size) +{ + unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6); + PDBG("%s addr 0x%x size %d\n", __FUNCTION__, (u32)addr, size << 6); + return (u32)addr; +} + +void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) +{ + PDBG("%s addr 0x%x size %d\n", __FUNCTION__, addr, size << 6); + gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6); +} + +int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p) +{ + unsigned long i; + rdev_p->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1); + if (rdev_p->rqt_pool) + for (i = rdev_p->rnic_info.rqt_base; + i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1; + i += RQT_CHUNK) + gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1); + return rdev_p->rqt_pool ? 0 : -ENOMEM; +} + +void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p) +{ + gen_pool_destroy(rdev_p->rqt_pool); +} diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.h b/drivers/infiniband/hw/cxgb3/cxio_resource.h new file mode 100644 index 00000000000..a6bbe8370d8 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_RESOURCE_H__ +#define __CXIO_RESOURCE_H__ + +#include +#include +#include +#include +#include +#include +#include +#include "cxio_hal.h" + +extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl); +extern void cxio_hal_destroy_rhdl_resource(void); +extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p, + u32 nr_tpt, u32 nr_pbl, + u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, + u32 nr_pdid); +extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag); +extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid); +extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp); +extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid); +extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp); + +#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base ) +extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); + +#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base ) +extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p); +extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p); +extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size); +extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size); +#endif diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h new file mode 100644 index 00000000000..103fc42d697 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __CXIO_WR_H__ +#define __CXIO_WR_H__ + +#include +#include +#include +#include "firmware_exports.h" + +#define T3_MAX_SGE 4 + +#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr)) +#define Q_FULL(rptr,wptr,size_log2) ( (((wptr)-(rptr))>>(size_log2)) && \ + ((rptr)!=(wptr)) ) +#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1)) +#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<> S_FW_RIWR_OP)) & M_FW_RIWR_OP) + +#define S_FW_RIWR_SOPEOP 22 +#define M_FW_RIWR_SOPEOP 0x3 +#define V_FW_RIWR_SOPEOP(x) ((x) << S_FW_RIWR_SOPEOP) + +#define S_FW_RIWR_FLAGS 8 +#define M_FW_RIWR_FLAGS 0x3fffff +#define V_FW_RIWR_FLAGS(x) ((x) << S_FW_RIWR_FLAGS) +#define G_FW_RIWR_FLAGS(x) ((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS) + +#define S_FW_RIWR_TID 8 +#define V_FW_RIWR_TID(x) ((x) << S_FW_RIWR_TID) + +#define S_FW_RIWR_LEN 0 +#define V_FW_RIWR_LEN(x) ((x) << S_FW_RIWR_LEN) + +#define S_FW_RIWR_GEN 31 +#define V_FW_RIWR_GEN(x) ((x) << S_FW_RIWR_GEN) + +struct t3_sge { + __be32 stag; + __be32 len; + __be64 to; +}; + +/* If num_sgle is zero, flit 5+ contains immediate data.*/ +struct t3_send_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 rem_stag; + __be32 plen; /* 3 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 4+ */ +}; + +struct t3_local_inv_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 stag; /* 2 */ + __be32 reserved3; +}; + +struct t3_rdma_write_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 stag_sink; + __be64 to_sink; /* 3 */ + __be32 plen; /* 4 */ + __be32 num_sgle; + struct t3_sge sgl[T3_MAX_SGE]; /* 5+ */ +}; + +struct t3_rdma_read_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 rdmaop; /* 2 */ + u8 reserved[3]; + __be32 rem_stag; + __be64 rem_to; /* 3 */ + __be32 local_stag; /* 4 */ + __be32 local_len; + __be64 local_to; /* 5 */ +}; + +enum t3_addr_type { + T3_VA_BASED_TO = 0x0, + T3_ZERO_BASED_TO = 0x1 +} __attribute__ ((packed)); + +enum t3_mem_perms { + T3_MEM_ACCESS_LOCAL_READ = 0x1, + T3_MEM_ACCESS_LOCAL_WRITE = 0x2, + T3_MEM_ACCESS_REM_READ = 0x4, + T3_MEM_ACCESS_REM_WRITE = 0x8 +} __attribute__ ((packed)); + +struct t3_bind_mw_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u16 reserved; /* 2 */ + u8 type; + u8 perms; + __be32 mr_stag; + __be32 mw_stag; /* 3 */ + __be32 mw_len; + __be64 mw_va; /* 4 */ + __be32 mr_pbl_addr; /* 5 */ + u8 reserved2[3]; + u8 mr_pagesz; +}; + +struct t3_receive_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + u8 pagesz[T3_MAX_SGE]; + __be32 num_sgle; /* 2 */ + struct t3_sge sgl[T3_MAX_SGE]; /* 3+ */ + __be32 pbl_addr[T3_MAX_SGE]; +}; + +struct t3_bypass_wr { + struct fw_riwrh wrh; + union t3_wrid wrid; /* 1 */ +}; + +struct t3_modify_qp_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 flags; /* 2 */ + __be32 quiesce; /* 2 */ + __be32 max_ird; /* 3 */ + __be32 max_ord; /* 3 */ + __be64 sge_cmd; /* 4 */ + __be64 ctx1; /* 5 */ + __be64 ctx0; /* 6 */ +}; + +enum t3_modify_qp_flags { + MODQP_QUIESCE = 0x01, + MODQP_MAX_IRD = 0x02, + MODQP_MAX_ORD = 0x04, + MODQP_WRITE_EC = 0x08, + MODQP_READ_EC = 0x10, +}; + + +enum t3_mpa_attrs { + uP_RI_MPA_RX_MARKER_ENABLE = 0x1, + uP_RI_MPA_TX_MARKER_ENABLE = 0x2, + uP_RI_MPA_CRC_ENABLE = 0x4, + uP_RI_MPA_IETF_ENABLE = 0x8 +} __attribute__ ((packed)); + +enum t3_qp_caps { + uP_RI_QP_RDMA_READ_ENABLE = 0x01, + uP_RI_QP_RDMA_WRITE_ENABLE = 0x02, + uP_RI_QP_BIND_ENABLE = 0x04, + uP_RI_QP_FAST_REGISTER_ENABLE = 0x08, + uP_RI_QP_STAG0_ENABLE = 0x10 +} __attribute__ ((packed)); + +struct t3_rdma_init_attr { + u32 tid; + u32 qpid; + u32 pdid; + u32 scqid; + u32 rcqid; + u32 rq_addr; + u32 rq_size; + enum t3_mpa_attrs mpaattrs; + enum t3_qp_caps qpcaps; + u16 tcp_emss; + u32 ord; + u32 ird; + u64 qp_dma_addr; + u32 qp_dma_size; + u32 flags; +}; + +struct t3_rdma_init_wr { + struct fw_riwrh wrh; /* 0 */ + union t3_wrid wrid; /* 1 */ + __be32 qpid; /* 2 */ + __be32 pdid; + __be32 scqid; /* 3 */ + __be32 rcqid; + __be32 rq_addr; /* 4 */ + __be32 rq_size; + u8 mpaattrs; /* 5 */ + u8 qpcaps; + __be16 ulpdu_size; + __be32 flags; /* bits 31-1 - reservered */ + /* bit 0 - set if RECV posted */ + __be32 ord; /* 6 */ + __be32 ird; + __be64 qp_dma_addr; /* 7 */ + __be32 qp_dma_size; /* 8 */ + u32 rsvd; +}; + +struct t3_genbit { + u64 flit[15]; + __be64 genbit; +}; + +enum rdma_init_wr_flags { + RECVS_POSTED = 1, +}; + +union t3_wr { + struct t3_send_wr send; + struct t3_rdma_write_wr write; + struct t3_rdma_read_wr read; + struct t3_receive_wr recv; + struct t3_local_inv_wr local_inv; + struct t3_bind_mw_wr bind; + struct t3_bypass_wr bypass; + struct t3_rdma_init_wr init; + struct t3_modify_qp_wr qp_mod; + struct t3_genbit genbit; + u64 flit[16]; +}; + +#define T3_SQ_CQE_FLIT 13 +#define T3_SQ_COOKIE_FLIT 14 + +#define T3_RQ_COOKIE_FLIT 13 +#define T3_RQ_CQE_FLIT 14 + +static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe) +{ + return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags)); +} + +static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op, + enum t3_wr_flags flags, u8 genbit, u32 tid, + u8 len) +{ + wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) | + V_FW_RIWR_SOPEOP(M_FW_RIWR_SOPEOP) | + V_FW_RIWR_FLAGS(flags)); + wmb(); + wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) | + V_FW_RIWR_TID(tid) | + V_FW_RIWR_LEN(len)); + /* 2nd gen bit... */ + ((union t3_wr *)wqe)->genbit.genbit = cpu_to_be64(genbit); +} + +/* + * T3 ULP2_TX commands + */ +enum t3_utx_mem_op { + T3_UTX_MEM_READ = 2, + T3_UTX_MEM_WRITE = 3 +}; + +/* T3 MC7 RDMA TPT entry format */ + +enum tpt_mem_type { + TPT_NON_SHARED_MR = 0x0, + TPT_SHARED_MR = 0x1, + TPT_MW = 0x2, + TPT_MW_RELAXED_PROTECTION = 0x3 +}; + +enum tpt_addr_type { + TPT_ZBTO = 0, + TPT_VATO = 1 +}; + +enum tpt_mem_perm { + TPT_LOCAL_READ = 0x8, + TPT_LOCAL_WRITE = 0x4, + TPT_REMOTE_READ = 0x2, + TPT_REMOTE_WRITE = 0x1 +}; + +struct tpt_entry { + __be32 valid_stag_pdid; + __be32 flags_pagesize_qpid; + + __be32 rsvd_pbl_addr; + __be32 len; + __be32 va_hi; + __be32 va_low_or_fbo; + + __be32 rsvd_bind_cnt_or_pstag; + __be32 rsvd_pbl_size; +}; + +#define S_TPT_VALID 31 +#define V_TPT_VALID(x) ((x) << S_TPT_VALID) +#define F_TPT_VALID V_TPT_VALID(1U) + +#define S_TPT_STAG_KEY 23 +#define M_TPT_STAG_KEY 0xFF +#define V_TPT_STAG_KEY(x) ((x) << S_TPT_STAG_KEY) +#define G_TPT_STAG_KEY(x) (((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY) + +#define S_TPT_STAG_STATE 22 +#define V_TPT_STAG_STATE(x) ((x) << S_TPT_STAG_STATE) +#define F_TPT_STAG_STATE V_TPT_STAG_STATE(1U) + +#define S_TPT_STAG_TYPE 20 +#define M_TPT_STAG_TYPE 0x3 +#define V_TPT_STAG_TYPE(x) ((x) << S_TPT_STAG_TYPE) +#define G_TPT_STAG_TYPE(x) (((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE) + +#define S_TPT_PDID 0 +#define M_TPT_PDID 0xFFFFF +#define V_TPT_PDID(x) ((x) << S_TPT_PDID) +#define G_TPT_PDID(x) (((x) >> S_TPT_PDID) & M_TPT_PDID) + +#define S_TPT_PERM 28 +#define M_TPT_PERM 0xF +#define V_TPT_PERM(x) ((x) << S_TPT_PERM) +#define G_TPT_PERM(x) (((x) >> S_TPT_PERM) & M_TPT_PERM) + +#define S_TPT_REM_INV_DIS 27 +#define V_TPT_REM_INV_DIS(x) ((x) << S_TPT_REM_INV_DIS) +#define F_TPT_REM_INV_DIS V_TPT_REM_INV_DIS(1U) + +#define S_TPT_ADDR_TYPE 26 +#define V_TPT_ADDR_TYPE(x) ((x) << S_TPT_ADDR_TYPE) +#define F_TPT_ADDR_TYPE V_TPT_ADDR_TYPE(1U) + +#define S_TPT_MW_BIND_ENABLE 25 +#define V_TPT_MW_BIND_ENABLE(x) ((x) << S_TPT_MW_BIND_ENABLE) +#define F_TPT_MW_BIND_ENABLE V_TPT_MW_BIND_ENABLE(1U) + +#define S_TPT_PAGE_SIZE 20 +#define M_TPT_PAGE_SIZE 0x1F +#define V_TPT_PAGE_SIZE(x) ((x) << S_TPT_PAGE_SIZE) +#define G_TPT_PAGE_SIZE(x) (((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE) + +#define S_TPT_PBL_ADDR 0 +#define M_TPT_PBL_ADDR 0x1FFFFFFF +#define V_TPT_PBL_ADDR(x) ((x) << S_TPT_PBL_ADDR) +#define G_TPT_PBL_ADDR(x) (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR) + +#define S_TPT_QPID 0 +#define M_TPT_QPID 0xFFFFF +#define V_TPT_QPID(x) ((x) << S_TPT_QPID) +#define G_TPT_QPID(x) (((x) >> S_TPT_QPID) & M_TPT_QPID) + +#define S_TPT_PSTAG 0 +#define M_TPT_PSTAG 0xFFFFFF +#define V_TPT_PSTAG(x) ((x) << S_TPT_PSTAG) +#define G_TPT_PSTAG(x) (((x) >> S_TPT_PSTAG) & M_TPT_PSTAG) + +#define S_TPT_PBL_SIZE 0 +#define M_TPT_PBL_SIZE 0xFFFFF +#define V_TPT_PBL_SIZE(x) ((x) << S_TPT_PBL_SIZE) +#define G_TPT_PBL_SIZE(x) (((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE) + +/* + * CQE defs + */ +struct t3_cqe { + __be32 header; + __be32 len; + union { + struct { + __be32 stag; + __be32 msn; + } rcqe; + struct { + u32 wrid_hi; + u32 wrid_low; + } scqe; + } u; +}; + +#define S_CQE_OOO 31 +#define M_CQE_OOO 0x1 +#define G_CQE_OOO(x) ((((x) >> S_CQE_OOO)) & M_CQE_OOO) +#define V_CEQ_OOO(x) ((x)<> S_CQE_QPID)) & M_CQE_QPID) +#define V_CQE_QPID(x) ((x)<> S_CQE_SWCQE)) & M_CQE_SWCQE) +#define V_CQE_SWCQE(x) ((x)<> S_CQE_GENBIT) & M_CQE_GENBIT) +#define V_CQE_GENBIT(x) ((x)<> S_CQE_STATUS)) & M_CQE_STATUS) +#define V_CQE_STATUS(x) ((x)<> S_CQE_TYPE)) & M_CQE_TYPE) +#define V_CQE_TYPE(x) ((x)<> S_CQE_OPCODE)) & M_CQE_OPCODE) +#define V_CQE_OPCODE(x) ((x)<queue->flit[13] = 1; +} + +static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + return NULL; +} + +static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq) +{ + struct t3_cqe *cqe; + + if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) { + cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2)); + return cqe; + } + cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2)); + if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe)) + return cqe; + return NULL; +} + +#endif diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c new file mode 100644 index 00000000000..4611afa5222 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include + +#include "cxgb3_offload.h" +#include "iwch_provider.h" +#include "iwch_user.h" +#include "iwch.h" +#include "iwch_cm.h" + +#define DRV_VERSION "1.1" + +MODULE_AUTHOR("Boyd Faulkner, Steve Wise"); +MODULE_DESCRIPTION("Chelsio T3 RDMA Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; + +static void open_rnic_dev(struct t3cdev *); +static void close_rnic_dev(struct t3cdev *); + +struct cxgb3_client t3c_client = { + .name = "iw_cxgb3", + .add = open_rnic_dev, + .remove = close_rnic_dev, + .handlers = t3c_handlers, + .redirect = iwch_ep_redirect +}; + +static LIST_HEAD(dev_list); +static DEFINE_MUTEX(dev_mutex); + +static void rnic_init(struct iwch_dev *rnicp) +{ + PDBG("%s iwch_dev %p\n", __FUNCTION__, rnicp); + idr_init(&rnicp->cqidr); + idr_init(&rnicp->qpidr); + idr_init(&rnicp->mmidr); + spin_lock_init(&rnicp->lock); + + rnicp->attr.vendor_id = 0x168; + rnicp->attr.vendor_part_id = 7; + rnicp->attr.max_qps = T3_MAX_NUM_QP - 32; + rnicp->attr.max_wrs = (1UL << 24) - 1; + rnicp->attr.max_sge_per_wr = T3_MAX_SGE; + rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE; + rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1; + rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1; + rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev); + rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE; + rnicp->attr.max_pds = T3_MAX_NUM_PD - 1; + rnicp->attr.mem_pgsizes_bitmask = 0x7FFF; /* 4KB-128MB */ + rnicp->attr.can_resize_wq = 0; + rnicp->attr.max_rdma_reads_per_qp = 8; + rnicp->attr.max_rdma_read_resources = + rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps; + rnicp->attr.max_rdma_read_qp_depth = 8; /* IRD */ + rnicp->attr.max_rdma_read_depth = + rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps; + rnicp->attr.rq_overflow_handled = 0; + rnicp->attr.can_modify_ird = 0; + rnicp->attr.can_modify_ord = 0; + rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1; + rnicp->attr.stag0_value = 1; + rnicp->attr.zbva_support = 1; + rnicp->attr.local_invalidate_fence = 1; + rnicp->attr.cq_overflow_detection = 1; + return; +} + +static void open_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *rnicp; + static int vers_printed; + + PDBG("%s t3cdev %p\n", __FUNCTION__, tdev); + if (!vers_printed++) + printk(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n", + DRV_VERSION); + rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); + if (!rnicp) { + printk(KERN_ERR MOD "Cannot allocate ib device\n"); + return; + } + rnicp->rdev.ulp = rnicp; + rnicp->rdev.t3cdev_p = tdev; + + mutex_lock(&dev_mutex); + + if (cxio_rdev_open(&rnicp->rdev)) { + mutex_unlock(&dev_mutex); + printk(KERN_ERR MOD "Unable to open CXIO rdev\n"); + ib_dealloc_device(&rnicp->ibdev); + return; + } + + rnic_init(rnicp); + + list_add_tail(&rnicp->entry, &dev_list); + mutex_unlock(&dev_mutex); + + if (iwch_register_device(rnicp)) { + printk(KERN_ERR MOD "Unable to register device\n"); + close_rnic_dev(tdev); + } + printk(KERN_INFO MOD "Initialized device %s\n", + pci_name(rnicp->rdev.rnic_info.pdev)); + return; +} + +static void close_rnic_dev(struct t3cdev *tdev) +{ + struct iwch_dev *dev, *tmp; + PDBG("%s t3cdev %p\n", __FUNCTION__, tdev); + mutex_lock(&dev_mutex); + list_for_each_entry_safe(dev, tmp, &dev_list, entry) { + if (dev->rdev.t3cdev_p == tdev) { + list_del(&dev->entry); + iwch_unregister_device(dev); + cxio_rdev_close(&dev->rdev); + idr_destroy(&dev->cqidr); + idr_destroy(&dev->qpidr); + idr_destroy(&dev->mmidr); + ib_dealloc_device(&dev->ibdev); + break; + } + } + mutex_unlock(&dev_mutex); +} + +static int __init iwch_init_module(void) +{ + int err; + + err = cxio_hal_init(); + if (err) + return err; + err = iwch_cm_init(); + if (err) + return err; + cxio_register_ev_cb(iwch_ev_dispatch); + cxgb3_register_client(&t3c_client); + return 0; +} + +static void __exit iwch_exit_module(void) +{ + cxgb3_unregister_client(&t3c_client); + cxio_unregister_ev_cb(iwch_ev_dispatch); + iwch_cm_term(); + cxio_hal_exit(); +} + +module_init(iwch_init_module); +module_exit(iwch_exit_module); diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h new file mode 100644 index 00000000000..6517ef85026 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_H__ +#define __IWCH_H__ + +#include +#include +#include +#include + +#include + +#include "cxio_hal.h" +#include "cxgb3_offload.h" + +struct iwch_pd; +struct iwch_cq; +struct iwch_qp; +struct iwch_mr; + +struct iwch_rnic_attributes { + u32 vendor_id; + u32 vendor_part_id; + u32 max_qps; + u32 max_wrs; /* Max for any SQ/RQ */ + u32 max_sge_per_wr; + u32 max_sge_per_rdma_write_wr; /* for RDMA Write WR */ + u32 max_cqs; + u32 max_cqes_per_cq; + u32 max_mem_regs; + u32 max_phys_buf_entries; /* for phys buf list */ + u32 max_pds; + + /* + * The memory page sizes supported by this RNIC. + * Bit position i in bitmap indicates page of + * size (4k)^i. Phys block list mode unsupported. + */ + u32 mem_pgsizes_bitmask; + u8 can_resize_wq; + + /* + * The maximum number of RDMA Reads that can be outstanding + * per QP with this RNIC as the target. + */ + u32 max_rdma_reads_per_qp; + + /* + * The maximum number of resources used for RDMA Reads + * by this RNIC with this RNIC as the target. + */ + u32 max_rdma_read_resources; + + /* + * The max depth per QP for initiation of RDMA Read + * by this RNIC. + */ + u32 max_rdma_read_qp_depth; + + /* + * The maximum depth for initiation of RDMA Read + * operations by this RNIC on all QPs + */ + u32 max_rdma_read_depth; + u8 rq_overflow_handled; + u32 can_modify_ird; + u32 can_modify_ord; + u32 max_mem_windows; + u32 stag0_value; + u8 zbva_support; + u8 local_invalidate_fence; + u32 cq_overflow_detection; +}; + +struct iwch_dev { + struct ib_device ibdev; + struct cxio_rdev rdev; + u32 device_cap_flags; + struct iwch_rnic_attributes attr; + struct idr cqidr; + struct idr qpidr; + struct idr mmidr; + spinlock_t lock; + struct list_head entry; +}; + +static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct iwch_dev, ibdev); +} + +static inline int t3b_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3B; +} + +static inline int t3a_device(const struct iwch_dev *rhp) +{ + return rhp->rdev.t3cdev_p->type == T3A; +} + +static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid) +{ + return idr_find(&rhp->cqidr, cqid); +} + +static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid) +{ + return idr_find(&rhp->qpidr, qpid); +} + +static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid) +{ + return idr_find(&rhp->mmidr, mmid); +} + +static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr, + void *handle, u32 id) +{ + int ret; + u32 newid; + + do { + if (!idr_pre_get(idr, GFP_KERNEL)) { + return -ENOMEM; + } + spin_lock_irq(&rhp->lock); + ret = idr_get_new_above(idr, handle, id, &newid); + BUG_ON(newid != id); + spin_unlock_irq(&rhp->lock); + } while (ret == -EAGAIN); + + return ret; +} + +static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id) +{ + spin_lock_irq(&rhp->lock); + idr_remove(idr, id); + spin_unlock_irq(&rhp->lock); +} + +extern struct cxgb3_client t3c_client; +extern cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS]; +extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb); + +#endif diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c new file mode 100644 index 00000000000..a522b1baa3b --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -0,0 +1,2081 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "tcb.h" +#include "cxgb3_offload.h" +#include "iwch.h" +#include "iwch_provider.h" +#include "iwch_cm.h" + +static char *states[] = { + "idle", + "listen", + "connecting", + "mpa_wait_req", + "mpa_req_sent", + "mpa_req_rcvd", + "mpa_rep_sent", + "fpdu_mode", + "aborting", + "closing", + "moribund", + "dead", + NULL, +}; + +static int ep_timeout_secs = 10; +module_param(ep_timeout_secs, int, 0444); +MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " + "in seconds (default=10)"); + +static int mpa_rev = 1; +module_param(mpa_rev, int, 0444); +MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " + "1 is spec compliant. (default=1)"); + +static int markers_enabled = 0; +module_param(markers_enabled, int, 0444); +MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)"); + +static int crc_enabled = 1; +module_param(crc_enabled, int, 0444); +MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)"); + +static int rcv_win = 256 * 1024; +module_param(rcv_win, int, 0444); +MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)"); + +static int snd_win = 32 * 1024; +module_param(snd_win, int, 0444); +MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)"); + +static unsigned int nocong = 0; +module_param(nocong, uint, 0444); +MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)"); + +static unsigned int cong_flavor = 1; +module_param(cong_flavor, uint, 0444); +MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)"); + +static void process_work(struct work_struct *work); +static struct workqueue_struct *workq; +static DECLARE_WORK(skb_work, process_work); + +static struct sk_buff_head rxq; +static cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS]; + +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp); +static void ep_timeout(unsigned long arg); +static void connect_reply_upcall(struct iwch_ep *ep, int status); + +static void start_ep_timer(struct iwch_ep *ep) +{ + PDBG("%s ep %p\n", __FUNCTION__, ep); + if (timer_pending(&ep->timer)) { + PDBG("%s stopped / restarted timer ep %p\n", __FUNCTION__, ep); + del_timer_sync(&ep->timer); + } else + get_ep(&ep->com); + ep->timer.expires = jiffies + ep_timeout_secs * HZ; + ep->timer.data = (unsigned long)ep; + ep->timer.function = ep_timeout; + add_timer(&ep->timer); +} + +static void stop_ep_timer(struct iwch_ep *ep) +{ + PDBG("%s ep %p\n", __FUNCTION__, ep); + del_timer_sync(&ep->timer); + put_ep(&ep->com); +} + +static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb) +{ + struct cpl_tid_release *req; + + skb = get_skb(skb, sizeof *req, GFP_KERNEL); + if (!skb) + return; + req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid)); + skb->priority = CPL_PRIORITY_SETUP; + tdev->send(tdev, skb); + return; +} + +int iwch_quiesce_tid(struct iwch_ep *ep) +{ + struct cpl_set_tcb_field *req; + struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + + if (!skb) + return -ENOMEM; + req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE); + + skb->priority = CPL_PRIORITY_DATA; + ep->com.tdev->send(ep->com.tdev, skb); + return 0; +} + +int iwch_resume_tid(struct iwch_ep *ep) +{ + struct cpl_set_tcb_field *req; + struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + + if (!skb) + return -ENOMEM; + req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid)); + req->reply = 0; + req->cpu_idx = 0; + req->word = htons(W_TCB_RX_QUIESCE); + req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE); + req->val = 0; + + skb->priority = CPL_PRIORITY_DATA; + ep->com.tdev->send(ep->com.tdev, skb); + return 0; +} + +static void set_emss(struct iwch_ep *ep, u16 opt) +{ + PDBG("%s ep %p opt %u\n", __FUNCTION__, ep, opt); + ep->emss = T3C_DATA(ep->com.tdev)->mtus[G_TCPOPT_MSS(opt)] - 40; + if (G_TCPOPT_TSTAMP(opt)) + ep->emss -= 12; + if (ep->emss < 128) + ep->emss = 128; + PDBG("emss=%d\n", ep->emss); +} + +static enum iwch_ep_state state_read(struct iwch_ep_common *epc) +{ + unsigned long flags; + enum iwch_ep_state state; + + spin_lock_irqsave(&epc->lock, flags); + state = epc->state; + spin_unlock_irqrestore(&epc->lock, flags); + return state; +} + +static inline void __state_set(struct iwch_ep_common *epc, + enum iwch_ep_state new) +{ + epc->state = new; +} + +static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new) +{ + unsigned long flags; + + spin_lock_irqsave(&epc->lock, flags); + PDBG("%s - %s -> %s\n", __FUNCTION__, states[epc->state], states[new]); + __state_set(epc, new); + spin_unlock_irqrestore(&epc->lock, flags); + return; +} + +static void *alloc_ep(int size, gfp_t gfp) +{ + struct iwch_ep_common *epc; + + epc = kmalloc(size, gfp); + if (epc) { + memset(epc, 0, size); + kref_init(&epc->kref); + spin_lock_init(&epc->lock); + init_waitqueue_head(&epc->waitq); + } + PDBG("%s alloc ep %p\n", __FUNCTION__, epc); + return epc; +} + +void __free_ep(struct kref *kref) +{ + struct iwch_ep_common *epc; + epc = container_of(kref, struct iwch_ep_common, kref); + PDBG("%s ep %p state %s\n", __FUNCTION__, epc, states[state_read(epc)]); + kfree(epc); +} + +static void release_ep_resources(struct iwch_ep *ep) +{ + PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, ep->hwtid); + cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid); + dst_release(ep->dst); + l2t_release(L2DATA(ep->com.tdev), ep->l2t); + if (ep->com.tdev->type == T3B) + release_tid(ep->com.tdev, ep->hwtid, NULL); + put_ep(&ep->com); +} + +static void process_work(struct work_struct *work) +{ + struct sk_buff *skb = NULL; + void *ep; + struct t3cdev *tdev; + int ret; + + while ((skb = skb_dequeue(&rxq))) { + ep = *((void **) (skb->cb)); + tdev = *((struct t3cdev **) (skb->cb + sizeof(void *))); + ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep); + if (ret & CPL_RET_BUF_DONE) + kfree_skb(skb); + + /* + * ep was referenced in sched(), and is freed here. + */ + put_ep((struct iwch_ep_common *)ep); + } +} + +static int status2errno(int status) +{ + switch (status) { + case CPL_ERR_NONE: + return 0; + case CPL_ERR_CONN_RESET: + return -ECONNRESET; + case CPL_ERR_ARP_MISS: + return -EHOSTUNREACH; + case CPL_ERR_CONN_TIMEDOUT: + return -ETIMEDOUT; + case CPL_ERR_TCAM_FULL: + return -ENOMEM; + case CPL_ERR_CONN_EXIST: + return -EADDRINUSE; + default: + return -EIO; + } +} + +/* + * Try and reuse skbs already allocated... + */ +static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp) +{ + if (skb) { + BUG_ON(skb_cloned(skb)); + skb_trim(skb, 0); + skb_get(skb); + } else { + skb = alloc_skb(len, gfp); + } + return skb; +} + +static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip, + __be32 peer_ip, __be16 local_port, + __be16 peer_port, u8 tos) +{ + struct rtable *rt; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = peer_ip, + .saddr = local_ip, + .tos = tos} + }, + .proto = IPPROTO_TCP, + .uli_u = { + .ports = { + .sport = local_port, + .dport = peer_port} + } + }; + + if (ip_route_output_flow(&rt, &fl, NULL, 0)) + return NULL; + return rt; +} + +static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu) +{ + int i = 0; + + while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) + ++i; + return i; +} + +static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb) +{ + PDBG("%s t3cdev %p\n", __FUNCTION__, dev); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for an active open. + */ +static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + printk(KERN_ERR MOD "ARP failure duing connect\n"); + kfree_skb(skb); +} + +/* + * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant + * and send it along. + */ +static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb) +{ + struct cpl_abort_req *req = cplhdr(skb); + + PDBG("%s t3cdev %p\n", __FUNCTION__, dev); + req->cmd = CPL_ABORT_NO_RST; + cxgb3_ofld_send(dev, skb); +} + +static int send_halfclose(struct iwch_ep *ep, gfp_t gfp) +{ + struct cpl_close_con_req *req; + struct sk_buff *skb; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb = get_skb(NULL, sizeof(*req), gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, arp_failure_discard); + req = (struct cpl_close_con_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid)); + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + struct cpl_abort_req *req; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb = get_skb(skb, sizeof(*req), gfp); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __FUNCTION__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, abort_arp_failure); + req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); + req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid)); + req->cmd = CPL_ABORT_SEND_RST; + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static int send_connect(struct iwch_ep *ep) +{ + struct cpl_act_open_req *req; + struct sk_buff *skb; + u32 opt0h, opt0l, opt2; + unsigned int mtu_idx; + int wscale; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb.\n", + __FUNCTION__); + return -ENOMEM; + } + mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst)); + wscale = compute_wscale(rcv_win); + opt0h = V_NAGLE(0) | + V_NO_CONG(nocong) | + V_KEEP_ALIVE(1) | + F_TCAM_BYPASS | + V_WND_SCALE(wscale) | + V_MSS_IDX(mtu_idx) | + V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx); + opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10); + opt2 = V_FLAVORS_VALID(1) | V_CONG_CONTROL_FLAVOR(cong_flavor); + skb->priority = CPL_PRIORITY_SETUP; + set_arp_failure_handler(skb, act_open_req_arp_failure); + + req = (struct cpl_act_open_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ep->atid)); + req->local_port = ep->com.local_addr.sin_port; + req->peer_port = ep->com.remote_addr.sin_port; + req->local_ip = ep->com.local_addr.sin_addr.s_addr; + req->peer_ip = ep->com.remote_addr.sin_addr.s_addr; + req->opt0h = htonl(opt0h); + req->opt0l = htonl(opt0l); + req->params = 0; + req->opt2 = htonl(opt2); + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + int len; + + PDBG("%s ep %p pd_len %d\n", __FUNCTION__, ep, ep->plen); + + BUG_ON(skb_cloned(skb)); + + mpalen = sizeof(*mpa) + ep->plen; + if (skb->data + mpalen + sizeof(*req) > skb->end) { + kfree_skb(skb); + skb=alloc_skb(mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + connect_reply_upcall(ep, -ENOMEM); + return; + } + } + skb_trim(skb, 0); + skb_reserve(skb, sizeof(*req)); + skb_put(skb, mpalen); + skb->priority = CPL_PRIORITY_DATA; + mpa = (struct mpa_message *) skb->data; + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key)); + mpa->flags = (crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->private_data_size = htons(ep->plen); + mpa->revision = mpa_rev; + + if (ep->plen) + memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + set_arp_failure_handler(skb, arp_failure_discard); + skb->h.raw = skb->data; + len = skb->len; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(len); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_IMM_ACK|F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + l2t_send(ep->com.tdev, skb, ep->l2t); + start_ep_timer(ep); + state_set(&ep->com, MPA_REQ_SENT); + return; +} + +static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + struct sk_buff *skb; + + PDBG("%s ep %p plen %d\n", __FUNCTION__, ep, plen); + + mpalen = sizeof(*mpa) + plen; + + skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __FUNCTION__); + return -ENOMEM; + } + skb_reserve(skb, sizeof(*req)); + mpa = (struct mpa_message *) skb_put(skb, mpalen); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = MPA_REJECT; + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb again. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + skb->priority = CPL_PRIORITY_DATA; + set_arp_failure_handler(skb, arp_failure_discard); + skb->h.raw = skb->data; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(mpalen); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_IMM_ACK|F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + BUG_ON(ep->mpa_skb); + ep->mpa_skb = skb; + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen) +{ + int mpalen; + struct tx_data_wr *req; + struct mpa_message *mpa; + int len; + struct sk_buff *skb; + + PDBG("%s ep %p plen %d\n", __FUNCTION__, ep, plen); + + mpalen = sizeof(*mpa) + plen; + + skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __FUNCTION__); + return -ENOMEM; + } + skb->priority = CPL_PRIORITY_DATA; + skb_reserve(skb, sizeof(*req)); + mpa = (struct mpa_message *) skb_put(skb, mpalen); + memset(mpa, 0, sizeof(*mpa)); + memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key)); + mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) | + (markers_enabled ? MPA_MARKERS : 0); + mpa->revision = mpa_rev; + mpa->private_data_size = htons(plen); + if (plen) + memcpy(mpa->private_data, pdata, plen); + + /* + * Reference the mpa skb. This ensures the data area + * will remain in memory until the hw acks the tx. + * Function tx_ack() will deref it. + */ + skb_get(skb); + set_arp_failure_handler(skb, arp_failure_discard); + skb->h.raw = skb->data; + len = skb->len; + req = (struct tx_data_wr *) skb_push(skb, sizeof(*req)); + req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); + req->wr_lo = htonl(V_WR_TID(ep->hwtid)); + req->len = htonl(len); + req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) | + V_TX_SNDBUF(snd_win>>15)); + req->flags = htonl(F_TX_MORE | F_TX_IMM_ACK | F_TX_INIT); + req->sndseq = htonl(ep->snd_seq); + ep->mpa_skb = skb; + state_set(&ep->com, MPA_REP_SENT); + l2t_send(ep->com.tdev, skb, ep->l2t); + return 0; +} + +static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_act_establish *req = cplhdr(skb); + unsigned int tid = GET_TID(req); + + PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, tid); + + dst_confirm(ep->dst); + + /* setup the hwtid for this connection */ + ep->hwtid = tid; + cxgb3_insert_tid(ep->com.tdev, &t3c_client, ep, tid); + + ep->snd_seq = ntohl(req->snd_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + /* dealloc the atid */ + cxgb3_free_atid(ep->com.tdev, ep->atid); + + /* start MPA negotiation */ + send_mpa_req(ep, skb); + + return 0; +} + +static void abort_connection(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp) +{ + PDBG("%s ep %p\n", __FILE__, ep); + state_set(&ep->com, ABORTING); + send_abort(ep, skb, gfp); +} + +static void close_complete_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + if (ep->com.cm_id) { + PDBG("close complete delivered ep %p cm_id %p tid %d\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void peer_close_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_DISCONNECT; + if (ep->com.cm_id) { + PDBG("peer close delivered ep %p cm_id %p tid %d\n", + ep, ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static void peer_abort_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CLOSE; + event.status = -ECONNRESET; + if (ep->com.cm_id) { + PDBG("abort delivered ep %p cm_id %p tid %d\n", ep, + ep->com.cm_id, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void connect_reply_upcall(struct iwch_ep *ep, int status) +{ + struct iw_cm_event event; + + PDBG("%s ep %p status %d\n", __FUNCTION__, ep, status); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REPLY; + event.status = status; + event.local_addr = ep->com.local_addr; + event.remote_addr = ep->com.remote_addr; + + if ((status == 0) || (status == -ECONNREFUSED)) { + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + } + if (ep->com.cm_id) { + PDBG("%s ep %p tid %d status %d\n", __FUNCTION__, ep, + ep->hwtid, status); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } + if (status < 0) { + ep->com.cm_id->rem_ref(ep->com.cm_id); + ep->com.cm_id = NULL; + ep->com.qp = NULL; + } +} + +static void connect_request_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, ep->hwtid); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_CONNECT_REQUEST; + event.local_addr = ep->com.local_addr; + event.remote_addr = ep->com.remote_addr; + event.private_data_len = ep->plen; + event.private_data = ep->mpa_pkt + sizeof(struct mpa_message); + event.provider_data = ep; + if (state_read(&ep->parent_ep->com) != DEAD) + ep->parent_ep->com.cm_id->event_handler( + ep->parent_ep->com.cm_id, + &event); + put_ep(&ep->parent_ep->com); + ep->parent_ep = NULL; +} + +static void established_upcall(struct iwch_ep *ep) +{ + struct iw_cm_event event; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + memset(&event, 0, sizeof(event)); + event.event = IW_CM_EVENT_ESTABLISHED; + if (ep->com.cm_id) { + PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, ep->hwtid); + ep->com.cm_id->event_handler(ep->com.cm_id, &event); + } +} + +static int update_rx_credits(struct iwch_ep *ep, u32 credits) +{ + struct cpl_rx_data_ack *req; + struct sk_buff *skb; + + PDBG("%s ep %p credits %u\n", __FUNCTION__, ep, credits); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n"); + return 0; + } + + req = (struct cpl_rx_data_ack *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid)); + req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1)); + skb->priority = CPL_PRIORITY_ACK; + ep->com.tdev->send(ep->com.tdev, skb); + return credits; +} + +static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + u16 plen; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + int err; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_SENT) + return; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + err = -EINVAL; + goto err; + } + + /* + * copy the new data into our accumulation buffer. + */ + memcpy(&(ep->mpa_pkt[ep->mpa_pkt_len]), skb->data, skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * if we don't even have the mpa message, then bail. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* Validate MPA header. */ + if (mpa->revision != mpa_rev) { + err = -EPROTO; + goto err; + } + if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { + err = -EPROTO; + goto err; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + err = -EPROTO; + goto err; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + err = -EPROTO; + goto err; + } + + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + if (mpa->flags & MPA_REJECT) { + err = -ECONNREFUSED; + goto err; + } + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. And + * the MPA header is valid. + */ + state_set(&ep->com, FPDU_MODE); + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d\n", __FUNCTION__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ird; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD; + + /* bind QP and TID with INIT_WR */ + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + if (!err) + goto out; +err: + abort_connection(ep, skb, GFP_KERNEL); +out: + connect_reply_upcall(ep, err); + return; +} + +static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb) +{ + struct mpa_message *mpa; + u16 plen; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + /* + * Stop mpa timer. If it expired, then the state has + * changed and we bail since ep_timeout already aborted + * the connection. + */ + stop_ep_timer(ep); + if (state_read(&ep->com) != MPA_REQ_WAIT) + return; + + /* + * If we get more than the supported amount of private data + * then we must fail this connection. + */ + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + PDBG("%s enter (%s line %u)\n", __FUNCTION__, __FILE__, __LINE__); + + /* + * Copy the new data into our accumulation buffer. + */ + memcpy(&(ep->mpa_pkt[ep->mpa_pkt_len]), skb->data, skb->len); + ep->mpa_pkt_len += skb->len; + + /* + * If we don't even have the mpa message, then bail. + * We'll continue process when more data arrives. + */ + if (ep->mpa_pkt_len < sizeof(*mpa)) + return; + PDBG("%s enter (%s line %u)\n", __FUNCTION__, __FILE__, __LINE__); + mpa = (struct mpa_message *) ep->mpa_pkt; + + /* + * Validate MPA Header. + */ + if (mpa->revision != mpa_rev) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + plen = ntohs(mpa->private_data_size); + + /* + * Fail if there's too much private data. + */ + if (plen > MPA_MAX_PRIVATE_DATA) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + + /* + * If plen does not account for pkt size + */ + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { + abort_connection(ep, skb, GFP_KERNEL); + return; + } + ep->plen = (u8) plen; + + /* + * If we don't have all the pdata yet, then bail. + */ + if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) + return; + + /* + * If we get here we have accumulated the entire mpa + * start reply message including private data. + */ + ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0; + ep->mpa_attr.recv_marker_enabled = markers_enabled; + ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0; + ep->mpa_attr.version = mpa_rev; + PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, " + "xmit_marker_enabled=%d, version=%d\n", __FUNCTION__, + ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled, + ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version); + + state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + connect_request_upcall(ep); + return; +} + +static int rx_data(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_rx_data *hdr = cplhdr(skb); + unsigned int dlen = ntohs(hdr->len); + + PDBG("%s ep %p dlen %u\n", __FUNCTION__, ep, dlen); + + skb_pull(skb, sizeof(*hdr)); + skb_trim(skb, dlen); + + switch (state_read(&ep->com)) { + case MPA_REQ_SENT: + process_mpa_reply(ep, skb); + break; + case MPA_REQ_WAIT: + process_mpa_request(ep, skb); + break; + case MPA_REP_SENT: + break; + default: + printk(KERN_ERR MOD "%s Unexpected streaming data." + " ep %p state %d tid %d\n", + __FUNCTION__, ep, state_read(&ep->com), ep->hwtid); + + /* + * The ep will timeout and inform the ULP of the failure. + * See ep_timeout(). + */ + break; + } + + /* update RX credits */ + update_rx_credits(ep, dlen); + + return CPL_RET_BUF_DONE; +} + +/* + * Upcall from the adapter indicating data has been transmitted. + * For us its just the single MPA request or reply. We can now free + * the skb holding the mpa message. + */ +static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_wr_ack *hdr = cplhdr(skb); + unsigned int credits = ntohs(hdr->credits); + enum iwch_qp_attr_mask mask; + + PDBG("%s ep %p credits %u\n", __FUNCTION__, ep, credits); + + if (credits == 0) + return CPL_RET_BUF_DONE; + BUG_ON(credits != 1); + BUG_ON(ep->mpa_skb == NULL); + kfree_skb(ep->mpa_skb); + ep->mpa_skb = NULL; + dst_confirm(ep->dst); + if (state_read(&ep->com) == MPA_REP_SENT) { + struct iwch_qp_attributes attrs; + + /* bind QP to EP and move to RTS */ + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ord; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + /* bind QP and TID with INIT_WR */ + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_MAX_ORD; + + ep->com.rpl_err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + + if (!ep->com.rpl_err) { + state_set(&ep->com, FPDU_MODE); + established_upcall(ep); + } + + ep->com.rpl_done = 1; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + } + return CPL_RET_BUF_DONE; +} + +static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + close_complete_upcall(ep); + state_set(&ep->com, DEAD); + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_act_open_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p status %u errno %d\n", __FUNCTION__, ep, rpl->status, + status2errno(rpl->status)); + connect_reply_upcall(ep, status2errno(rpl->status)); + state_set(&ep->com, DEAD); + if (ep->com.tdev->type == T3B) + release_tid(ep->com.tdev, GET_TID(rpl), NULL); + cxgb3_free_atid(ep->com.tdev, ep->atid); + dst_release(ep->dst); + l2t_release(L2DATA(ep->com.tdev), ep->l2t); + put_ep(&ep->com); + return CPL_RET_BUF_DONE; +} + +static int listen_start(struct iwch_listen_ep *ep) +{ + struct sk_buff *skb; + struct cpl_pass_open_req *req; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "t3c_listen_start failed to alloc skb!\n"); + return -ENOMEM; + } + + req = (struct cpl_pass_open_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid)); + req->local_port = ep->com.local_addr.sin_port; + req->local_ip = ep->com.local_addr.sin_addr.s_addr; + req->peer_port = 0; + req->peer_ip = 0; + req->peer_netmask = 0; + req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS); + req->opt0l = htonl(V_RCV_BUFSIZ(rcv_win>>10)); + req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK)); + + skb->priority = 1; + ep->com.tdev->send(ep->com.tdev, skb); + return 0; +} + +static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_listen_ep *ep = ctx; + struct cpl_pass_open_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p status %d error %d\n", __FUNCTION__, ep, + rpl->status, status2errno(rpl->status)); + ep->com.rpl_err = status2errno(rpl->status); + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + + return CPL_RET_BUF_DONE; +} + +static int listen_stop(struct iwch_listen_ep *ep) +{ + struct sk_buff *skb; + struct cpl_close_listserv_req *req; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb = get_skb(NULL, sizeof(*req), GFP_KERNEL); + if (!skb) { + printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__); + return -ENOMEM; + } + req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req)); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid)); + skb->priority = 1; + ep->com.tdev->send(ep->com.tdev, skb); + return 0; +} + +static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb, + void *ctx) +{ + struct iwch_listen_ep *ep = ctx; + struct cpl_close_listserv_rpl *rpl = cplhdr(skb); + + PDBG("%s ep %p\n", __FUNCTION__, ep); + ep->com.rpl_err = status2errno(rpl->status); + ep->com.rpl_done = 1; + wake_up(&ep->com.waitq); + return CPL_RET_BUF_DONE; +} + +static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb) +{ + struct cpl_pass_accept_rpl *rpl; + unsigned int mtu_idx; + u32 opt0h, opt0l, opt2; + int wscale; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(*rpl)); + skb_get(skb); + mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst)); + wscale = compute_wscale(rcv_win); + opt0h = V_NAGLE(0) | + V_NO_CONG(nocong) | + V_KEEP_ALIVE(1) | + F_TCAM_BYPASS | + V_WND_SCALE(wscale) | + V_MSS_IDX(mtu_idx) | + V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx); + opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10); + opt2 = V_FLAVORS_VALID(1) | V_CONG_CONTROL_FLAVOR(cong_flavor); + + rpl = cplhdr(skb); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ep->hwtid)); + rpl->peer_ip = peer_ip; + rpl->opt0h = htonl(opt0h); + rpl->opt0l_status = htonl(opt0l | CPL_PASS_OPEN_ACCEPT); + rpl->opt2 = htonl(opt2); + rpl->rsvd = rpl->opt2; /* workaround for HW bug */ + skb->priority = CPL_PRIORITY_SETUP; + l2t_send(ep->com.tdev, skb, ep->l2t); + + return; +} + +static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip, + struct sk_buff *skb) +{ + PDBG("%s t3cdev %p tid %u peer_ip %x\n", __FUNCTION__, tdev, hwtid, + peer_ip); + BUG_ON(skb_cloned(skb)); + skb_trim(skb, sizeof(struct cpl_tid_release)); + skb_get(skb); + + if (tdev->type == T3B) + release_tid(tdev, hwtid, skb); + else { + struct cpl_pass_accept_rpl *rpl; + + rpl = cplhdr(skb); + skb->priority = CPL_PRIORITY_SETUP; + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, + hwtid)); + rpl->peer_ip = peer_ip; + rpl->opt0h = htonl(F_TCAM_BYPASS); + rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); + rpl->opt2 = 0; + rpl->rsvd = rpl->opt2; + tdev->send(tdev, skb); + } +} + +static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *child_ep, *parent_ep = ctx; + struct cpl_pass_accept_req *req = cplhdr(skb); + unsigned int hwtid = GET_TID(req); + struct dst_entry *dst; + struct l2t_entry *l2t; + struct rtable *rt; + struct iff_mac tim; + + PDBG("%s parent ep %p tid %u\n", __FUNCTION__, parent_ep, hwtid); + + if (state_read(&parent_ep->com) != LISTEN) { + printk(KERN_ERR "%s - listening ep not in LISTEN\n", + __FUNCTION__); + goto reject; + } + + /* + * Find the netdev for this connection request. + */ + tim.mac_addr = req->dst_mac; + tim.vlan_tag = ntohs(req->vlan_tag); + if (tdev->ctl(tdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { + printk(KERN_ERR + "%s bad dst mac %02x %02x %02x %02x %02x %02x\n", + __FUNCTION__, + req->dst_mac[0], + req->dst_mac[1], + req->dst_mac[2], + req->dst_mac[3], + req->dst_mac[4], + req->dst_mac[5]); + goto reject; + } + + /* Find output route */ + rt = find_route(tdev, + req->local_ip, + req->peer_ip, + req->local_port, + req->peer_port, G_PASS_OPEN_TOS(ntohl(req->tos_tid))); + if (!rt) { + printk(KERN_ERR MOD "%s - failed to find dst entry!\n", + __FUNCTION__); + goto reject; + } + dst = &rt->u.dst; + l2t = t3_l2t_get(tdev, dst->neighbour, dst->neighbour->dev); + if (!l2t) { + printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n", + __FUNCTION__); + dst_release(dst); + goto reject; + } + child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL); + if (!child_ep) { + printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n", + __FUNCTION__); + l2t_release(L2DATA(tdev), l2t); + dst_release(dst); + goto reject; + } + state_set(&child_ep->com, CONNECTING); + child_ep->com.tdev = tdev; + child_ep->com.cm_id = NULL; + child_ep->com.local_addr.sin_family = PF_INET; + child_ep->com.local_addr.sin_port = req->local_port; + child_ep->com.local_addr.sin_addr.s_addr = req->local_ip; + child_ep->com.remote_addr.sin_family = PF_INET; + child_ep->com.remote_addr.sin_port = req->peer_port; + child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip; + get_ep(&parent_ep->com); + child_ep->parent_ep = parent_ep; + child_ep->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid)); + child_ep->l2t = l2t; + child_ep->dst = dst; + child_ep->hwtid = hwtid; + init_timer(&child_ep->timer); + cxgb3_insert_tid(tdev, &t3c_client, child_ep, hwtid); + accept_cr(child_ep, req->peer_ip, skb); + goto out; +reject: + reject_cr(tdev, hwtid, req->peer_ip, skb); +out: + return CPL_RET_BUF_DONE; +} + +static int pass_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct cpl_pass_establish *req = cplhdr(skb); + + PDBG("%s ep %p\n", __FUNCTION__, ep); + ep->snd_seq = ntohl(req->snd_isn); + + set_emss(ep, ntohs(req->tcp_opt)); + + dst_confirm(ep->dst); + state_set(&ep->com, MPA_REQ_WAIT); + start_ep_timer(ep); + + return CPL_RET_BUF_DONE; +} + +static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct iwch_qp_attributes attrs; + unsigned long flags; + int disconnect = 1; + int release = 0; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + dst_confirm(ep->dst); + + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case MPA_REQ_WAIT: + __state_set(&ep->com, CLOSING); + break; + case MPA_REQ_SENT: + __state_set(&ep->com, CLOSING); + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. + */ + __state_set(&ep->com, CLOSING); + get_ep(&ep->com); + break; + case MPA_REP_SENT: + __state_set(&ep->com, CLOSING); + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case FPDU_MODE: + __state_set(&ep->com, CLOSING); + attrs.next_state = IWCH_QP_STATE_CLOSING; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + peer_close_upcall(ep); + break; + case ABORTING: + disconnect = 0; + break; + case CLOSING: + start_ep_timer(ep); + __state_set(&ep->com, MORIBUND); + disconnect = 0; + break; + case MORIBUND: + stop_ep_timer(ep); + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, &attrs, 1); + } + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + disconnect = 0; + break; + case DEAD: + disconnect = 0; + break; + default: + BUG_ON(1); + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (disconnect) + iwch_ep_disconnect(ep, 0, GFP_KERNEL); + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * Returns whether an ABORT_REQ_RSS message is a negative advice. + */ +static inline int is_neg_adv_abort(unsigned int status) +{ + return status == CPL_ERR_RTX_NEG_ADVICE || + status == CPL_ERR_PERSIST_NEG_ADVICE; +} + +static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct iwch_ep *ep = ctx; + struct cpl_abort_rpl *rpl; + struct sk_buff *rpl_skb; + struct iwch_qp_attributes attrs; + int ret; + int state; + + if (is_neg_adv_abort(req->status)) { + PDBG("%s neg_adv_abort ep %p tid %d\n", __FUNCTION__, ep, + ep->hwtid); + t3_l2t_send_event(ep->com.tdev, ep->l2t); + return CPL_RET_BUF_DONE; + } + + state = state_read(&ep->com); + PDBG("%s ep %p state %u\n", __FUNCTION__, ep, state); + switch (state) { + case CONNECTING: + break; + case MPA_REQ_WAIT: + break; + case MPA_REQ_SENT: + connect_reply_upcall(ep, -ECONNRESET); + break; + case MPA_REP_SENT: + ep->com.rpl_done = 1; + ep->com.rpl_err = -ECONNRESET; + PDBG("waking up ep %p\n", ep); + wake_up(&ep->com.waitq); + break; + case MPA_REQ_RCVD: + + /* + * We're gonna mark this puppy DEAD, but keep + * the reference on it until the ULP accepts or + * rejects the CR. + */ + get_ep(&ep->com); + break; + case MORIBUND: + stop_ep_timer(ep); + case FPDU_MODE: + case CLOSING: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_ERROR; + ret = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (ret) + printk(KERN_ERR MOD + "%s - qp <- error failed!\n", + __FUNCTION__); + } + peer_abort_upcall(ep); + break; + case ABORTING: + break; + case DEAD: + PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __FUNCTION__); + return CPL_RET_BUF_DONE; + default: + BUG_ON(1); + break; + } + dst_confirm(ep->dst); + + rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); + if (!rpl_skb) { + printk(KERN_ERR MOD "%s - cannot allocate skb!\n", + __FUNCTION__); + dst_release(ep->dst); + l2t_release(L2DATA(ep->com.tdev), ep->l2t); + put_ep(&ep->com); + return CPL_RET_BUF_DONE; + } + rpl_skb->priority = CPL_PRIORITY_DATA; + rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl)); + rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); + rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid)); + OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid)); + rpl->cmd = CPL_ABORT_NO_RST; + ep->com.tdev->send(ep->com.tdev, rpl_skb); + if (state != ABORTING) { + state_set(&ep->com, DEAD); + release_ep_resources(ep); + } + return CPL_RET_BUF_DONE; +} + +static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + struct iwch_qp_attributes attrs; + unsigned long flags; + int release = 0; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + BUG_ON(!ep); + + /* The cm_id may be null if we failed to connect */ + spin_lock_irqsave(&ep->com.lock, flags); + switch (ep->com.state) { + case CLOSING: + start_ep_timer(ep); + __state_set(&ep->com, MORIBUND); + break; + case MORIBUND: + stop_ep_timer(ep); + if ((ep->com.cm_id) && (ep->com.qp)) { + attrs.next_state = IWCH_QP_STATE_IDLE; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, + IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + close_complete_upcall(ep); + __state_set(&ep->com, DEAD); + release = 1; + break; + case DEAD: + default: + BUG_ON(1); + break; + } + spin_unlock_irqrestore(&ep->com.lock, flags); + if (release) + release_ep_resources(ep); + return CPL_RET_BUF_DONE; +} + +/* + * T3A does 3 things when a TERM is received: + * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet + * 2) generate an async event on the QP with the TERMINATE opcode + * 3) post a TERMINATE opcde cqe into the associated CQ. + * + * For (1), we save the message in the qp for later consumer consumption. + * For (2), we move the QP into TERMINATE, post a QP event and disconnect. + * For (3), we toss the CQE in cxio_poll_cq(). + * + * terminate() handles case (1)... + */ +static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep *ep = ctx; + + PDBG("%s ep %p\n", __FUNCTION__, ep); + skb_pull(skb, sizeof(struct cpl_rdma_terminate)); + PDBG("%s saving %d bytes of term msg\n", __FUNCTION__, skb->len); + memcpy(ep->com.qp->attr.terminate_buffer, skb->data, skb->len); + ep->com.qp->attr.terminate_msg_len = skb->len; + ep->com.qp->attr.is_terminate_local = 0; + return CPL_RET_BUF_DONE; +} + +static int ec_status(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct cpl_rdma_ec_status *rep = cplhdr(skb); + struct iwch_ep *ep = ctx; + + PDBG("%s ep %p tid %u status %d\n", __FUNCTION__, ep, ep->hwtid, + rep->status); + if (rep->status) { + struct iwch_qp_attributes attrs; + + printk(KERN_ERR MOD "%s BAD CLOSE - Aborting tid %u\n", + __FUNCTION__, ep->hwtid); + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + abort_connection(ep, NULL, GFP_KERNEL); + } + return CPL_RET_BUF_DONE; +} + +static void ep_timeout(unsigned long arg) +{ + struct iwch_ep *ep = (struct iwch_ep *)arg; + struct iwch_qp_attributes attrs; + unsigned long flags; + + spin_lock_irqsave(&ep->com.lock, flags); + PDBG("%s ep %p tid %u state %d\n", __FUNCTION__, ep, ep->hwtid, + ep->com.state); + switch (ep->com.state) { + case MPA_REQ_SENT: + connect_reply_upcall(ep, -ETIMEDOUT); + break; + case MPA_REQ_WAIT: + break; + case MORIBUND: + if (ep->com.cm_id && ep->com.qp) { + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + } + break; + default: + BUG(); + } + __state_set(&ep->com, CLOSING); + spin_unlock_irqrestore(&ep->com.lock, flags); + abort_connection(ep, NULL, GFP_ATOMIC); + put_ep(&ep->com); +} + +int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err; + struct iwch_ep *ep = to_ep(cm_id); + PDBG("%s ep %p tid %u\n", __FUNCTION__, ep, ep->hwtid); + + if (state_read(&ep->com) == DEAD) { + put_ep(&ep->com); + return -ECONNRESET; + } + BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + state_set(&ep->com, CLOSING); + if (mpa_rev == 0) + abort_connection(ep, NULL, GFP_KERNEL); + else { + err = send_mpa_reject(ep, pdata, pdata_len); + err = send_halfclose(ep, GFP_KERNEL); + } + return 0; +} + +int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err; + struct iwch_qp_attributes attrs; + enum iwch_qp_attr_mask mask; + struct iwch_ep *ep = to_ep(cm_id); + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_qp *qp = get_qhp(h, conn_param->qpn); + + PDBG("%s ep %p tid %u\n", __FUNCTION__, ep, ep->hwtid); + if (state_read(&ep->com) == DEAD) { + put_ep(&ep->com); + return -ECONNRESET; + } + + BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD); + BUG_ON(!qp); + + if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) || + (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) { + abort_connection(ep, NULL, GFP_KERNEL); + return -EINVAL; + } + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = qp; + + ep->com.rpl_done = 0; + ep->com.rpl_err = 0; + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + PDBG("%s %d ird %d ord %d\n", __FUNCTION__, __LINE__, ep->ird, ep->ord); + get_ep(&ep->com); + err = send_mpa_reply(ep, conn_param->private_data, + conn_param->private_data_len); + if (err) { + ep->com.cm_id = NULL; + ep->com.qp = NULL; + cm_id->rem_ref(cm_id); + abort_connection(ep, NULL, GFP_KERNEL); + put_ep(&ep->com); + return err; + } + + /* bind QP to EP and move to RTS */ + attrs.mpa_attr = ep->mpa_attr; + attrs.max_ird = ep->ord; + attrs.max_ord = ep->ord; + attrs.llp_stream_handle = ep; + attrs.next_state = IWCH_QP_STATE_RTS; + + /* bind QP and TID with INIT_WR */ + mask = IWCH_QP_ATTR_NEXT_STATE | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_MAX_ORD; + + err = iwch_modify_qp(ep->com.qp->rhp, + ep->com.qp, mask, &attrs, 1); + + if (err) { + ep->com.cm_id = NULL; + ep->com.qp = NULL; + cm_id->rem_ref(cm_id); + abort_connection(ep, NULL, GFP_KERNEL); + } else { + state_set(&ep->com, FPDU_MODE); + established_upcall(ep); + } + put_ep(&ep->com); + return err; +} + +int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) +{ + int err = 0; + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_ep *ep; + struct rtable *rt; + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __FUNCTION__); + err = -ENOMEM; + goto out; + } + init_timer(&ep->timer); + ep->plen = conn_param->private_data_len; + if (ep->plen) + memcpy(ep->mpa_pkt + sizeof(struct mpa_message), + conn_param->private_data, ep->plen); + ep->ird = conn_param->ird; + ep->ord = conn_param->ord; + ep->com.tdev = h->rdev.t3cdev_p; + + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->com.qp = get_qhp(h, conn_param->qpn); + BUG_ON(!ep->com.qp); + PDBG("%s qpn 0x%x qp %p cm_id %p\n", __FUNCTION__, conn_param->qpn, + ep->com.qp, cm_id); + + /* + * Allocate an active TID to initiate a TCP connection. + */ + ep->atid = cxgb3_alloc_atid(h->rdev.t3cdev_p, &t3c_client, ep); + if (ep->atid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __FUNCTION__); + err = -ENOMEM; + goto fail2; + } + + /* find a route */ + rt = find_route(h->rdev.t3cdev_p, + cm_id->local_addr.sin_addr.s_addr, + cm_id->remote_addr.sin_addr.s_addr, + cm_id->local_addr.sin_port, + cm_id->remote_addr.sin_port, IPTOS_LOWDELAY); + if (!rt) { + printk(KERN_ERR MOD "%s - cannot find route.\n", __FUNCTION__); + err = -EHOSTUNREACH; + goto fail3; + } + ep->dst = &rt->u.dst; + + /* get a l2t entry */ + ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst->neighbour, + ep->dst->neighbour->dev); + if (!ep->l2t) { + printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __FUNCTION__); + err = -ENOMEM; + goto fail4; + } + + state_set(&ep->com, CONNECTING); + ep->tos = IPTOS_LOWDELAY; + ep->com.local_addr = cm_id->local_addr; + ep->com.remote_addr = cm_id->remote_addr; + + /* send connect request to rnic */ + err = send_connect(ep); + if (!err) + goto out; + + l2t_release(L2DATA(h->rdev.t3cdev_p), ep->l2t); +fail4: + dst_release(ep->dst); +fail3: + cxgb3_free_atid(ep->com.tdev, ep->atid); +fail2: + put_ep(&ep->com); +out: + return err; +} + +int iwch_create_listen(struct iw_cm_id *cm_id, int backlog) +{ + int err = 0; + struct iwch_dev *h = to_iwch_dev(cm_id->device); + struct iwch_listen_ep *ep; + + + might_sleep(); + + ep = alloc_ep(sizeof(*ep), GFP_KERNEL); + if (!ep) { + printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __FUNCTION__); + err = -ENOMEM; + goto fail1; + } + PDBG("%s ep %p\n", __FUNCTION__, ep); + ep->com.tdev = h->rdev.t3cdev_p; + cm_id->add_ref(cm_id); + ep->com.cm_id = cm_id; + ep->backlog = backlog; + ep->com.local_addr = cm_id->local_addr; + + /* + * Allocate a server TID. + */ + ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep); + if (ep->stid == -1) { + printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __FUNCTION__); + err = -ENOMEM; + goto fail2; + } + + state_set(&ep->com, LISTEN); + err = listen_start(ep); + if (err) + goto fail3; + + /* wait for pass_open_rpl */ + wait_event(ep->com.waitq, ep->com.rpl_done); + err = ep->com.rpl_err; + if (!err) { + cm_id->provider_data = ep; + goto out; + } +fail3: + cxgb3_free_stid(ep->com.tdev, ep->stid); +fail2: + put_ep(&ep->com); +fail1: +out: + return err; +} + +int iwch_destroy_listen(struct iw_cm_id *cm_id) +{ + int err; + struct iwch_listen_ep *ep = to_listen_ep(cm_id); + + PDBG("%s ep %p\n", __FUNCTION__, ep); + + might_sleep(); + state_set(&ep->com, DEAD); + ep->com.rpl_done = 0; + ep->com.rpl_err = 0; + err = listen_stop(ep); + wait_event(ep->com.waitq, ep->com.rpl_done); + cxgb3_free_stid(ep->com.tdev, ep->stid); + err = ep->com.rpl_err; + cm_id->rem_ref(cm_id); + put_ep(&ep->com); + return err; +} + +int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp) +{ + int ret=0; + unsigned long flags; + int close = 0; + + spin_lock_irqsave(&ep->com.lock, flags); + + PDBG("%s ep %p state %s, abrupt %d\n", __FUNCTION__, ep, + states[ep->com.state], abrupt); + + if (ep->com.state == DEAD) { + PDBG("%s already dead ep %p\n", __FUNCTION__, ep); + goto out; + } + + if (abrupt) { + if (ep->com.state != ABORTING) { + ep->com.state = ABORTING; + close = 1; + } + goto out; + } + + switch (ep->com.state) { + case MPA_REQ_WAIT: + case MPA_REQ_SENT: + case MPA_REQ_RCVD: + case MPA_REP_SENT: + case FPDU_MODE: + ep->com.state = CLOSING; + close = 1; + break; + case CLOSING: + start_ep_timer(ep); + ep->com.state = MORIBUND; + close = 1; + break; + case MORIBUND: + break; + default: + BUG(); + break; + } +out: + spin_unlock_irqrestore(&ep->com.lock, flags); + if (close) { + if (abrupt) + ret = send_abort(ep, NULL, gfp); + else + ret = send_halfclose(ep, gfp); + } + return ret; +} + +int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, + struct l2t_entry *l2t) +{ + struct iwch_ep *ep = ctx; + + if (ep->dst != old) + return 0; + + PDBG("%s ep %p redirect to dst %p l2t %p\n", __FUNCTION__, ep, new, + l2t); + dst_hold(new); + l2t_release(L2DATA(ep->com.tdev), ep->l2t); + ep->l2t = l2t; + dst_release(old); + ep->dst = new; + return 1; +} + +/* + * All the CM events are handled on a work queue to have a safe context. + */ +static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx) +{ + struct iwch_ep_common *epc = ctx; + + get_ep(epc); + + /* + * Save ctx and tdev in the skb->cb area. + */ + *((void **) skb->cb) = ctx; + *((struct t3cdev **) (skb->cb + sizeof(void *))) = tdev; + + /* + * Queue the skb and schedule the worker thread. + */ + skb_queue_tail(&rxq, skb); + queue_work(workq, &skb_work); + return 0; +} + +int __init iwch_cm_init(void) +{ + skb_queue_head_init(&rxq); + + workq = create_singlethread_workqueue("iw_cxgb3"); + if (!workq) + return -ENOMEM; + + /* + * All upcalls from the T3 Core go to sched() to + * schedule the processing on a work queue. + */ + t3c_handlers[CPL_ACT_ESTABLISH] = sched; + t3c_handlers[CPL_ACT_OPEN_RPL] = sched; + t3c_handlers[CPL_RX_DATA] = sched; + t3c_handlers[CPL_TX_DMA_ACK] = sched; + t3c_handlers[CPL_ABORT_RPL_RSS] = sched; + t3c_handlers[CPL_ABORT_RPL] = sched; + t3c_handlers[CPL_PASS_OPEN_RPL] = sched; + t3c_handlers[CPL_CLOSE_LISTSRV_RPL] = sched; + t3c_handlers[CPL_PASS_ACCEPT_REQ] = sched; + t3c_handlers[CPL_PASS_ESTABLISH] = sched; + t3c_handlers[CPL_PEER_CLOSE] = sched; + t3c_handlers[CPL_CLOSE_CON_RPL] = sched; + t3c_handlers[CPL_ABORT_REQ_RSS] = sched; + t3c_handlers[CPL_RDMA_TERMINATE] = sched; + t3c_handlers[CPL_RDMA_EC_STATUS] = sched; + + /* + * These are the real handlers that are called from a + * work queue. + */ + work_handlers[CPL_ACT_ESTABLISH] = act_establish; + work_handlers[CPL_ACT_OPEN_RPL] = act_open_rpl; + work_handlers[CPL_RX_DATA] = rx_data; + work_handlers[CPL_TX_DMA_ACK] = tx_ack; + work_handlers[CPL_ABORT_RPL_RSS] = abort_rpl; + work_handlers[CPL_ABORT_RPL] = abort_rpl; + work_handlers[CPL_PASS_OPEN_RPL] = pass_open_rpl; + work_handlers[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl; + work_handlers[CPL_PASS_ACCEPT_REQ] = pass_accept_req; + work_handlers[CPL_PASS_ESTABLISH] = pass_establish; + work_handlers[CPL_PEER_CLOSE] = peer_close; + work_handlers[CPL_ABORT_REQ_RSS] = peer_abort; + work_handlers[CPL_CLOSE_CON_RPL] = close_con_rpl; + work_handlers[CPL_RDMA_TERMINATE] = terminate; + work_handlers[CPL_RDMA_EC_STATUS] = ec_status; + return 0; +} + +void __exit iwch_cm_term(void) +{ + flush_workqueue(workq); + destroy_workqueue(workq); +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h new file mode 100644 index 00000000000..7c810d90427 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _IWCH_CM_H_ +#define _IWCH_CM_H_ + +#include +#include +#include +#include + +#include +#include + +#include "cxgb3_offload.h" +#include "iwch_provider.h" + +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" + +#define MPA_MAX_PRIVATE_DATA 256 +#define MPA_REV 0 /* XXX - amso1100 uses rev 0 ! */ +#define MPA_REJECT 0x20 +#define MPA_CRC 0x40 +#define MPA_MARKERS 0x80 +#define MPA_FLAGS_MASK 0xE0 + +#define put_ep(ep) { \ + PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __FUNCTION__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + kref_put(&((ep)->kref), __free_ep); \ +} + +#define get_ep(ep) { \ + PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __FUNCTION__, __LINE__, \ + ep, atomic_read(&((ep)->kref.refcount))); \ + kref_get(&((ep)->kref)); \ +} + +struct mpa_message { + u8 key[16]; + u8 flags; + u8 revision; + __be16 private_data_size; + u8 private_data[0]; +}; + +struct terminate_message { + u8 layer_etype; + u8 ecode; + __be16 hdrct_rsvd; + u8 len_hdrs[0]; +}; + +#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28) + +enum iwch_layers_types { + LAYER_RDMAP = 0x00, + LAYER_DDP = 0x10, + LAYER_MPA = 0x20, + RDMAP_LOCAL_CATA = 0x00, + RDMAP_REMOTE_PROT = 0x01, + RDMAP_REMOTE_OP = 0x02, + DDP_LOCAL_CATA = 0x00, + DDP_TAGGED_ERR = 0x01, + DDP_UNTAGGED_ERR = 0x02, + DDP_LLP = 0x03 +}; + +enum iwch_rdma_ecodes { + RDMAP_INV_STAG = 0x00, + RDMAP_BASE_BOUNDS = 0x01, + RDMAP_ACC_VIOL = 0x02, + RDMAP_STAG_NOT_ASSOC = 0x03, + RDMAP_TO_WRAP = 0x04, + RDMAP_INV_VERS = 0x05, + RDMAP_INV_OPCODE = 0x06, + RDMAP_STREAM_CATA = 0x07, + RDMAP_GLOBAL_CATA = 0x08, + RDMAP_CANT_INV_STAG = 0x09, + RDMAP_UNSPECIFIED = 0xff +}; + +enum iwch_ddp_ecodes { + DDPT_INV_STAG = 0x00, + DDPT_BASE_BOUNDS = 0x01, + DDPT_STAG_NOT_ASSOC = 0x02, + DDPT_TO_WRAP = 0x03, + DDPT_INV_VERS = 0x04, + DDPU_INV_QN = 0x01, + DDPU_INV_MSN_NOBUF = 0x02, + DDPU_INV_MSN_RANGE = 0x03, + DDPU_INV_MO = 0x04, + DDPU_MSG_TOOBIG = 0x05, + DDPU_INV_VERS = 0x06 +}; + +enum iwch_mpa_ecodes { + MPA_CRC_ERR = 0x02, + MPA_MARKER_ERR = 0x03 +}; + +enum iwch_ep_state { + IDLE = 0, + LISTEN, + CONNECTING, + MPA_REQ_WAIT, + MPA_REQ_SENT, + MPA_REQ_RCVD, + MPA_REP_SENT, + FPDU_MODE, + ABORTING, + CLOSING, + MORIBUND, + DEAD, +}; + +struct iwch_ep_common { + struct iw_cm_id *cm_id; + struct iwch_qp *qp; + struct t3cdev *tdev; + enum iwch_ep_state state; + struct kref kref; + spinlock_t lock; + struct sockaddr_in local_addr; + struct sockaddr_in remote_addr; + wait_queue_head_t waitq; + int rpl_done; + int rpl_err; +}; + +struct iwch_listen_ep { + struct iwch_ep_common com; + unsigned int stid; + int backlog; +}; + +struct iwch_ep { + struct iwch_ep_common com; + struct iwch_ep *parent_ep; + struct timer_list timer; + unsigned int atid; + u32 hwtid; + u32 snd_seq; + struct l2t_entry *l2t; + struct dst_entry *dst; + struct sk_buff *mpa_skb; + struct iwch_mpa_attributes mpa_attr; + unsigned int mpa_pkt_len; + u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA]; + u8 tos; + u16 emss; + u16 plen; + u32 ird; + u32 ord; +}; + +static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id) +{ + return cm_id->provider_data; +} + +static inline int compute_wscale(int win) +{ + int wscale = 0; + + while (wscale < 14 && (65535<cq); + + if (!rd_cqe) + return 0; + + qhp = get_qhp(rhp, CQE_QPID(*rd_cqe)); + if (!qhp) + wq = NULL; + else { + spin_lock(&qhp->lock); + wq = &(qhp->wq); + } + ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, + &credit); + if (t3a_device(chp->rhp) && credit) { + PDBG("%s updating %d cq credits on id %d\n", __FUNCTION__, + credit, chp->cq.cqid); + cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit); + } + + if (ret) { + ret = -EAGAIN; + goto out; + } + ret = 1; + + wc->wr_id = cookie; + wc->qp = &qhp->ibqp; + wc->vendor_err = CQE_STATUS(cqe); + + PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x " + "lo 0x%x cookie 0x%llx\n", __FUNCTION__, + CQE_QPID(cqe), CQE_TYPE(cqe), + CQE_OPCODE(cqe), CQE_STATUS(cqe), CQE_WRID_HI(cqe), + CQE_WRID_LOW(cqe), (unsigned long long) cookie); + + if (CQE_TYPE(cqe) == 0) { + if (!CQE_STATUS(cqe)) + wc->byte_len = CQE_LEN(cqe); + else + wc->byte_len = 0; + wc->opcode = IB_WC_RECV; + } else { + switch (CQE_OPCODE(cqe)) { + case T3_RDMA_WRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case T3_READ_REQ: + wc->opcode = IB_WC_RDMA_READ; + wc->byte_len = CQE_LEN(cqe); + break; + case T3_SEND: + case T3_SEND_WITH_SE: + wc->opcode = IB_WC_SEND; + break; + case T3_BIND_MW: + wc->opcode = IB_WC_BIND_MW; + break; + + /* these aren't supported yet */ + case T3_SEND_WITH_INV: + case T3_SEND_WITH_SE_INV: + case T3_LOCAL_INV: + case T3_FAST_REGISTER: + default: + printk(KERN_ERR MOD "Unexpected opcode %d " + "in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + goto out; + } + } + + if (cqe_flushed) + wc->status = IB_WC_WR_FLUSH_ERR; + else { + + switch (CQE_STATUS(cqe)) { + case TPT_ERR_SUCCESS: + wc->status = IB_WC_SUCCESS; + break; + case TPT_ERR_STAG: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_PDID: + wc->status = IB_WC_LOC_PROT_ERR; + break; + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + wc->status = IB_WC_LOC_ACCESS_ERR; + break; + case TPT_ERR_WRAP: + wc->status = IB_WC_GENERAL_ERR; + break; + case TPT_ERR_BOUND: + wc->status = IB_WC_LOC_LEN_ERR; + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + wc->status = IB_WC_MW_BIND_ERR; + break; + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + case TPT_ERR_OPCODE: + wc->status = IB_WC_FATAL_ERR; + break; + case TPT_ERR_SWFLUSH: + wc->status = IB_WC_WR_FLUSH_ERR; + break; + default: + printk(KERN_ERR MOD "Unexpected cqe_status 0x%x for " + "QPID=0x%0x\n", CQE_STATUS(cqe), CQE_QPID(cqe)); + ret = -EINVAL; + } + } +out: + if (wq) + spin_unlock(&qhp->lock); + return ret; +} + +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + unsigned long flags; + int npolled; + int err = 0; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + + spin_lock_irqsave(&chp->lock, flags); + for (npolled = 0; npolled < num_entries; ++npolled) { +#ifdef DEBUG + int i=0; +#endif + + /* + * Because T3 can post CQEs that are _not_ associated + * with a WR, we might have to poll again after removing + * one of these. + */ + do { + err = iwch_poll_cq_one(rhp, chp, wc + npolled); +#ifdef DEBUG + BUG_ON(++i > 1000); +#endif + } while (err == -EAGAIN); + if (err <= 0) + break; + } + spin_unlock_irqrestore(&chp->lock, flags); + + if (err < 0) + return err; + else { + return npolled; + } +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c new file mode 100644 index 00000000000..a6efa8fe15d --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include "iwch_provider.h" +#include "iwch.h" +#include "iwch_cm.h" +#include "cxio_hal.h" +#include "cxio_wr.h" + +static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp, + struct respQ_msg_t *rsp_msg, + enum ib_event_type ib_event, + int send_term) +{ + struct ib_event event; + struct iwch_qp_attributes attrs; + struct iwch_qp *qhp; + + printk(KERN_ERR "%s - AE qpid 0x%x opcode %d status 0x%x " + "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__, + CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe), + CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + + spin_lock(&rnicp->lock); + qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); + + if (!qhp) { + printk(KERN_ERR "%s unaffiliated error 0x%x qpid 0x%x\n", + __FUNCTION__, CQE_STATUS(rsp_msg->cqe), + CQE_QPID(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + return; + } + + if ((qhp->attr.state == IWCH_QP_STATE_ERROR) || + (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) { + PDBG("%s AE received after RTS - " + "qp state %d qpid 0x%x status 0x%x\n", __FUNCTION__, + qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + return; + } + + atomic_inc(&qhp->refcnt); + spin_unlock(&rnicp->lock); + + event.event = ib_event; + event.device = chp->ibcq.device; + if (ib_event == IB_EVENT_CQ_ERR) + event.element.cq = &chp->ibcq; + else + event.element.qp = &qhp->ibqp; + + if (qhp->ibqp.event_handler) + (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context); + + if (qhp->attr.state == IWCH_QP_STATE_RTS) { + attrs.next_state = IWCH_QP_STATE_TERMINATE; + iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (send_term) + iwch_post_terminate(qhp, rsp_msg); + } + + if (atomic_dec_and_test(&qhp->refcnt)) + wake_up(&qhp->wait); +} + +void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb) +{ + struct iwch_dev *rnicp; + struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data; + struct iwch_cq *chp; + struct iwch_qp *qhp; + u32 cqid = RSPQ_CQID(rsp_msg); + + rnicp = (struct iwch_dev *) rdev_p->ulp; + spin_lock(&rnicp->lock); + chp = get_chp(rnicp, cqid); + qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe)); + if (!chp || !qhp) { + printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d " + "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x \n", + cqid, CQE_QPID(rsp_msg->cqe), + CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe), + CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe), + CQE_WRID_LOW(rsp_msg->cqe)); + spin_unlock(&rnicp->lock); + goto out; + } + iwch_qp_add_ref(&qhp->ibqp); + atomic_inc(&chp->refcnt); + spin_unlock(&rnicp->lock); + + /* + * 1) completion of our sending a TERMINATE. + * 2) incoming TERMINATE message. + */ + if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) && + (CQE_STATUS(rsp_msg->cqe) == 0)) { + if (SQ_TYPE(rsp_msg->cqe)) { + PDBG("%s QPID 0x%x ep %p disconnecting\n", + __FUNCTION__, qhp->wq.qpid, qhp->ep); + iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC); + } else { + PDBG("%s post REQ_ERR AE QPID 0x%x\n", __FUNCTION__, + qhp->wq.qpid); + post_qp_event(rnicp, chp, rsp_msg, + IB_EVENT_QP_REQ_ERR, 0); + iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC); + } + goto done; + } + + /* Bad incoming Read request */ + if (SQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) { + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + /* Bad incoming write */ + if (RQ_TYPE(rsp_msg->cqe) && + (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) { + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1); + goto done; + } + + switch (CQE_STATUS(rsp_msg->cqe)) { + + /* Completion Events */ + case TPT_ERR_SUCCESS: + + /* + * Confirm the destination entry if this is a RECV completion. + */ + if (qhp->ep && SQ_TYPE(rsp_msg->cqe)) + dst_confirm(qhp->ep->dst); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + break; + + case TPT_ERR_STAG: + case TPT_ERR_PDID: + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + case TPT_ERR_WRAP: + case TPT_ERR_BOUND: + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + printk(KERN_ERR "%s - CQE Err qpid 0x%x opcode %d status 0x%x " + "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__, + CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe), + CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe), + CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe)); + (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1); + break; + + /* Device Fatal Errors */ + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1); + break; + + /* QP Fatal Errors */ + case TPT_ERR_OUT_OF_RQE: + case TPT_ERR_PBL_ADDR_BOUND: + case TPT_ERR_CRC: + case TPT_ERR_MARKER: + case TPT_ERR_PDU_LEN_ERR: + case TPT_ERR_DDP_VERSION: + case TPT_ERR_RDMA_VERSION: + case TPT_ERR_OPCODE: + case TPT_ERR_DDP_QUEUE_NUM: + case TPT_ERR_MSN: + case TPT_ERR_TBIT: + case TPT_ERR_MO: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_RQE_ADDR_BOUND: + case TPT_ERR_IRD_OVERFLOW: + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + + default: + printk(KERN_ERR MOD "Unknown T3 status 0x%x QPID 0x%x\n", + CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid); + post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1); + break; + } +done: + if (atomic_dec_and_test(&chp->refcnt)) + wake_up(&chp->wait); + iwch_qp_rem_ref(&qhp->ibqp); +out: + dev_kfree_skb_irq(skb); +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c new file mode 100644 index 00000000000..2b6cd53bb3f --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#include +#include + +#include "cxio_hal.h" +#include "iwch.h" +#include "iwch_provider.h" + +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list) +{ + u32 stag; + u32 mmid; + + + if (cxio_register_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift-12, + page_list, + &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + return -ENOMEM; + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + insert_handle(rhp, &rhp->mmidr, mhp, mmid); + PDBG("%s mmid 0x%x mhp %p\n", __FUNCTION__, mmid, mhp); + return 0; +} + +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list, + int npages) +{ + u32 stag; + u32 mmid; + + + /* We could support this... */ + if (npages > mhp->attr.pbl_size) + return -ENOMEM; + + stag = mhp->attr.stag; + if (cxio_reregister_phys_mem(&rhp->rdev, + &stag, mhp->attr.pdid, + mhp->attr.perms, + mhp->attr.zbva, + mhp->attr.va_fbo, + mhp->attr.len, + shift-12, + page_list, + &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + return -ENOMEM; + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + insert_handle(rhp, &rhp->mmidr, mhp, mmid); + PDBG("%s mmid 0x%x mhp %p\n", __FUNCTION__, mmid, mhp); + return 0; +} + +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list) +{ + u64 mask; + int i, j, n; + + mask = 0; + *total_size = 0; + for (i = 0; i < num_phys_buf; ++i) { + if (i != 0 && buffer_list[i].addr & ~PAGE_MASK) + return -EINVAL; + if (i != 0 && i != num_phys_buf - 1 && + (buffer_list[i].size & ~PAGE_MASK)) + return -EINVAL; + *total_size += buffer_list[i].size; + if (i > 0) + mask |= buffer_list[i].addr; + } + + if (*total_size > 0xFFFFFFFFULL) + return -ENOMEM; + + /* Find largest page shift we can use to cover buffers */ + for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift)) + if (num_phys_buf > 1) { + if ((1ULL << *shift) & mask) + break; + } else + if (1ULL << *shift >= + buffer_list[0].size + + (buffer_list[0].addr & ((1ULL << *shift) - 1))) + break; + + buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1); + buffer_list[0].addr &= ~0ull << *shift; + + *npages = 0; + for (i = 0; i < num_phys_buf; ++i) + *npages += (buffer_list[i].size + + (1ULL << *shift) - 1) >> *shift; + + if (!*npages) + return -EINVAL; + + *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL); + if (!*page_list) + return -ENOMEM; + + n = 0; + for (i = 0; i < num_phys_buf; ++i) + for (j = 0; + j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift; + ++j) + (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr + + ((u64) j << *shift)); + + PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n", + __FUNCTION__, (unsigned long long) *iova_start, + (unsigned long long) mask, *shift, (unsigned long long) *total_size, + *npages); + + return 0; + +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c new file mode 100644 index 00000000000..6861087d776 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -0,0 +1,1203 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "cxio_hal.h" +#include "iwch.h" +#include "iwch_provider.h" +#include "iwch_cm.h" +#include "iwch_user.h" + +static int iwch_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + return -ENOSYS; +} + +static struct ib_ah *iwch_ah_create(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + return ERR_PTR(-ENOSYS); +} + +static int iwch_ah_destroy(struct ib_ah *ah) +{ + return -ENOSYS; +} + +static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + return -ENOSYS; +} + +static int iwch_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + struct ib_wc *in_wc, + struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + return -ENOSYS; +} + +static int iwch_dealloc_ucontext(struct ib_ucontext *context) +{ + struct iwch_dev *rhp = to_iwch_dev(context->device); + struct iwch_ucontext *ucontext = to_iwch_ucontext(context); + struct iwch_mm_entry *mm, *tmp; + + PDBG("%s context %p\n", __FUNCTION__, context); + list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) + kfree(mm); + cxio_release_ucontext(&rhp->rdev, &ucontext->uctx); + kfree(ucontext); + return 0; +} + +static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct iwch_ucontext *context; + struct iwch_dev *rhp = to_iwch_dev(ibdev); + + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + cxio_init_ucontext(&rhp->rdev, &context->uctx); + INIT_LIST_HEAD(&context->mmaps); + spin_lock_init(&context->mmap_lock); + return &context->ibucontext; +} + +static int iwch_destroy_cq(struct ib_cq *ib_cq) +{ + struct iwch_cq *chp; + + PDBG("%s ib_cq %p\n", __FUNCTION__, ib_cq); + chp = to_iwch_cq(ib_cq); + + remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid); + atomic_dec(&chp->refcnt); + wait_event(chp->wait, !atomic_read(&chp->refcnt)); + + cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); + kfree(chp); + return 0; +} + +static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, + struct ib_ucontext *ib_context, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + struct iwch_create_cq_resp uresp; + struct iwch_create_cq_req ureq; + struct iwch_ucontext *ucontext = NULL; + + PDBG("%s ib_dev %p entries %d\n", __FUNCTION__, ibdev, entries); + rhp = to_iwch_dev(ibdev); + chp = kzalloc(sizeof(*chp), GFP_KERNEL); + if (!chp) + return ERR_PTR(-ENOMEM); + + if (ib_context) { + ucontext = to_iwch_ucontext(ib_context); + if (!t3a_device(rhp)) { + if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) { + kfree(chp); + return ERR_PTR(-EFAULT); + } + chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr; + } + } + + if (t3a_device(rhp)) { + + /* + * T3A: Add some fluff to handle extra CQEs inserted + * for various errors. + * Additional CQE possibilities: + * TERMINATE, + * incoming RDMA WRITE Failures + * incoming RDMA READ REQUEST FAILUREs + * NOTE: We cannot ensure the CQ won't overflow. + */ + entries += 16; + } + entries = roundup_pow_of_two(entries); + chp->cq.size_log2 = ilog2(entries); + + if (cxio_create_cq(&rhp->rdev, &chp->cq)) { + kfree(chp); + return ERR_PTR(-ENOMEM); + } + chp->rhp = rhp; + chp->ibcq.cqe = (1 << chp->cq.size_log2) - 1; + spin_lock_init(&chp->lock); + atomic_set(&chp->refcnt, 1); + init_waitqueue_head(&chp->wait); + insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); + + if (ucontext) { + struct iwch_mm_entry *mm; + + mm = kmalloc(sizeof *mm, GFP_KERNEL); + if (!mm) { + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-ENOMEM); + } + uresp.cqid = chp->cq.cqid; + uresp.size_log2 = chp->cq.size_log2; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + kfree(mm); + iwch_destroy_cq(&chp->ibcq); + return ERR_PTR(-EFAULT); + } + mm->key = uresp.key; + mm->addr = virt_to_phys(chp->cq.queue); + mm->len = PAGE_ALIGN((1UL << uresp.size_log2) * + sizeof (struct t3_cqe)); + insert_mmap(ucontext, mm); + } + PDBG("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n", + chp->cq.cqid, chp, (1 << chp->cq.size_log2), + (unsigned long long) chp->cq.dma_addr); + return &chp->ibcq; +} + +static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ +#ifdef notyet + struct iwch_cq *chp = to_iwch_cq(cq); + struct t3_cq oldcq, newcq; + int ret; + + PDBG("%s ib_cq %p cqe %d\n", __FUNCTION__, cq, cqe); + + /* We don't downsize... */ + if (cqe <= cq->cqe) + return 0; + + /* create new t3_cq with new size */ + cqe = roundup_pow_of_two(cqe+1); + newcq.size_log2 = ilog2(cqe); + + /* Dont allow resize to less than the current wce count */ + if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) { + return -ENOMEM; + } + + /* Quiesce all QPs using this CQ */ + ret = iwch_quiesce_qps(chp); + if (ret) { + return ret; + } + + ret = cxio_create_cq(&chp->rhp->rdev, &newcq); + if (ret) { + return ret; + } + + /* copy CQEs */ + memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) * + sizeof(struct t3_cqe)); + + /* old iwch_qp gets new t3_cq but keeps old cqid */ + oldcq = chp->cq; + chp->cq = newcq; + chp->cq.cqid = oldcq.cqid; + + /* resize new t3_cq to update the HW context */ + ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq); + if (ret) { + chp->cq = oldcq; + return ret; + } + chp->ibcq.cqe = (1<cq.size_log2) - 1; + + /* destroy old t3_cq */ + oldcq.cqid = newcq.cqid; + ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq); + if (ret) { + printk(KERN_ERR MOD "%s - cxio_destroy_cq failed %d\n", + __FUNCTION__, ret); + } + + /* add user hooks here */ + + /* resume qps */ + ret = iwch_resume_qps(chp); + return ret; +#else + return -ENOSYS; +#endif +} + +static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify) +{ + struct iwch_dev *rhp; + struct iwch_cq *chp; + enum t3_cq_opcode cq_op; + int err; + unsigned long flag; + u32 rptr; + + chp = to_iwch_cq(ibcq); + rhp = chp->rhp; + if (notify == IB_CQ_SOLICITED) + cq_op = CQ_ARM_SE; + else + cq_op = CQ_ARM_AN; + if (chp->user_rptr_addr) { + if (get_user(rptr, chp->user_rptr_addr)) + return -EFAULT; + spin_lock_irqsave(&chp->lock, flag); + chp->cq.rptr = rptr; + } else + spin_lock_irqsave(&chp->lock, flag); + PDBG("%s rptr 0x%x\n", __FUNCTION__, chp->cq.rptr); + err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0); + spin_unlock_irqrestore(&chp->lock, flag); + if (err) + printk(KERN_ERR MOD "Error %d rearming CQID 0x%x\n", err, + chp->cq.cqid); + return err; +} + +static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + int len = vma->vm_end - vma->vm_start; + u32 key = vma->vm_pgoff << PAGE_SHIFT; + struct cxio_rdev *rdev_p; + int ret = 0; + struct iwch_mm_entry *mm; + struct iwch_ucontext *ucontext; + + PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __FUNCTION__, vma->vm_pgoff, + key, len); + + if (vma->vm_start & (PAGE_SIZE-1)) { + return -EINVAL; + } + + rdev_p = &(to_iwch_dev(context->device)->rdev); + ucontext = to_iwch_ucontext(context); + + mm = remove_mmap(ucontext, key, len); + if (!mm) + return -EINVAL; + kfree(mm); + + if ((mm->addr >= rdev_p->rnic_info.udbell_physbase) && + (mm->addr < (rdev_p->rnic_info.udbell_physbase + + rdev_p->rnic_info.udbell_len))) { + + /* + * Map T3 DB register. + */ + if (vma->vm_flags & VM_READ) { + return -EPERM; + } + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_flags &= ~VM_MAYREAD; + ret = io_remap_pfn_range(vma, vma->vm_start, + mm->addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } else { + + /* + * Map WQ or CQ contig dma memory... + */ + ret = remap_pfn_range(vma, vma->vm_start, + mm->addr >> PAGE_SHIFT, + len, vma->vm_page_prot); + } + + return ret; +} + +static int iwch_deallocate_pd(struct ib_pd *pd) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + + php = to_iwch_pd(pd); + rhp = php->rhp; + PDBG("%s ibpd %p pdid 0x%x\n", __FUNCTION__, pd, php->pdid); + cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid); + kfree(php); + return 0; +} + +static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct iwch_pd *php; + u32 pdid; + struct iwch_dev *rhp; + + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + rhp = (struct iwch_dev *) ibdev; + pdid = cxio_hal_get_pdid(rhp->rdev.rscp); + if (!pdid) + return ERR_PTR(-EINVAL); + php = kzalloc(sizeof(*php), GFP_KERNEL); + if (!php) { + cxio_hal_put_pdid(rhp->rdev.rscp, pdid); + return ERR_PTR(-ENOMEM); + } + php->pdid = pdid; + php->rhp = rhp; + if (context) { + if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) { + iwch_deallocate_pd(&php->ibpd); + return ERR_PTR(-EFAULT); + } + } + PDBG("%s pdid 0x%0x ptr 0x%p\n", __FUNCTION__, pdid, php); + return &php->ibpd; +} + +static int iwch_dereg_mr(struct ib_mr *ib_mr) +{ + struct iwch_dev *rhp; + struct iwch_mr *mhp; + u32 mmid; + + PDBG("%s ib_mr %p\n", __FUNCTION__, ib_mr); + /* There can be no memory windows */ + if (atomic_read(&ib_mr->usecnt)) + return -EINVAL; + + mhp = to_iwch_mr(ib_mr); + rhp = mhp->rhp; + mmid = mhp->attr.stag >> 8; + cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, + mhp->attr.pbl_addr); + remove_handle(rhp, &rhp->mmidr, mmid); + if (mhp->kva) + kfree((void *) (unsigned long) mhp->kva); + PDBG("%s mmid 0x%x ptr %p\n", __FUNCTION__, mmid, mhp); + kfree(mhp); + return 0; +} + +static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, + u64 *iova_start) +{ + __be64 *page_list; + int shift; + u64 total_size; + int npages; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + int ret; + + PDBG("%s ib_pd %p\n", __FUNCTION__, pd); + php = to_iwch_pd(pd); + rhp = php->rhp; + + acc = iwch_convert_access(acc); + + + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + /* First check that we have enough alignment */ + if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + if (num_phys_buf > 1 && + ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { + ret = -EINVAL; + goto err; + } + + ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, + &total_size, &npages, &shift, &page_list); + if (ret) + goto err; + + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + + /* NOTE: TPT perms are backwards from BIND WR perms! */ + mhp->attr.perms = (acc & 0x1) << 3; + mhp->attr.perms |= (acc & 0x2) << 1; + mhp->attr.perms |= (acc & 0x4) >> 1; + mhp->attr.perms |= (acc & 0x8) >> 3; + + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + ret = iwch_register_mem(rhp, php, mhp, shift, page_list); + kfree(page_list); + if (ret) { + goto err; + } + return &mhp->ibmr; +err: + kfree(mhp); + return ERR_PTR(ret); + +} + +static int iwch_reregister_phys_mem(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, + int acc, u64 * iova_start) +{ + + struct iwch_mr mh, *mhp; + struct iwch_pd *php; + struct iwch_dev *rhp; + int new_acc; + __be64 *page_list = NULL; + int shift = 0; + u64 total_size; + int npages; + int ret; + + PDBG("%s ib_mr %p ib_pd %p\n", __FUNCTION__, mr, pd); + + /* There can be no memory windows */ + if (atomic_read(&mr->usecnt)) + return -EINVAL; + + mhp = to_iwch_mr(mr); + rhp = mhp->rhp; + php = to_iwch_pd(mr->pd); + + /* make sure we are on the same adapter */ + if (rhp != php->rhp) + return -EINVAL; + + new_acc = mhp->attr.perms; + + memcpy(&mh, mhp, sizeof *mhp); + + if (mr_rereg_mask & IB_MR_REREG_PD) + php = to_iwch_pd(pd); + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mh.attr.perms = iwch_convert_access(acc); + if (mr_rereg_mask & IB_MR_REREG_TRANS) + ret = build_phys_page_list(buffer_list, num_phys_buf, + iova_start, + &total_size, &npages, + &shift, &page_list); + + ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages); + kfree(page_list); + if (ret) { + return ret; + } + if (mr_rereg_mask & IB_MR_REREG_PD) + mhp->attr.pdid = php->pdid; + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + mhp->attr.perms = acc; + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + mhp->attr.zbva = 0; + mhp->attr.va_fbo = *iova_start; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) total_size; + mhp->attr.pbl_size = npages; + } + + return 0; +} + + +static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, + int acc, struct ib_udata *udata) +{ + __be64 *pages; + int shift, n, len; + int i, j, k; + int err = 0; + struct ib_umem_chunk *chunk; + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mr *mhp; + struct iwch_reg_user_mr_resp uresp; + + PDBG("%s ib_pd %p\n", __FUNCTION__, pd); + shift = ffs(region->page_size) - 1; + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + + n = 0; + list_for_each_entry(chunk, ®ion->chunk_list, list) + n += chunk->nents; + + pages = kmalloc(n * sizeof(u64), GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err; + } + + acc = iwch_convert_access(acc); + + i = n = 0; + + list_for_each_entry(chunk, ®ion->chunk_list, list) + for (j = 0; j < chunk->nmap; ++j) { + len = sg_dma_len(&chunk->page_list[j]) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = cpu_to_be64(sg_dma_address( + &chunk->page_list[j]) + + region->page_size * k); + } + } + + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.zbva = 0; + mhp->attr.perms = (acc & 0x1) << 3; + mhp->attr.perms |= (acc & 0x2) << 1; + mhp->attr.perms |= (acc & 0x4) >> 1; + mhp->attr.perms |= (acc & 0x8) >> 3; + mhp->attr.va_fbo = region->virt_base; + mhp->attr.page_size = shift - 12; + mhp->attr.len = (u32) region->length; + mhp->attr.pbl_size = i; + err = iwch_register_mem(rhp, php, mhp, shift, pages); + kfree(pages); + if (err) + goto err; + + if (udata && t3b_device(rhp)) { + uresp.pbl_addr = (mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3; + PDBG("%s user resp pbl_addr 0x%x\n", __FUNCTION__, + uresp.pbl_addr); + + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + iwch_dereg_mr(&mhp->ibmr); + err = -EFAULT; + goto err; + } + } + + return &mhp->ibmr; + +err: + kfree(mhp); + return ERR_PTR(err); +} + +static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva; + struct ib_mr *ibmr; + + PDBG("%s ib_pd %p\n", __FUNCTION__, pd); + + /* + * T3 only supports 32 bits of size. + */ + bl.size = 0xffffffff; + bl.addr = 0; + kva = 0; + ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva); + return ibmr; +} + +static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd) +{ + struct iwch_dev *rhp; + struct iwch_pd *php; + struct iwch_mw *mhp; + u32 mmid; + u32 stag = 0; + int ret; + + php = to_iwch_pd(pd); + rhp = php->rhp; + mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); + if (!mhp) + return ERR_PTR(-ENOMEM); + ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid); + if (ret) { + kfree(mhp); + return ERR_PTR(ret); + } + mhp->rhp = rhp; + mhp->attr.pdid = php->pdid; + mhp->attr.type = TPT_MW; + mhp->attr.stag = stag; + mmid = (stag) >> 8; + insert_handle(rhp, &rhp->mmidr, mhp, mmid); + PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __FUNCTION__, mmid, mhp, stag); + return &(mhp->ibmw); +} + +static int iwch_dealloc_mw(struct ib_mw *mw) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + u32 mmid; + + mhp = to_iwch_mw(mw); + rhp = mhp->rhp; + mmid = (mw->rkey) >> 8; + cxio_deallocate_window(&rhp->rdev, mhp->attr.stag); + remove_handle(rhp, &rhp->mmidr, mmid); + kfree(mhp); + PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __FUNCTION__, mw, mmid, mhp); + return 0; +} + +static int iwch_destroy_qp(struct ib_qp *ib_qp) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_qp_attributes attrs; + struct iwch_ucontext *ucontext; + + qhp = to_iwch_qp(ib_qp); + rhp = qhp->rhp; + + if (qhp->attr.state == IWCH_QP_STATE_RTS) { + attrs.next_state = IWCH_QP_STATE_ERROR; + iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0); + } + wait_event(qhp->wait, !qhp->ep); + + remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid); + + atomic_dec(&qhp->refcnt); + wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); + + ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context) + : NULL; + cxio_destroy_qp(&rhp->rdev, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + + PDBG("%s ib_qp %p qpid 0x%0x qhp %p\n", __FUNCTION__, + ib_qp, qhp->wq.qpid, qhp); + kfree(qhp); + return 0; +} + +static struct ib_qp *iwch_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + struct iwch_pd *php; + struct iwch_cq *schp; + struct iwch_cq *rchp; + struct iwch_create_qp_resp uresp; + int wqsize, sqsize, rqsize; + struct iwch_ucontext *ucontext; + + PDBG("%s ib_pd %p\n", __FUNCTION__, pd); + if (attrs->qp_type != IB_QPT_RC) + return ERR_PTR(-EINVAL); + php = to_iwch_pd(pd); + rhp = php->rhp; + schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid); + rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid); + if (!schp || !rchp) + return ERR_PTR(-EINVAL); + + /* The RQT size must be # of entries + 1 rounded up to a power of two */ + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr); + if (rqsize == attrs->cap.max_recv_wr) + rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1); + + /* T3 doesn't support RQT depth < 16 */ + if (rqsize < 16) + rqsize = 16; + + if (rqsize > T3_MAX_RQ_SIZE) + return ERR_PTR(-EINVAL); + + /* + * NOTE: The SQ and total WQ sizes don't need to be + * a power of two. However, all the code assumes + * they are. EG: Q_FREECNT() and friends. + */ + sqsize = roundup_pow_of_two(attrs->cap.max_send_wr); + wqsize = roundup_pow_of_two(rqsize + sqsize); + PDBG("%s wqsize %d sqsize %d rqsize %d\n", __FUNCTION__, + wqsize, sqsize, rqsize); + qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); + if (!qhp) + return ERR_PTR(-ENOMEM); + qhp->wq.size_log2 = ilog2(wqsize); + qhp->wq.rq_size_log2 = ilog2(rqsize); + qhp->wq.sq_size_log2 = ilog2(sqsize); + ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL; + if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq, + ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) { + kfree(qhp); + return ERR_PTR(-ENOMEM); + } + attrs->cap.max_recv_wr = rqsize - 1; + attrs->cap.max_send_wr = sqsize; + qhp->rhp = rhp; + qhp->attr.pd = php->pdid; + qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid; + qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid; + qhp->attr.sq_num_entries = attrs->cap.max_send_wr; + qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; + qhp->attr.sq_max_sges = attrs->cap.max_send_sge; + qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; + qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.next_state = IWCH_QP_STATE_IDLE; + + /* + * XXX - These don't get passed in from the openib user + * at create time. The CM sets them via a QP modify. + * Need to fix... I think the CM should + */ + qhp->attr.enable_rdma_read = 1; + qhp->attr.enable_rdma_write = 1; + qhp->attr.enable_bind = 1; + qhp->attr.max_ord = 1; + qhp->attr.max_ird = 1; + + spin_lock_init(&qhp->lock); + init_waitqueue_head(&qhp->wait); + atomic_set(&qhp->refcnt, 1); + insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid); + + if (udata) { + + struct iwch_mm_entry *mm1, *mm2; + + mm1 = kmalloc(sizeof *mm1, GFP_KERNEL); + if (!mm1) { + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); + if (!mm2) { + kfree(mm1); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-ENOMEM); + } + + uresp.qpid = qhp->wq.qpid; + uresp.size_log2 = qhp->wq.size_log2; + uresp.sq_size_log2 = qhp->wq.sq_size_log2; + uresp.rq_size_log2 = qhp->wq.rq_size_log2; + spin_lock(&ucontext->mmap_lock); + uresp.key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.db_key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) { + kfree(mm1); + kfree(mm2); + iwch_destroy_qp(&qhp->ibqp); + return ERR_PTR(-EFAULT); + } + mm1->key = uresp.key; + mm1->addr = virt_to_phys(qhp->wq.queue); + mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr)); + insert_mmap(ucontext, mm1); + mm2->key = uresp.db_key; + mm2->addr = qhp->wq.udb & PAGE_MASK; + mm2->len = PAGE_SIZE; + insert_mmap(ucontext, mm2); + } + qhp->ibqp.qp_num = qhp->wq.qpid; + init_timer(&(qhp->timer)); + PDBG("%s sq_num_entries %d, rq_num_entries %d " + "qpid 0x%0x qhp %p dma_addr 0x%llx size %d\n", + __FUNCTION__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, + qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr, + 1 << qhp->wq.size_log2); + return &qhp->ibqp; +} + +static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct iwch_dev *rhp; + struct iwch_qp *qhp; + enum iwch_qp_attr_mask mask = 0; + struct iwch_qp_attributes attrs; + + PDBG("%s ib_qp %p\n", __FUNCTION__, ibqp); + + /* iwarp does not support the RTR state */ + if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) + attr_mask &= ~IB_QP_STATE; + + /* Make sure we still have something left to do */ + if (!attr_mask) + return 0; + + memset(&attrs, 0, sizeof attrs); + qhp = to_iwch_qp(ibqp); + rhp = qhp->rhp; + + attrs.next_state = iwch_convert_state(attr->qp_state); + attrs.enable_rdma_read = (attr->qp_access_flags & + IB_ACCESS_REMOTE_READ) ? 1 : 0; + attrs.enable_rdma_write = (attr->qp_access_flags & + IB_ACCESS_REMOTE_WRITE) ? 1 : 0; + attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; + + + mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0; + mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? + (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0; + + return iwch_modify_qp(rhp, qhp, mask, &attrs, 0); +} + +void iwch_qp_add_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __FUNCTION__, qp); + atomic_inc(&(to_iwch_qp(qp)->refcnt)); +} + +void iwch_qp_rem_ref(struct ib_qp *qp) +{ + PDBG("%s ib_qp %p\n", __FUNCTION__, qp); + if (atomic_dec_and_test(&(to_iwch_qp(qp)->refcnt))) + wake_up(&(to_iwch_qp(qp)->wait)); +} + +struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn) +{ + PDBG("%s ib_dev %p qpn 0x%x\n", __FUNCTION__, dev, qpn); + return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn); +} + + +static int iwch_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 * pkey) +{ + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + *pkey = 0; + return 0; +} + +static int iwch_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct iwch_dev *dev; + + PDBG("%s ibdev %p, port %d, index %d, gid %p\n", + __FUNCTION__, ibdev, port, index, gid); + dev = to_iwch_dev(ibdev); + BUG_ON(port == 0 || port > 2); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), dev->rdev.port_info.lldevs[port-1]->dev_addr, 6); + return 0; +} + +static int iwch_query_device(struct ib_device *ibdev, + struct ib_device_attr *props) +{ + + struct iwch_dev *dev; + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + + dev = to_iwch_dev(ibdev); + memset(props, 0, sizeof *props); + memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); + props->device_cap_flags = dev->device_cap_flags; + props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor; + props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device; + props->max_mr_size = ~0ull; + props->max_qp = dev->attr.max_qps; + props->max_qp_wr = dev->attr.max_wrs; + props->max_sge = dev->attr.max_sge_per_wr; + props->max_sge_rd = 1; + props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp; + props->max_cq = dev->attr.max_cqs; + props->max_cqe = dev->attr.max_cqes_per_cq; + props->max_mr = dev->attr.max_mem_regs; + props->max_pd = dev->attr.max_pds; + props->local_ca_ack_delay = 0; + + return 0; +} + +static int iwch_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + PDBG("%s ibdev %p\n", __FUNCTION__, ibdev); + props->max_mtu = IB_MTU_4096; + props->lid = 0; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + props->state = IB_PORT_ACTIVE; + props->phys_state = 0; + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->qkey_viol_cntr = 0; + props->active_width = 2; + props->active_speed = 2; + props->max_msg_sz = -1; + + return 0; +} + +static ssize_t show_rev(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev); + return sprintf(buf, "%d\n", dev->rdev.t3cdev_p->type); +} + +static ssize_t show_fw_ver(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + struct ethtool_drvinfo info; + struct net_device *lldev = dev->rdev.t3cdev_p->lldev; + + PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.fw_version); +} + +static ssize_t show_hca(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + struct ethtool_drvinfo info; + struct net_device *lldev = dev->rdev.t3cdev_p->lldev; + + PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev); + lldev->ethtool_ops->get_drvinfo(lldev, &info); + return sprintf(buf, "%s\n", info.driver); +} + +static ssize_t show_board(struct class_device *cdev, char *buf) +{ + struct iwch_dev *dev = container_of(cdev, struct iwch_dev, + ibdev.class_dev); + PDBG("%s class dev 0x%p\n", __FUNCTION__, dev); + return sprintf(buf, "%x.%x\n", dev->rdev.rnic_info.pdev->vendor, + dev->rdev.rnic_info.pdev->device); +} + +static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct class_device_attribute *iwch_class_attributes[] = { + &class_device_attr_hw_rev, + &class_device_attr_fw_ver, + &class_device_attr_hca_type, + &class_device_attr_board_id +}; + +int iwch_register_device(struct iwch_dev *dev) +{ + int ret; + int i; + + PDBG("%s iwch_dev %p\n", __FUNCTION__, dev); + strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); + memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); + dev->ibdev.owner = THIS_MODULE; + dev->device_cap_flags = + (IB_DEVICE_ZERO_STAG | + IB_DEVICE_SEND_W_INV | IB_DEVICE_MEM_WINDOW); + + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + dev->ibdev.node_type = RDMA_NODE_RNIC; + memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC)); + dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports; + dev->ibdev.dma_device = &(dev->rdev.rnic_info.pdev->dev); + dev->ibdev.class_dev.dev = &(dev->rdev.rnic_info.pdev->dev); + dev->ibdev.query_device = iwch_query_device; + dev->ibdev.query_port = iwch_query_port; + dev->ibdev.modify_port = iwch_modify_port; + dev->ibdev.query_pkey = iwch_query_pkey; + dev->ibdev.query_gid = iwch_query_gid; + dev->ibdev.alloc_ucontext = iwch_alloc_ucontext; + dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext; + dev->ibdev.mmap = iwch_mmap; + dev->ibdev.alloc_pd = iwch_allocate_pd; + dev->ibdev.dealloc_pd = iwch_deallocate_pd; + dev->ibdev.create_ah = iwch_ah_create; + dev->ibdev.destroy_ah = iwch_ah_destroy; + dev->ibdev.create_qp = iwch_create_qp; + dev->ibdev.modify_qp = iwch_ib_modify_qp; + dev->ibdev.destroy_qp = iwch_destroy_qp; + dev->ibdev.create_cq = iwch_create_cq; + dev->ibdev.destroy_cq = iwch_destroy_cq; + dev->ibdev.resize_cq = iwch_resize_cq; + dev->ibdev.poll_cq = iwch_poll_cq; + dev->ibdev.get_dma_mr = iwch_get_dma_mr; + dev->ibdev.reg_phys_mr = iwch_register_phys_mem; + dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem; + dev->ibdev.reg_user_mr = iwch_reg_user_mr; + dev->ibdev.dereg_mr = iwch_dereg_mr; + dev->ibdev.alloc_mw = iwch_alloc_mw; + dev->ibdev.bind_mw = iwch_bind_mw; + dev->ibdev.dealloc_mw = iwch_dealloc_mw; + + dev->ibdev.attach_mcast = iwch_multicast_attach; + dev->ibdev.detach_mcast = iwch_multicast_detach; + dev->ibdev.process_mad = iwch_process_mad; + + dev->ibdev.req_notify_cq = iwch_arm_cq; + dev->ibdev.post_send = iwch_post_send; + dev->ibdev.post_recv = iwch_post_receive; + + + dev->ibdev.iwcm = + (struct iw_cm_verbs *) kmalloc(sizeof(struct iw_cm_verbs), + GFP_KERNEL); + dev->ibdev.iwcm->connect = iwch_connect; + dev->ibdev.iwcm->accept = iwch_accept_cr; + dev->ibdev.iwcm->reject = iwch_reject_cr; + dev->ibdev.iwcm->create_listen = iwch_create_listen; + dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen; + dev->ibdev.iwcm->add_ref = iwch_qp_add_ref; + dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref; + dev->ibdev.iwcm->get_qp = iwch_get_qp; + + ret = ib_register_device(&dev->ibdev); + if (ret) + goto bail1; + + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) { + ret = class_device_create_file(&dev->ibdev.class_dev, + iwch_class_attributes[i]); + if (ret) { + goto bail2; + } + } + return 0; +bail2: + ib_unregister_device(&dev->ibdev); +bail1: + return ret; +} + +void iwch_unregister_device(struct iwch_dev *dev) +{ + int i; + + PDBG("%s iwch_dev %p\n", __FUNCTION__, dev); + for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) + class_device_remove_file(&dev->ibdev.class_dev, + iwch_class_attributes[i]); + ib_unregister_device(&dev->ibdev); + return; +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h new file mode 100644 index 00000000000..61e3278fd7a --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_PROVIDER_H__ +#define __IWCH_PROVIDER_H__ + +#include +#include +#include +#include +#include "t3cdev.h" +#include "iwch.h" +#include "cxio_wr.h" +#include "cxio_hal.h" + +struct iwch_pd { + struct ib_pd ibpd; + u32 pdid; + struct iwch_dev *rhp; +}; + +static inline struct iwch_pd *to_iwch_pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct iwch_pd, ibpd); +} + +struct tpt_attributes { + u32 stag; + u32 state:1; + u32 type:2; + u32 rsvd:1; + enum tpt_mem_perm perms; + u32 remote_invaliate_disable:1; + u32 zbva:1; + u32 mw_bind_enable:1; + u32 page_size:5; + + u32 pdid; + u32 qpid; + u32 pbl_addr; + u32 len; + u64 va_fbo; + u32 pbl_size; +}; + +struct iwch_mr { + struct ib_mr ibmr; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +typedef struct iwch_mw iwch_mw_handle; + +static inline struct iwch_mr *to_iwch_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct iwch_mr, ibmr); +} + +struct iwch_mw { + struct ib_mw ibmw; + struct iwch_dev *rhp; + u64 kva; + struct tpt_attributes attr; +}; + +static inline struct iwch_mw *to_iwch_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct iwch_mw, ibmw); +} + +struct iwch_cq { + struct ib_cq ibcq; + struct iwch_dev *rhp; + struct t3_cq cq; + spinlock_t lock; + atomic_t refcnt; + wait_queue_head_t wait; + u32 __user *user_rptr_addr; +}; + +static inline struct iwch_cq *to_iwch_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct iwch_cq, ibcq); +} + +enum IWCH_QP_FLAGS { + QP_QUIESCED = 0x01 +}; + +struct iwch_mpa_attributes { + u8 recv_marker_enabled; + u8 xmit_marker_enabled; /* iWARP: enable inbound Read Resp. */ + u8 crc_enabled; + u8 version; /* 0 or 1 */ +}; + +struct iwch_qp_attributes { + u32 scq; + u32 rcq; + u32 sq_num_entries; + u32 rq_num_entries; + u32 sq_max_sges; + u32 sq_max_sges_rdma_write; + u32 rq_max_sges; + u32 state; + u8 enable_rdma_read; + u8 enable_rdma_write; /* enable inbound Read Resp. */ + u8 enable_bind; + u8 enable_mmid0_fastreg; /* Enable STAG0 + Fast-register */ + /* + * Next QP state. If specify the current state, only the + * QP attributes will be modified. + */ + u32 max_ord; + u32 max_ird; + u32 pd; /* IN */ + u32 next_state; + char terminate_buffer[52]; + u32 terminate_msg_len; + u8 is_terminate_local; + struct iwch_mpa_attributes mpa_attr; /* IN-OUT */ + struct iwch_ep *llp_stream_handle; + char *stream_msg_buf; /* Last stream msg. before Idle -> RTS */ + u32 stream_msg_buf_len; /* Only on Idle -> RTS */ +}; + +struct iwch_qp { + struct ib_qp ibqp; + struct iwch_dev *rhp; + struct iwch_ep *ep; + struct iwch_qp_attributes attr; + struct t3_wq wq; + spinlock_t lock; + atomic_t refcnt; + wait_queue_head_t wait; + enum IWCH_QP_FLAGS flags; + struct timer_list timer; +}; + +static inline int qp_quiesced(struct iwch_qp *qhp) +{ + return qhp->flags & QP_QUIESCED; +} + +static inline struct iwch_qp *to_iwch_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct iwch_qp, ibqp); +} + +void iwch_qp_add_ref(struct ib_qp *qp); +void iwch_qp_rem_ref(struct ib_qp *qp); +struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn); + +struct iwch_ucontext { + struct ib_ucontext ibucontext; + struct cxio_ucontext uctx; + u32 key; + spinlock_t mmap_lock; + struct list_head mmaps; +}; + +static inline struct iwch_ucontext *to_iwch_ucontext(struct ib_ucontext *c) +{ + return container_of(c, struct iwch_ucontext, ibucontext); +} + +struct iwch_mm_entry { + struct list_head entry; + u64 addr; + u32 key; + unsigned len; +}; + +static inline struct iwch_mm_entry *remove_mmap(struct iwch_ucontext *ucontext, + u32 key, unsigned len) +{ + struct list_head *pos, *nxt; + struct iwch_mm_entry *mm; + + spin_lock(&ucontext->mmap_lock); + list_for_each_safe(pos, nxt, &ucontext->mmaps) { + + mm = list_entry(pos, struct iwch_mm_entry, entry); + if (mm->key == key && mm->len == len) { + list_del_init(&mm->entry); + spin_unlock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__, + key, (unsigned long long) mm->addr, mm->len); + return mm; + } + } + spin_unlock(&ucontext->mmap_lock); + return NULL; +} + +static inline void insert_mmap(struct iwch_ucontext *ucontext, + struct iwch_mm_entry *mm) +{ + spin_lock(&ucontext->mmap_lock); + PDBG("%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__, + mm->key, (unsigned long long) mm->addr, mm->len); + list_add_tail(&mm->entry, &ucontext->mmaps); + spin_unlock(&ucontext->mmap_lock); +} + +enum iwch_qp_attr_mask { + IWCH_QP_ATTR_NEXT_STATE = 1 << 0, + IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7, + IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8, + IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9, + IWCH_QP_ATTR_MAX_ORD = 1 << 11, + IWCH_QP_ATTR_MAX_IRD = 1 << 12, + IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22, + IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23, + IWCH_QP_ATTR_MPA_ATTR = 1 << 24, + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25, + IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ | + IWCH_QP_ATTR_ENABLE_RDMA_WRITE | + IWCH_QP_ATTR_MAX_ORD | + IWCH_QP_ATTR_MAX_IRD | + IWCH_QP_ATTR_LLP_STREAM_HANDLE | + IWCH_QP_ATTR_STREAM_MSG_BUFFER | + IWCH_QP_ATTR_MPA_ATTR | + IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE) +}; + +int iwch_modify_qp(struct iwch_dev *rhp, + struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal); + +enum iwch_qp_state { + IWCH_QP_STATE_IDLE, + IWCH_QP_STATE_RTS, + IWCH_QP_STATE_ERROR, + IWCH_QP_STATE_TERMINATE, + IWCH_QP_STATE_CLOSING, + IWCH_QP_STATE_TOT +}; + +static inline int iwch_convert_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + case IB_QPS_INIT: + return IWCH_QP_STATE_IDLE; + case IB_QPS_RTS: + return IWCH_QP_STATE_RTS; + case IB_QPS_SQD: + return IWCH_QP_STATE_CLOSING; + case IB_QPS_SQE: + return IWCH_QP_STATE_TERMINATE; + case IB_QPS_ERR: + return IWCH_QP_STATE_ERROR; + default: + return -1; + } +} + +enum iwch_mem_perms { + IWCH_MEM_ACCESS_LOCAL_READ = 1 << 0, + IWCH_MEM_ACCESS_LOCAL_WRITE = 1 << 1, + IWCH_MEM_ACCESS_REMOTE_READ = 1 << 2, + IWCH_MEM_ACCESS_REMOTE_WRITE = 1 << 3, + IWCH_MEM_ACCESS_ATOMICS = 1 << 4, + IWCH_MEM_ACCESS_BINDING = 1 << 5, + IWCH_MEM_ACCESS_LOCAL = + (IWCH_MEM_ACCESS_LOCAL_READ | IWCH_MEM_ACCESS_LOCAL_WRITE), + IWCH_MEM_ACCESS_REMOTE = + (IWCH_MEM_ACCESS_REMOTE_WRITE | IWCH_MEM_ACCESS_REMOTE_READ) + /* cannot go beyond 1 << 31 */ +} __attribute__ ((packed)); + +static inline u32 iwch_convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? IWCH_MEM_ACCESS_REMOTE_WRITE : 0) + | (acc & IB_ACCESS_REMOTE_READ ? IWCH_MEM_ACCESS_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? IWCH_MEM_ACCESS_LOCAL_WRITE : 0) | + (acc & IB_ACCESS_MW_BIND ? IWCH_MEM_ACCESS_BINDING : 0) | + IWCH_MEM_ACCESS_LOCAL_READ; +} + +enum iwch_mmid_state { + IWCH_STAG_STATE_VALID, + IWCH_STAG_STATE_INVALID +}; + +enum iwch_qp_query_flags { + IWCH_QP_QUERY_CONTEXT_NONE = 0x0, /* No ctx; Only attrs */ + IWCH_QP_QUERY_CONTEXT_GET = 0x1, /* Get ctx + attrs */ + IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2, /* Not Supported */ + + /* + * Quiesce QP context; Consumer + * will NOT replay outstanding WR + */ + IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4, + IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8, + IWCH_QP_QUERY_TEST_USERWRITE = 0x32 /* Test special */ +}; + +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind); +int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); +int iwch_register_device(struct iwch_dev *dev); +void iwch_unregister_device(struct iwch_dev *dev); +int iwch_quiesce_qps(struct iwch_cq *chp); +int iwch_resume_qps(struct iwch_cq *chp); +void stop_read_rep_timer(struct iwch_qp *qhp); +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list); +int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, + int shift, + __be64 *page_list, + int npages); +int build_phys_page_list(struct ib_phys_buf *buffer_list, + int num_phys_buf, + u64 *iova_start, + u64 *total_size, + int *npages, + int *shift, + __be64 **page_list); + + +#define IWCH_NODE_DESC "cxgb3 Chelsio Communications" + +#endif diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c new file mode 100644 index 00000000000..e066727504b --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -0,0 +1,1007 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "iwch_provider.h" +#include "iwch.h" +#include "iwch_cm.h" +#include "cxio_hal.h" + +#define NO_SUPPORT -1 + +static inline int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, + u8 * flit_cnt) +{ + int i; + u32 plen; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (wr->send_flags & IB_SEND_SOLICITED) + wqe->send.rdmaop = T3_SEND_WITH_SE; + else + wqe->send.rdmaop = T3_SEND; + wqe->send.rem_stag = 0; + break; +#if 0 /* Not currently supported */ + case TYPE_SEND_INVALIDATE: + case TYPE_SEND_INVALIDATE_IMMEDIATE: + wqe->send.rdmaop = T3_SEND_WITH_INV; + wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey); + break; + case TYPE_SEND_SE_INVALIDATE: + wqe->send.rdmaop = T3_SEND_WITH_SE_INV; + wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey); + break; +#endif + default: + break; + } + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + wqe->send.reserved[0] = 0; + wqe->send.reserved[1] = 0; + wqe->send.reserved[2] = 0; + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + plen = 4; + wqe->send.sgl[0].stag = wr->imm_data; + wqe->send.sgl[0].len = __constant_cpu_to_be32(0); + wqe->send.num_sgle = __constant_cpu_to_be32(0); + *flit_cnt = 5; + } else { + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) { + return -EMSGSIZE; + } + plen += wr->sg_list[i].length; + wqe->send.sgl[i].stag = + cpu_to_be32(wr->sg_list[i].lkey); + wqe->send.sgl[i].len = + cpu_to_be32(wr->sg_list[i].length); + wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr); + } + wqe->send.num_sgle = cpu_to_be32(wr->num_sge); + *flit_cnt = 4 + ((wr->num_sge) << 1); + } + wqe->send.plen = cpu_to_be32(plen); + return 0; +} + +static inline int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + int i; + u32 plen; + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + wqe->write.rdmaop = T3_RDMA_WRITE; + wqe->write.reserved[0] = 0; + wqe->write.reserved[1] = 0; + wqe->write.reserved[2] = 0; + wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey); + wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr); + + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + plen = 4; + wqe->write.sgl[0].stag = wr->imm_data; + wqe->write.sgl[0].len = __constant_cpu_to_be32(0); + wqe->write.num_sgle = __constant_cpu_to_be32(0); + *flit_cnt = 6; + } else { + plen = 0; + for (i = 0; i < wr->num_sge; i++) { + if ((plen + wr->sg_list[i].length) < plen) { + return -EMSGSIZE; + } + plen += wr->sg_list[i].length; + wqe->write.sgl[i].stag = + cpu_to_be32(wr->sg_list[i].lkey); + wqe->write.sgl[i].len = + cpu_to_be32(wr->sg_list[i].length); + wqe->write.sgl[i].to = + cpu_to_be64(wr->sg_list[i].addr); + } + wqe->write.num_sgle = cpu_to_be32(wr->num_sge); + *flit_cnt = 5 + ((wr->num_sge) << 1); + } + wqe->write.plen = cpu_to_be32(plen); + return 0; +} + +static inline int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, + u8 *flit_cnt) +{ + if (wr->num_sge > 1) + return -EINVAL; + wqe->read.rdmaop = T3_READ_REQ; + wqe->read.reserved[0] = 0; + wqe->read.reserved[1] = 0; + wqe->read.reserved[2] = 0; + wqe->read.rem_stag = cpu_to_be32(wr->wr.rdma.rkey); + wqe->read.rem_to = cpu_to_be64(wr->wr.rdma.remote_addr); + wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey); + wqe->read.local_len = cpu_to_be32(wr->sg_list[0].length); + wqe->read.local_to = cpu_to_be64(wr->sg_list[0].addr); + *flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3; + return 0; +} + +/* + * TBD: this is going to be moved to firmware. Missing pdid/qpid check for now. + */ +static inline int iwch_sgl2pbl_map(struct iwch_dev *rhp, + struct ib_sge *sg_list, u32 num_sgle, + u32 * pbl_addr, u8 * page_size) +{ + int i; + struct iwch_mr *mhp; + u32 offset; + for (i = 0; i < num_sgle; i++) { + + mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8); + if (!mhp) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EIO; + } + if (!mhp->attr.state) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EIO; + } + if (mhp->attr.zbva) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EIO; + } + + if (sg_list[i].addr < mhp->attr.va_fbo) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EINVAL; + } + if (sg_list[i].addr + ((u64) sg_list[i].length) < + sg_list[i].addr) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EINVAL; + } + if (sg_list[i].addr + ((u64) sg_list[i].length) > + mhp->attr.va_fbo + ((u64) mhp->attr.len)) { + PDBG("%s %d\n", __FUNCTION__, __LINE__); + return -EINVAL; + } + offset = sg_list[i].addr - mhp->attr.va_fbo; + offset += ((u32) mhp->attr.va_fbo) % + (1UL << (12 + mhp->attr.page_size)); + pbl_addr[i] = ((mhp->attr.pbl_addr - + rhp->rdev.rnic_info.pbl_base) >> 3) + + (offset >> (12 + mhp->attr.page_size)); + page_size[i] = mhp->attr.page_size; + } + return 0; +} + +static inline int iwch_build_rdma_recv(struct iwch_dev *rhp, + union t3_wr *wqe, + struct ib_recv_wr *wr) +{ + int i, err = 0; + u32 pbl_addr[4]; + u8 page_size[4]; + if (wr->num_sge > T3_MAX_SGE) + return -EINVAL; + err = iwch_sgl2pbl_map(rhp, wr->sg_list, wr->num_sge, pbl_addr, + page_size); + if (err) + return err; + wqe->recv.pagesz[0] = page_size[0]; + wqe->recv.pagesz[1] = page_size[1]; + wqe->recv.pagesz[2] = page_size[2]; + wqe->recv.pagesz[3] = page_size[3]; + wqe->recv.num_sgle = cpu_to_be32(wr->num_sge); + for (i = 0; i < wr->num_sge; i++) { + wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey); + wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length); + + /* to in the WQE == the offset into the page */ + wqe->recv.sgl[i].to = cpu_to_be64(((u32) wr->sg_list[i].addr) % + (1UL << (12 + page_size[i]))); + + /* pbl_addr is the adapters address in the PBL */ + wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]); + } + for (; i < T3_MAX_SGE; i++) { + wqe->recv.sgl[i].stag = 0; + wqe->recv.sgl[i].len = 0; + wqe->recv.sgl[i].to = 0; + wqe->recv.pbl_addr[i] = 0; + } + return 0; +} + +int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + int err = 0; + u8 t3_wr_flit_cnt; + enum t3_wr_opcode t3_wr_opcode = 0; + enum t3_wr_flags t3_wr_flags; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + unsigned long flag; + struct t3_swsq *sqp; + + qhp = to_iwch_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if (num_wrs <= 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + while (wr) { + if (num_wrs == 0) { + err = -ENOMEM; + *bad_wr = wr; + break; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + t3_wr_flags = 0; + if (wr->send_flags & IB_SEND_SOLICITED) + t3_wr_flags |= T3_SOLICITED_EVENT_FLAG; + if (wr->send_flags & IB_SEND_FENCE) + t3_wr_flags |= T3_READ_FENCE_FLAG; + if (wr->send_flags & IB_SEND_SIGNALED) + t3_wr_flags |= T3_COMPLETION_FLAG; + sqp = qhp->wq.sq + + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + t3_wr_opcode = T3_WR_SEND; + err = iwch_build_rdma_send(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + t3_wr_opcode = T3_WR_WRITE; + err = iwch_build_rdma_write(wqe, wr, &t3_wr_flit_cnt); + break; + case IB_WR_RDMA_READ: + t3_wr_opcode = T3_WR_READ; + t3_wr_flags = 0; /* T3 reads are always signaled */ + err = iwch_build_rdma_read(wqe, wr, &t3_wr_flit_cnt); + if (err) + break; + sqp->read_len = wqe->read.local_len; + if (!qhp->wq.oldest_read) + qhp->wq.oldest_read = sqp; + break; + default: + PDBG("%s post of type=%d TBD!\n", __FUNCTION__, + wr->opcode); + err = -EINVAL; + } + if (err) { + *bad_wr = wr; + break; + } + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp->wr_id = wr->wr_id; + sqp->opcode = wr2opcode(t3_wr_opcode); + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED); + + build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, t3_wr_flit_cnt); + PDBG("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n", + __FUNCTION__, (unsigned long long) wr->wr_id, idx, + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2), + sqp->opcode); + wr = wr->next; + num_wrs--; + ++(qhp->wq.wptr); + ++(qhp->wq.sq_wptr); + } + spin_unlock_irqrestore(&qhp->lock, flag); + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + return err; +} + +int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct iwch_qp *qhp; + u32 idx; + union t3_wr *wqe; + u32 num_wrs; + unsigned long flag; + + qhp = to_iwch_qp(ibqp); + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr, + qhp->wq.rq_size_log2) - 1; + if (!wr) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + while (wr) { + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + if (num_wrs) + err = iwch_build_rdma_recv(qhp->rhp, wqe, wr); + else + err = -ENOMEM; + if (err) { + *bad_wr = wr; + break; + } + qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, qhp->wq.rq_size_log2)] = + wr->wr_id; + build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), + 0, sizeof(struct t3_receive_wr) >> 3); + PDBG("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x " + "wqe %p \n", __FUNCTION__, (unsigned long long) wr->wr_id, + idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe); + ++(qhp->wq.rq_wptr); + ++(qhp->wq.wptr); + wr = wr->next; + num_wrs--; + } + spin_unlock_irqrestore(&qhp->lock, flag); + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + return err; +} + +int iwch_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + struct iwch_dev *rhp; + struct iwch_mw *mhp; + struct iwch_qp *qhp; + union t3_wr *wqe; + u32 pbl_addr; + u8 page_size; + u32 num_wrs; + unsigned long flag; + struct ib_sge sgl; + int err=0; + enum t3_wr_flags t3_wr_flags; + u32 idx; + struct t3_swsq *sqp; + + qhp = to_iwch_qp(qp); + mhp = to_iwch_mw(mw); + rhp = qhp->rhp; + + spin_lock_irqsave(&qhp->lock, flag); + if (qhp->attr.state > IWCH_QP_STATE_RTS) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -EINVAL; + } + num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, + qhp->wq.sq_size_log2); + if ((num_wrs) <= 0) { + spin_unlock_irqrestore(&qhp->lock, flag); + return -ENOMEM; + } + idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); + PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __FUNCTION__, idx, + mw, mw_bind); + wqe = (union t3_wr *) (qhp->wq.queue + idx); + + t3_wr_flags = 0; + if (mw_bind->send_flags & IB_SEND_SIGNALED) + t3_wr_flags = T3_COMPLETION_FLAG; + + sgl.addr = mw_bind->addr; + sgl.lkey = mw_bind->mr->lkey; + sgl.length = mw_bind->length; + wqe->bind.reserved = 0; + wqe->bind.type = T3_VA_BASED_TO; + + /* TBD: check perms */ + wqe->bind.perms = iwch_convert_access(mw_bind->mw_access_flags); + wqe->bind.mr_stag = cpu_to_be32(mw_bind->mr->lkey); + wqe->bind.mw_stag = cpu_to_be32(mw->rkey); + wqe->bind.mw_len = cpu_to_be32(mw_bind->length); + wqe->bind.mw_va = cpu_to_be64(mw_bind->addr); + err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size); + if (err) { + spin_unlock_irqrestore(&qhp->lock, flag); + return err; + } + wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; + sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); + sqp->wr_id = mw_bind->wr_id; + sqp->opcode = T3_BIND_MW; + sqp->sq_wptr = qhp->wq.sq_wptr; + sqp->complete = 0; + sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED); + wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr); + wqe->bind.mr_pagesz = page_size; + wqe->flit[T3_SQ_COOKIE_FLIT] = mw_bind->wr_id; + build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags, + Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0, + sizeof(struct t3_bind_mw_wr) >> 3); + ++(qhp->wq.wptr); + ++(qhp->wq.sq_wptr); + spin_unlock_irqrestore(&qhp->lock, flag); + + ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); + + return err; +} + +static inline void build_term_codes(int t3err, u8 *layer_type, u8 *ecode, + int tagged) +{ + switch (t3err) { + case TPT_ERR_STAG: + if (tagged == 1) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_INV_STAG; + } else if (tagged == 2) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_INV_STAG; + } + break; + case TPT_ERR_PDID: + case TPT_ERR_QPID: + case TPT_ERR_ACCESS: + if (tagged == 1) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_STAG_NOT_ASSOC; + } else if (tagged == 2) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_STAG_NOT_ASSOC; + } + break; + case TPT_ERR_WRAP: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_TO_WRAP; + break; + case TPT_ERR_BOUND: + if (tagged == 1) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + } else if (tagged == 2) { + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; + *ecode = RDMAP_BASE_BOUNDS; + } else { + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_MSG_TOOBIG; + } + break; + case TPT_ERR_INVALIDATE_SHARED_MR: + case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_CANT_INV_STAG; + break; + case TPT_ERR_ECC: + case TPT_ERR_ECC_PSTAG: + case TPT_ERR_INTERNAL_ERR: + *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_OUT_OF_RQE: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_NOBUF; + break; + case TPT_ERR_PBL_ADDR_BOUND: + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_BASE_BOUNDS; + break; + case TPT_ERR_CRC: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_CRC_ERR; + break; + case TPT_ERR_MARKER: + *layer_type = LAYER_MPA|DDP_LLP; + *ecode = MPA_MARKER_ERR; + break; + case TPT_ERR_PDU_LEN_ERR: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_MSG_TOOBIG; + break; + case TPT_ERR_DDP_VERSION: + if (tagged) { + *layer_type = LAYER_DDP|DDP_TAGGED_ERR; + *ecode = DDPT_INV_VERS; + } else { + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_VERS; + } + break; + case TPT_ERR_RDMA_VERSION: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_VERS; + break; + case TPT_ERR_OPCODE: + *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; + *ecode = RDMAP_INV_OPCODE; + break; + case TPT_ERR_DDP_QUEUE_NUM: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_QN; + break; + case TPT_ERR_MSN: + case TPT_ERR_MSN_GAP: + case TPT_ERR_MSN_RANGE: + case TPT_ERR_IRD_OVERFLOW: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MSN_RANGE; + break; + case TPT_ERR_TBIT: + *layer_type = LAYER_DDP|DDP_LOCAL_CATA; + *ecode = 0; + break; + case TPT_ERR_MO: + *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; + *ecode = DDPU_INV_MO; + break; + default: + *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; + *ecode = 0; + break; + } +} + +/* + * This posts a TERMINATE with layer=RDMA, type=catastrophic. + */ +int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg) +{ + union t3_wr *wqe; + struct terminate_message *term; + int status; + int tagged = 0; + struct sk_buff *skb; + + PDBG("%s %d\n", __FUNCTION__, __LINE__); + skb = alloc_skb(40, GFP_ATOMIC); + if (!skb) { + printk(KERN_ERR "%s cannot send TERMINATE!\n", __FUNCTION__); + return -ENOMEM; + } + wqe = (union t3_wr *)skb_put(skb, 40); + memset(wqe, 0, 40); + wqe->send.rdmaop = T3_TERMINATE; + + /* immediate data length */ + wqe->send.plen = htonl(4); + + /* immediate data starts here. */ + term = (struct terminate_message *)wqe->send.sgl; + if (rsp_msg) { + status = CQE_STATUS(rsp_msg->cqe); + if (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE) + tagged = 1; + if ((CQE_OPCODE(rsp_msg->cqe) == T3_READ_REQ) || + (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) + tagged = 2; + } else { + status = TPT_ERR_INTERNAL_ERR; + } + build_term_codes(status, &term->layer_etype, &term->ecode, tagged); + build_fw_riwrh((void *)wqe, T3_WR_SEND, + T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 1, + qhp->ep->hwtid, 5); + skb->priority = CPL_PRIORITY_DATA; + return cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb); +} + +/* + * Assumes qhp lock is held. + */ +static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag) +{ + struct iwch_cq *rchp, *schp; + int count; + + rchp = get_chp(qhp->rhp, qhp->attr.rcq); + schp = get_chp(qhp->rhp, qhp->attr.scq); + + PDBG("%s qhp %p rchp %p schp %p\n", __FUNCTION__, qhp, rchp, schp); + /* take a ref on the qhp since we must release the lock */ + atomic_inc(&qhp->refcnt); + spin_unlock_irqrestore(&qhp->lock, *flag); + + /* locking heirarchy: cq lock first, then qp lock. */ + spin_lock_irqsave(&rchp->lock, *flag); + spin_lock(&qhp->lock); + cxio_flush_hw_cq(&rchp->cq); + cxio_count_rcqes(&rchp->cq, &qhp->wq, &count); + cxio_flush_rq(&qhp->wq, &rchp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&rchp->lock, *flag); + + /* locking heirarchy: cq lock first, then qp lock. */ + spin_lock_irqsave(&schp->lock, *flag); + spin_lock(&qhp->lock); + cxio_flush_hw_cq(&schp->cq); + cxio_count_scqes(&schp->cq, &qhp->wq, &count); + cxio_flush_sq(&qhp->wq, &schp->cq, count); + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&schp->lock, *flag); + + /* deref */ + if (atomic_dec_and_test(&qhp->refcnt)) + wake_up(&qhp->wait); + + spin_lock_irqsave(&qhp->lock, *flag); +} + +static inline void flush_qp(struct iwch_qp *qhp, unsigned long *flag) +{ + if (t3b_device(qhp->rhp)) + cxio_set_wq_in_error(&qhp->wq); + else + __flush_qp(qhp, flag); +} + + +/* + * Return non zero if at least one RECV was pre-posted. + */ +static inline int rqes_posted(struct iwch_qp *qhp) +{ + return fw_riwrh_opcode((struct fw_riwrh *)qhp->wq.queue) == T3_WR_RCV; +} + +static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs) +{ + struct t3_rdma_init_attr init_attr; + int ret; + + init_attr.tid = qhp->ep->hwtid; + init_attr.qpid = qhp->wq.qpid; + init_attr.pdid = qhp->attr.pd; + init_attr.scqid = qhp->attr.scq; + init_attr.rcqid = qhp->attr.rcq; + init_attr.rq_addr = qhp->wq.rq_addr; + init_attr.rq_size = 1 << qhp->wq.rq_size_log2; + init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE | + qhp->attr.mpa_attr.recv_marker_enabled | + (qhp->attr.mpa_attr.xmit_marker_enabled << 1) | + (qhp->attr.mpa_attr.crc_enabled << 2); + + /* + * XXX - The IWCM doesn't quite handle getting these + * attrs set before going into RTS. For now, just turn + * them on always... + */ +#if 0 + init_attr.qpcaps = qhp->attr.enableRdmaRead | + (qhp->attr.enableRdmaWrite << 1) | + (qhp->attr.enableBind << 2) | + (qhp->attr.enable_stag0_fastreg << 3) | + (qhp->attr.enable_stag0_fastreg << 4); +#else + init_attr.qpcaps = 0x1f; +#endif + init_attr.tcp_emss = qhp->ep->emss; + init_attr.ord = qhp->attr.max_ord; + init_attr.ird = qhp->attr.max_ird; + init_attr.qp_dma_addr = qhp->wq.dma_addr; + init_attr.qp_dma_size = (1UL << qhp->wq.size_log2); + init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0; + PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d " + "flags 0x%x qpcaps 0x%x\n", __FUNCTION__, + init_attr.rq_addr, init_attr.rq_size, + init_attr.flags, init_attr.qpcaps); + ret = cxio_rdma_init(&rhp->rdev, &init_attr); + PDBG("%s ret %d\n", __FUNCTION__, ret); + return ret; +} + +int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp, + enum iwch_qp_attr_mask mask, + struct iwch_qp_attributes *attrs, + int internal) +{ + int ret = 0; + struct iwch_qp_attributes newattr = qhp->attr; + unsigned long flag; + int disconnect = 0; + int terminate = 0; + int abort = 0; + int free = 0; + struct iwch_ep *ep = NULL; + + PDBG("%s qhp %p qpid 0x%x ep %p state %d -> %d\n", __FUNCTION__, + qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state, + (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); + + spin_lock_irqsave(&qhp->lock, flag); + + /* Process attr changes if in IDLE */ + if (mask & IWCH_QP_ATTR_VALID_MODIFY) { + if (qhp->attr.state != IWCH_QP_STATE_IDLE) { + ret = -EIO; + goto out; + } + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ) + newattr.enable_rdma_read = attrs->enable_rdma_read; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE) + newattr.enable_rdma_write = attrs->enable_rdma_write; + if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND) + newattr.enable_bind = attrs->enable_bind; + if (mask & IWCH_QP_ATTR_MAX_ORD) { + if (attrs->max_ord > + rhp->attr.max_rdma_read_qp_depth) { + ret = -EINVAL; + goto out; + } + newattr.max_ord = attrs->max_ord; + } + if (mask & IWCH_QP_ATTR_MAX_IRD) { + if (attrs->max_ird > + rhp->attr.max_rdma_reads_per_qp) { + ret = -EINVAL; + goto out; + } + newattr.max_ird = attrs->max_ird; + } + qhp->attr = newattr; + } + + if (!(mask & IWCH_QP_ATTR_NEXT_STATE)) + goto out; + if (qhp->attr.state == attrs->next_state) + goto out; + + switch (qhp->attr.state) { + case IWCH_QP_STATE_IDLE: + switch (attrs->next_state) { + case IWCH_QP_STATE_RTS: + if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) { + ret = -EINVAL; + goto out; + } + if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) { + ret = -EINVAL; + goto out; + } + qhp->attr.mpa_attr = attrs->mpa_attr; + qhp->attr.llp_stream_handle = attrs->llp_stream_handle; + qhp->ep = qhp->attr.llp_stream_handle; + qhp->attr.state = IWCH_QP_STATE_RTS; + + /* + * Ref the endpoint here and deref when we + * disassociate the endpoint from the QP. This + * happens in CLOSING->IDLE transition or *->ERROR + * transition. + */ + get_ep(&qhp->ep->com); + spin_unlock_irqrestore(&qhp->lock, flag); + ret = rdma_init(rhp, qhp, mask, attrs); + spin_lock_irqsave(&qhp->lock, flag); + if (ret) + goto err; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + flush_qp(qhp, &flag); + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_RTS: + switch (attrs->next_state) { + case IWCH_QP_STATE_CLOSING: + BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); + qhp->attr.state = IWCH_QP_STATE_CLOSING; + if (!internal) { + abort=0; + disconnect = 1; + ep = qhp->ep; + } + break; + case IWCH_QP_STATE_TERMINATE: + qhp->attr.state = IWCH_QP_STATE_TERMINATE; + if (!internal) + terminate = 1; + break; + case IWCH_QP_STATE_ERROR: + qhp->attr.state = IWCH_QP_STATE_ERROR; + if (!internal) { + abort=1; + disconnect = 1; + ep = qhp->ep; + } + goto err; + break; + default: + ret = -EINVAL; + goto out; + } + break; + case IWCH_QP_STATE_CLOSING: + if (!internal) { + ret = -EINVAL; + goto out; + } + switch (attrs->next_state) { + case IWCH_QP_STATE_IDLE: + qhp->attr.state = IWCH_QP_STATE_IDLE; + qhp->attr.llp_stream_handle = NULL; + put_ep(&qhp->ep->com); + qhp->ep = NULL; + wake_up(&qhp->wait); + break; + case IWCH_QP_STATE_ERROR: + goto err; + default: + ret = -EINVAL; + goto err; + } + break; + case IWCH_QP_STATE_ERROR: + if (attrs->next_state != IWCH_QP_STATE_IDLE) { + ret = -EINVAL; + goto out; + } + + if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) || + !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) { + ret = -EINVAL; + goto out; + } + qhp->attr.state = IWCH_QP_STATE_IDLE; + memset(&qhp->attr, 0, sizeof(qhp->attr)); + break; + case IWCH_QP_STATE_TERMINATE: + if (!internal) { + ret = -EINVAL; + goto out; + } + goto err; + break; + default: + printk(KERN_ERR "%s in a bad state %d\n", + __FUNCTION__, qhp->attr.state); + ret = -EINVAL; + goto err; + break; + } + goto out; +err: + PDBG("%s disassociating ep %p qpid 0x%x\n", __FUNCTION__, qhp->ep, + qhp->wq.qpid); + + /* disassociate the LLP connection */ + qhp->attr.llp_stream_handle = NULL; + ep = qhp->ep; + qhp->ep = NULL; + qhp->attr.state = IWCH_QP_STATE_ERROR; + free=1; + wake_up(&qhp->wait); + BUG_ON(!ep); + flush_qp(qhp, &flag); +out: + spin_unlock_irqrestore(&qhp->lock, flag); + + if (terminate) + iwch_post_terminate(qhp, NULL); + + /* + * If disconnect is 1, then we need to initiate a disconnect + * on the EP. This can be a normal close (RTS->CLOSING) or + * an abnormal close (RTS/CLOSING->ERROR). + */ + if (disconnect) + iwch_ep_disconnect(ep, abort, GFP_KERNEL); + + /* + * If free is 1, then we've disassociated the EP from the QP + * and we need to dereference the EP. + */ + if (free) + put_ep(&ep->com); + + PDBG("%s exit state %d\n", __FUNCTION__, qhp->attr.state); + return ret; +} + +static int quiesce_qp(struct iwch_qp *qhp) +{ + spin_lock_irq(&qhp->lock); + iwch_quiesce_tid(qhp->ep); + qhp->flags |= QP_QUIESCED; + spin_unlock_irq(&qhp->lock); + return 0; +} + +static int resume_qp(struct iwch_qp *qhp) +{ + spin_lock_irq(&qhp->lock); + iwch_resume_tid(qhp->ep); + qhp->flags &= ~QP_QUIESCED; + spin_unlock_irq(&qhp->lock); + return 0; +} + +int iwch_quiesce_qps(struct iwch_cq *chp) +{ + int i; + struct iwch_qp *qhp; + + for (i=0; i < T3_MAX_NUM_QP; i++) { + qhp = get_qhp(chp->rhp, i); + if (!qhp) + continue; + if ((qhp->attr.rcq == chp->cq.cqid) && !qp_quiesced(qhp)) { + quiesce_qp(qhp); + continue; + } + if ((qhp->attr.scq == chp->cq.cqid) && !qp_quiesced(qhp)) + quiesce_qp(qhp); + } + return 0; +} + +int iwch_resume_qps(struct iwch_cq *chp) +{ + int i; + struct iwch_qp *qhp; + + for (i=0; i < T3_MAX_NUM_QP; i++) { + qhp = get_qhp(chp->rhp, i); + if (!qhp) + continue; + if ((qhp->attr.rcq == chp->cq.cqid) && qp_quiesced(qhp)) { + resume_qp(qhp); + continue; + } + if ((qhp->attr.scq == chp->cq.cqid) && qp_quiesced(qhp)) + resume_qp(qhp); + } + return 0; +} diff --git a/drivers/infiniband/hw/cxgb3/iwch_user.h b/drivers/infiniband/hw/cxgb3/iwch_user.h new file mode 100644 index 00000000000..c4e7fbea8bb --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/iwch_user.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2006 Chelsio, Inc. All rights reserved. + * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __IWCH_USER_H__ +#define __IWCH_USER_H__ + +#define IWCH_UVERBS_ABI_VERSION 1 + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ +struct iwch_create_cq_req { + __u64 user_rptr_addr; +}; + +struct iwch_create_cq_resp { + __u64 key; + __u32 cqid; + __u32 size_log2; +}; + +struct iwch_create_qp_resp { + __u64 key; + __u64 db_key; + __u32 qpid; + __u32 size_log2; + __u32 sq_size_log2; + __u32 rq_size_log2; +}; + +struct iwch_reg_user_mr_resp { + __u32 pbl_addr; +}; +#endif diff --git a/drivers/infiniband/hw/cxgb3/tcb.h b/drivers/infiniband/hw/cxgb3/tcb.h new file mode 100644 index 00000000000..c702dc199e1 --- /dev/null +++ b/drivers/infiniband/hw/cxgb3/tcb.h @@ -0,0 +1,632 @@ +/* + * Copyright (c) 2007 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _TCB_DEFS_H +#define _TCB_DEFS_H + +#define W_TCB_T_STATE 0 +#define S_TCB_T_STATE 0 +#define M_TCB_T_STATE 0xfULL +#define V_TCB_T_STATE(x) ((x) << S_TCB_T_STATE) + +#define W_TCB_TIMER 0 +#define S_TCB_TIMER 4 +#define M_TCB_TIMER 0x1ULL +#define V_TCB_TIMER(x) ((x) << S_TCB_TIMER) + +#define W_TCB_DACK_TIMER 0 +#define S_TCB_DACK_TIMER 5 +#define M_TCB_DACK_TIMER 0x1ULL +#define V_TCB_DACK_TIMER(x) ((x) << S_TCB_DACK_TIMER) + +#define W_TCB_DEL_FLAG 0 +#define S_TCB_DEL_FLAG 6 +#define M_TCB_DEL_FLAG 0x1ULL +#define V_TCB_DEL_FLAG(x) ((x) << S_TCB_DEL_FLAG) + +#define W_TCB_L2T_IX 0 +#define S_TCB_L2T_IX 7 +#define M_TCB_L2T_IX 0x7ffULL +#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX) + +#define W_TCB_SMAC_SEL 0 +#define S_TCB_SMAC_SEL 18 +#define M_TCB_SMAC_SEL 0x3ULL +#define V_TCB_SMAC_SEL(x) ((x) << S_TCB_SMAC_SEL) + +#define W_TCB_TOS 0 +#define S_TCB_TOS 20 +#define M_TCB_TOS 0x3fULL +#define V_TCB_TOS(x) ((x) << S_TCB_TOS) + +#define W_TCB_MAX_RT 0 +#define S_TCB_MAX_RT 26 +#define M_TCB_MAX_RT 0xfULL +#define V_TCB_MAX_RT(x) ((x) << S_TCB_MAX_RT) + +#define W_TCB_T_RXTSHIFT 0 +#define S_TCB_T_RXTSHIFT 30 +#define M_TCB_T_RXTSHIFT 0xfULL +#define V_TCB_T_RXTSHIFT(x) ((x) << S_TCB_T_RXTSHIFT) + +#define W_TCB_T_DUPACKS 1 +#define S_TCB_T_DUPACKS 2 +#define M_TCB_T_DUPACKS 0xfULL +#define V_TCB_T_DUPACKS(x) ((x) << S_TCB_T_DUPACKS) + +#define W_TCB_T_MAXSEG 1 +#define S_TCB_T_MAXSEG 6 +#define M_TCB_T_MAXSEG 0xfULL +#define V_TCB_T_MAXSEG(x) ((x) << S_TCB_T_MAXSEG) + +#define W_TCB_T_FLAGS1 1 +#define S_TCB_T_FLAGS1 10 +#define M_TCB_T_FLAGS1 0xffffffffULL +#define V_TCB_T_FLAGS1(x) ((x) << S_TCB_T_FLAGS1) + +#define W_TCB_T_MIGRATION 1 +#define S_TCB_T_MIGRATION 20 +#define M_TCB_T_MIGRATION 0x1ULL +#define V_TCB_T_MIGRATION(x) ((x) << S_TCB_T_MIGRATION) + +#define W_TCB_T_FLAGS2 2 +#define S_TCB_T_FLAGS2 10 +#define M_TCB_T_FLAGS2 0x7fULL +#define V_TCB_T_FLAGS2(x) ((x) << S_TCB_T_FLAGS2) + +#define W_TCB_SND_SCALE 2 +#define S_TCB_SND_SCALE 17 +#define M_TCB_SND_SCALE 0xfULL +#define V_TCB_SND_SCALE(x) ((x) << S_TCB_SND_SCALE) + +#define W_TCB_RCV_SCALE 2 +#define S_TCB_RCV_SCALE 21 +#define M_TCB_RCV_SCALE 0xfULL +#define V_TCB_RCV_SCALE(x) ((x) << S_TCB_RCV_SCALE) + +#define W_TCB_SND_UNA_RAW 2 +#define S_TCB_SND_UNA_RAW 25 +#define M_TCB_SND_UNA_RAW 0x7ffffffULL +#define V_TCB_SND_UNA_RAW(x) ((x) << S_TCB_SND_UNA_RAW) + +#define W_TCB_SND_NXT_RAW 3 +#define S_TCB_SND_NXT_RAW 20 +#define M_TCB_SND_NXT_RAW 0x7ffffffULL +#define V_TCB_SND_NXT_RAW(x) ((x) << S_TCB_SND_NXT_RAW) + +#define W_TCB_RCV_NXT 4 +#define S_TCB_RCV_NXT 15 +#define M_TCB_RCV_NXT 0xffffffffULL +#define V_TCB_RCV_NXT(x) ((x) << S_TCB_RCV_NXT) + +#define W_TCB_RCV_ADV 5 +#define S_TCB_RCV_ADV 15 +#define M_TCB_RCV_ADV 0xffffULL +#define V_TCB_RCV_ADV(x) ((x) << S_TCB_RCV_ADV) + +#define W_TCB_SND_MAX_RAW 5 +#define S_TCB_SND_MAX_RAW 31 +#define M_TCB_SND_MAX_RAW 0x7ffffffULL +#define V_TCB_SND_MAX_RAW(x) ((x) << S_TCB_SND_MAX_RAW) + +#define W_TCB_SND_CWND 6 +#define S_TCB_SND_CWND 26 +#define M_TCB_SND_CWND 0x7ffffffULL +#define V_TCB_SND_CWND(x) ((x) << S_TCB_SND_CWND) + +#define W_TCB_SND_SSTHRESH 7 +#define S_TCB_SND_SSTHRESH 21 +#define M_TCB_SND_SSTHRESH 0x7ffffffULL +#define V_TCB_SND_SSTHRESH(x) ((x) << S_TCB_SND_SSTHRESH) + +#define W_TCB_T_RTT_TS_RECENT_AGE 8 +#define S_TCB_T_RTT_TS_RECENT_AGE 16 +#define M_TCB_T_RTT_TS_RECENT_AGE 0xffffffffULL +#define V_TCB_T_RTT_TS_RECENT_AGE(x) ((x) << S_TCB_T_RTT_TS_RECENT_AGE) + +#define W_TCB_T_RTSEQ_RECENT 9 +#define S_TCB_T_RTSEQ_RECENT 16 +#define M_TCB_T_RTSEQ_RECENT 0xffffffffULL +#define V_TCB_T_RTSEQ_RECENT(x) ((x) << S_TCB_T_RTSEQ_RECENT) + +#define W_TCB_T_SRTT 10 +#define S_TCB_T_SRTT 16 +#define M_TCB_T_SRTT 0xffffULL +#define V_TCB_T_SRTT(x) ((x) << S_TCB_T_SRTT) + +#define W_TCB_T_RTTVAR 11 +#define S_TCB_T_RTTVAR 0 +#define M_TCB_T_RTTVAR 0xffffULL +#define V_TCB_T_RTTVAR(x) ((x) << S_TCB_T_RTTVAR) + +#define W_TCB_TS_LAST_ACK_SENT_RAW 11 +#define S_TCB_TS_LAST_ACK_SENT_RAW 16 +#define M_TCB_TS_LAST_ACK_SENT_RAW 0x7ffffffULL +#define V_TCB_TS_LAST_ACK_SENT_RAW(x) ((x) << S_TCB_TS_LAST_ACK_SENT_RAW) + +#define W_TCB_DIP 12 +#define S_TCB_DIP 11 +#define M_TCB_DIP 0xffffffffULL +#define V_TCB_DIP(x) ((x) << S_TCB_DIP) + +#define W_TCB_SIP 13 +#define S_TCB_SIP 11 +#define M_TCB_SIP 0xffffffffULL +#define V_TCB_SIP(x) ((x) << S_TCB_SIP) + +#define W_TCB_DP 14 +#define S_TCB_DP 11 +#define M_TCB_DP 0xffffULL +#define V_TCB_DP(x) ((x) << S_TCB_DP) + +#define W_TCB_SP 14 +#define S_TCB_SP 27 +#define M_TCB_SP 0xffffULL +#define V_TCB_SP(x) ((x) << S_TCB_SP) + +#define W_TCB_TIMESTAMP 15 +#define S_TCB_TIMESTAMP 11 +#define M_TCB_TIMESTAMP 0xffffffffULL +#define V_TCB_TIMESTAMP(x) ((x) << S_TCB_TIMESTAMP) + +#define W_TCB_TIMESTAMP_OFFSET 16 +#define S_TCB_TIMESTAMP_OFFSET 11 +#define M_TCB_TIMESTAMP_OFFSET 0xfULL +#define V_TCB_TIMESTAMP_OFFSET(x) ((x) << S_TCB_TIMESTAMP_OFFSET) + +#define W_TCB_TX_MAX 16 +#define S_TCB_TX_MAX 15 +#define M_TCB_TX_MAX 0xffffffffULL +#define V_TCB_TX_MAX(x) ((x) << S_TCB_TX_MAX) + +#define W_TCB_TX_HDR_PTR_RAW 17 +#define S_TCB_TX_HDR_PTR_RAW 15 +#define M_TCB_TX_HDR_PTR_RAW 0x1ffffULL +#define V_TCB_TX_HDR_PTR_RAW(x) ((x) << S_TCB_TX_HDR_PTR_RAW) + +#define W_TCB_TX_LAST_PTR_RAW 18 +#define S_TCB_TX_LAST_PTR_RAW 0 +#define M_TCB_TX_LAST_PTR_RAW 0x1ffffULL +#define V_TCB_TX_LAST_PTR_RAW(x) ((x) << S_TCB_TX_LAST_PTR_RAW) + +#define W_TCB_TX_COMPACT 18 +#define S_TCB_TX_COMPACT 17 +#define M_TCB_TX_COMPACT 0x1ULL +#define V_TCB_TX_COMPACT(x) ((x) << S_TCB_TX_COMPACT) + +#define W_TCB_RX_COMPACT 18 +#define S_TCB_RX_COMPACT 18 +#define M_TCB_RX_COMPACT 0x1ULL +#define V_TCB_RX_COMPACT(x) ((x) << S_TCB_RX_COMPACT) + +#define W_TCB_RCV_WND 18 +#define S_TCB_RCV_WND 19 +#define M_TCB_RCV_WND 0x7ffffffULL +#define V_TCB_RCV_WND(x) ((x) << S_TCB_RCV_WND) + +#define W_TCB_RX_HDR_OFFSET 19 +#define S_TCB_RX_HDR_OFFSET 14 +#define M_TCB_RX_HDR_OFFSET 0x7ffffffULL +#define V_TCB_RX_HDR_OFFSET(x) ((x) << S_TCB_RX_HDR_OFFSET) + +#define W_TCB_RX_FRAG0_START_IDX_RAW 20 +#define S_TCB_RX_FRAG0_START_IDX_RAW 9 +#define M_TCB_RX_FRAG0_START_IDX_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG0_START_IDX_RAW(x) ((x) << S_TCB_RX_FRAG0_START_IDX_RAW) + +#define W_TCB_RX_FRAG1_START_IDX_OFFSET 21 +#define S_TCB_RX_FRAG1_START_IDX_OFFSET 4 +#define M_TCB_RX_FRAG1_START_IDX_OFFSET 0x7ffffffULL +#define V_TCB_RX_FRAG1_START_IDX_OFFSET(x) ((x) << S_TCB_RX_FRAG1_START_IDX_OFFSET) + +#define W_TCB_RX_FRAG0_LEN 21 +#define S_TCB_RX_FRAG0_LEN 31 +#define M_TCB_RX_FRAG0_LEN 0x7ffffffULL +#define V_TCB_RX_FRAG0_LEN(x) ((x) << S_TCB_RX_FRAG0_LEN) + +#define W_TCB_RX_FRAG1_LEN 22 +#define S_TCB_RX_FRAG1_LEN 26 +#define M_TCB_RX_FRAG1_LEN 0x7ffffffULL +#define V_TCB_RX_FRAG1_LEN(x) ((x) << S_TCB_RX_FRAG1_LEN) + +#define W_TCB_NEWRENO_RECOVER 23 +#define S_TCB_NEWRENO_RECOVER 21 +#define M_TCB_NEWRENO_RECOVER 0x7ffffffULL +#define V_TCB_NEWRENO_RECOVER(x) ((x) << S_TCB_NEWRENO_RECOVER) + +#define W_TCB_PDU_HAVE_LEN 24 +#define S_TCB_PDU_HAVE_LEN 16 +#define M_TCB_PDU_HAVE_LEN 0x1ULL +#define V_TCB_PDU_HAVE_LEN(x) ((x) << S_TCB_PDU_HAVE_LEN) + +#define W_TCB_PDU_LEN 24 +#define S_TCB_PDU_LEN 17 +#define M_TCB_PDU_LEN 0xffffULL +#define V_TCB_PDU_LEN(x) ((x) << S_TCB_PDU_LEN) + +#define W_TCB_RX_QUIESCE 25 +#define S_TCB_RX_QUIESCE 1 +#define M_TCB_RX_QUIESCE 0x1ULL +#define V_TCB_RX_QUIESCE(x) ((x) << S_TCB_RX_QUIESCE) + +#define W_TCB_RX_PTR_RAW 25 +#define S_TCB_RX_PTR_RAW 2 +#define M_TCB_RX_PTR_RAW 0x1ffffULL +#define V_TCB_RX_PTR_RAW(x) ((x) << S_TCB_RX_PTR_RAW) + +#define W_TCB_CPU_NO 25 +#define S_TCB_CPU_NO 19 +#define M_TCB_CPU_NO 0x7fULL +#define V_TCB_CPU_NO(x) ((x) << S_TCB_CPU_NO) + +#define W_TCB_ULP_TYPE 25 +#define S_TCB_ULP_TYPE 26 +#define M_TCB_ULP_TYPE 0xfULL +#define V_TCB_ULP_TYPE(x) ((x) << S_TCB_ULP_TYPE) + +#define W_TCB_RX_FRAG1_PTR_RAW 25 +#define S_TCB_RX_FRAG1_PTR_RAW 30 +#define M_TCB_RX_FRAG1_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG1_PTR_RAW(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW) + +#define W_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 26 +#define S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 15 +#define M_TCB_RX_FRAG2_START_IDX_OFFSET_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG2_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW) + +#define W_TCB_RX_FRAG2_PTR_RAW 27 +#define S_TCB_RX_FRAG2_PTR_RAW 10 +#define M_TCB_RX_FRAG2_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG2_PTR_RAW(x) ((x) << S_TCB_RX_FRAG2_PTR_RAW) + +#define W_TCB_RX_FRAG2_LEN_RAW 27 +#define S_TCB_RX_FRAG2_LEN_RAW 27 +#define M_TCB_RX_FRAG2_LEN_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG2_LEN_RAW(x) ((x) << S_TCB_RX_FRAG2_LEN_RAW) + +#define W_TCB_RX_FRAG3_PTR_RAW 28 +#define S_TCB_RX_FRAG3_PTR_RAW 22 +#define M_TCB_RX_FRAG3_PTR_RAW 0x1ffffULL +#define V_TCB_RX_FRAG3_PTR_RAW(x) ((x) << S_TCB_RX_FRAG3_PTR_RAW) + +#define W_TCB_RX_FRAG3_LEN_RAW 29 +#define S_TCB_RX_FRAG3_LEN_RAW 7 +#define M_TCB_RX_FRAG3_LEN_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG3_LEN_RAW(x) ((x) << S_TCB_RX_FRAG3_LEN_RAW) + +#define W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 30 +#define S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 2 +#define M_TCB_RX_FRAG3_START_IDX_OFFSET_RAW 0x7ffffffULL +#define V_TCB_RX_FRAG3_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW) + +#define W_TCB_PDU_HDR_LEN 30 +#define S_TCB_PDU_HDR_LEN 29 +#define M_TCB_PDU_HDR_LEN 0xffULL +#define V_TCB_PDU_HDR_LEN(x) ((x) << S_TCB_PDU_HDR_LEN) + +#define W_TCB_SLUSH1 31 +#define S_TCB_SLUSH1 5 +#define M_TCB_SLUSH1 0x7ffffULL +#define V_TCB_SLUSH1(x) ((x) << S_TCB_SLUSH1) + +#define W_TCB_ULP_RAW 31 +#define S_TCB_ULP_RAW 24 +#define M_TCB_ULP_RAW 0xffULL +#define V_TCB_ULP_RAW(x) ((x) << S_TCB_ULP_RAW) + +#define W_TCB_DDP_RDMAP_VERSION 25 +#define S_TCB_DDP_RDMAP_VERSION 30 +#define M_TCB_DDP_RDMAP_VERSION 0x1ULL +#define V_TCB_DDP_RDMAP_VERSION(x) ((x) << S_TCB_DDP_RDMAP_VERSION) + +#define W_TCB_MARKER_ENABLE_RX 25 +#define S_TCB_MARKER_ENABLE_RX 31 +#define M_TCB_MARKER_ENABLE_RX 0x1ULL +#define V_TCB_MARKER_ENABLE_RX(x) ((x) << S_TCB_MARKER_ENABLE_RX) + +#define W_TCB_MARKER_ENABLE_TX 26 +#define S_TCB_MARKER_ENABLE_TX 0 +#define M_TCB_MARKER_ENABLE_TX 0x1ULL +#define V_TCB_MARKER_ENABLE_TX(x) ((x) << S_TCB_MARKER_ENABLE_TX) + +#define W_TCB_CRC_ENABLE 26 +#define S_TCB_CRC_ENABLE 1 +#define M_TCB_CRC_ENABLE 0x1ULL +#define V_TCB_CRC_ENABLE(x) ((x) << S_TCB_CRC_ENABLE) + +#define W_TCB_IRS_ULP 26 +#define S_TCB_IRS_ULP 2 +#define M_TCB_IRS_ULP 0x1ffULL +#define V_TCB_IRS_ULP(x) ((x) << S_TCB_IRS_ULP) + +#define W_TCB_ISS_ULP 26 +#define S_TCB_ISS_ULP 11 +#define M_TCB_ISS_ULP 0x1ffULL +#define V_TCB_ISS_ULP(x) ((x) << S_TCB_ISS_ULP) + +#define W_TCB_TX_PDU_LEN 26 +#define S_TCB_TX_PDU_LEN 20 +#define M_TCB_TX_PDU_LEN 0x3fffULL +#define V_TCB_TX_PDU_LEN(x) ((x) << S_TCB_TX_PDU_LEN) + +#define W_TCB_TX_PDU_OUT 27 +#define S_TCB_TX_PDU_OUT 2 +#define M_TCB_TX_PDU_OUT 0x1ULL +#define V_TCB_TX_PDU_OUT(x) ((x) << S_TCB_TX_PDU_OUT) + +#define W_TCB_CQ_IDX_SQ 27 +#define S_TCB_CQ_IDX_SQ 3 +#define M_TCB_CQ_IDX_SQ 0xffffULL +#define V_TCB_CQ_IDX_SQ(x) ((x) << S_TCB_CQ_IDX_SQ) + +#define W_TCB_CQ_IDX_RQ 27 +#define S_TCB_CQ_IDX_RQ 19 +#define M_TCB_CQ_IDX_RQ 0xffffULL +#define V_TCB_CQ_IDX_RQ(x) ((x) << S_TCB_CQ_IDX_RQ) + +#define W_TCB_QP_ID 28 +#define S_TCB_QP_ID 3 +#define M_TCB_QP_ID 0xffffULL +#define V_TCB_QP_ID(x) ((x) << S_TCB_QP_ID) + +#define W_TCB_PD_ID 28 +#define S_TCB_PD_ID 19 +#define M_TCB_PD_ID 0xffffULL +#define V_TCB_PD_ID(x) ((x) << S_TCB_PD_ID) + +#define W_TCB_STAG 29 +#define S_TCB_STAG 3 +#define M_TCB_STAG 0xffffffffULL +#define V_TCB_STAG(x) ((x) << S_TCB_STAG) + +#define W_TCB_RQ_START 30 +#define S_TCB_RQ_START 3 +#define M_TCB_RQ_START 0x3ffffffULL +#define V_TCB_RQ_START(x) ((x) << S_TCB_RQ_START) + +#define W_TCB_RQ_MSN 30 +#define S_TCB_RQ_MSN 29 +#define M_TCB_RQ_MSN 0x3ffULL +#define V_TCB_RQ_MSN(x) ((x) << S_TCB_RQ_MSN) + +#define W_TCB_RQ_MAX_OFFSET 31 +#define S_TCB_RQ_MAX_OFFSET 7 +#define M_TCB_RQ_MAX_OFFSET 0xfULL +#define V_TCB_RQ_MAX_OFFSET(x) ((x) << S_TCB_RQ_MAX_OFFSET) + +#define W_TCB_RQ_WRITE_PTR 31 +#define S_TCB_RQ_WRITE_PTR 11 +#define M_TCB_RQ_WRITE_PTR 0x3ffULL +#define V_TCB_RQ_WRITE_PTR(x) ((x) << S_TCB_RQ_WRITE_PTR) + +#define W_TCB_INB_WRITE_PERM 31 +#define S_TCB_INB_WRITE_PERM 21 +#define M_TCB_INB_WRITE_PERM 0x1ULL +#define V_TCB_INB_WRITE_PERM(x) ((x) << S_TCB_INB_WRITE_PERM) + +#define W_TCB_INB_READ_PERM 31 +#define S_TCB_INB_READ_PERM 22 +#define M_TCB_INB_READ_PERM 0x1ULL +#define V_TCB_INB_READ_PERM(x) ((x) << S_TCB_INB_READ_PERM) + +#define W_TCB_ORD_L_BIT_VLD 31 +#define S_TCB_ORD_L_BIT_VLD 23 +#define M_TCB_ORD_L_BIT_VLD 0x1ULL +#define V_TCB_ORD_L_BIT_VLD(x) ((x) << S_TCB_ORD_L_BIT_VLD) + +#define W_TCB_RDMAP_OPCODE 31 +#define S_TCB_RDMAP_OPCODE 24 +#define M_TCB_RDMAP_OPCODE 0xfULL +#define V_TCB_RDMAP_OPCODE(x) ((x) << S_TCB_RDMAP_OPCODE) + +#define W_TCB_TX_FLUSH 31 +#define S_TCB_TX_FLUSH 28 +#define M_TCB_TX_FLUSH 0x1ULL +#define V_TCB_TX_FLUSH(x) ((x) << S_TCB_TX_FLUSH) + +#define W_TCB_TX_OOS_RXMT 31 +#define S_TCB_TX_OOS_RXMT 29 +#define M_TCB_TX_OOS_RXMT 0x1ULL +#define V_TCB_TX_OOS_RXMT(x) ((x) << S_TCB_TX_OOS_RXMT) + +#define W_TCB_TX_OOS_TXMT 31 +#define S_TCB_TX_OOS_TXMT 30 +#define M_TCB_TX_OOS_TXMT 0x1ULL +#define V_TCB_TX_OOS_TXMT(x) ((x) << S_TCB_TX_OOS_TXMT) + +#define W_TCB_SLUSH_AUX2 31 +#define S_TCB_SLUSH_AUX2 31 +#define M_TCB_SLUSH_AUX2 0x1ULL +#define V_TCB_SLUSH_AUX2(x) ((x) << S_TCB_SLUSH_AUX2) + +#define W_TCB_RX_FRAG1_PTR_RAW2 25 +#define S_TCB_RX_FRAG1_PTR_RAW2 30 +#define M_TCB_RX_FRAG1_PTR_RAW2 0x1ffffULL +#define V_TCB_RX_FRAG1_PTR_RAW2(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW2) + +#define W_TCB_RX_DDP_FLAGS 26 +#define S_TCB_RX_DDP_FLAGS 15 +#define M_TCB_RX_DDP_FLAGS 0x3ffULL +#define V_TCB_RX_DDP_FLAGS(x) ((x) << S_TCB_RX_DDP_FLAGS) + +#define W_TCB_SLUSH_AUX3 26 +#define S_TCB_SLUSH_AUX3 31 +#define M_TCB_SLUSH_AUX3 0x1ffULL +#define V_TCB_SLUSH_AUX3(x) ((x) << S_TCB_SLUSH_AUX3) + +#define W_TCB_RX_DDP_BUF0_OFFSET 27 +#define S_TCB_RX_DDP_BUF0_OFFSET 8 +#define M_TCB_RX_DDP_BUF0_OFFSET 0x3fffffULL +#define V_TCB_RX_DDP_BUF0_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF0_OFFSET) + +#define W_TCB_RX_DDP_BUF0_LEN 27 +#define S_TCB_RX_DDP_BUF0_LEN 30 +#define M_TCB_RX_DDP_BUF0_LEN 0x3fffffULL +#define V_TCB_RX_DDP_BUF0_LEN(x) ((x) << S_TCB_RX_DDP_BUF0_LEN) + +#define W_TCB_RX_DDP_BUF1_OFFSET 28 +#define S_TCB_RX_DDP_BUF1_OFFSET 20 +#define M_TCB_RX_DDP_BUF1_OFFSET 0x3fffffULL +#define V_TCB_RX_DDP_BUF1_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF1_OFFSET) + +#define W_TCB_RX_DDP_BUF1_LEN 29 +#define S_TCB_RX_DDP_BUF1_LEN 10 +#define M_TCB_RX_DDP_BUF1_LEN 0x3fffffULL +#define V_TCB_RX_DDP_BUF1_LEN(x) ((x) << S_TCB_RX_DDP_BUF1_LEN) + +#define W_TCB_RX_DDP_BUF0_TAG 30 +#define S_TCB_RX_DDP_BUF0_TAG 0 +#define M_TCB_RX_DDP_BUF0_TAG 0xffffffffULL +#define V_TCB_RX_DDP_BUF0_TAG(x) ((x) << S_TCB_RX_DDP_BUF0_TAG) + +#define W_TCB_RX_DDP_BUF1_TAG 31 +#define S_TCB_RX_DDP_BUF1_TAG 0 +#define M_TCB_RX_DDP_BUF1_TAG 0xffffffffULL +#define V_TCB_RX_DDP_BUF1_TAG(x) ((x) << S_TCB_RX_DDP_BUF1_TAG) + +#define S_TF_DACK 10 +#define V_TF_DACK(x) ((x) << S_TF_DACK) + +#define S_TF_NAGLE 11 +#define V_TF_NAGLE(x) ((x) << S_TF_NAGLE) + +#define S_TF_RECV_SCALE 12 +#define V_TF_RECV_SCALE(x) ((x) << S_TF_RECV_SCALE) + +#define S_TF_RECV_TSTMP 13 +#define V_TF_RECV_TSTMP(x) ((x) << S_TF_RECV_TSTMP) + +#define S_TF_RECV_SACK 14 +#define V_TF_RECV_SACK(x) ((x) << S_TF_RECV_SACK) + +#define S_TF_TURBO 15 +#define V_TF_TURBO(x) ((x) << S_TF_TURBO) + +#define S_TF_KEEPALIVE 16 +#define V_TF_KEEPALIVE(x) ((x) << S_TF_KEEPALIVE) + +#define S_TF_TCAM_BYPASS 17 +#define V_TF_TCAM_BYPASS(x) ((x) << S_TF_TCAM_BYPASS) + +#define S_TF_CORE_FIN 18 +#define V_TF_CORE_FIN(x) ((x) << S_TF_CORE_FIN) + +#define S_TF_CORE_MORE 19 +#define V_TF_CORE_MORE(x) ((x) << S_TF_CORE_MORE) + +#define S_TF_MIGRATING 20 +#define V_TF_MIGRATING(x) ((x) << S_TF_MIGRATING) + +#define S_TF_ACTIVE_OPEN 21 +#define V_TF_ACTIVE_OPEN(x) ((x) << S_TF_ACTIVE_OPEN) + +#define S_TF_ASK_MODE 22 +#define V_TF_ASK_MODE(x) ((x) << S_TF_ASK_MODE) + +#define S_TF_NON_OFFLOAD 23 +#define V_TF_NON_OFFLOAD(x) ((x) << S_TF_NON_OFFLOAD) + +#define S_TF_MOD_SCHD 24 +#define V_TF_MOD_SCHD(x) ((x) << S_TF_MOD_SCHD) + +#define S_TF_MOD_SCHD_REASON0 25 +#define V_TF_MOD_SCHD_REASON0(x) ((x) << S_TF_MOD_SCHD_REASON0) + +#define S_TF_MOD_SCHD_REASON1 26 +#define V_TF_MOD_SCHD_REASON1(x) ((x) << S_TF_MOD_SCHD_REASON1) + +#define S_TF_MOD_SCHD_RX 27 +#define V_TF_MOD_SCHD_RX(x) ((x) << S_TF_MOD_SCHD_RX) + +#define S_TF_CORE_PUSH 28 +#define V_TF_CORE_PUSH(x) ((x) << S_TF_CORE_PUSH) + +#define S_TF_RCV_COALESCE_ENABLE 29 +#define V_TF_RCV_COALESCE_ENABLE(x) ((x) << S_TF_RCV_COALESCE_ENABLE) + +#define S_TF_RCV_COALESCE_PUSH 30 +#define V_TF_RCV_COALESCE_PUSH(x) ((x) << S_TF_RCV_COALESCE_PUSH) + +#define S_TF_RCV_COALESCE_LAST_PSH 31 +#define V_TF_RCV_COALESCE_LAST_PSH(x) ((x) << S_TF_RCV_COALESCE_LAST_PSH) + +#define S_TF_RCV_COALESCE_HEARTBEAT 32 +#define V_TF_RCV_COALESCE_HEARTBEAT(x) ((x) << S_TF_RCV_COALESCE_HEARTBEAT) + +#define S_TF_HALF_CLOSE 33 +#define V_TF_HALF_CLOSE(x) ((x) << S_TF_HALF_CLOSE) + +#define S_TF_DACK_MSS 34 +#define V_TF_DACK_MSS(x) ((x) << S_TF_DACK_MSS) + +#define S_TF_CCTRL_SEL0 35 +#define V_TF_CCTRL_SEL0(x) ((x) << S_TF_CCTRL_SEL0) + +#define S_TF_CCTRL_SEL1 36 +#define V_TF_CCTRL_SEL1(x) ((x) << S_TF_CCTRL_SEL1) + +#define S_TF_TCP_NEWRENO_FAST_RECOVERY 37 +#define V_TF_TCP_NEWRENO_FAST_RECOVERY(x) ((x) << S_TF_TCP_NEWRENO_FAST_RECOVERY) + +#define S_TF_TX_PACE_AUTO 38 +#define V_TF_TX_PACE_AUTO(x) ((x) << S_TF_TX_PACE_AUTO) + +#define S_TF_PEER_FIN_HELD 39 +#define V_TF_PEER_FIN_HELD(x) ((x) << S_TF_PEER_FIN_HELD) + +#define S_TF_CORE_URG 40 +#define V_TF_CORE_URG(x) ((x) << S_TF_CORE_URG) + +#define S_TF_RDMA_ERROR 41 +#define V_TF_RDMA_ERROR(x) ((x) << S_TF_RDMA_ERROR) + +#define S_TF_SSWS_DISABLED 42 +#define V_TF_SSWS_DISABLED(x) ((x) << S_TF_SSWS_DISABLED) + +#define S_TF_DUPACK_COUNT_ODD 43 +#define V_TF_DUPACK_COUNT_ODD(x) ((x) << S_TF_DUPACK_COUNT_ODD) + +#define S_TF_TX_CHANNEL 44 +#define V_TF_TX_CHANNEL(x) ((x) << S_TF_TX_CHANNEL) + +#define S_TF_RX_CHANNEL 45 +#define V_TF_RX_CHANNEL(x) ((x) << S_TF_RX_CHANNEL) + +#define S_TF_TX_PACE_FIXED 46 +#define V_TF_TX_PACE_FIXED(x) ((x) << S_TF_TX_PACE_FIXED) + +#define S_TF_RDMA_FLM_ERROR 47 +#define V_TF_RDMA_FLM_ERROR(x) ((x) << S_TF_RDMA_FLM_ERROR) + +#define S_TF_RX_FLOW_CONTROL_DISABLE 48 +#define V_TF_RX_FLOW_CONTROL_DISABLE(x) ((x) << S_TF_RX_FLOW_CONTROL_DISABLE) + +#endif /* _TCB_DEFS_H */ -- cgit v1.2.3-70-g09d2 From c7d204e8fdf02f88d91707213f473805bcfb977b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sat, 10 Feb 2007 23:17:26 +0200 Subject: IB/mthca: Fix reserved MTTs calculation on mem-free HCAs The reserved_mtts field has different meaning in Tavor and Arbel, so we are wasting mtt entries on memfree. Fix the Arbel case to match Tavor semantics. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_cmd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 968d1519761..71314460b11 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -1051,7 +1051,11 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev, MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET); dev_lim->max_eqs = 1 << (field & 0x7); MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET); - dev_lim->reserved_mtts = 1 << (field >> 4); + if (mthca_is_memfree(dev)) + dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64), + MTHCA_MTT_SEG_SIZE) / MTHCA_MTT_SEG_SIZE; + else + dev_lim->reserved_mtts = 1 << (field >> 4); MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET); dev_lim->max_mrw_sz = 1 << field; MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET); -- cgit v1.2.3-70-g09d2 From 1d1f19cfce7687b557cebdc41bf8a5eeba8a9882 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sat, 10 Feb 2007 23:17:26 +0200 Subject: IB/mthca: Give reserved MTTs a separate cache line MTTs are allocated in non-cache-coherent memory, so we must give reserved MTTs their own cache line, to prevent both device and CPU from writing into the same cache line at the same time. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 44bc6cc734a..9a9dd32885a 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -464,6 +464,10 @@ static int mthca_init_icm(struct mthca_dev *mdev, goto err_unmap_aux; } + /* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */ + mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * MTHCA_MTT_SEG_SIZE, + dma_get_cache_alignment()) / MTHCA_MTT_SEG_SIZE; + mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base, MTHCA_MTT_SEG_SIZE, mdev->limits.num_mtt_segs, -- cgit v1.2.3-70-g09d2 From 391e4dea7189eef32b0c2d121e7e047110c1b83c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sat, 10 Feb 2007 23:15:08 +0200 Subject: IB/mthca: Fix access to MTT and MPT tables on non-cache-coherent CPUs We allocate the MTT table with alloc_pages() and then do pci_map_sg(), so we must call pci_dma_sync_sg() after the CPU writes to the MTT table. This works since the device will never write MTTs on mem-free HCAs, once we get rid of the use of the WRITE_MTT firmware command. This change is needed to make that work, and is an improvement for now, since it gives FMRs a chance at working. For MPTs, both the device and CPU might write there, so we must allocate DMA coherent memory for these. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_main.c | 36 ++++---- drivers/infiniband/hw/mthca/mthca_memfree.c | 127 +++++++++++++++++++++------ drivers/infiniband/hw/mthca/mthca_memfree.h | 9 +- drivers/infiniband/hw/mthca/mthca_mr.c | 8 +- drivers/infiniband/hw/mthca/mthca_provider.h | 1 + 5 files changed, 131 insertions(+), 50 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 9a9dd32885a..0d9b7d06bbc 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -379,7 +379,7 @@ static int mthca_load_fw(struct mthca_dev *mdev) mdev->fw.arbel.fw_icm = mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages, - GFP_HIGHUSER | __GFP_NOWARN); + GFP_HIGHUSER | __GFP_NOWARN, 0); if (!mdev->fw.arbel.fw_icm) { mthca_err(mdev, "Couldn't allocate FW area, aborting.\n"); return -ENOMEM; @@ -412,7 +412,7 @@ err_unmap_fa: mthca_UNMAP_FA(mdev, &status); err_free: - mthca_free_icm(mdev, mdev->fw.arbel.fw_icm); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); return err; } @@ -441,7 +441,7 @@ static int mthca_init_icm(struct mthca_dev *mdev, (unsigned long long) aux_pages << 2); mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages, - GFP_HIGHUSER | __GFP_NOWARN); + GFP_HIGHUSER | __GFP_NOWARN, 0); if (!mdev->fw.arbel.aux_icm) { mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n"); return -ENOMEM; @@ -471,7 +471,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base, MTHCA_MTT_SEG_SIZE, mdev->limits.num_mtt_segs, - mdev->limits.reserved_mtts, 1); + mdev->limits.reserved_mtts, + 1, 0); if (!mdev->mr_table.mtt_table) { mthca_err(mdev, "Failed to map MTT context memory, aborting.\n"); err = -ENOMEM; @@ -481,7 +482,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base, dev_lim->mpt_entry_sz, mdev->limits.num_mpts, - mdev->limits.reserved_mrws, 1); + mdev->limits.reserved_mrws, + 1, 1); if (!mdev->mr_table.mpt_table) { mthca_err(mdev, "Failed to map MPT context memory, aborting.\n"); err = -ENOMEM; @@ -491,7 +493,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base, dev_lim->qpc_entry_sz, mdev->limits.num_qps, - mdev->limits.reserved_qps, 0); + mdev->limits.reserved_qps, + 0, 0); if (!mdev->qp_table.qp_table) { mthca_err(mdev, "Failed to map QP context memory, aborting.\n"); err = -ENOMEM; @@ -501,7 +504,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base, dev_lim->eqpc_entry_sz, mdev->limits.num_qps, - mdev->limits.reserved_qps, 0); + mdev->limits.reserved_qps, + 0, 0); if (!mdev->qp_table.eqp_table) { mthca_err(mdev, "Failed to map EQP context memory, aborting.\n"); err = -ENOMEM; @@ -511,7 +515,7 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base, MTHCA_RDB_ENTRY_SIZE, mdev->limits.num_qps << - mdev->qp_table.rdb_shift, + mdev->qp_table.rdb_shift, 0, 0, 0); if (!mdev->qp_table.rdb_table) { mthca_err(mdev, "Failed to map RDB context memory, aborting\n"); @@ -522,7 +526,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base, dev_lim->cqc_entry_sz, mdev->limits.num_cqs, - mdev->limits.reserved_cqs, 0); + mdev->limits.reserved_cqs, + 0, 0); if (!mdev->cq_table.table) { mthca_err(mdev, "Failed to map CQ context memory, aborting.\n"); err = -ENOMEM; @@ -534,7 +539,8 @@ static int mthca_init_icm(struct mthca_dev *mdev, mthca_alloc_icm_table(mdev, init_hca->srqc_base, dev_lim->srq_entry_sz, mdev->limits.num_srqs, - mdev->limits.reserved_srqs, 0); + mdev->limits.reserved_srqs, + 0, 0); if (!mdev->srq_table.table) { mthca_err(mdev, "Failed to map SRQ context memory, " "aborting.\n"); @@ -554,7 +560,7 @@ static int mthca_init_icm(struct mthca_dev *mdev, mdev->limits.num_amgms, mdev->limits.num_mgms + mdev->limits.num_amgms, - 0); + 0, 0); if (!mdev->mcg_table.table) { mthca_err(mdev, "Failed to map MCG context memory, aborting.\n"); err = -ENOMEM; @@ -592,7 +598,7 @@ err_unmap_aux: mthca_UNMAP_ICM_AUX(mdev, &status); err_free_aux: - mthca_free_icm(mdev, mdev->fw.arbel.aux_icm); + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); return err; } @@ -613,7 +619,7 @@ static void mthca_free_icms(struct mthca_dev *mdev) mthca_unmap_eq_icm(mdev); mthca_UNMAP_ICM_AUX(mdev, &status); - mthca_free_icm(mdev, mdev->fw.arbel.aux_icm); + mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0); } static int mthca_init_arbel(struct mthca_dev *mdev) @@ -697,7 +703,7 @@ err_free_icm: err_stop_fw: mthca_UNMAP_FA(mdev, &status); - mthca_free_icm(mdev, mdev->fw.arbel.fw_icm); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); err_disable: if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) @@ -716,7 +722,7 @@ static void mthca_close_hca(struct mthca_dev *mdev) mthca_free_icms(mdev); mthca_UNMAP_FA(mdev, &status); - mthca_free_icm(mdev, mdev->fw.arbel.fw_icm); + mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0); if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM)) mthca_DISABLE_LAM(mdev, &status); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 6b19645d946..0b9d053a599 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -35,6 +35,9 @@ */ #include +#include + +#include #include "mthca_memfree.h" #include "mthca_dev.h" @@ -58,22 +61,42 @@ struct mthca_user_db_table { } page[0]; }; -void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm) +static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) +{ + int i; + + if (chunk->nsg > 0) + pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, + PCI_DMA_BIDIRECTIONAL); + + for (i = 0; i < chunk->npages; ++i) + __free_pages(chunk->mem[i].page, + get_order(chunk->mem[i].length)); +} + +static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk) { - struct mthca_icm_chunk *chunk, *tmp; int i; + for (i = 0; i < chunk->npages; ++i) { + dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length, + lowmem_page_address(chunk->mem[i].page), + sg_dma_address(&chunk->mem[i])); + } +} + +void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent) +{ + struct mthca_icm_chunk *chunk, *tmp; + if (!icm) return; list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) { - if (chunk->nsg > 0) - pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages, - PCI_DMA_BIDIRECTIONAL); - - for (i = 0; i < chunk->npages; ++i) - __free_pages(chunk->mem[i].page, - get_order(chunk->mem[i].length)); + if (coherent) + mthca_free_icm_coherent(dev, chunk); + else + mthca_free_icm_pages(dev, chunk); kfree(chunk); } @@ -81,12 +104,41 @@ void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm) kfree(icm); } +static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask) +{ + mem->page = alloc_pages(gfp_mask, order); + if (!mem->page) + return -ENOMEM; + + mem->length = PAGE_SIZE << order; + mem->offset = 0; + return 0; +} + +static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem, + int order, gfp_t gfp_mask) +{ + void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem), + gfp_mask); + if (!buf) + return -ENOMEM; + + sg_set_buf(mem, buf, PAGE_SIZE << order); + BUG_ON(mem->offset); + sg_dma_len(mem) = PAGE_SIZE << order; + return 0; +} + struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, - gfp_t gfp_mask) + gfp_t gfp_mask, int coherent) { struct mthca_icm *icm; struct mthca_icm_chunk *chunk = NULL; int cur_order; + int ret; + + /* We use sg_set_buf for coherent allocs, which assumes low memory */ + BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM)); icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); if (!icm) @@ -112,21 +164,28 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, while (1 << cur_order > npages) --cur_order; - chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order); - if (chunk->mem[chunk->npages].page) { - chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order; - chunk->mem[chunk->npages].offset = 0; + if (coherent) + ret = mthca_alloc_icm_coherent(&dev->pdev->dev, + &chunk->mem[chunk->npages], + cur_order, gfp_mask); + else + ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages], + cur_order, gfp_mask); - if (++chunk->npages == MTHCA_ICM_CHUNK_LEN) { + if (!ret) { + ++chunk->npages; + + if (!coherent && chunk->npages == MTHCA_ICM_CHUNK_LEN) { chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, chunk->npages, PCI_DMA_BIDIRECTIONAL); if (chunk->nsg <= 0) goto fail; + } + if (chunk->npages == MTHCA_ICM_CHUNK_LEN) chunk = NULL; - } npages -= 1 << cur_order; } else { @@ -136,7 +195,7 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, } } - if (chunk) { + if (!coherent && chunk) { chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, chunk->npages, PCI_DMA_BIDIRECTIONAL); @@ -148,7 +207,7 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, return icm; fail: - mthca_free_icm(dev, icm); + mthca_free_icm(dev, icm, coherent); return NULL; } @@ -167,7 +226,7 @@ int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int ob table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT, (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | - __GFP_NOWARN); + __GFP_NOWARN, table->coherent); if (!table->icm[i]) { ret = -ENOMEM; goto out; @@ -175,7 +234,7 @@ int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int ob if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE, &status) || status) { - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); table->icm[i] = NULL; ret = -ENOMEM; goto out; @@ -204,16 +263,16 @@ void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int o mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE, MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE, &status); - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); table->icm[i] = NULL; } mutex_unlock(&table->mutex); } -void *mthca_table_find(struct mthca_icm_table *table, int obj) +void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle) { - int idx, offset, i; + int idx, offset, dma_offset, i; struct mthca_icm_chunk *chunk; struct mthca_icm *icm; struct page *page = NULL; @@ -225,13 +284,22 @@ void *mthca_table_find(struct mthca_icm_table *table, int obj) idx = (obj & (table->num_obj - 1)) * table->obj_size; icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE]; - offset = idx % MTHCA_TABLE_CHUNK_SIZE; + dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE; if (!icm) goto out; list_for_each_entry(chunk, &icm->chunk_list, list) { for (i = 0; i < chunk->npages; ++i) { + if (dma_handle && dma_offset >= 0) { + if (sg_dma_len(&chunk->mem[i]) > dma_offset) + *dma_handle = sg_dma_address(&chunk->mem[i]) + + dma_offset; + dma_offset -= sg_dma_len(&chunk->mem[i]); + } + /* DMA mapping can merge pages but not split them, + * so if we found the page, dma_handle has already + * been assigned to. */ if (chunk->mem[i].length > offset) { page = chunk->mem[i].page; goto out; @@ -283,7 +351,7 @@ void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table, struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, u64 virt, int obj_size, int nobj, int reserved, - int use_lowmem) + int use_lowmem, int use_coherent) { struct mthca_icm_table *table; int num_icm; @@ -302,6 +370,7 @@ struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, table->num_obj = nobj; table->obj_size = obj_size; table->lowmem = use_lowmem; + table->coherent = use_coherent; mutex_init(&table->mutex); for (i = 0; i < num_icm; ++i) @@ -314,12 +383,12 @@ struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT, (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) | - __GFP_NOWARN); + __GFP_NOWARN, use_coherent); if (!table->icm[i]) goto err; if (mthca_MAP_ICM(dev, table->icm[i], virt + i * MTHCA_TABLE_CHUNK_SIZE, &status) || status) { - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); table->icm[i] = NULL; goto err; } @@ -339,7 +408,7 @@ err: mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE, MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE, &status); - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); } kfree(table); @@ -357,7 +426,7 @@ void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table) mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE, MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE, &status); - mthca_free_icm(dev, table->icm[i]); + mthca_free_icm(dev, table->icm[i], table->coherent); } kfree(table); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h index 6d42947e1dc..594144145f4 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.h +++ b/drivers/infiniband/hw/mthca/mthca_memfree.h @@ -69,6 +69,7 @@ struct mthca_icm_table { int num_obj; int obj_size; int lowmem; + int coherent; struct mutex mutex; struct mthca_icm *icm[0]; }; @@ -82,17 +83,17 @@ struct mthca_icm_iter { struct mthca_dev; struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages, - gfp_t gfp_mask); -void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm); + gfp_t gfp_mask, int coherent); +void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent); struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev, u64 virt, int obj_size, int nobj, int reserved, - int use_lowmem); + int use_lowmem, int use_coherent); void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table); int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj); -void *mthca_table_find(struct mthca_icm_table *table, int obj); +void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle); int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table, int start, int end); void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table, diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index f71ffa88db3..7d08f2038af 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -524,7 +524,7 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, if (err) goto err_out_mpt_free; - mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key); + mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL); BUG_ON(!mr->mem.arbel.mpt); } else mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base + @@ -538,7 +538,8 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, if (mthca_is_memfree(dev)) { mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table, - mr->mtt->first_seg); + mr->mtt->first_seg, + &mr->mem.arbel.dma_handle); BUG_ON(!mr->mem.arbel.mtts); } else mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg; @@ -712,6 +713,9 @@ int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] | MTHCA_MTT_FLAG_PRESENT); + dma_sync_single(&dev->pdev->dev, fmr->mem.arbel.dma_handle, + list_len * sizeof(u64), DMA_TO_DEVICE); + fmr->mem.arbel.mpt->key = cpu_to_be32(key); fmr->mem.arbel.mpt->lkey = cpu_to_be32(key); fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift)); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index 9a5bece3fa5..1d266ac2e09 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -89,6 +89,7 @@ struct mthca_fmr { struct { struct mthca_mpt_entry *mpt; __be64 *mtts; + dma_addr_t dma_handle; } arbel; } mem; }; -- cgit v1.2.3-70-g09d2 From c20e20ab0f3af9a44842ea11287c9ecd034a5d33 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sat, 10 Feb 2007 23:13:12 +0200 Subject: IB/mthca: Merge MR and FMR space on 64-bit systems For Tavor, we currently reserve separate MPT and MTT space for FMRs to avoid abusing the vmalloc space on 32 bit kernels. No such problem exists on 64 bit kernels so let's not do it there. This way we have a shared pool for MR and FMR resources, used on demand. This will also make it possible to write MTTs for regular regions directly from driver. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_mr.c | 20 +++++++++++++++----- drivers/infiniband/hw/mthca/mthca_profile.c | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 7d08f2038af..958c6d5b6bc 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -765,7 +765,7 @@ void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) int mthca_init_mr_table(struct mthca_dev *dev) { unsigned long addr; - int err, i; + int mpts, mtts, err, i; err = mthca_alloc_init(&dev->mr_table.mpt_alloc, dev->limits.num_mpts, @@ -799,13 +799,21 @@ int mthca_init_mr_table(struct mthca_dev *dev) err = -EINVAL; goto err_fmr_mpt; } + mpts = mtts = 1 << i; + } else { + mpts = dev->limits.num_mtt_segs; + mtts = dev->limits.num_mpts; + } + + if (!mthca_is_memfree(dev) && + (dev->mthca_flags & MTHCA_FLAG_FMR)) { addr = pci_resource_start(dev->pdev, 4) + ((pci_resource_len(dev->pdev, 4) - 1) & dev->mr_table.mpt_base); dev->mr_table.tavor_fmr.mpt_base = - ioremap(addr, (1 << i) * sizeof(struct mthca_mpt_entry)); + ioremap(addr, mpts * sizeof(struct mthca_mpt_entry)); if (!dev->mr_table.tavor_fmr.mpt_base) { mthca_warn(dev, "MPT ioremap for FMR failed.\n"); @@ -818,19 +826,21 @@ int mthca_init_mr_table(struct mthca_dev *dev) dev->mr_table.mtt_base); dev->mr_table.tavor_fmr.mtt_base = - ioremap(addr, (1 << i) * MTHCA_MTT_SEG_SIZE); + ioremap(addr, mtts * MTHCA_MTT_SEG_SIZE); if (!dev->mr_table.tavor_fmr.mtt_base) { mthca_warn(dev, "MTT ioremap for FMR failed.\n"); err = -ENOMEM; goto err_fmr_mtt; } + } - err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, i); + if (dev->limits.fmr_reserved_mtts) { + err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1)); if (err) goto err_fmr_mtt_buddy; /* Prevent regular MRs from using FMR keys */ - err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, i); + err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1)); if (err) goto err_reserve_fmr; diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c index 58d44aa3c30..26bf86d1cfc 100644 --- a/drivers/infiniband/hw/mthca/mthca_profile.c +++ b/drivers/infiniband/hw/mthca/mthca_profile.c @@ -277,7 +277,7 @@ u64 mthca_make_profile(struct mthca_dev *dev, * out of the MR pool. They don't use additional memory, but * we assign them as part of the HCA profile anyway. */ - if (mthca_is_memfree(dev)) + if (mthca_is_memfree(dev) || BITS_PER_LONG == 64) dev->limits.fmr_reserved_mtts = 0; else dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts; -- cgit v1.2.3-70-g09d2 From b2875d4c39759a732203db32f245cc6d8bbdd7cf Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sat, 10 Feb 2007 23:14:25 +0200 Subject: IB/mthca: Always fill MTTs from CPU Speed up memory registration by filling in MTTs directly when the CPU can write directly to the whole table (all mem-free cards, and to Tavor mode on 64-bit systems with the patch I posted earlier). This reduces the number of FW commands needed to register an MR by at least a factor of 2 and speeds up memory registration significantly. Signed-off-by: Michael S. Tsirkin Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_dev.h | 2 + drivers/infiniband/hw/mthca/mthca_mr.c | 82 +++++++++++++++++++++++++++- drivers/infiniband/hw/mthca/mthca_provider.c | 14 ++--- 3 files changed, 89 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index fe5cecf70fe..b7e42efaf43 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -464,6 +464,8 @@ void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd); void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd); +int mthca_write_mtt_size(struct mthca_dev *dev); + struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size); void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt); int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 958c6d5b6bc..6037dd3f87d 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -243,8 +243,8 @@ void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt) kfree(mtt); } -int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, - int start_index, u64 *buffer_list, int list_len) +static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) { struct mthca_mailbox *mailbox; __be64 *mtt_entry; @@ -295,6 +295,84 @@ out: return err; } +int mthca_write_mtt_size(struct mthca_dev *dev) +{ + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy) + /* + * Be friendly to WRITE_MTT command + * and leave two empty slots for the + * index and reserved fields of the + * mailbox. + */ + return PAGE_SIZE / sizeof (u64) - 2; + + /* For Arbel, all MTTs must fit in the same page. */ + return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff; +} + +void mthca_tavor_write_mtt_seg(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + u64 __iomem *mtts; + int i; + + mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * MTHCA_MTT_SEG_SIZE + + start_index * sizeof (u64); + for (i = 0; i < list_len; ++i) + mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT), + mtts + i); +} + +void mthca_arbel_write_mtt_seg(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + __be64 *mtts; + dma_addr_t dma_handle; + int i; + int s = start_index * sizeof (u64); + + /* For Arbel, all MTTs must fit in the same page. */ + BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE); + /* Require full segments */ + BUG_ON(s % MTHCA_MTT_SEG_SIZE); + + mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg + + s / MTHCA_MTT_SEG_SIZE, &dma_handle); + + BUG_ON(!mtts); + + for (i = 0; i < list_len; ++i) + mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT); + + dma_sync_single(&dev->pdev->dev, dma_handle, list_len * sizeof (u64), DMA_TO_DEVICE); +} + +int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt, + int start_index, u64 *buffer_list, int list_len) +{ + int size = mthca_write_mtt_size(dev); + int chunk; + + if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy) + return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len); + + while (list_len > 0) { + chunk = min(size, list_len); + if (mthca_is_memfree(dev)) + mthca_arbel_write_mtt_seg(dev, mtt, start_index, + buffer_list, chunk); + else + mthca_tavor_write_mtt_seg(dev, mtt, start_index, + buffer_list, chunk); + + list_len -= chunk; + start_index += chunk; + buffer_list += chunk; + } + + return 0; +} + static inline u32 tavor_hw_index_to_key(u32 ind) { return ind; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 7b96751695e..0725ad7ad9b 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1015,6 +1015,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, int shift, n, len; int i, j, k; int err = 0; + int write_mtt_size; shift = ffs(region->page_size) - 1; @@ -1040,6 +1041,8 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, i = n = 0; + write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages)); + list_for_each_entry(chunk, ®ion->chunk_list, list) for (j = 0; j < chunk->nmap; ++j) { len = sg_dma_len(&chunk->page_list[j]) >> shift; @@ -1047,14 +1050,11 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, pages[i++] = sg_dma_address(&chunk->page_list[j]) + region->page_size * k; /* - * Be friendly to WRITE_MTT command - * and leave two empty slots for the - * index and reserved fields of the - * mailbox. + * Be friendly to write_mtt and pass it chunks + * of appropriate size. */ - if (i == PAGE_SIZE / sizeof (u64) - 2) { - err = mthca_write_mtt(dev, mr->mtt, - n, pages, i); + if (i == write_mtt_size) { + err = mthca_write_mtt(dev, mr->mtt, n, pages, i); if (err) goto mtt_done; n += i; -- cgit v1.2.3-70-g09d2