diff options
Diffstat (limited to 'drivers/infiniband')
56 files changed, 5108 insertions, 2094 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index ba2d6505e9a..69a53d476b5 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -41,4 +41,6 @@ source "drivers/infiniband/ulp/ipoib/Kconfig" source "drivers/infiniband/ulp/srp/Kconfig" +source "drivers/infiniband/ulp/iser/Kconfig" + endmenu diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index eea27322a22..c7ff58c1d0e 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_INFINIBAND_MTHCA) += hw/mthca/ obj-$(CONFIG_IPATH_CORE) += hw/ipath/ obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/ obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/ +obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/ diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 450adfe0a4f..3f6705f3083 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3152,6 +3152,7 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, } if (cm_id_priv->alt_av.ah_attr.dlid) { *qp_attr_mask |= IB_QP_ALT_PATH; + qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr; } ret = 0; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a76834edf60..863f64befc7 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -476,7 +476,7 @@ static inline int cma_zero_addr(struct sockaddr *addr) else { ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | - ip6->s6_addr32[3] | ip6->s6_addr32[4]) == 0; + ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; } } diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index b38e02a5db3..5ed4dab52a6 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1775,11 +1775,9 @@ ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv, void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) { mad_send_wr->timeout = 0; - if (mad_send_wr->refcount == 1) { - list_del(&mad_send_wr->agent_list); - list_add_tail(&mad_send_wr->agent_list, + if (mad_send_wr->refcount == 1) + list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->done_list); - } } static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, @@ -2098,8 +2096,7 @@ retry: queued_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); - list_del(&mad_list->list); - list_add_tail(&mad_list->list, &send_queue->list); + list_move_tail(&mad_list->list, &send_queue->list); } spin_unlock_irqrestore(&send_queue->lock, flags); diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index d4704e054e3..ebcd5b18177 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -665,8 +665,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, goto out; mad_send_wr->refcount++; - list_del(&mad_send_wr->agent_list); - list_add_tail(&mad_send_wr->agent_list, + list_move_tail(&mad_send_wr->agent_list, &mad_send_wr->mad_agent_priv->send_list); } out: diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 76bf61e9b55..bdf5d509819 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1530,7 +1530,6 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, out_put: put_qp_read(qp); -out: while (wr) { if (is_ud && wr->wr.ud.ah) put_ah_read(wr->wr.ud.ah); @@ -1539,6 +1538,7 @@ out: wr = next; } +out: kfree(user_wr); return ret ? ret : in_len; @@ -1963,7 +1963,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, if (!obj) return -ENOMEM; - init_uobj(&obj->uobject, 0, file->ucontext); + init_uobj(&obj->uobject, cmd.user_handle, file->ucontext); down_write(&obj->uobject.mutex); pd = idr_read_pd(cmd.pd_handle, file->ucontext); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5ec2d49e9bb..e725cccc7cd 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -188,7 +188,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_ah_idr, uobj); ib_destroy_ah(ah); - list_del(&uobj->list); kfree(uobj); } @@ -200,7 +199,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_qp_idr, uobj); ib_uverbs_detach_umcast(qp, uqp); ib_destroy_qp(qp); - list_del(&uobj->list); ib_uverbs_release_uevent(file, &uqp->uevent); kfree(uqp); } @@ -213,7 +211,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_cq_idr, uobj); ib_destroy_cq(cq); - list_del(&uobj->list); ib_uverbs_release_ucq(file, ev_file, ucq); kfree(ucq); } @@ -225,7 +222,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_srq_idr, uobj); ib_destroy_srq(srq); - list_del(&uobj->list); ib_uverbs_release_uevent(file, uevent); kfree(uevent); } @@ -243,7 +239,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, memobj = container_of(uobj, struct ib_umem_object, uobject); ib_umem_release_on_close(mrdev, &memobj->umem); - list_del(&uobj->list); kfree(memobj); } @@ -252,7 +247,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, idr_remove_uobj(&ib_uverbs_pd_idr, uobj); ib_dealloc_pd(pd); - list_del(&uobj->list); kfree(uobj); } @@ -821,11 +815,12 @@ static void ib_uverbs_remove_one(struct ib_device *device) kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); } -static struct super_block *uverbs_event_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + struct vfsmount *mnt) { return get_sb_pseudo(fs_type, "infinibandevent:", NULL, - INFINIBANDEVENTFS_MAGIC); + INFINIBANDEVENTFS_MAGIC, mnt); } static struct file_system_type uverbs_event_fs = { diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig index 9ea67c409b6..1db9489f1e8 100644 --- a/drivers/infiniband/hw/ipath/Kconfig +++ b/drivers/infiniband/hw/ipath/Kconfig @@ -1,16 +1,16 @@ config IPATH_CORE - tristate "PathScale InfiniPath Driver" + tristate "QLogic InfiniPath Driver" depends on 64BIT && PCI_MSI && NET ---help--- - This is a low-level driver for PathScale InfiniPath host channel + This is a low-level driver for QLogic InfiniPath host channel adapters (HCAs) based on the HT-400 and PE-800 chips. config INFINIBAND_IPATH - tristate "PathScale InfiniPath Verbs Driver" + tristate "QLogic InfiniPath Verbs Driver" depends on IPATH_CORE && INFINIBAND ---help--- This is a driver that provides InfiniBand verbs support for - PathScale InfiniPath host channel adapters (HCAs). This + QLogic InfiniPath host channel adapters (HCAs). This allows these devices to be used with both kernel upper level protocols such as IP-over-InfiniBand as well as with userspace applications (in conjunction with InfiniBand userspace access). diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile index b4d084abfd2..b0bf7286413 100644 --- a/drivers/infiniband/hw/ipath/Makefile +++ b/drivers/infiniband/hw/ipath/Makefile @@ -1,4 +1,4 @@ -EXTRA_CFLAGS += -DIPATH_IDSTR='"PathScale kernel.org driver"' \ +EXTRA_CFLAGS += -DIPATH_IDSTR='"QLogic kernel.org driver"' \ -DIPATH_KERN_TYPE=0 obj-$(CONFIG_IPATH_CORE) += ipath_core.o diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h index 48a55247b83..062bd392e7e 100644 --- a/drivers/infiniband/hw/ipath/ipath_common.h +++ b/drivers/infiniband/hw/ipath/ipath_common.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -38,7 +39,8 @@ * to communicate between kernel and user code. */ -/* This is the IEEE-assigned OUI for PathScale, Inc. */ + +/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */ #define IPATH_SRC_OUI_1 0x00 #define IPATH_SRC_OUI_2 0x11 #define IPATH_SRC_OUI_3 0x75 @@ -96,8 +98,8 @@ struct infinipath_stats { __u64 sps_hwerrs; /* number of times IB link changed state unexpectedly */ __u64 sps_iblink; - /* no longer used; left for compatibility */ - __u64 sps_unused3; + /* kernel receive interrupts that didn't read intstat */ + __u64 sps_fastrcvint; /* number of kernel (port0) packets received */ __u64 sps_port0pkts; /* number of "ethernet" packets sent by driver */ @@ -121,8 +123,7 @@ struct infinipath_stats { __u64 sps_ports; /* list of pkeys (other than default) accepted (0 means not set) */ __u16 sps_pkeys[4]; - /* lids for up to 4 infinipaths, indexed by infinipath # */ - __u16 sps_lid[4]; + __u16 sps_unused16[4]; /* available; maintaining compatible layout */ /* number of user ports per chip (not IB ports) */ __u32 sps_nports; /* not our interrupt, or already handled */ @@ -140,10 +141,8 @@ struct infinipath_stats { * packets if ipath not configured, sma/mad, etc.) */ __u64 sps_krdrops; - /* mlids for up to 4 infinipaths, indexed by infinipath # */ - __u16 sps_mlid[4]; /* pad for future growth */ - __u64 __sps_pad[45]; + __u64 __sps_pad[46]; }; /* @@ -310,6 +309,9 @@ struct ipath_base_info { __u32 spi_rcv_egrchunksize; /* total size of mmap to cover full rcvegrbuffers */ __u32 spi_rcv_egrbuftotlen; + __u32 spi_filler_for_align; + /* address of readonly memory copy of the rcvhdrq tail register. */ + __u64 spi_rcvhdr_tailaddr; } __attribute__ ((aligned(8))); @@ -342,9 +344,9 @@ struct ipath_base_info { /* * Similarly, this is the kernel version going back to the user. It's * slightly different, in that we want to tell if the driver was built as - * part of a PathScale release, or from the driver from OpenIB, kernel.org, - * or a standard distribution, for support reasons. The high bit is 0 for - * non-PathScale, and 1 for PathScale-built/supplied. + * part of a QLogic release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied. * * It's returned by the driver to the user code during initialization in the * spi_sw_version field of ipath_base_info, so the user code can in turn @@ -379,13 +381,7 @@ struct ipath_user_info { */ __u32 spu_rcvhdrsize; - /* - * cache line aligned (64 byte) user address to - * which the rcvhdrtail register will be written by infinipath - * whenever it changes, so that no chip registers are read in - * the performance path. - */ - __u64 spu_rcvhdraddr; + __u64 spu_unused; /* kept for compatible layout */ /* * address of struct base_info to write to @@ -481,7 +477,7 @@ struct ipath_sma_pkt * Data layout in I2C flash (for GUID, etc.) * All fields are little-endian binary unless otherwise stated */ -#define IPATH_FLASH_VERSION 1 +#define IPATH_FLASH_VERSION 2 struct ipath_flash { /* flash layout version (IPATH_FLASH_VERSION) */ __u8 if_fversion; @@ -489,14 +485,14 @@ struct ipath_flash { __u8 if_csum; /* * valid length (in use, protected by if_csum), including - * if_fversion and if_sum themselves) + * if_fversion and if_csum themselves) */ __u8 if_length; /* the GUID, in network order */ __u8 if_guid[8]; /* number of GUIDs to use, starting from if_guid */ __u8 if_numguid; - /* the board serial number, in ASCII */ + /* the (last 10 characters of) board serial number, in ASCII */ char if_serial[12]; /* board mfg date (YYYYMMDD ASCII) */ char if_mfgdate[8]; @@ -508,8 +504,10 @@ struct ipath_flash { __u8 if_powerhour[2]; /* ASCII free-form comment field */ char if_comment[32]; - /* 78 bytes used, min flash size is 128 bytes */ - __u8 if_future[50]; + /* Backwards compatible prefix for longer QLogic Serial Numbers */ + char if_sprefix[4]; + /* 82 bytes used, min flash size is 128 bytes */ + __u8 if_future[46]; }; /* @@ -603,14 +601,118 @@ struct infinipath_counters { #define INFINIPATH_KPF_INTR 0x1 /* SendPIO per-buffer control */ -#define INFINIPATH_SP_LENGTHP1_MASK 0x3FF -#define INFINIPATH_SP_LENGTHP1_SHIFT 0 -#define INFINIPATH_SP_INTR 0x80000000 -#define INFINIPATH_SP_TEST 0x40000000 -#define INFINIPATH_SP_TESTEBP 0x20000000 +#define INFINIPATH_SP_TEST 0x40 +#define INFINIPATH_SP_TESTEBP 0x20 /* SendPIOAvail bits */ #define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1 #define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0 +/* infinipath header format */ +struct ipath_header { + /* + * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset - + * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, + * Port 3, TID 11, offset 14. + */ + __le32 ver_port_tid_offset; + __le16 chksum; + __le16 pkt_flags; +}; + +/* infinipath user message header format. + * This structure contains the first 4 fields common to all protocols + * that employ infinipath. + */ +struct ipath_message_header { + __be16 lrh[4]; + __be32 bth[3]; + /* fields below this point are in host byte order */ + struct ipath_header iph; + __u8 sub_opcode; +}; + +/* infinipath ethernet header format */ +struct ether_header { + __be16 lrh[4]; + __be32 bth[3]; + struct ipath_header iph; + __u8 sub_opcode; + __u8 cmd; + __be16 lid; + __u16 mac[3]; + __u8 frag_num; + __u8 seq_num; + __le32 len; + /* MUST be of word size due to PIO write requirements */ + __le32 csum; + __le16 csum_offset; + __le16 flags; + __u16 first_2_bytes; + __u8 unused[2]; /* currently unused */ +}; + + +/* IB - LRH header consts */ +#define IPATH_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define IPATH_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define IPATH_DEFAULT_P_KEY 0xFFFF +#define IPATH_PERMISSIVE_LID 0xFFFF +#define IPATH_AETH_CREDIT_SHIFT 24 +#define IPATH_AETH_CREDIT_MASK 0x1F +#define IPATH_AETH_CREDIT_INVAL 0x1F +#define IPATH_PSN_MASK 0xFFFFFF +#define IPATH_MSN_MASK 0xFFFFFF +#define IPATH_QPN_MASK 0xFFFFFF +#define IPATH_MULTICAST_LID_BASE 0xC000 +#define IPATH_MULTICAST_QPN 0xFFFFFF + +/* Receive Header Queue: receive type (from infinipath) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + + +/* sub OpCodes - ith4x */ +#define IPATH_ITH4X_OPCODE_ENCAP 0x81 +#define IPATH_ITH4X_OPCODE_LID_ARP 0x82 + +#define IPATH_HEADER_QUEUE_WORDS 9 + +/* functions for extracting fields from rcvhdrq entries for the driver. + */ +static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf) +{ + return __le32_to_cpu(rbuf[1]); +} + +static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT) + & INFINIPATH_RHF_RCVTYPE_MASK; +} + +static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf) +{ + return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT) + & INFINIPATH_RHF_LENGTH_MASK) << 2; +} + +static inline __u32 ipath_hdrget_index(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT) + & INFINIPATH_RHF_EGRINDEX_MASK; +} + +static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword) +{ + return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT) + & INFINIPATH_I_VERS_MASK; +} + #endif /* _IPATH_COMMON_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c index 7ece1135ddf..3efee341c9b 100644 --- a/drivers/infiniband/hw/ipath/ipath_cq.c +++ b/drivers/infiniband/hw/ipath/ipath_cq.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -157,10 +158,21 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, struct ib_ucontext *context, struct ib_udata *udata) { + struct ipath_ibdev *dev = to_idev(ibdev); struct ipath_cq *cq; struct ib_wc *wc; struct ib_cq *ret; + if (entries > ib_ipath_max_cqes) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (dev->n_cqs_allocated == ib_ipath_max_cqs) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + /* * Need to use vmalloc() if we want to support large #s of * entries. @@ -196,6 +208,8 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, ret = &cq->ibcq; + dev->n_cqs_allocated++; + bail: return ret; } @@ -210,9 +224,11 @@ bail: */ int ipath_destroy_cq(struct ib_cq *ibcq) { + struct ipath_ibdev *dev = to_idev(ibcq->device); struct ipath_cq *cq = to_icq(ibcq); tasklet_kill(&cq->comptask); + dev->n_cqs_allocated--; vfree(cq->queue); kfree(cq); diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h index 46762387f5f..f415beda0d3 100644 --- a/drivers/infiniband/hw/ipath/ipath_debug.h +++ b/drivers/infiniband/hw/ipath/ipath_debug.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c index 28ddceb260e..147dd89e21c 100644 --- a/drivers/infiniband/hw/ipath/ipath_diag.c +++ b/drivers/infiniband/hw/ipath/ipath_diag.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -43,10 +44,9 @@ #include <linux/pci.h> #include <asm/uaccess.h> -#include "ipath_common.h" #include "ipath_kernel.h" -#include "ips_common.h" #include "ipath_layer.h" +#include "ipath_common.h" int ipath_diag_inuse; static int diag_set_link; @@ -66,18 +66,20 @@ static struct file_operations diag_file_ops = { .release = ipath_diag_release }; -static struct cdev *diag_cdev; -static struct class_device *diag_class_dev; - -int ipath_diag_init(void) +int ipath_diag_add(struct ipath_devdata *dd) { - return ipath_cdev_init(IPATH_DIAG_MINOR, "ipath_diag", - &diag_file_ops, &diag_cdev, &diag_class_dev); + char name[16]; + + snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit); + + return ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name, + &diag_file_ops, &dd->diag_cdev, + &dd->diag_class_dev); } -void ipath_diag_cleanup(void) +void ipath_diag_remove(struct ipath_devdata *dd) { - ipath_cdev_cleanup(&diag_cdev, &diag_class_dev); + ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_class_dev); } /** @@ -101,8 +103,7 @@ static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr, int ret; /* not very efficient, but it works for now */ - if (reg_addr < dd->ipath_kregbase || - reg_end > dd->ipath_kregend) { + if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) { ret = -EINVAL; goto bail; } @@ -113,7 +114,7 @@ static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr, goto bail; } reg_addr++; - uaddr++; + uaddr += sizeof(u64); } ret = 0; bail: @@ -139,8 +140,7 @@ static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr, int ret; /* not very efficient, but it works for now */ - if (reg_addr < dd->ipath_kregbase || - reg_end > dd->ipath_kregend) { + if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) { ret = -EINVAL; goto bail; } @@ -153,7 +153,7 @@ static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr, writeq(data, reg_addr); reg_addr++; - uaddr++; + uaddr += sizeof(u64); } ret = 0; bail: @@ -191,7 +191,8 @@ static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr, } reg_addr++; - uaddr++; + uaddr += sizeof(u32); + } ret = 0; bail: @@ -230,7 +231,7 @@ static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr, writel(data, reg_addr); reg_addr++; - uaddr++; + uaddr += sizeof(u32); } ret = 0; bail: @@ -239,59 +240,45 @@ bail: static int ipath_diag_open(struct inode *in, struct file *fp) { + int unit = iminor(in) - IPATH_DIAG_MINOR_BASE; struct ipath_devdata *dd; - int unit = 0; /* XXX this is bogus */ - unsigned long flags; int ret; - dd = ipath_lookup(unit); - mutex_lock(&ipath_mutex); - spin_lock_irqsave(&ipath_devs_lock, flags); if (ipath_diag_inuse) { ret = -EBUSY; goto bail; } - list_for_each_entry(dd, &ipath_dev_list, ipath_list) { - /* - * we need at least one infinipath device to be present - * (don't use INITTED, because we want to be able to open - * even if device is in freeze mode, which cleared INITTED). - * There is a small amount of risk to this, which is why we - * also verify kregbase is set. - */ - - if (!(dd->ipath_flags & IPATH_PRESENT) || - !dd->ipath_kregbase) - continue; - - ipath_diag_inuse = 1; - diag_set_link = 0; - ret = 0; + dd = ipath_lookup(unit); + + if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) || + !dd->ipath_kregbase) { + ret = -ENODEV; goto bail; } - ret = -ENODEV; - -bail: - spin_unlock_irqrestore(&ipath_devs_lock, flags); + fp->private_data = dd; + ipath_diag_inuse = 1; + diag_set_link = 0; + ret = 0; /* Only expose a way to reset the device if we make it into diag mode. */ - if (ret == 0) - ipath_expose_reset(&dd->pcidev->dev); + ipath_expose_reset(&dd->pcidev->dev); +bail: mutex_unlock(&ipath_mutex); return ret; } -static int ipath_diag_release(struct inode *i, struct file *f) +static int ipath_diag_release(struct inode *in, struct file *fp) { mutex_lock(&ipath_mutex); ipath_diag_inuse = 0; + fp->private_data = NULL; mutex_unlock(&ipath_mutex); return 0; } @@ -299,17 +286,10 @@ static int ipath_diag_release(struct inode *i, struct file *f) static ssize_t ipath_diag_read(struct file *fp, char __user *data, size_t count, loff_t *off) { - int unit = 0; /* XXX provide for reads on other units some day */ - struct ipath_devdata *dd; + struct ipath_devdata *dd = fp->private_data; void __iomem *kreg_base; ssize_t ret; - dd = ipath_lookup(unit); - if (!dd) { - ret = -ENODEV; - goto bail; - } - kreg_base = dd->ipath_kregbase; if (count == 0) @@ -328,23 +308,16 @@ static ssize_t ipath_diag_read(struct file *fp, char __user *data, ret = count; } -bail: return ret; } static ssize_t ipath_diag_write(struct file *fp, const char __user *data, size_t count, loff_t *off) { - int unit = 0; /* XXX this is bogus */ - struct ipath_devdata *dd; + struct ipath_devdata *dd = fp->private_data; void __iomem *kreg_base; ssize_t ret; - dd = ipath_lookup(unit); - if (!dd) { - ret = -ENODEV; - goto bail; - } kreg_base = dd->ipath_kregbase; if (count == 0) @@ -363,6 +336,5 @@ static ssize_t ipath_diag_write(struct file *fp, const char __user *data, ret = count; } -bail: return ret; } diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index dddcdae736a..823131d58b3 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -38,8 +39,8 @@ #include <linux/vmalloc.h> #include "ipath_kernel.h" -#include "ips_common.h" #include "ipath_layer.h" +#include "ipath_common.h" static void ipath_update_pio_bufs(struct ipath_devdata *); @@ -52,7 +53,7 @@ const char *ipath_get_unit_name(int unit) EXPORT_SYMBOL_GPL(ipath_get_unit_name); -#define DRIVER_LOAD_MSG "PathScale " IPATH_DRV_NAME " loaded: " +#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: " #define PFX IPATH_DRV_NAME ": " /* @@ -74,8 +75,8 @@ MODULE_PARM_DESC(debug, "mask for debug prints"); EXPORT_SYMBOL_GPL(ipath_debug); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("PathScale <support@pathscale.com>"); -MODULE_DESCRIPTION("Pathscale InfiniPath driver"); +MODULE_AUTHOR("QLogic <support@pathscale.com>"); +MODULE_DESCRIPTION("QLogic InfiniPath driver"); const char *ipath_ibcstatus_str[] = { "Disabled", @@ -130,14 +131,6 @@ static struct pci_driver ipath_driver = { .id_table = ipath_pci_tbl, }; -/* - * This is where port 0's rcvhdrtail register is written back; we also - * want nothing else sharing the cache line, so make it a cache line - * in size. Used for all units. - */ -volatile __le64 *ipath_port0_rcvhdrtail; -dma_addr_t ipath_port0_rcvhdrtail_dma; -static int port0_rcvhdrtail_refs; static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev, u32 *bar0, u32 *bar1) @@ -170,14 +163,13 @@ static void ipath_free_devdata(struct pci_dev *pdev, list_del(&dd->ipath_list); spin_unlock_irqrestore(&ipath_devs_lock, flags); } - dma_free_coherent(&pdev->dev, sizeof(*dd), dd, dd->ipath_dma_addr); + vfree(dd); } static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) { unsigned long flags; struct ipath_devdata *dd; - dma_addr_t dma_addr; int ret; if (!idr_pre_get(&unit_table, GFP_KERNEL)) { @@ -185,15 +177,12 @@ static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) goto bail; } - dd = dma_alloc_coherent(&pdev->dev, sizeof(*dd), &dma_addr, - GFP_KERNEL); - + dd = vmalloc(sizeof(*dd)); if (!dd) { dd = ERR_PTR(-ENOMEM); goto bail; } - - dd->ipath_dma_addr = dma_addr; + memset(dd, 0, sizeof(*dd)); dd->ipath_unit = -1; spin_lock_irqsave(&ipath_devs_lock, flags); @@ -271,47 +260,6 @@ int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp) return nunits; } -static int init_port0_rcvhdrtail(struct pci_dev *pdev) -{ - int ret; - - mutex_lock(&ipath_mutex); - - if (!ipath_port0_rcvhdrtail) { - ipath_port0_rcvhdrtail = - dma_alloc_coherent(&pdev->dev, - IPATH_PORT0_RCVHDRTAIL_SIZE, - &ipath_port0_rcvhdrtail_dma, - GFP_KERNEL); - - if (!ipath_port0_rcvhdrtail) { - ret = -ENOMEM; - goto bail; - } - } - port0_rcvhdrtail_refs++; - ret = 0; - -bail: - mutex_unlock(&ipath_mutex); - - return ret; -} - -static void cleanup_port0_rcvhdrtail(struct pci_dev *pdev) -{ - mutex_lock(&ipath_mutex); - - if (!--port0_rcvhdrtail_refs) { - dma_free_coherent(&pdev->dev, IPATH_PORT0_RCVHDRTAIL_SIZE, - (void *) ipath_port0_rcvhdrtail, - ipath_port0_rcvhdrtail_dma); - ipath_port0_rcvhdrtail = NULL; - } - - mutex_unlock(&ipath_mutex); -} - /* * These next two routines are placeholders in case we don't have per-arch * code for controlling write combining. If explicit control of write @@ -336,20 +284,12 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, u32 bar0 = 0, bar1 = 0; u8 rev; - ret = init_port0_rcvhdrtail(pdev); - if (ret < 0) { - printk(KERN_ERR IPATH_DRV_NAME - ": Could not allocate port0_rcvhdrtail: error %d\n", - -ret); - goto bail; - } - dd = ipath_alloc_devdata(pdev); if (IS_ERR(dd)) { ret = PTR_ERR(dd); printk(KERN_ERR IPATH_DRV_NAME ": Could not allocate devdata: error %d\n", -ret); - goto bail_rcvhdrtail; + goto bail; } ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit); @@ -424,12 +364,29 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, */ ret = pci_set_dma_mask(pdev, DMA_32BIT_MASK); if (ret) { - dev_info(&pdev->dev, "pci_set_dma_mask unit %u " - "fails: %d\n", dd->ipath_unit, ret); + dev_info(&pdev->dev, + "Unable to set DMA mask for unit %u: %d\n", + dd->ipath_unit, ret); goto bail_regions; } - else + else { ipath_dbg("No 64bit DMA mask, used 32 bit mask\n"); + ret = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); + + } + } + else { + ret = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); } pci_set_master(pdev); @@ -452,7 +409,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ipath_init_pe800_funcs(dd); break; default: - ipath_dev_err(dd, "Found unknown PathScale deviceid 0x%x, " + ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, " "failing\n", ent->device); return -ENODEV; } @@ -460,10 +417,10 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, for (j = 0; j < 6; j++) { if (!pdev->resource[j].start) continue; - ipath_cdbg(VERBOSE, "BAR %d start %lx, end %lx, len %lx\n", - j, pdev->resource[j].start, - pdev->resource[j].end, - pci_resource_len(pdev, j)); + ipath_cdbg(VERBOSE, "BAR %d start %llx, end %llx, len %llx\n", + j, (unsigned long long)pdev->resource[j].start, + (unsigned long long)pdev->resource[j].end, + (unsigned long long)pci_resource_len(pdev, j)); } if (!addr) { @@ -495,23 +452,23 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ((void __iomem *)dd->ipath_kregbase + len); dd->ipath_physaddr = addr; /* used for io_remap, etc. */ /* for user mmap */ - dd->ipath_kregvirt = (u64 __iomem *) phys_to_virt(addr); - ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p " - "kregvirt %p\n", addr, dd->ipath_kregbase, - dd->ipath_kregvirt); + ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n", + addr, dd->ipath_kregbase); /* * clear ipath_flags here instead of in ipath_init_chip as it is set * by ipath_setup_htconfig. */ dd->ipath_flags = 0; + dd->ipath_lli_counter = 0; + dd->ipath_lli_errors = 0; if (dd->ipath_f_bus(dd, pdev)) ipath_dev_err(dd, "Failed to setup config space; " "continuing anyway\n"); /* - * set up our interrupt handler; SA_SHIRQ probably not needed, + * set up our interrupt handler; IRQF_SHARED probably not needed, * since MSI interrupts shouldn't be shared but won't hurt for now. * check 0 irq after we return from chip-specific bus setup, since * that can affect this due to setup @@ -520,7 +477,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ipath_dev_err(dd, "irq is 0, BIOS error? Interrupts won't " "work\n"); else { - ret = request_irq(pdev->irq, ipath_intr, SA_SHIRQ, + ret = request_irq(pdev->irq, ipath_intr, IRQF_SHARED, IPATH_DRV_NAME, dd); if (ret) { ipath_dev_err(dd, "Couldn't setup irq handler, " @@ -545,6 +502,7 @@ static int __devinit ipath_init_one(struct pci_dev *pdev, ipath_device_create_group(&pdev->dev, dd); ipathfs_add_device(dd); ipath_user_add(dd); + ipath_diag_add(dd); ipath_layer_add(dd); goto bail; @@ -561,9 +519,6 @@ bail_disable: bail_devdata: ipath_free_devdata(pdev, dd); -bail_rcvhdrtail: - cleanup_port0_rcvhdrtail(pdev); - bail: return ret; } @@ -577,8 +532,9 @@ static void __devexit ipath_remove_one(struct pci_dev *pdev) return; dd = pci_get_drvdata(pdev); - ipath_layer_del(dd); - ipath_user_del(dd); + ipath_layer_remove(dd); + ipath_diag_remove(dd); + ipath_user_remove(dd); ipathfs_remove_device(dd); ipath_device_remove_group(&pdev->dev, dd); ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, " @@ -594,7 +550,6 @@ static void __devexit ipath_remove_one(struct pci_dev *pdev) pci_disable_device(pdev); ipath_free_devdata(pdev, dd); - cleanup_port0_rcvhdrtail(pdev); } /* general driver use */ @@ -868,7 +823,8 @@ static void ipath_rcv_layer(struct ipath_devdata *dd, u32 etail, u8 pad, *bthbytes; struct sk_buff *skb, *nskb; - if (dd->ipath_port0_skbs && hdr->sub_opcode == OPCODE_ENCAP) { + if (dd->ipath_port0_skbs && + hdr->sub_opcode == IPATH_ITH4X_OPCODE_ENCAP) { /* * Allocate a new sk_buff to replace the one we give * to the network stack. @@ -899,7 +855,7 @@ static void ipath_rcv_layer(struct ipath_devdata *dd, u32 etail, /* another ether packet received */ ipath_stats.sps_ether_rpkts++; } - else if (hdr->sub_opcode == OPCODE_LID_ARP) + else if (hdr->sub_opcode == IPATH_ITH4X_OPCODE_LID_ARP) __ipath_layer_rcv_lid(dd, hdr); } @@ -916,8 +872,8 @@ void ipath_kreceive(struct ipath_devdata *dd) const u32 rsize = dd->ipath_rcvhdrentsize; /* words */ const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */ u32 etail = -1, l, hdrqtail; - struct ips_message_header *hdr; - u32 eflags, i, etype, tlen, pkttot = 0; + struct ipath_message_header *hdr; + u32 eflags, i, etype, tlen, pkttot = 0, updegr=0, reloop=0; static u64 totcalls; /* stats, may eventually remove */ char emsg[128]; @@ -931,24 +887,18 @@ void ipath_kreceive(struct ipath_devdata *dd) if (test_and_set_bit(0, &dd->ipath_rcv_pending)) goto bail; - if (dd->ipath_port0head == - (u32)le64_to_cpu(*dd->ipath_hdrqtailptr)) + l = dd->ipath_port0head; + hdrqtail = (u32) le64_to_cpu(*dd->ipath_hdrqtailptr); + if (l == hdrqtail) goto done; -gotmore: - /* - * read only once at start. If in flood situation, this helps - * performance slightly. If more arrive while we are processing, - * we'll come back here and do them - */ - hdrqtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr); - - for (i = 0, l = dd->ipath_port0head; l != hdrqtail; i++) { +reloop: + for (i = 0; l != hdrqtail; i++) { u32 qp; u8 *bthbytes; rc = (u64 *) (dd->ipath_pd[0]->port_rcvhdrq + (l << 2)); - hdr = (struct ips_message_header *)&rc[1]; + hdr = (struct ipath_message_header *)&rc[1]; /* * could make a network order version of IPATH_KD_QP, and * do the obvious shift before masking to speed this up. @@ -956,10 +906,10 @@ gotmore: qp = ntohl(hdr->bth[1]) & 0xffffff; bthbytes = (u8 *) hdr->bth; - eflags = ips_get_hdr_err_flags((__le32 *) rc); - etype = ips_get_rcv_type((__le32 *) rc); + eflags = ipath_hdrget_err_flags((__le32 *) rc); + etype = ipath_hdrget_rcv_type((__le32 *) rc); /* total length */ - tlen = ips_get_length_in_bytes((__le32 *) rc); + tlen = ipath_hdrget_length_in_bytes((__le32 *) rc); ebuf = NULL; if (etype != RCVHQ_RCV_TYPE_EXPECTED) { /* @@ -969,7 +919,7 @@ gotmore: * set ebuf (so we try to copy data) unless the * length requires it. */ - etail = ips_get_index((__le32 *) rc); + etail = ipath_hdrget_index((__le32 *) rc); if (tlen > sizeof(*hdr) || etype == RCVHQ_RCV_TYPE_NON_KD) ebuf = ipath_get_egrbuf(dd, etail, 0); @@ -981,7 +931,7 @@ gotmore: */ if (etype != RCVHQ_RCV_TYPE_NON_KD && etype != - RCVHQ_RCV_TYPE_ERROR && ips_get_ipath_ver( + RCVHQ_RCV_TYPE_ERROR && ipath_hdrget_ipath_ver( hdr->iph.ver_port_tid_offset) != IPS_PROTO_VERSION) { ipath_cdbg(PKT, "Bad InfiniPath protocol version " @@ -994,7 +944,19 @@ gotmore: ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " "tlen=%x opcode=%x egridx=%x: %s\n", eflags, l, etype, tlen, bthbytes[0], - ips_get_index((__le32 *) rc), emsg); + ipath_hdrget_index((__le32 *) rc), emsg); + /* Count local link integrity errors. */ + if (eflags & (INFINIPATH_RHF_H_ICRCERR | + INFINIPATH_RHF_H_VCRCERR)) { + u8 n = (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + + if (++dd->ipath_lli_counter > n) { + dd->ipath_lli_counter = 0; + dd->ipath_lli_errors++; + } + } } else if (etype == RCVHQ_RCV_TYPE_NON_KD) { int ret = __ipath_verbs_rcv(dd, rc + 1, ebuf, tlen); @@ -1002,6 +964,9 @@ gotmore: ipath_cdbg(VERBOSE, "received IB packet, " "not SMA (QP=%x)\n", qp); + if (dd->ipath_lli_counter) + dd->ipath_lli_counter--; + } else if (etype == RCVHQ_RCV_TYPE_EAGER) { if (qp == IPATH_KD_QP && bthbytes[0] == ipath_layer_rcv_opcode && @@ -1054,25 +1019,49 @@ gotmore: l += rsize; if (l >= maxcnt) l = 0; + if (etype != RCVHQ_RCV_TYPE_EXPECTED) + updegr = 1; /* - * update for each packet, to help prevent overflows if we - * have lots of packets. + * update head regs on last packet, and every 16 packets. + * Reduce bus traffic, while still trying to prevent + * rcvhdrq overflows, for when the queue is nearly full */ - (void)ipath_write_ureg(dd, ur_rcvhdrhead, - dd->ipath_rhdrhead_intr_off | l, 0); - if (etype != RCVHQ_RCV_TYPE_EXPECTED) - (void)ipath_write_ureg(dd, ur_rcvegrindexhead, - etail, 0); + if (l == hdrqtail || (i && !(i&0xf))) { + u64 lval; + if (l == hdrqtail) /* PE-800 interrupt only on last */ + lval = dd->ipath_rhdrhead_intr_off | l; + else + lval = l; + (void)ipath_write_ureg(dd, ur_rcvhdrhead, lval, 0); + if (updegr) { + (void)ipath_write_ureg(dd, ur_rcvegrindexhead, + etail, 0); + updegr = 0; + } + } + } + + if (!dd->ipath_rhdrhead_intr_off && !reloop) { + /* HT-400 workaround; we can have a race clearing chip + * interrupt with another interrupt about to be delivered, + * and can clear it before it is delivered on the GPIO + * workaround. By doing the extra check here for the + * in-memory tail register updating while we were doing + * earlier packets, we "almost" guarantee we have covered + * that case. + */ + u32 hqtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr); + if (hqtail != hdrqtail) { + hdrqtail = hqtail; + reloop = 1; /* loop 1 extra time at most */ + goto reloop; + } } pkttot += i; dd->ipath_port0head = l; - if (hdrqtail != (u32)le64_to_cpu(*dd->ipath_hdrqtailptr)) - /* more arrived while we handled first batch */ - goto gotmore; - if (pkttot > ipath_stats.sps_maxpkts_call) ipath_stats.sps_maxpkts_call = pkttot; ipath_stats.sps_port0pkts += pkttot; @@ -1369,26 +1358,20 @@ bail: * @dd: the infinipath device * @pd: the port data * - * this *must* be physically contiguous memory, and for now, - * that limits it to what kmalloc can do. + * this must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). */ int ipath_create_rcvhdrq(struct ipath_devdata *dd, struct ipath_portdata *pd) { - int ret = 0, amt; + int ret = 0; - amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * - sizeof(u32), PAGE_SIZE); if (!pd->port_rcvhdrq) { - /* - * not using REPEAT isn't viable; at 128KB, we can easily - * fail this. The problem with REPEAT is we can block here - * "forever". There isn't an inbetween, unfortunately. We - * could reduce the risk by never freeing the rcvhdrq except - * at unload, but even then, the first time a port is used, - * we could delay for some time... - */ + dma_addr_t phys_hdrqtail; gfp_t gfp_flags = GFP_USER | __GFP_COMP; + int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE); pd->port_rcvhdrq = dma_alloc_coherent( &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys, @@ -1401,6 +1384,16 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd, ret = -ENOMEM; goto bail; } + pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, GFP_KERNEL); + if (!pd->port_rcvhdrtail_kvaddr) { + ipath_dev_err(dd, "attempt to allocate 1 page " + "for port %u rcvhdrqtailaddr failed\n", + pd->port_port); + ret = -ENOMEM; + goto bail; + } + pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail; pd->port_rcvhdrq_size = amt; @@ -1410,20 +1403,28 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd, (unsigned long) pd->port_rcvhdrq_phys, (unsigned long) pd->port_rcvhdrq_size, pd->port_port); - } else { - /* - * clear for security, sanity, and/or debugging, each - * time we reuse - */ - memset(pd->port_rcvhdrq, 0, amt); + + ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx physical\n", + pd->port_port, + (unsigned long long) phys_hdrqtail); } + else + ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; " + "hdrtailaddr@%p %llx physical\n", + pd->port_port, pd->port_rcvhdrq, + pd->port_rcvhdrq_phys, pd->port_rcvhdrtail_kvaddr, + (unsigned long long)pd->port_rcvhdrqtailaddr_phys); + + /* clear for security and sanity on each use */ + memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size); + memset((void *)pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE); /* * tell chip each time we init it, even if we are re-using previous - * memory (we zero it at process close) + * memory (we zero the register at process close) */ - ipath_cdbg(VERBOSE, "writing port %d rcvhdraddr as %lx\n", - pd->port_port, (unsigned long) pd->port_rcvhdrq_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr, + pd->port_port, pd->port_rcvhdrqtailaddr_phys); ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, pd->port_port, pd->port_rcvhdrq_phys); @@ -1511,15 +1512,27 @@ void ipath_set_ib_lstate(struct ipath_devdata *dd, int which) [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED", [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE" }; + int linkcmd = (which >> INFINIPATH_IBCC_LINKCMD_SHIFT) & + INFINIPATH_IBCC_LINKCMD_MASK; + ipath_cdbg(SMA, "Trying to move unit %u to %s, current ltstate " "is %s\n", dd->ipath_unit, - what[(which >> INFINIPATH_IBCC_LINKCMD_SHIFT) & - INFINIPATH_IBCC_LINKCMD_MASK], + what[linkcmd], ipath_ibcstatus_str[ (ipath_read_kreg64 (dd, dd->ipath_kregs->kr_ibcstatus) >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & INFINIPATH_IBCS_LINKTRAININGSTATE_MASK]); + /* flush all queued sends when going to DOWN or INIT, to be sure that + * they don't block SMA and other MAD packets */ + if (!linkcmd || linkcmd == INFINIPATH_IBCC_LINKCMD_INIT) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + INFINIPATH_S_ABORT); + ipath_disarm_piobufs(dd, dd->ipath_lastport_piobuf, + (unsigned)(dd->ipath_piobcnt2k + + dd->ipath_piobcnt4k) - + dd->ipath_lastport_piobuf); + } ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, dd->ipath_ibcctrl | which); @@ -1638,7 +1651,7 @@ void ipath_shutdown_device(struct ipath_devdata *dd) /* disable IBC */ dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; ipath_write_kreg(dd, dd->ipath_kregs->kr_control, - dd->ipath_control); + dd->ipath_control | INFINIPATH_C_FREEZEMODE); /* * clear SerdesEnable and turn the leds off; do this here because @@ -1667,60 +1680,54 @@ void ipath_shutdown_device(struct ipath_devdata *dd) /** * ipath_free_pddata - free a port's allocated data * @dd: the infinipath device - * @port: the port - * @freehdrq: free the port data structure if true + * @pd: the portdata structure * - * when closing, free up any allocated data for a port, if the - * reference count goes to zero - * Note: this also optionally frees the portdata itself! - * Any changes here have to be matched up with the reinit case - * of ipath_init_chip(), which calls this routine on reinit after reset. + * free up any allocated data for a port + * This should not touch anything that would affect a simultaneous + * re-allocation of port data, because it is called after ipath_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + * (The only exception to global state is freeing the port0 port0_skbs.) */ -void ipath_free_pddata(struct ipath_devdata *dd, u32 port, int freehdrq) +void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd) { - struct ipath_portdata *pd = dd->ipath_pd[port]; - if (!pd) return; - if (freehdrq) - /* - * only clear and free portdata if we are going to also - * release the hdrq, otherwise we leak the hdrq on each - * open/close cycle - */ - dd->ipath_pd[port] = NULL; - if (freehdrq && pd->port_rcvhdrq) { + + if (pd->port_rcvhdrq) { ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p " "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq, (unsigned long) pd->port_rcvhdrq_size); dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size, pd->port_rcvhdrq, pd->port_rcvhdrq_phys); pd->port_rcvhdrq = NULL; + if (pd->port_rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *)pd->port_rcvhdrtail_kvaddr, + pd->port_rcvhdrqtailaddr_phys); + pd->port_rcvhdrtail_kvaddr = NULL; + } } - if (port && pd->port_rcvegrbuf) { - /* always free this */ - if (pd->port_rcvegrbuf) { - unsigned e; - - for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { - void *base = pd->port_rcvegrbuf[e]; - size_t size = pd->port_rcvegrbuf_size; - - ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), " - "chunk %u/%u\n", base, - (unsigned long) size, - e, pd->port_rcvegrbuf_chunks); - dma_free_coherent( - &dd->pcidev->dev, size, base, - pd->port_rcvegrbuf_phys[e]); - } - vfree(pd->port_rcvegrbuf); - pd->port_rcvegrbuf = NULL; - vfree(pd->port_rcvegrbuf_phys); - pd->port_rcvegrbuf_phys = NULL; + if (pd->port_port && pd->port_rcvegrbuf) { + unsigned e; + + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + void *base = pd->port_rcvegrbuf[e]; + size_t size = pd->port_rcvegrbuf_size; + + ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), " + "chunk %u/%u\n", base, + (unsigned long) size, + e, pd->port_rcvegrbuf_chunks); + dma_free_coherent(&dd->pcidev->dev, size, + base, pd->port_rcvegrbuf_phys[e]); } + vfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; + vfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; pd->port_rcvegrbuf_chunks = 0; - } else if (port == 0 && dd->ipath_port0_skbs) { + } else if (pd->port_port == 0 && dd->ipath_port0_skbs) { unsigned e; struct sk_buff **skbs = dd->ipath_port0_skbs; @@ -1732,10 +1739,8 @@ void ipath_free_pddata(struct ipath_devdata *dd, u32 port, int freehdrq) dev_kfree_skb(skbs[e]); vfree(skbs); } - if (freehdrq) { - kfree(pd->port_tid_pg_list); - kfree(pd); - } + kfree(pd->port_tid_pg_list); + kfree(pd); } static int __init infinipath_init(void) @@ -1806,7 +1811,6 @@ static void cleanup_device(struct ipath_devdata *dd) * re-init */ dd->ipath_kregbase = NULL; - dd->ipath_kregvirt = NULL; dd->ipath_uregbase = 0; dd->ipath_sregbase = 0; dd->ipath_cregbase = 0; @@ -1821,6 +1825,12 @@ static void cleanup_device(struct ipath_devdata *dd) dd->ipath_pioavailregs_phys); dd->ipath_pioavailregs_dma = NULL; } + if (dd->ipath_dummy_hdrq) { + dma_free_coherent(&dd->pcidev->dev, + dd->ipath_pd[0]->port_rcvhdrq_size, + dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys); + dd->ipath_dummy_hdrq = NULL; + } if (dd->ipath_pageshadow) { struct page **tmpp = dd->ipath_pageshadow; @@ -1861,10 +1871,14 @@ static void cleanup_device(struct ipath_devdata *dd) /* * free any resources still in use (usually just kernel ports) - * at unload + * at unload; we do for portcnt, not cfgports, because cfgports + * could have changed while we were loaded. */ - for (port = 0; port < dd->ipath_cfgports; port++) - ipath_free_pddata(dd, port, 1); + for (port = 0; port < dd->ipath_portcnt; port++) { + struct ipath_portdata *pd = dd->ipath_pd[port]; + dd->ipath_pd[port] = NULL; + ipath_free_pddata(dd, pd); + } kfree(dd->ipath_pd); /* * debuggability, in case some cleanup path tries to use it diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c index a2f1ceafcca..3313356ab93 100644 --- a/drivers/infiniband/hw/ipath/ipath_eeprom.c +++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -600,8 +601,31 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd) guid = *(__be64 *) ifp->if_guid; dd->ipath_guid = guid; dd->ipath_nguid = ifp->if_numguid; - memcpy(dd->ipath_serial, ifp->if_serial, - sizeof(ifp->if_serial)); + /* + * Things are slightly complicated by the desire to transparently + * support both the Pathscale 10-digit serial number and the QLogic + * 13-character version. + */ + if ((ifp->if_fversion > 1) && ifp->if_sprefix[0] + && ((u8 *)ifp->if_sprefix)[0] != 0xFF) { + /* This board has a Serial-prefix, which is stored + * elsewhere for backward-compatibility. + */ + char *snp = dd->ipath_serial; + int len; + memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix); + snp[sizeof ifp->if_sprefix] = '\0'; + len = strlen(snp); + snp += len; + len = (sizeof dd->ipath_serial) - len; + if (len > sizeof ifp->if_serial) { + len = sizeof ifp->if_serial; + } + memcpy(snp, ifp->if_serial, len); + } else + memcpy(dd->ipath_serial, ifp->if_serial, + sizeof ifp->if_serial); + ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n", (unsigned long long) be64_to_cpu(dd->ipath_guid)); diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index ada267e41f6..bbaa70e57db 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -38,8 +39,8 @@ #include <asm/pgtable.h> #include "ipath_kernel.h" -#include "ips_common.h" #include "ipath_layer.h" +#include "ipath_common.h" static int ipath_open(struct inode *, struct file *); static int ipath_close(struct inode *, struct file *); @@ -122,6 +123,7 @@ static int ipath_get_base_info(struct ipath_portdata *pd, * on to yet another method of dealing with this */ kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys; + kinfo->spi_rcvhdr_tailaddr = (u64)pd->port_rcvhdrqtailaddr_phys; kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys; kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys; kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + @@ -456,7 +458,7 @@ static int ipath_set_part_key(struct ipath_portdata *pd, u16 key) u16 lkey = key & 0x7FFF; int ret; - if (lkey == (IPS_DEFAULT_P_KEY & 0x7FFF)) { + if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) { /* nothing to do; this key always valid */ ret = 0; goto bail; @@ -704,6 +706,15 @@ static int ipath_create_user_egr(struct ipath_portdata *pd) unsigned e, egrcnt, alloced, egrperchunk, chunk, egrsize, egroff; size_t size; int ret; + gfp_t gfp_flags; + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail, and we can + * use compound pages. + */ + gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; egrcnt = dd->ipath_rcvegrcnt; /* TID number offset for this port */ @@ -720,10 +731,8 @@ static int ipath_create_user_egr(struct ipath_portdata *pd) * memory pressure (creating large files and then copying them over * NFS while doing lots of MPI jobs), we hit some allocation * failures, even though we can sleep... (2.6.10) Still get - * failures at 64K. 32K is the lowest we can go without waiting - * more memory again. It seems likely that the coalescing in - * free_pages, etc. still has issues (as it has had previously - * during 2.6.x development). + * failures at 64K. 32K is the lowest we can go without wasting + * additional memory. */ size = 0x8000; alloced = ALIGN(egrsize * egrcnt, size); @@ -744,12 +753,6 @@ static int ipath_create_user_egr(struct ipath_portdata *pd) goto bail_rcvegrbuf; } for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { - /* - * GFP_USER, but without GFP_FS, so buffer cache can be - * coalesced (we hope); otherwise, even at order 4, - * heavy filesystem activity makes these fail - */ - gfp_t gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; pd->port_rcvegrbuf[e] = dma_alloc_coherent( &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e], @@ -783,11 +786,12 @@ static int ipath_create_user_egr(struct ipath_portdata *pd) bail_rcvegrbuf_phys: for (e = 0; e < pd->port_rcvegrbuf_chunks && - pd->port_rcvegrbuf[e]; e++) + pd->port_rcvegrbuf[e]; e++) { dma_free_coherent(&dd->pcidev->dev, size, pd->port_rcvegrbuf[e], pd->port_rcvegrbuf_phys[e]); + } vfree(pd->port_rcvegrbuf_phys); pd->port_rcvegrbuf_phys = NULL; bail_rcvegrbuf: @@ -802,10 +806,7 @@ static int ipath_do_user_init(struct ipath_portdata *pd, { int ret = 0; struct ipath_devdata *dd = pd->port_dd; - u64 physaddr, uaddr, off, atmp; - struct page *pagep; u32 head32; - u64 head; /* for now, if major version is different, bail */ if ((uinfo->spu_userversion >> 16) != IPATH_USER_SWMAJOR) { @@ -830,54 +831,6 @@ static int ipath_do_user_init(struct ipath_portdata *pd, /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ - /* set up for the rcvhdr Q tail register writeback to user memory */ - if (!uinfo->spu_rcvhdraddr || - !access_ok(VERIFY_WRITE, (u64 __user *) (unsigned long) - uinfo->spu_rcvhdraddr, sizeof(u64))) { - ipath_dbg("Port %d rcvhdrtail addr %llx not valid\n", - pd->port_port, - (unsigned long long) uinfo->spu_rcvhdraddr); - ret = -EINVAL; - goto done; - } - - off = offset_in_page(uinfo->spu_rcvhdraddr); - uaddr = PAGE_MASK & (unsigned long) uinfo->spu_rcvhdraddr; - ret = ipath_get_user_pages_nocopy(uaddr, &pagep); - if (ret) { - dev_info(&dd->pcidev->dev, "Failed to lookup and lock " - "address %llx for rcvhdrtail: errno %d\n", - (unsigned long long) uinfo->spu_rcvhdraddr, -ret); - goto done; - } - ipath_stats.sps_pagelocks++; - pd->port_rcvhdrtail_uaddr = uaddr; - pd->port_rcvhdrtail_pagep = pagep; - pd->port_rcvhdrtail_kvaddr = - page_address(pagep); - pd->port_rcvhdrtail_kvaddr += off; - physaddr = page_to_phys(pagep) + off; - ipath_cdbg(VERBOSE, "port %d user addr %llx hdrtailaddr, %llx " - "physical (off=%llx)\n", - pd->port_port, - (unsigned long long) uinfo->spu_rcvhdraddr, - (unsigned long long) physaddr, (unsigned long long) off); - ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr, - pd->port_port, physaddr); - atmp = ipath_read_kreg64_port(dd, - dd->ipath_kregs->kr_rcvhdrtailaddr, - pd->port_port); - if (physaddr != atmp) { - ipath_dev_err(dd, - "Catastrophic software error, " - "RcvHdrTailAddr%u written as %llx, " - "read back as %llx\n", pd->port_port, - (unsigned long long) physaddr, - (unsigned long long) atmp); - ret = -EINVAL; - goto done; - } - /* for right now, kernel piobufs are at end, so port 1 is at 0 */ pd->port_piobufs = dd->ipath_piobufbase + dd->ipath_pbufsport * (pd->port_port - @@ -896,26 +849,18 @@ static int ipath_do_user_init(struct ipath_portdata *pd, ret = ipath_create_user_egr(pd); if (ret) goto done; - /* enable receives now */ - /* atomically set enable bit for this port */ - set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, - &dd->ipath_rcvctrl); /* - * set the head registers for this port to the current values + * set the eager head register for this port to the current values * of the tail pointers, since we don't know if they were * updated on last use of the port. */ - head32 = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); - head = (u64) head32; - ipath_write_ureg(dd, ur_rcvhdrhead, head, pd->port_port); head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port); ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port); dd->ipath_lastegrheads[pd->port_port] = -1; dd->ipath_lastrcvhdrqtails[pd->port_port] = -1; - ipath_cdbg(VERBOSE, "Wrote port%d head %llx, egrhead %x from " - "tail regs\n", pd->port_port, - (unsigned long long) head, head32); + ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n", + pd->port_port, head32); pd->port_tidcursor = 0; /* start at beginning after open */ /* * now enable the port; the tail registers will be written to memory @@ -924,24 +869,76 @@ static int ipath_do_user_init(struct ipath_portdata *pd, * transition from 0 to 1, so clear it first, then set it as part of * enabling the port. This will (very briefly) affect any other * open ports, but it shouldn't be long enough to be an issue. + * We explictly set the in-memory copy to 0 beforehand, so we don't + * have to wait to be sure the DMA update has happened. */ + *pd->port_rcvhdrtail_kvaddr = 0ULL; + set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port, + &dd->ipath_rcvctrl); ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl & ~INFINIPATH_R_TAILUPD); ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl); - done: return ret; } + +/* common code for the mappings on dma_alloc_coherent mem */ +static int ipath_mmap_mem(struct vm_area_struct *vma, + struct ipath_portdata *pd, unsigned len, + int write_ok, dma_addr_t addr, char *what) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned pfn = (unsigned long)addr >> PAGE_SHIFT; + int ret; + + if ((vma->vm_end - vma->vm_start) > len) { + dev_info(&dd->pcidev->dev, + "FAIL on %s: len %lx > %x\n", what, + vma->vm_end - vma->vm_start, len); + ret = -EFAULT; + goto bail; + } + + if (!write_ok) { + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "%s must be mapped readonly\n", what); + ret = -EPERM; + goto bail; + } + + /* don't allow them to later change with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + } + + ret = remap_pfn_range(vma, vma->vm_start, pfn, + len, vma->vm_page_prot); + if (ret) + dev_info(&dd->pcidev->dev, + "%s port%u mmap of %lx, %x bytes r%c failed: %d\n", + what, pd->port_port, (unsigned long)addr, len, + write_ok?'w':'o', ret); + else + ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes r%c\n", + what, pd->port_port, (unsigned long)addr, len, + write_ok?'w':'o'); +bail: + return ret; +} + static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd, u64 ureg) { unsigned long phys; int ret; - /* it's the real hardware, so io_remap works */ - + /* + * This is real hardware, so use io_remap. This is the mechanism + * for the user process to update the head registers for their port + * in the chip. + */ if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen " "%lx > PAGE\n", vma->vm_end - vma->vm_start); @@ -967,10 +964,11 @@ static int mmap_piobufs(struct vm_area_struct *vma, int ret; /* - * When we map the PIO buffers, we want to map them as writeonly, no - * read possible. + * When we map the PIO buffers in the chip, we want to map them as + * writeonly, no read possible. This prevents access to previous + * process data, and catches users who might try to read the i/o + * space due to a bug. */ - if ((vma->vm_end - vma->vm_start) > (dd->ipath_pbufsport * dd->ipath_palign)) { dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: " @@ -981,11 +979,10 @@ static int mmap_piobufs(struct vm_area_struct *vma, } phys = dd->ipath_physaddr + pd->port_piobufs; + /* - * Do *NOT* mark this as non-cached (PWT bit), or we don't get the + * Don't mark this as non-cached, or we don't get the * write combining behavior we want on the PIO buffers! - * vma->vm_page_prot = - * pgprot_noncached(vma->vm_page_prot); */ if (vma->vm_flags & VM_READ) { @@ -997,8 +994,7 @@ static int mmap_piobufs(struct vm_area_struct *vma, } /* don't allow them to later change to readable with mprotect */ - - vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_flags &= ~VM_MAYREAD; vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, @@ -1017,11 +1013,6 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, dma_addr_t *phys; int ret; - if (!pd->port_rcvegrbuf) { - ret = -EFAULT; - goto bail; - } - size = pd->port_rcvegrbuf_size; total_size = pd->port_rcvegrbuf_chunks * size; if ((vma->vm_end - vma->vm_start) > total_size) { @@ -1039,13 +1030,12 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, ret = -EPERM; goto bail; } + /* don't allow them to later change to writeable with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; start = vma->vm_start; phys = pd->port_rcvegrbuf_phys; - /* don't allow them to later change to writeable with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; - for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) { ret = remap_pfn_range(vma, start, phys[i] >> PAGE_SHIFT, size, vma->vm_page_prot); @@ -1058,78 +1048,6 @@ bail: return ret; } -static int mmap_rcvhdrq(struct vm_area_struct *vma, - struct ipath_portdata *pd) -{ - struct ipath_devdata *dd = pd->port_dd; - size_t total_size; - int ret; - - /* - * kmalloc'ed memory, physically contiguous; this is from - * spi_rcvhdr_base; we allow user to map read-write so they can - * write hdrq entries to allow protocol code to directly poll - * whether a hdrq entry has been written. - */ - total_size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * - sizeof(u32), PAGE_SIZE); - if ((vma->vm_end - vma->vm_start) > total_size) { - dev_info(&dd->pcidev->dev, - "FAIL on rcvhdrq: reqlen %lx > actual %lx\n", - vma->vm_end - vma->vm_start, - (unsigned long) total_size); - ret = -EFAULT; - goto bail; - } - - ret = remap_pfn_range(vma, vma->vm_start, - pd->port_rcvhdrq_phys >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot); -bail: - return ret; -} - -static int mmap_pioavailregs(struct vm_area_struct *vma, - struct ipath_portdata *pd) -{ - struct ipath_devdata *dd = pd->port_dd; - int ret; - - /* - * when we map the PIO bufferavail registers, we want to map them as - * readonly, no write possible. - * - * kmalloc'ed memory, physically contiguous, one page only, readonly - */ - - if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { - dev_info(&dd->pcidev->dev, "FAIL on pioavailregs_dma: " - "reqlen %lx > actual %lx\n", - vma->vm_end - vma->vm_start, - (unsigned long) PAGE_SIZE); - ret = -EFAULT; - goto bail; - } - - if (vma->vm_flags & VM_WRITE) { - dev_info(&dd->pcidev->dev, - "Can't map pioavailregs as writable (flags=%lx)\n", - vma->vm_flags); - ret = -EPERM; - goto bail; - } - - /* don't allow them to later change with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; - - ret = remap_pfn_range(vma, vma->vm_start, - dd->ipath_pioavailregs_phys >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot); -bail: - return ret; -} - /** * ipath_mmap - mmap various structures into user space * @fp: the file pointer @@ -1149,6 +1067,7 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) pd = port_fp(fp); dd = pd->port_dd; + /* * This is the ipath_do_user_init() code, mapping the shared buffers * into the user process. The address referred to by vm_pgoff is the @@ -1158,28 +1077,59 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) pgaddr = vma->vm_pgoff << PAGE_SHIFT; /* - * note that ureg does *NOT* have the kregvirt as part of it, to be - * sure that for 32 bit programs, we don't end up trying to map a > - * 44 address. Has to match ipath_get_base_info() code that sets - * __spi_uregbase + * Must fit in 40 bits for our hardware; some checked elsewhere, + * but we'll be paranoid. Check for 0 is mostly in case one of the + * allocations failed, but user called mmap anyway. We want to catch + * that before it can match. */ + if (!pgaddr || pgaddr >= (1ULL<<40)) { + ipath_dev_err(dd, "Bad phys addr %llx, start %lx, end %lx\n", + (unsigned long long)pgaddr, vma->vm_start, vma->vm_end); + return -EINVAL; + } + /* just the offset of the port user registers, not physical addr */ ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port; ipath_cdbg(MM, "ushare: pgaddr %llx vm_start=%lx, vmlen %lx\n", (unsigned long long) pgaddr, vma->vm_start, vma->vm_end - vma->vm_start); - if (pgaddr == ureg) + if (vma->vm_start & (PAGE_SIZE-1)) { + ipath_dev_err(dd, + "vm_start not aligned: %lx, end=%lx phys %lx\n", + vma->vm_start, vma->vm_end, (unsigned long)pgaddr); + ret = -EINVAL; + } + else if (pgaddr == ureg) ret = mmap_ureg(vma, dd, ureg); else if (pgaddr == pd->port_piobufs) ret = mmap_piobufs(vma, dd, pd); else if (pgaddr == (u64) pd->port_rcvegr_phys) ret = mmap_rcvegrbufs(vma, pd); - else if (pgaddr == (u64) pd->port_rcvhdrq_phys) - ret = mmap_rcvhdrq(vma, pd); + else if (pgaddr == (u64) pd->port_rcvhdrq_phys) { + /* + * The rcvhdrq itself; readonly except on HT-400 (so have + * to allow writable mapping), multiple pages, contiguous + * from an i/o perspective. + */ + unsigned total_size = + ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize + * sizeof(u32), PAGE_SIZE); + ret = ipath_mmap_mem(vma, pd, total_size, 1, + pd->port_rcvhdrq_phys, + "rcvhdrq"); + } + else if (pgaddr == (u64)pd->port_rcvhdrqtailaddr_phys) + /* in-memory copy of rcvhdrq tail register */ + ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, + pd->port_rcvhdrqtailaddr_phys, + "rcvhdrq tail"); else if (pgaddr == dd->ipath_pioavailregs_phys) - ret = mmap_pioavailregs(vma, pd); + /* in-memory copy of pioavail registers */ + ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, + dd->ipath_pioavailregs_phys, + "pioavail registers"); else ret = -EINVAL; @@ -1442,16 +1392,16 @@ done: static int ipath_open(struct inode *in, struct file *fp) { - int ret, minor; + int ret, user_minor; mutex_lock(&ipath_mutex); - minor = iminor(in); + user_minor = iminor(in) - IPATH_USER_MINOR_BASE; ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n", - (long)in->i_rdev, minor); + (long)in->i_rdev, user_minor); - if (minor) - ret = find_free_port(minor - 1, fp); + if (user_minor) + ret = find_free_port(user_minor - 1, fp); else ret = find_best_unit(fp); @@ -1536,53 +1486,54 @@ static int ipath_close(struct inode *in, struct file *fp) } if (dd->ipath_kregbase) { - if (pd->port_rcvhdrtail_uaddr) { - pd->port_rcvhdrtail_uaddr = 0; - pd->port_rcvhdrtail_kvaddr = NULL; - ipath_release_user_pages_on_close( - &pd->port_rcvhdrtail_pagep, 1); - pd->port_rcvhdrtail_pagep = NULL; - ipath_stats.sps_pageunlocks++; - } - ipath_write_kreg_port( - dd, dd->ipath_kregs->kr_rcvhdrtailaddr, - port, 0ULL); - ipath_write_kreg_port( - dd, dd->ipath_kregs->kr_rcvhdraddr, - pd->port_port, 0); + int i; + /* atomically clear receive enable port. */ + clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + port, + &dd->ipath_rcvctrl); + ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* and read back from chip to be sure that nothing + * else is in flight when we do the rest */ + (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); /* clean up the pkeys for this port user */ ipath_clean_part_key(pd, dd); - if (port < dd->ipath_cfgports) { - int i = dd->ipath_pbufsport * (port - 1); - ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport); - /* atomically clear receive enable port. */ - clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + port, - &dd->ipath_rcvctrl); - ipath_write_kreg( - dd, - dd->ipath_kregs->kr_rcvctrl, - dd->ipath_rcvctrl); - - if (dd->ipath_pageshadow) - unlock_expected_tids(pd); - ipath_stats.sps_ports--; - ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", - pd->port_comm, pd->port_pid, - dd->ipath_unit, port); - } + /* + * be paranoid, and never write 0's to these, just use an + * unused part of the port 0 tail page. Of course, + * rcvhdraddr points to a large chunk of memory, so this + * could still trash things, but at least it won't trash + * page 0, and by disabling the port, it should stop "soon", + * even if a packet or two is in already in flight after we + * disabled the port. + */ + ipath_write_kreg_port(dd, + dd->ipath_kregs->kr_rcvhdrtailaddr, port, + dd->ipath_dummy_hdrq_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, + pd->port_port, dd->ipath_dummy_hdrq_phys); + + i = dd->ipath_pbufsport * (port - 1); + ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport); + + if (dd->ipath_pageshadow) + unlock_expected_tids(pd); + ipath_stats.sps_ports--; + ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", + pd->port_comm, pd->port_pid, + dd->ipath_unit, port); + + dd->ipath_f_clear_tids(dd, pd->port_port); } pd->port_cnt = 0; pd->port_pid = 0; - dd->ipath_f_clear_tids(dd, pd->port_port); - - ipath_free_pddata(dd, pd->port_port, 0); - + dd->ipath_pd[pd->port_port] = NULL; /* before releasing mutex */ mutex_unlock(&ipath_mutex); + ipath_free_pddata(dd, pd); /* after releasing the mutex */ return ret; } @@ -1859,19 +1810,12 @@ int ipath_user_add(struct ipath_devdata *dd) "error %d\n", -ret); goto bail; } - ret = ipath_diag_init(); - if (ret < 0) { - ipath_dev_err(dd, "Unable to set up diag support: " - "error %d\n", -ret); - goto bail_sma; - } - ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev, &wildcard_class_dev); if (ret < 0) { ipath_dev_err(dd, "Could not create wildcard " "minor: error %d\n", -ret); - goto bail_diag; + goto bail_sma; } atomic_set(&user_setup, 1); @@ -1880,31 +1824,28 @@ int ipath_user_add(struct ipath_devdata *dd) snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit); ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops, - &dd->cdev, &dd->class_dev); + &dd->user_cdev, &dd->user_class_dev); if (ret < 0) ipath_dev_err(dd, "Could not create user minor %d, %s\n", dd->ipath_unit + 1, name); goto bail; -bail_diag: - ipath_diag_cleanup(); bail_sma: user_cleanup(); bail: return ret; } -void ipath_user_del(struct ipath_devdata *dd) +void ipath_user_remove(struct ipath_devdata *dd) { - cleanup_cdev(&dd->cdev, &dd->class_dev); + cleanup_cdev(&dd->user_cdev, &dd->user_class_dev); if (atomic_dec_return(&user_count) == 0) { if (atomic_read(&user_setup) == 0) goto bail; cleanup_cdev(&wildcard_cdev, &wildcard_class_dev); - ipath_diag_cleanup(); user_cleanup(); atomic_set(&user_setup, 0); @@ -1912,3 +1853,4 @@ void ipath_user_del(struct ipath_devdata *dd) bail: return; } + diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index e274120567e..0936d8e8d70 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,7 +32,6 @@ */ #include <linux/version.h> -#include <linux/config.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/mount.h> @@ -542,13 +542,14 @@ bail: return ret; } -static struct super_block *ipathfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int ipathfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) { - ipath_super = get_sb_single(fs_type, flags, data, - ipathfs_fill_super); - return ipath_super; + int ret = get_sb_single(fs_type, flags, data, + ipathfs_fill_super, mnt); + if (ret >= 0) + ipath_super = mnt->mnt_sb; + return ret; } static void ipathfs_kill_super(struct super_block *s) diff --git a/drivers/infiniband/hw/ipath/ipath_ht400.c b/drivers/infiniband/hw/ipath/ipath_ht400.c index fac0a2b74de..3db015da6e7 100644 --- a/drivers/infiniband/hw/ipath/ipath_ht400.c +++ b/drivers/infiniband/hw/ipath/ipath_ht400.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -1572,7 +1573,6 @@ void ipath_init_ht400_funcs(struct ipath_devdata *dd) dd->ipath_f_reset = ipath_setup_ht_reset; dd->ipath_f_get_boardname = ipath_ht_boardname; dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors; - dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors; dd->ipath_f_early_init = ipath_ht_early_init; dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors; dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes; diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index dc83250d26a..414cdd1d80a 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -35,7 +36,7 @@ #include <linux/vmalloc.h> #include "ipath_kernel.h" -#include "ips_common.h" +#include "ipath_common.h" /* * min buffers we want to have per port, after driver @@ -114,6 +115,7 @@ static int create_port0_egr(struct ipath_devdata *dd) "eager TID %u\n", e); while (e != 0) dev_kfree_skb(skbs[--e]); + vfree(skbs); ret = -ENOMEM; goto bail; } @@ -275,7 +277,7 @@ static int init_chip_first(struct ipath_devdata *dd, pd->port_port = 0; pd->port_cnt = 1; /* The port 0 pkey table is used by the layer interface. */ - pd->port_pkeys[0] = IPS_DEFAULT_P_KEY; + pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY; dd->ipath_rcvtidcnt = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); dd->ipath_rcvtidbase = @@ -409,17 +411,8 @@ static int init_pioavailregs(struct ipath_devdata *dd) /* and its length */ dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]); - if (dd->ipath_unit * 64 > (IPATH_PORT0_RCVHDRTAIL_SIZE - 64)) { - ipath_dev_err(dd, "unit %u too large for port 0 " - "rcvhdrtail buffer size\n", dd->ipath_unit); - ret = -ENODEV; - } - else - ret = 0; + ret = 0; - /* so we can get current tail in ipath_kreceive(), per chip */ - dd->ipath_hdrqtailptr = &ipath_port0_rcvhdrtail[ - dd->ipath_unit * (64 / sizeof(*ipath_port0_rcvhdrtail))]; done: return ret; } @@ -652,8 +645,9 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) { int ret = 0, i; u32 val32, kpiobufs; - u64 val, atmp; + u64 val; struct ipath_portdata *pd = NULL; /* keep gcc4 happy */ + gfp_t gfp_flags = GFP_USER | __GFP_COMP; ret = init_housekeeping(dd, &pd, reinit); if (ret) @@ -775,24 +769,6 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) goto done; } - val = ipath_port0_rcvhdrtail_dma + dd->ipath_unit * 64; - - /* verify that the alignment requirement was met */ - ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr, - 0, val); - atmp = ipath_read_kreg64_port( - dd, dd->ipath_kregs->kr_rcvhdrtailaddr, 0); - if (val != atmp) { - ipath_dev_err(dd, "Catastrophic software error, " - "RcvHdrTailAddr0 written as %llx, " - "read back as %llx from %x\n", - (unsigned long long) val, - (unsigned long long) atmp, - dd->ipath_kregs->kr_rcvhdrtailaddr); - ret = -EINVAL; - goto done; - } - ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP); /* @@ -836,25 +812,45 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) /* clear any interrups up to this point (ints still not enabled) */ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); - ipath_stats.sps_lid[dd->ipath_unit] = dd->ipath_lid; - /* * Set up the port 0 (kernel) rcvhdr q and egr TIDs. If doing * re-init, the simplest way to handle this is to free * existing, and re-allocate. */ - if (reinit) - ipath_free_pddata(dd, 0, 0); + if (reinit) { + struct ipath_portdata *pd = dd->ipath_pd[0]; + dd->ipath_pd[0] = NULL; + ipath_free_pddata(dd, pd); + } dd->ipath_f_tidtemplate(dd); ret = ipath_create_rcvhdrq(dd, pd); - if (!ret) + if (!ret) { + dd->ipath_hdrqtailptr = + (volatile __le64 *)pd->port_rcvhdrtail_kvaddr; ret = create_port0_egr(dd); + } if (ret) ipath_dev_err(dd, "failed to allocate port 0 (kernel) " "rcvhdrq and/or egr bufs\n"); else enable_chip(dd, pd, reinit); + + if (!ret && !reinit) { + /* used when we close a port, for DMA already in flight at close */ + dd->ipath_dummy_hdrq = dma_alloc_coherent( + &dd->pcidev->dev, pd->port_rcvhdrq_size, + &dd->ipath_dummy_hdrq_phys, + gfp_flags); + if (!dd->ipath_dummy_hdrq ) { + dev_info(&dd->pcidev->dev, + "Couldn't allocate 0x%lx bytes for dummy hdrq\n", + pd->port_rcvhdrq_size); + /* fallback to just 0'ing */ + dd->ipath_dummy_hdrq_phys = 0UL; + } + } + /* * cause retrigger of pending interrupts ignored during init, * even if we had errors diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 3e72a1fe3d7..280e732660a 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -33,9 +34,10 @@ #include <linux/pci.h> #include "ipath_kernel.h" -#include "ips_common.h" #include "ipath_layer.h" +#include "ipath_common.h" +/* These are all rcv-related errors which we want to count for stats */ #define E_SUM_PKTERRS \ (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \ INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \ @@ -44,6 +46,7 @@ INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \ INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP) +/* These are all send-related errors which we want to count for stats */ #define E_SUM_ERRS \ (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \ INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ @@ -51,6 +54,18 @@ INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \ INFINIPATH_E_INVALIDADDR) +/* + * these are errors that can occur when the link changes state while + * a packet is being sent or received. This doesn't cover things + * like EBP or VCRC that can be the result of a sending having the + * link change state, so we receive a "known bad" packet. + */ +#define E_SUM_LINK_PKTERRS \ + (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \ + INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \ + INFINIPATH_E_RUNEXPCHAR) + static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) { unsigned long sbuf[4]; @@ -100,9 +115,7 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) if (ipath_debug & __IPATH_PKTDBG) printk("\n"); } - if ((errs & (INFINIPATH_E_SDROPPEDDATAPKT | - INFINIPATH_E_SDROPPEDSMPPKT | - INFINIPATH_E_SMINPKTLEN)) && + if ((errs & E_SUM_LINK_PKTERRS) && !(dd->ipath_flags & IPATH_LINKACTIVE)) { /* * This can happen when SMA is trying to bring the link @@ -111,11 +124,9 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) * valid. We don't want to confuse people, so we just * don't print them, except at debug */ - ipath_dbg("Ignoring pktsend errors %llx, because not " - "yet active\n", (unsigned long long) errs); - ignore_this_time = INFINIPATH_E_SDROPPEDDATAPKT | - INFINIPATH_E_SDROPPEDSMPPKT | - INFINIPATH_E_SMINPKTLEN; + ipath_dbg("Ignoring packet errors %llx, because link not " + "ACTIVE\n", (unsigned long long) errs); + ignore_this_time = errs & E_SUM_LINK_PKTERRS; } return ignore_this_time; @@ -156,7 +167,29 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd, */ val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); lstate = val & IPATH_IBSTATE_MASK; - if (lstate == IPATH_IBSTATE_INIT || lstate == IPATH_IBSTATE_ARM || + + /* + * this is confusing enough when it happens that I want to always put it + * on the console and in the logs. If it was a requested state change, + * we'll have already cleared the flags, so we won't print this warning + */ + if ((lstate != IPATH_IBSTATE_ARM && lstate != IPATH_IBSTATE_ACTIVE) + && (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) { + dev_info(&dd->pcidev->dev, "Link state changed from %s to %s\n", + (dd->ipath_flags & IPATH_LINKARMED) ? "ARM" : "ACTIVE", + ib_linkstate(lstate)); + /* + * Flush all queued sends when link went to DOWN or INIT, + * to be sure that they don't block SMA and other MAD packets + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + INFINIPATH_S_ABORT); + ipath_disarm_piobufs(dd, dd->ipath_lastport_piobuf, + (unsigned)(dd->ipath_piobcnt2k + + dd->ipath_piobcnt4k) - + dd->ipath_lastport_piobuf); + } + else if (lstate == IPATH_IBSTATE_INIT || lstate == IPATH_IBSTATE_ARM || lstate == IPATH_IBSTATE_ACTIVE) { /* * only print at SMA if there is a change, debug if not @@ -229,6 +262,7 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd, | IPATH_LINKACTIVE | IPATH_LINKARMED); *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + dd->ipath_lli_counter = 0; if (!noprint) { if (((dd->ipath_lastibcstat >> INFINIPATH_IBCS_LINKSTATE_SHIFT) & @@ -350,7 +384,7 @@ static unsigned handle_frequent_errors(struct ipath_devdata *dd, return supp_msgs; } -static void handle_errors(struct ipath_devdata *dd, ipath_err_t errs) +static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) { char msg[512]; u64 ignore_this_time = 0; @@ -379,6 +413,19 @@ static void handle_errors(struct ipath_devdata *dd, ipath_err_t errs) if (errs & E_SUM_ERRS) ignore_this_time = handle_e_sum_errs(dd, errs); + else if ((errs & E_SUM_LINK_PKTERRS) && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ipath_dbg("Ignoring packet errors %llx, because link not " + "ACTIVE\n", (unsigned long long) errs); + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } if (supp_msgs == 250000) { /* @@ -397,7 +444,7 @@ static void handle_errors(struct ipath_devdata *dd, ipath_err_t errs) if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) & ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL)) ipath_dev_err(dd, "Disabling error(s) %llx because " - "occuring too frequently (%s)\n", + "occurring too frequently (%s)\n", (unsigned long long) (dd->ipath_maskederrs & ~dd->ipath_ignorederrs), msg); @@ -434,7 +481,7 @@ static void handle_errors(struct ipath_devdata *dd, ipath_err_t errs) INFINIPATH_E_IBSTATUSCHANGED); } if (!errs) - return; + return 0; if (!noprint) /* @@ -493,10 +540,10 @@ static void handle_errors(struct ipath_devdata *dd, ipath_err_t errs) continue; if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) { - dd->ipath_lastrcvhdrqtails[i] = tl; - pd->port_hdrqfull++; if (i == 0) chkerrpkts = 1; + dd->ipath_lastrcvhdrqtails[i] = tl; + pd->port_hdrqfull++; } } } @@ -558,9 +605,7 @@ static void handle_errors(struct ipath_devdata *dd, ipath_err_t errs) wake_up_interruptible(&ipath_sma_state_wait); } - if (chkerrpkts) - /* process possible error packets in hdrq */ - ipath_kreceive(dd); + return chkerrpkts; } /* this is separate to allow for better optimization of ipath_intr() */ @@ -678,7 +723,12 @@ set: dd->ipath_sendctrl); } -static void handle_rcv(struct ipath_devdata *dd, u32 istat) +/* + * Handle receive interrupts for user ports; this means a user + * process was waiting for a packet to arrive, and didn't want + * to poll + */ +static void handle_urcv(struct ipath_devdata *dd, u32 istat) { u64 portr; int i; @@ -688,22 +738,17 @@ static void handle_rcv(struct ipath_devdata *dd, u32 istat) infinipath_i_rcvavail_mask) | ((istat >> INFINIPATH_I_RCVURG_SHIFT) & infinipath_i_rcvurg_mask); - for (i = 0; i < dd->ipath_cfgports; i++) { + for (i = 1; i < dd->ipath_cfgports; i++) { struct ipath_portdata *pd = dd->ipath_pd[i]; - if (portr & (1 << i) && pd && - pd->port_cnt) { - if (i == 0) - ipath_kreceive(dd); - else if (test_bit(IPATH_PORT_WAITING_RCV, - &pd->port_flag)) { - int rcbit; - clear_bit(IPATH_PORT_WAITING_RCV, - &pd->port_flag); - rcbit = i + INFINIPATH_R_INTRAVAIL_SHIFT; - clear_bit(1UL << rcbit, &dd->ipath_rcvctrl); - wake_up_interruptible(&pd->port_wait); - rcvdint = 1; - } + if (portr & (1 << i) && pd && pd->port_cnt && + test_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag)) { + int rcbit; + clear_bit(IPATH_PORT_WAITING_RCV, + &pd->port_flag); + rcbit = i + INFINIPATH_R_INTRAVAIL_SHIFT; + clear_bit(1UL << rcbit, &dd->ipath_rcvctrl); + wake_up_interruptible(&pd->port_wait); + rcvdint = 1; } } if (rcvdint) { @@ -719,16 +764,19 @@ static void handle_rcv(struct ipath_devdata *dd, u32 istat) irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) { struct ipath_devdata *dd = data; - u32 istat; + u32 istat, chk0rcv = 0; ipath_err_t estat = 0; - static unsigned unexpected = 0; irqreturn_t ret; + u32 oldhead, curtail; + static unsigned unexpected = 0; + static const u32 port0rbits = (1U<<INFINIPATH_I_RCVAVAIL_SHIFT) | + (1U<<INFINIPATH_I_RCVURG_SHIFT); + + ipath_stats.sps_ints++; - if(!(dd->ipath_flags & IPATH_PRESENT)) { - /* this is mostly so we don't try to touch the chip while - * it is being reset */ + if (!(dd->ipath_flags & IPATH_PRESENT)) { /* - * This return value is perhaps odd, but we do not want the + * This return value is not great, but we do not want the * interrupt core code to remove our interrupt handler * because we don't appear to be handling an interrupt * during a chip reset. @@ -736,7 +784,51 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) return IRQ_HANDLED; } + /* + * this needs to be flags&initted, not statusp, so we keep + * taking interrupts even after link goes down, etc. + * Also, we *must* clear the interrupt at some point, or we won't + * take it again, which can be real bad for errors, etc... + */ + + if (!(dd->ipath_flags & IPATH_INITTED)) { + ipath_bad_intr(dd, &unexpected); + ret = IRQ_NONE; + goto bail; + } + + /* + * We try to avoid reading the interrupt status register, since + * that's a PIO read, and stalls the processor for up to about + * ~0.25 usec. The idea is that if we processed a port0 packet, + * we blindly clear the port 0 receive interrupt bits, and nothing + * else, then return. If other interrupts are pending, the chip + * will re-interrupt us as soon as we write the intclear register. + * We then won't process any more kernel packets (if not the 2nd + * time, then the 3rd or 4th) and we'll then handle the other + * interrupts. We clear the interrupts first so that we don't + * lose intr for later packets that arrive while we are processing. + */ + oldhead = dd->ipath_port0head; + curtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr); + if (oldhead != curtail) { + if (dd->ipath_flags & IPATH_GPIO_INTR) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, + (u64) (1 << 2)); + istat = port0rbits | INFINIPATH_I_GPIO; + } + else + istat = port0rbits; + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat); + ipath_kreceive(dd); + if (oldhead != dd->ipath_port0head) { + ipath_stats.sps_fastrcvint++; + goto done; + } + } + istat = ipath_read_kreg32(dd, dd->ipath_kregs->kr_intstatus); + if (unlikely(!istat)) { ipath_stats.sps_nullintr++; ret = IRQ_NONE; /* not our interrupt, or already handled */ @@ -749,31 +841,17 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) goto bail; } - ipath_stats.sps_ints++; - - /* - * this needs to be flags&initted, not statusp, so we keep - * taking interrupts even after link goes down, etc. - * Also, we *must* clear the interrupt at some point, or we won't - * take it again, which can be real bad for errors, etc... - */ - - if (!(dd->ipath_flags & IPATH_INITTED)) { - ipath_bad_intr(dd, &unexpected); - ret = IRQ_NONE; - goto bail; - } if (unexpected) unexpected = 0; - ipath_cdbg(VERBOSE, "intr stat=0x%x\n", istat); - - if (istat & ~infinipath_i_bitsextant) + if (unlikely(istat & ~infinipath_i_bitsextant)) ipath_dev_err(dd, "interrupt with unknown interrupts %x set\n", istat & (u32) ~ infinipath_i_bitsextant); + else + ipath_cdbg(VERBOSE, "intr stat=0x%x\n", istat); - if (istat & INFINIPATH_I_ERROR) { + if (unlikely(istat & INFINIPATH_I_ERROR)) { ipath_stats.sps_errints++; estat = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errorstatus); @@ -788,10 +866,18 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) ipath_dev_err(dd, "Read of error status failed " "(all bits set); ignoring\n"); else - handle_errors(dd, estat); + if (handle_errors(dd, estat)) + /* force calling ipath_kreceive() */ + chk0rcv = 1; } if (istat & INFINIPATH_I_GPIO) { + /* + * Packets are available in the port 0 rcv queue. + * Eventually this needs to be generalized to check + * IPATH_GPIO_INTR, and the specific GPIO bit, if + * GPIO interrupts are used for anything else. + */ if (unlikely(!(dd->ipath_flags & IPATH_GPIO_INTR))) { u32 gpiostatus; gpiostatus = ipath_read_kreg32( @@ -804,27 +890,39 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) else { /* Clear GPIO status bit 2 */ ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, - (u64) (1 << 2)); - - /* - * Packets are available in the port 0 rcv queue. - * Eventually this needs to be generalized to check - * IPATH_GPIO_INTR, and the specific GPIO bit, if - * GPIO interrupts are used for anything else. - */ - ipath_kreceive(dd); + (u64) (1 << 2)); + chk0rcv = 1; } } + chk0rcv |= istat & port0rbits; /* - * clear the ones we will deal with on this round - * We clear it early, mostly for receive interrupts, so we - * know the chip will have seen this by the time we process - * the queue, and will re-interrupt if necessary. The processor - * itself won't take the interrupt again until we return. + * Clear the interrupt bits we found set, unless they are receive + * related, in which case we already cleared them above, and don't + * want to clear them again, because we might lose an interrupt. + * Clear it early, so we "know" know the chip will have seen this by + * the time we process the queue, and will re-interrupt if necessary. + * The processor itself won't take the interrupt again until we return. */ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat); + /* + * handle port0 receive before checking for pio buffers available, + * since receives can overflow; piobuf waiters can afford a few + * extra cycles, since they were waiting anyway, and user's waiting + * for receive are at the bottom. + */ + if (chk0rcv) { + ipath_kreceive(dd); + istat &= ~port0rbits; + } + + if (istat & ((infinipath_i_rcvavail_mask << + INFINIPATH_I_RCVAVAIL_SHIFT) + | (infinipath_i_rcvurg_mask << + INFINIPATH_I_RCVURG_SHIFT))) + handle_urcv(dd, istat); + if (istat & INFINIPATH_I_SPIOBUFAVAIL) { clear_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl); ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, @@ -836,17 +934,7 @@ irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs) handle_layer_pioavail(dd); } - /* - * we check for both transition from empty to non-empty, and urgent - * packets (those with the interrupt bit set in the header) - */ - - if (istat & ((infinipath_i_rcvavail_mask << - INFINIPATH_I_RCVAVAIL_SHIFT) - | (infinipath_i_rcvurg_mask << - INFINIPATH_I_RCVURG_SHIFT))) - handle_rcv(dd, istat); - +done: ret = IRQ_HANDLED; bail: diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index 5d92d57b6f5..e9f374fb641 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -1,6 +1,7 @@ #ifndef _IPATH_KERNEL_H #define _IPATH_KERNEL_H /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -61,9 +62,7 @@ struct ipath_portdata { /* rcvhdrq base, needs mmap before useful */ void *port_rcvhdrq; /* kernel virtual address where hdrqtail is updated */ - u64 *port_rcvhdrtail_kvaddr; - /* page * used for uaddr */ - struct page *port_rcvhdrtail_pagep; + volatile __le64 *port_rcvhdrtail_kvaddr; /* * temp buffer for expected send setup, allocated at open, instead * of each setup call @@ -78,11 +77,7 @@ struct ipath_portdata { dma_addr_t port_rcvegr_phys; /* mmap of hdrq, must fit in 44 bits */ dma_addr_t port_rcvhdrq_phys; - /* - * the actual user address that we ipath_mlock'ed, so we can - * ipath_munlock it at close - */ - unsigned long port_rcvhdrtail_uaddr; + dma_addr_t port_rcvhdrqtailaddr_phys; /* * number of opens on this instance (0 or 1; ignoring forks, dup, * etc. for now) @@ -158,16 +153,10 @@ struct ipath_devdata { /* base of memory alloced for ipath_kregbase, for free */ u64 *ipath_kregalloc; /* - * version of kregbase that doesn't have high bits set (for 32 bit - * programs, so mmap64 44 bit works) - */ - u64 __iomem *ipath_kregvirt; - /* * virtual address where port0 rcvhdrqtail updated for this unit. * only written to by the chip, not the driver. */ volatile __le64 *ipath_hdrqtailptr; - dma_addr_t ipath_dma_addr; /* ipath_cfgports pointers */ struct ipath_portdata **ipath_pd; /* sk_buffs used by port 0 eager receive queue */ @@ -354,13 +343,17 @@ struct ipath_devdata { char *ipath_freezemsg; /* pci access data structure */ struct pci_dev *pcidev; - struct cdev *cdev; - struct class_device *class_dev; + struct cdev *user_cdev; + struct cdev *diag_cdev; + struct class_device *user_class_dev; + struct class_device *diag_class_dev; /* timer used to prevent stats overflow, error throttling, etc. */ struct timer_list ipath_stats_timer; /* check for stale messages in rcv queue */ /* only allow one intr at a time. */ unsigned long ipath_rcv_pending; + void *ipath_dummy_hdrq; /* used after port close */ + dma_addr_t ipath_dummy_hdrq_phys; /* * Shadow copies of registers; size indicates read access size. @@ -500,8 +493,11 @@ struct ipath_devdata { u16 ipath_lid; /* list of pkeys programmed; 0 if not set */ u16 ipath_pkeys[4]; - /* ASCII serial number, from flash */ - u8 ipath_serial[12]; + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer QLogic serial number format + */ + u8 ipath_serial[16]; /* human readable board version */ u8 ipath_boardversion[80]; /* chip major rev, from ipath_revision */ @@ -516,12 +512,12 @@ struct ipath_devdata { u8 ipath_pci_cacheline; /* LID mask control */ u8 ipath_lmc; -}; - -extern volatile __le64 *ipath_port0_rcvhdrtail; -extern dma_addr_t ipath_port0_rcvhdrtail_dma; -#define IPATH_PORT0_RCVHDRTAIL_SIZE PAGE_SIZE + /* local link integrity counter */ + u32 ipath_lli_counter; + /* local link integrity errors */ + u32 ipath_lli_errors; +}; extern struct list_head ipath_dev_list; extern spinlock_t ipath_devs_lock; @@ -537,7 +533,7 @@ extern int __ipath_verbs_piobufavail(struct ipath_devdata *); extern int __ipath_verbs_rcv(struct ipath_devdata *, void *, void *, u32); void ipath_layer_add(struct ipath_devdata *); -void ipath_layer_del(struct ipath_devdata *); +void ipath_layer_remove(struct ipath_devdata *); int ipath_init_chip(struct ipath_devdata *, int); int ipath_enable_wc(struct ipath_devdata *dd); @@ -551,14 +547,14 @@ int ipath_cdev_init(int minor, char *name, struct file_operations *fops, void ipath_cdev_cleanup(struct cdev **cdevp, struct class_device **class_devp); -int ipath_diag_init(void); -void ipath_diag_cleanup(void); +int ipath_diag_add(struct ipath_devdata *); +void ipath_diag_remove(struct ipath_devdata *); void ipath_diag_bringup_link(struct ipath_devdata *); extern wait_queue_head_t ipath_sma_state_wait; int ipath_user_add(struct ipath_devdata *dd); -void ipath_user_del(struct ipath_devdata *dd); +void ipath_user_remove(struct ipath_devdata *dd); struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t); @@ -582,7 +578,7 @@ void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first, unsigned cnt); int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *); -void ipath_free_pddata(struct ipath_devdata *, u32, int); +void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *); int ipath_parse_ushort(const char *str, unsigned short *valp); @@ -720,13 +716,8 @@ u64 ipath_read_kreg64_port(const struct ipath_devdata *, ipath_kreg, * @port: port number * * Return the contents of a register that is virtualized to be per port. - * Prints a debug message and returns -1 on errors (not distinguishable from - * valid contents at runtime; we may add a separate error variable at some - * point). - * - * This is normally not used by the kernel, but may be for debugging, and - * has a different implementation than user mode, which is why it's not in - * _common.h. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). */ static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd, ipath_ureg regno, int port) @@ -842,9 +833,10 @@ extern struct mutex ipath_mutex; #define IPATH_DRV_NAME "ipath_core" #define IPATH_MAJOR 233 +#define IPATH_USER_MINOR_BASE 0 #define IPATH_SMA_MINOR 128 -#define IPATH_DIAG_MINOR 129 -#define IPATH_NMINORS 130 +#define IPATH_DIAG_MINOR_BASE 129 +#define IPATH_NMINORS 255 #define ipath_dev_err(dd,fmt,...) \ do { \ diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c index 5ae8761f9dd..46773c673a1 100644 --- a/drivers/infiniband/hw/ipath/ipath_keys.c +++ b/drivers/infiniband/hw/ipath/ipath_keys.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -120,6 +121,7 @@ int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge, struct ib_sge *sge, int acc) { struct ipath_mregion *mr; + unsigned n, m; size_t off; int ret; @@ -151,20 +153,22 @@ int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge, } off += mr->offset; - isge->mr = mr; - isge->m = 0; - isge->n = 0; - while (off >= mr->map[isge->m]->segs[isge->n].length) { - off -= mr->map[isge->m]->segs[isge->n].length; - isge->n++; - if (isge->n >= IPATH_SEGSZ) { - isge->m++; - isge->n = 0; + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= IPATH_SEGSZ) { + m++; + n = 0; } } - isge->vaddr = mr->map[isge->m]->segs[isge->n].vaddr + off; - isge->length = mr->map[isge->m]->segs[isge->n].length - off; + isge->mr = mr; + isge->vaddr = mr->map[m]->segs[n].vaddr + off; + isge->length = mr->map[m]->segs[n].length - off; isge->sge_length = sge->length; + isge->m = m; + isge->n = n; ret = 1; @@ -189,6 +193,7 @@ int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, struct ipath_lkey_table *rkt = &dev->lk_table; struct ipath_sge *sge = &ss->sge; struct ipath_mregion *mr; + unsigned n, m; size_t off; int ret; @@ -206,20 +211,22 @@ int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, } off += mr->offset; - sge->mr = mr; - sge->m = 0; - sge->n = 0; - while (off >= mr->map[sge->m]->segs[sge->n].length) { - off -= mr->map[sge->m]->segs[sge->n].length; - sge->n++; - if (sge->n >= IPATH_SEGSZ) { - sge->m++; - sge->n = 0; + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= IPATH_SEGSZ) { + m++; + n = 0; } } - sge->vaddr = mr->map[sge->m]->segs[sge->n].vaddr + off; - sge->length = mr->map[sge->m]->segs[sge->n].length - off; + sge->mr = mr; + sge->vaddr = mr->map[m]->segs[n].vaddr + off; + sge->length = mr->map[m]->segs[n].length - off; sge->sge_length = len; + sge->m = m; + sge->n = n; ss->sg_list = NULL; ss->num_sge = 1; diff --git a/drivers/infiniband/hw/ipath/ipath_layer.c b/drivers/infiniband/hw/ipath/ipath_layer.c index 9ec4ac77b87..b28c6f81c73 100644 --- a/drivers/infiniband/hw/ipath/ipath_layer.c +++ b/drivers/infiniband/hw/ipath/ipath_layer.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -40,8 +41,8 @@ #include <asm/byteorder.h> #include "ipath_kernel.h" -#include "ips_common.h" #include "ipath_layer.h" +#include "ipath_common.h" /* Acquire before ipath_devs_lock. */ static DEFINE_MUTEX(ipath_layer_mutex); @@ -299,9 +300,8 @@ bail: EXPORT_SYMBOL_GPL(ipath_layer_set_mtu); -int ipath_set_sps_lid(struct ipath_devdata *dd, u32 arg, u8 lmc) +int ipath_set_lid(struct ipath_devdata *dd, u32 arg, u8 lmc) { - ipath_stats.sps_lid[dd->ipath_unit] = arg; dd->ipath_lid = arg; dd->ipath_lmc = lmc; @@ -315,7 +315,7 @@ int ipath_set_sps_lid(struct ipath_devdata *dd, u32 arg, u8 lmc) return 0; } -EXPORT_SYMBOL_GPL(ipath_set_sps_lid); +EXPORT_SYMBOL_GPL(ipath_set_lid); int ipath_layer_set_guid(struct ipath_devdata *dd, __be64 guid) { @@ -340,18 +340,26 @@ u32 ipath_layer_get_nguid(struct ipath_devdata *dd) EXPORT_SYMBOL_GPL(ipath_layer_get_nguid); -int ipath_layer_query_device(struct ipath_devdata *dd, u32 * vendor, - u32 * boardrev, u32 * majrev, u32 * minrev) +u32 ipath_layer_get_majrev(struct ipath_devdata *dd) { - *vendor = dd->ipath_vendorid; - *boardrev = dd->ipath_boardrev; - *majrev = dd->ipath_majrev; - *minrev = dd->ipath_minrev; + return dd->ipath_majrev; +} - return 0; +EXPORT_SYMBOL_GPL(ipath_layer_get_majrev); + +u32 ipath_layer_get_minrev(struct ipath_devdata *dd) +{ + return dd->ipath_minrev; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_minrev); + +u32 ipath_layer_get_pcirev(struct ipath_devdata *dd) +{ + return dd->ipath_pcirev; } -EXPORT_SYMBOL_GPL(ipath_layer_query_device); +EXPORT_SYMBOL_GPL(ipath_layer_get_pcirev); u32 ipath_layer_get_flags(struct ipath_devdata *dd) { @@ -374,6 +382,13 @@ u16 ipath_layer_get_deviceid(struct ipath_devdata *dd) EXPORT_SYMBOL_GPL(ipath_layer_get_deviceid); +u32 ipath_layer_get_vendorid(struct ipath_devdata *dd) +{ + return dd->ipath_vendorid; +} + +EXPORT_SYMBOL_GPL(ipath_layer_get_vendorid); + u64 ipath_layer_get_lastibcstat(struct ipath_devdata *dd) { return dd->ipath_lastibcstat; @@ -403,7 +418,7 @@ void ipath_layer_add(struct ipath_devdata *dd) mutex_unlock(&ipath_layer_mutex); } -void ipath_layer_del(struct ipath_devdata *dd) +void ipath_layer_remove(struct ipath_devdata *dd) { mutex_lock(&ipath_layer_mutex); @@ -607,7 +622,7 @@ int ipath_layer_open(struct ipath_devdata *dd, u32 * pktmax) goto bail; } - ret = ipath_setrcvhdrsize(dd, NUM_OF_EXTRA_WORDS_IN_HEADER_QUEUE); + ret = ipath_setrcvhdrsize(dd, IPATH_HEADER_QUEUE_WORDS); if (ret < 0) goto bail; @@ -616,9 +631,9 @@ int ipath_layer_open(struct ipath_devdata *dd, u32 * pktmax) if (*dd->ipath_statusp & IPATH_STATUS_IB_READY) intval |= IPATH_LAYER_INT_IF_UP; - if (ipath_stats.sps_lid[dd->ipath_unit]) + if (dd->ipath_lid) intval |= IPATH_LAYER_INT_LID; - if (ipath_stats.sps_mlid[dd->ipath_unit]) + if (dd->ipath_mlid) intval |= IPATH_LAYER_INT_BCAST; /* * do this on open, in case low level is already up and @@ -884,7 +899,7 @@ static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, /** * ipath_verbs_send - send a packet from the verbs layer * @dd: the infinipath device - * @hdrwords: the number of works in the header + * @hdrwords: the number of words in the header * @hdr: the packet header * @len: the length of the packet in bytes * @ss: the SGE to send @@ -1016,19 +1031,22 @@ int ipath_layer_get_counters(struct ipath_devdata *dd, ipath_snap_cntr(dd, dd->ipath_cregs->cr_ibsymbolerrcnt); cntrs->link_error_recovery_counter = ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt); + /* + * The link downed counter counts when the other side downs the + * connection. We add in the number of times we downed the link + * due to local link integrity errors to compensate. + */ cntrs->link_downed_counter = ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkdowncnt); cntrs->port_rcv_errors = ipath_snap_cntr(dd, dd->ipath_cregs->cr_rxdroppktcnt) + ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvovflcnt) + ipath_snap_cntr(dd, dd->ipath_cregs->cr_portovflcnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_errrcvflowctrlcnt) + ipath_snap_cntr(dd, dd->ipath_cregs->cr_err_rlencnt) + ipath_snap_cntr(dd, dd->ipath_cregs->cr_invalidrlencnt) + ipath_snap_cntr(dd, dd->ipath_cregs->cr_erricrccnt) + ipath_snap_cntr(dd, dd->ipath_cregs->cr_errvcrccnt) + ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlpcrccnt) + - ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlinkcnt) + ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt); cntrs->port_rcv_remphys_errors = ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvebpcnt); @@ -1042,6 +1060,8 @@ int ipath_layer_get_counters(struct ipath_devdata *dd, ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); cntrs->port_rcv_packets = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + cntrs->local_link_integrity_errors = dd->ipath_lli_errors; + cntrs->excessive_buffer_overrun_errors = 0; /* XXX */ ret = 0; @@ -1086,10 +1106,10 @@ int ipath_layer_send_hdr(struct ipath_devdata *dd, struct ether_header *hdr) } vlsllnh = *((__be16 *) hdr); - if (vlsllnh != htons(IPS_LRH_BTH)) { + if (vlsllnh != htons(IPATH_LRH_BTH)) { ipath_dbg("Warning: lrh[0] wrong (%x, not %x); " "not sending\n", be16_to_cpu(vlsllnh), - IPS_LRH_BTH); + IPATH_LRH_BTH); ret = -EINVAL; } if (ret) diff --git a/drivers/infiniband/hw/ipath/ipath_layer.h b/drivers/infiniband/hw/ipath/ipath_layer.h index 6fefd15bd2d..71485096fca 100644 --- a/drivers/infiniband/hw/ipath/ipath_layer.h +++ b/drivers/infiniband/hw/ipath/ipath_layer.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -54,6 +55,8 @@ struct ipath_layer_counters { u64 port_rcv_data; u64 port_xmit_packets; u64 port_rcv_packets; + u32 local_link_integrity_errors; + u32 excessive_buffer_overrun_errors; }; /* @@ -126,7 +129,7 @@ u16 ipath_layer_get_bcast(struct ipath_devdata *dd); u32 ipath_layer_get_cr_errpkey(struct ipath_devdata *dd); int ipath_layer_set_linkstate(struct ipath_devdata *dd, u8 state); int ipath_layer_set_mtu(struct ipath_devdata *, u16); -int ipath_set_sps_lid(struct ipath_devdata *, u32, u8); +int ipath_set_lid(struct ipath_devdata *, u32, u8); int ipath_layer_send_hdr(struct ipath_devdata *dd, struct ether_header *hdr); int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords, @@ -143,11 +146,13 @@ int ipath_layer_want_buffer(struct ipath_devdata *dd); int ipath_layer_set_guid(struct ipath_devdata *, __be64 guid); __be64 ipath_layer_get_guid(struct ipath_devdata *); u32 ipath_layer_get_nguid(struct ipath_devdata *); -int ipath_layer_query_device(struct ipath_devdata *, u32 * vendor, - u32 * boardrev, u32 * majrev, u32 * minrev); +u32 ipath_layer_get_majrev(struct ipath_devdata *); +u32 ipath_layer_get_minrev(struct ipath_devdata *); +u32 ipath_layer_get_pcirev(struct ipath_devdata *); u32 ipath_layer_get_flags(struct ipath_devdata *dd); struct device *ipath_layer_get_device(struct ipath_devdata *dd); u16 ipath_layer_get_deviceid(struct ipath_devdata *dd); +u32 ipath_layer_get_vendorid(struct ipath_devdata *); u64 ipath_layer_get_lastibcstat(struct ipath_devdata *dd); u32 ipath_layer_get_ibmtu(struct ipath_devdata *dd); int ipath_layer_enable_timer(struct ipath_devdata *dd); diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 1a9d0a2c33c..d3402341b7d 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -34,7 +35,7 @@ #include "ipath_kernel.h" #include "ipath_verbs.h" -#include "ips_common.h" +#include "ipath_common.h" #define IB_SMP_UNSUP_VERSION __constant_htons(0x0004) #define IB_SMP_UNSUP_METHOD __constant_htons(0x0008) @@ -84,7 +85,7 @@ static int recv_subn_get_nodeinfo(struct ib_smp *smp, { struct nodeinfo *nip = (struct nodeinfo *)&smp->data; struct ipath_devdata *dd = to_idev(ibdev)->dd; - u32 vendor, boardid, majrev, minrev; + u32 vendor, majrev, minrev; if (smp->attr_mod) smp->status |= IB_SMP_INVALID_FIELD; @@ -104,9 +105,11 @@ static int recv_subn_get_nodeinfo(struct ib_smp *smp, nip->port_guid = nip->sys_guid; nip->partition_cap = cpu_to_be16(ipath_layer_get_npkeys(dd)); nip->device_id = cpu_to_be16(ipath_layer_get_deviceid(dd)); - ipath_layer_query_device(dd, &vendor, &boardid, &majrev, &minrev); + majrev = ipath_layer_get_majrev(dd); + minrev = ipath_layer_get_minrev(dd); nip->revision = cpu_to_be32((majrev << 16) | minrev); nip->local_port_num = port; + vendor = ipath_layer_get_vendorid(dd); nip->vendor_id[0] = 0; nip->vendor_id[1] = vendor >> 8; nip->vendor_id[2] = vendor; @@ -215,7 +218,7 @@ static int recv_subn_get_portinfo(struct ib_smp *smp, /* P_KeyViolations are counted by hardware. */ pip->pkey_violations = cpu_to_be16((ipath_layer_get_cr_errpkey(dev->dd) - - dev->n_pkey_violations) & 0xFFFF); + dev->z_pkey_violations) & 0xFFFF); pip->qkey_violations = cpu_to_be16(dev->qkey_violations); /* Only the hardware GUID is supported for now */ pip->guid_cap = 1; @@ -303,9 +306,9 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, lid = be16_to_cpu(pip->lid); if (lid != ipath_layer_get_lid(dev->dd)) { /* Must be a valid unicast LID address. */ - if (lid == 0 || lid >= IPS_MULTICAST_LID_BASE) + if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) goto err; - ipath_set_sps_lid(dev->dd, lid, pip->mkeyprot_resv_lmc & 7); + ipath_set_lid(dev->dd, lid, pip->mkeyprot_resv_lmc & 7); event.event = IB_EVENT_LID_CHANGE; ib_dispatch_event(&event); } @@ -313,7 +316,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, smlid = be16_to_cpu(pip->sm_lid); if (smlid != dev->sm_lid) { /* Must be a valid unicast LID address. */ - if (smlid == 0 || smlid >= IPS_MULTICAST_LID_BASE) + if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE) goto err; dev->sm_lid = smlid; event.event = IB_EVENT_SM_CHANGE; @@ -389,7 +392,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp, * later. */ if (pip->pkey_violations == 0) - dev->n_pkey_violations = + dev->z_pkey_violations = ipath_layer_get_cr_errpkey(dev->dd); if (pip->qkey_violations == 0) @@ -610,6 +613,9 @@ struct ib_pma_portcounters { #define IB_PMA_SEL_PORT_RCV_ERRORS __constant_htons(0x0008) #define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS __constant_htons(0x0010) #define IB_PMA_SEL_PORT_XMIT_DISCARDS __constant_htons(0x0040) +#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS __constant_htons(0x0200) +#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS __constant_htons(0x0400) +#define IB_PMA_SEL_PORT_VL15_DROPPED __constant_htons(0x0800) #define IB_PMA_SEL_PORT_XMIT_DATA __constant_htons(0x1000) #define IB_PMA_SEL_PORT_RCV_DATA __constant_htons(0x2000) #define IB_PMA_SEL_PORT_XMIT_PACKETS __constant_htons(0x4000) @@ -844,18 +850,22 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp, ipath_layer_get_counters(dev->dd, &cntrs); /* Adjust counters for any resets done. */ - cntrs.symbol_error_counter -= dev->n_symbol_error_counter; + cntrs.symbol_error_counter -= dev->z_symbol_error_counter; cntrs.link_error_recovery_counter -= - dev->n_link_error_recovery_counter; - cntrs.link_downed_counter -= dev->n_link_downed_counter; + dev->z_link_error_recovery_counter; + cntrs.link_downed_counter -= dev->z_link_downed_counter; cntrs.port_rcv_errors += dev->rcv_errors; - cntrs.port_rcv_errors -= dev->n_port_rcv_errors; - cntrs.port_rcv_remphys_errors -= dev->n_port_rcv_remphys_errors; - cntrs.port_xmit_discards -= dev->n_port_xmit_discards; - cntrs.port_xmit_data -= dev->n_port_xmit_data; - cntrs.port_rcv_data -= dev->n_port_rcv_data; - cntrs.port_xmit_packets -= dev->n_port_xmit_packets; - cntrs.port_rcv_packets -= dev->n_port_rcv_packets; + cntrs.port_rcv_errors -= dev->z_port_rcv_errors; + cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors; + cntrs.port_xmit_discards -= dev->z_port_xmit_discards; + cntrs.port_xmit_data -= dev->z_port_xmit_data; + cntrs.port_rcv_data -= dev->z_port_rcv_data; + cntrs.port_xmit_packets -= dev->z_port_xmit_packets; + cntrs.port_rcv_packets -= dev->z_port_rcv_packets; + cntrs.local_link_integrity_errors -= + dev->z_local_link_integrity_errors; + cntrs.excessive_buffer_overrun_errors -= + dev->z_excessive_buffer_overrun_errors; memset(pmp->data, 0, sizeof(pmp->data)); @@ -893,6 +903,16 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp, else p->port_xmit_discards = cpu_to_be16((u16)cntrs.port_xmit_discards); + if (cntrs.local_link_integrity_errors > 0xFUL) + cntrs.local_link_integrity_errors = 0xFUL; + if (cntrs.excessive_buffer_overrun_errors > 0xFUL) + cntrs.excessive_buffer_overrun_errors = 0xFUL; + p->lli_ebor_errors = (cntrs.local_link_integrity_errors << 4) | + cntrs.excessive_buffer_overrun_errors; + if (dev->n_vl15_dropped > 0xFFFFUL) + p->vl15_dropped = __constant_cpu_to_be16(0xFFFF); + else + p->vl15_dropped = cpu_to_be16((u16)dev->n_vl15_dropped); if (cntrs.port_xmit_data > 0xFFFFFFFFUL) p->port_xmit_data = __constant_cpu_to_be32(0xFFFFFFFF); else @@ -928,10 +948,10 @@ static int recv_pma_get_portcounters_ext(struct ib_perf *pmp, &rpkts, &xwait); /* Adjust counters for any resets done. */ - swords -= dev->n_port_xmit_data; - rwords -= dev->n_port_rcv_data; - spkts -= dev->n_port_xmit_packets; - rpkts -= dev->n_port_rcv_packets; + swords -= dev->z_port_xmit_data; + rwords -= dev->z_port_rcv_data; + spkts -= dev->z_port_xmit_packets; + rpkts -= dev->z_port_rcv_packets; memset(pmp->data, 0, sizeof(pmp->data)); @@ -967,37 +987,48 @@ static int recv_pma_set_portcounters(struct ib_perf *pmp, ipath_layer_get_counters(dev->dd, &cntrs); if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR) - dev->n_symbol_error_counter = cntrs.symbol_error_counter; + dev->z_symbol_error_counter = cntrs.symbol_error_counter; if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY) - dev->n_link_error_recovery_counter = + dev->z_link_error_recovery_counter = cntrs.link_error_recovery_counter; if (p->counter_select & IB_PMA_SEL_LINK_DOWNED) - dev->n_link_downed_counter = cntrs.link_downed_counter; + dev->z_link_downed_counter = cntrs.link_downed_counter; if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS) - dev->n_port_rcv_errors = + dev->z_port_rcv_errors = cntrs.port_rcv_errors + dev->rcv_errors; if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS) - dev->n_port_rcv_remphys_errors = + dev->z_port_rcv_remphys_errors = cntrs.port_rcv_remphys_errors; if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS) - dev->n_port_xmit_discards = cntrs.port_xmit_discards; + dev->z_port_xmit_discards = cntrs.port_xmit_discards; + + if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS) + dev->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + + if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS) + dev->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) + dev->n_vl15_dropped = 0; if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA) - dev->n_port_xmit_data = cntrs.port_xmit_data; + dev->z_port_xmit_data = cntrs.port_xmit_data; if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA) - dev->n_port_rcv_data = cntrs.port_rcv_data; + dev->z_port_rcv_data = cntrs.port_rcv_data; if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS) - dev->n_port_xmit_packets = cntrs.port_xmit_packets; + dev->z_port_xmit_packets = cntrs.port_xmit_packets; if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS) - dev->n_port_rcv_packets = cntrs.port_rcv_packets; + dev->z_port_rcv_packets = cntrs.port_rcv_packets; return recv_pma_get_portcounters(pmp, ibdev, port); } @@ -1014,16 +1045,16 @@ static int recv_pma_set_portcounters_ext(struct ib_perf *pmp, &rpkts, &xwait); if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA) - dev->n_port_xmit_data = swords; + dev->z_port_xmit_data = swords; if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA) - dev->n_port_rcv_data = rwords; + dev->z_port_rcv_data = rwords; if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS) - dev->n_port_xmit_packets = spkts; + dev->z_port_xmit_packets = spkts; if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS) - dev->n_port_rcv_packets = rpkts; + dev->z_port_rcv_packets = rpkts; if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) dev->n_unicast_xmit = 0; @@ -1272,32 +1303,8 @@ int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad) { - struct ipath_ibdev *dev = to_idev(ibdev); int ret; - /* - * Snapshot current HW counters to "clear" them. - * This should be done when the driver is loaded except that for - * some reason we get a zillion errors when brining up the link. - */ - if (dev->rcv_errors == 0) { - struct ipath_layer_counters cntrs; - - ipath_layer_get_counters(to_idev(ibdev)->dd, &cntrs); - dev->rcv_errors++; - dev->n_symbol_error_counter = cntrs.symbol_error_counter; - dev->n_link_error_recovery_counter = - cntrs.link_error_recovery_counter; - dev->n_link_downed_counter = cntrs.link_downed_counter; - dev->n_port_rcv_errors = cntrs.port_rcv_errors + 1; - dev->n_port_rcv_remphys_errors = - cntrs.port_rcv_remphys_errors; - dev->n_port_xmit_discards = cntrs.port_xmit_discards; - dev->n_port_xmit_data = cntrs.port_xmit_data; - dev->n_port_rcv_data = cntrs.port_rcv_data; - dev->n_port_xmit_packets = cntrs.port_xmit_packets; - dev->n_port_rcv_packets = cntrs.port_rcv_packets; - } switch (in_mad->mad_hdr.mgmt_class) { case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: case IB_MGMT_CLASS_SUBN_LID_ROUTED: diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index 69ffec66d45..4ac31a5da33 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -169,6 +170,11 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region, int n, m, i; struct ib_mr *ret; + if (region->length == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + n = 0; list_for_each_entry(chunk, ®ion->chunk_list, list) n += chunk->nents; diff --git a/drivers/infiniband/hw/ipath/ipath_pe800.c b/drivers/infiniband/hw/ipath/ipath_pe800.c index 02e8c75b24f..b83f66d8262 100644 --- a/drivers/infiniband/hw/ipath/ipath_pe800.c +++ b/drivers/infiniband/hw/ipath/ipath_pe800.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -44,7 +45,7 @@ /* * This file contains all the chip-specific register information and - * access functions for the PathScale PE800, the PCI-Express chip. + * access functions for the QLogic InfiniPath PE800, the PCI-Express chip. * * This lists the InfiniPath PE800 registers, in the actual chip layout. * This structure should never be directly accessed. @@ -532,7 +533,7 @@ static int ipath_pe_boardname(struct ipath_devdata *dd, char *name, if (n) snprintf(name, namelen, "%s", n); - if (dd->ipath_majrev != 4 || dd->ipath_minrev != 1) { + if (dd->ipath_majrev != 4 || !dd->ipath_minrev || dd->ipath_minrev>2) { ipath_dev_err(dd, "Unsupported PE-800 revision %u.%u!\n", dd->ipath_majrev, dd->ipath_minrev); ret = 1; diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 9f8855d970c..83e557be591 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -34,7 +35,7 @@ #include <linux/vmalloc.h> #include "ipath_verbs.h" -#include "ips_common.h" +#include "ipath_common.h" #define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) @@ -332,10 +333,11 @@ static void ipath_reset_qp(struct ipath_qp *qp) qp->remote_qpn = 0; qp->qkey = 0; qp->qp_access_flags = 0; + clear_bit(IPATH_S_BUSY, &qp->s_flags); qp->s_hdrwords = 0; qp->s_psn = 0; qp->r_psn = 0; - atomic_set(&qp->msn, 0); + qp->r_msn = 0; if (qp->ibqp.qp_type == IB_QPT_RC) { qp->s_state = IB_OPCODE_RC_SEND_LAST; qp->r_state = IB_OPCODE_RC_SEND_LAST; @@ -344,7 +346,8 @@ static void ipath_reset_qp(struct ipath_qp *qp) qp->r_state = IB_OPCODE_UC_SEND_LAST; } qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; - qp->s_nak_state = 0; + qp->r_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + qp->r_nak_state = 0; qp->s_rnr_timeout = 0; qp->s_head = 0; qp->s_tail = 0; @@ -362,10 +365,10 @@ static void ipath_reset_qp(struct ipath_qp *qp) * @qp: the QP to put into an error state * * Flushes both send and receive work queues. - * QP r_rq.lock and s_lock should be held. + * QP s_lock should be held and interrupts disabled. */ -static void ipath_error_qp(struct ipath_qp *qp) +void ipath_error_qp(struct ipath_qp *qp) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ib_wc wc; @@ -408,12 +411,14 @@ static void ipath_error_qp(struct ipath_qp *qp) qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; wc.opcode = IB_WC_RECV; + spin_lock(&qp->r_rq.lock); while (qp->r_rq.tail != qp->r_rq.head) { wc.wr_id = get_rwqe_ptr(&qp->r_rq, qp->r_rq.tail)->wr_id; if (++qp->r_rq.tail >= qp->r_rq.size) qp->r_rq.tail = 0; ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); } + spin_unlock(&qp->r_rq.lock); } /** @@ -433,8 +438,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, unsigned long flags; int ret; - spin_lock_irqsave(&qp->r_rq.lock, flags); - spin_lock(&qp->s_lock); + spin_lock_irqsave(&qp->s_lock, flags); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; @@ -446,7 +450,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_AV) if (attr->ah_attr.dlid == 0 || - attr->ah_attr.dlid >= IPS_MULTICAST_LID_BASE) + attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE) goto inval; if (attr_mask & IB_QP_PKEY_INDEX) @@ -505,34 +509,19 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } if (attr_mask & IB_QP_MIN_RNR_TIMER) - qp->s_min_rnr_timer = attr->min_rnr_timer; + qp->r_min_rnr_timer = attr->min_rnr_timer; if (attr_mask & IB_QP_QKEY) qp->qkey = attr->qkey; - if (attr_mask & IB_QP_PKEY_INDEX) - qp->s_pkey_index = attr->pkey_index; - qp->state = new_state; - spin_unlock(&qp->s_lock); - spin_unlock_irqrestore(&qp->r_rq.lock, flags); - - /* - * If QP1 changed to the RTS state, try to move to the link to INIT - * even if it was ACTIVE so the SM will reinitialize the SMA's - * state. - */ - if (qp->ibqp.qp_num == 1 && new_state == IB_QPS_RTS) { - struct ipath_ibdev *dev = to_idev(ibqp->device); + spin_unlock_irqrestore(&qp->s_lock, flags); - ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKDOWN); - } ret = 0; goto bail; inval: - spin_unlock(&qp->s_lock); - spin_unlock_irqrestore(&qp->r_rq.lock, flags); + spin_unlock_irqrestore(&qp->s_lock, flags); ret = -EINVAL; bail: @@ -566,7 +555,7 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->sq_draining = 0; attr->max_rd_atomic = 1; attr->max_dest_rd_atomic = 1; - attr->min_rnr_timer = qp->s_min_rnr_timer; + attr->min_rnr_timer = qp->r_min_rnr_timer; attr->port_num = 1; attr->timeout = 0; attr->retry_cnt = qp->s_retry_cnt; @@ -593,21 +582,17 @@ int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, * @qp: the queue pair to compute the AETH for * * Returns the AETH. - * - * The QP s_lock should be held. */ __be32 ipath_compute_aeth(struct ipath_qp *qp) { - u32 aeth = atomic_read(&qp->msn) & IPS_MSN_MASK; + u32 aeth = qp->r_msn & IPATH_MSN_MASK; - if (qp->s_nak_state) { - aeth |= qp->s_nak_state << IPS_AETH_CREDIT_SHIFT; - } else if (qp->ibqp.srq) { + if (qp->ibqp.srq) { /* * Shared receive queues don't generate credits. * Set the credit field to the invalid value. */ - aeth |= IPS_AETH_CREDIT_INVAL << IPS_AETH_CREDIT_SHIFT; + aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT; } else { u32 min, max, x; u32 credits; @@ -637,7 +622,7 @@ __be32 ipath_compute_aeth(struct ipath_qp *qp) else min = x; } - aeth |= x << IPS_AETH_CREDIT_SHIFT; + aeth |= x << IPATH_AETH_CREDIT_SHIFT; } return cpu_to_be32(aeth); } @@ -663,12 +648,22 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, size_t sz; struct ib_qp *ret; - if (init_attr->cap.max_send_sge > 255 || - init_attr->cap.max_recv_sge > 255) { + if (init_attr->cap.max_send_sge > ib_ipath_max_sges || + init_attr->cap.max_recv_sge > ib_ipath_max_sges || + init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs || + init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) { ret = ERR_PTR(-ENOMEM); goto bail; } + if (init_attr->cap.max_send_sge + + init_attr->cap.max_recv_sge + + init_attr->cap.max_send_wr + + init_attr->cap.max_recv_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + switch (init_attr->qp_type) { case IB_QPT_UC: case IB_QPT_RC: @@ -686,18 +681,26 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, case IB_QPT_GSI: qp = kmalloc(sizeof(*qp), GFP_KERNEL); if (!qp) { + vfree(swq); ret = ERR_PTR(-ENOMEM); goto bail; } - qp->r_rq.size = init_attr->cap.max_recv_wr + 1; - sz = sizeof(struct ipath_sge) * - init_attr->cap.max_recv_sge + - sizeof(struct ipath_rwqe); - qp->r_rq.wq = vmalloc(qp->r_rq.size * sz); - if (!qp->r_rq.wq) { - kfree(qp); - ret = ERR_PTR(-ENOMEM); - goto bail; + if (init_attr->srq) { + qp->r_rq.size = 0; + qp->r_rq.max_sge = 0; + qp->r_rq.wq = NULL; + } else { + qp->r_rq.size = init_attr->cap.max_recv_wr + 1; + qp->r_rq.max_sge = init_attr->cap.max_recv_sge; + sz = (sizeof(struct ipath_sge) * qp->r_rq.max_sge) + + sizeof(struct ipath_rwqe); + qp->r_rq.wq = vmalloc(qp->r_rq.size * sz); + if (!qp->r_rq.wq) { + kfree(qp); + vfree(swq); + ret = ERR_PTR(-ENOMEM); + goto bail; + } } /* @@ -708,9 +711,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, spin_lock_init(&qp->r_rq.lock); atomic_set(&qp->refcount, 0); init_waitqueue_head(&qp->wait); - tasklet_init(&qp->s_task, - init_attr->qp_type == IB_QPT_RC ? - ipath_do_rc_send : ipath_do_uc_send, + tasklet_init(&qp->s_task, ipath_do_ruc_send, (unsigned long)qp); INIT_LIST_HEAD(&qp->piowait); INIT_LIST_HEAD(&qp->timerwait); @@ -718,7 +719,6 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, qp->s_wq = swq; qp->s_size = init_attr->cap.max_send_wr + 1; qp->s_max_sge = init_attr->cap.max_send_sge; - qp->r_rq.max_sge = init_attr->cap.max_recv_sge; qp->s_flags = init_attr->sq_sig_type == IB_SIGNAL_REQ_WR ? 1 << IPATH_S_SIGNAL_REQ_WR : 0; dev = to_idev(ibpd->device); @@ -888,18 +888,18 @@ void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc) */ void ipath_get_credit(struct ipath_qp *qp, u32 aeth) { - u32 credit = (aeth >> IPS_AETH_CREDIT_SHIFT) & IPS_AETH_CREDIT_MASK; + u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK; /* * If the credit is invalid, we can send * as many packets as we like. Otherwise, we have to * honor the credit field. */ - if (credit == IPS_AETH_CREDIT_INVAL) { + if (credit == IPATH_AETH_CREDIT_INVAL) qp->s_lsn = (u32) -1; - } else if (qp->s_lsn != (u32) -1) { + else if (qp->s_lsn != (u32) -1) { /* Compute new LSN (i.e., MSN + credit) */ - credit = (aeth + credit_table[credit]) & IPS_MSN_MASK; + credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK; if (ipath_cmp24(credit, qp->s_lsn) > 0) qp->s_lsn = credit; } diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c index 493b1821a93..774d1615ce2 100644 --- a/drivers/infiniband/hw/ipath/ipath_rc.c +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,7 +32,7 @@ */ #include "ipath_verbs.h" -#include "ips_common.h" +#include "ipath_common.h" /* cut down ridiculously long IB macro names */ #define OP(x) IB_OPCODE_RC_##x @@ -41,14 +42,14 @@ * @qp: the QP who's SGE we're restarting * @wqe: the work queue to initialize the QP's SGE from * - * The QP s_lock should be held. + * The QP s_lock should be held and interrupts disabled. */ static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) { struct ipath_ibdev *dev; u32 len; - len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) * + len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * ib_mtu_enum_to_int(qp->path_mtu); qp->s_sge.sge = wqe->sg_list[0]; qp->s_sge.sg_list = wqe->sg_list + 1; @@ -72,11 +73,10 @@ static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) * Return bth0 if constructed; otherwise, return 0. * Note the QP s_lock must be held. */ -static inline u32 ipath_make_rc_ack(struct ipath_qp *qp, - struct ipath_other_headers *ohdr, - u32 pmtu) +u32 ipath_make_rc_ack(struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 pmtu) { - struct ipath_sge_state *ss; u32 hwords; u32 len; u32 bth0; @@ -90,13 +90,12 @@ static inline u32 ipath_make_rc_ack(struct ipath_qp *qp, */ switch (qp->s_ack_state) { case OP(RDMA_READ_REQUEST): - ss = &qp->s_rdma_sge; + qp->s_cur_sge = &qp->s_rdma_sge; len = qp->s_rdma_len; if (len > pmtu) { len = pmtu; qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); - } - else + } else qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); qp->s_rdma_len -= len; bth0 = qp->s_ack_state << 24; @@ -108,7 +107,7 @@ static inline u32 ipath_make_rc_ack(struct ipath_qp *qp, qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); /* FALLTHROUGH */ case OP(RDMA_READ_RESPONSE_MIDDLE): - ss = &qp->s_rdma_sge; + qp->s_cur_sge = &qp->s_rdma_sge; len = qp->s_rdma_len; if (len > pmtu) len = pmtu; @@ -127,41 +126,50 @@ static inline u32 ipath_make_rc_ack(struct ipath_qp *qp, * We have to prevent new requests from changing * the r_sge state while a ipath_verbs_send() * is in progress. - * Changing r_state allows the receiver - * to continue processing new packets. - * We do it here now instead of above so - * that we are sure the packet was sent before - * changing the state. */ - qp->r_state = OP(RDMA_READ_RESPONSE_LAST); qp->s_ack_state = OP(ACKNOWLEDGE); - return 0; + bth0 = 0; + goto bail; case OP(COMPARE_SWAP): case OP(FETCH_ADD): - ss = NULL; + qp->s_cur_sge = NULL; len = 0; - qp->r_state = OP(SEND_LAST); - qp->s_ack_state = OP(ACKNOWLEDGE); - bth0 = IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24; + /* + * Set the s_ack_state so the receive interrupt handler + * won't try to send an ACK (out of order) until this one + * is actually sent. + */ + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + bth0 = OP(ATOMIC_ACKNOWLEDGE) << 24; ohdr->u.at.aeth = ipath_compute_aeth(qp); - ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic); + ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data); hwords += sizeof(ohdr->u.at) / 4; break; default: /* Send a regular ACK. */ - ss = NULL; + qp->s_cur_sge = NULL; len = 0; - qp->s_ack_state = OP(ACKNOWLEDGE); - bth0 = qp->s_ack_state << 24; - ohdr->u.aeth = ipath_compute_aeth(qp); + /* + * Set the s_ack_state so the receive interrupt handler + * won't try to send an ACK (out of order) until this one + * is actually sent. + */ + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + bth0 = OP(ACKNOWLEDGE) << 24; + if (qp->s_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->s_nak_state << + IPATH_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = ipath_compute_aeth(qp); hwords++; } qp->s_hdrwords = hwords; - qp->s_cur_sge = ss; qp->s_cur_size = len; +bail: return bth0; } @@ -174,11 +182,11 @@ static inline u32 ipath_make_rc_ack(struct ipath_qp *qp, * @bth2p: pointer to the BTH PSN word * * Return 1 if constructed; otherwise, return 0. - * Note the QP s_lock must be held. + * Note the QP s_lock must be held and interrupts disabled. */ -static inline int ipath_make_rc_req(struct ipath_qp *qp, - struct ipath_other_headers *ohdr, - u32 pmtu, u32 *bth0p, u32 *bth2p) +int ipath_make_rc_req(struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 pmtu, u32 *bth0p, u32 *bth2p) { struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ipath_sge_state *ss; @@ -257,7 +265,7 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, break; case IB_WR_RDMA_WRITE: - if (newreq) + if (newreq && qp->s_lsn != (u32) -1) qp->s_lsn++; /* FALLTHROUGH */ case IB_WR_RDMA_WRITE_WITH_IMM: @@ -283,8 +291,7 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, else { qp->s_state = OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); - /* Immediate data comes - * after RETH */ + /* Immediate data comes after RETH */ ohdr->u.rc.imm_data = wqe->wr.imm_data; hwords += 1; if (wqe->wr.send_flags & IB_SEND_SOLICITED) @@ -304,7 +311,8 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, qp->s_state = OP(RDMA_READ_REQUEST); hwords += sizeof(ohdr->u.rc.reth) / 4; if (newreq) { - qp->s_lsn++; + if (qp->s_lsn != (u32) -1) + qp->s_lsn++; /* * Adjust s_next_psn to count the * expected number of responses. @@ -335,7 +343,8 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, wqe->wr.wr.atomic.compare_add); hwords += sizeof(struct ib_atomic_eth) / 4; if (newreq) { - qp->s_lsn++; + if (qp->s_lsn != (u32) -1) + qp->s_lsn++; wqe->lpsn = wqe->psn; } if (++qp->s_cur == qp->s_size) @@ -352,9 +361,14 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, if (qp->s_tail >= qp->s_size) qp->s_tail = 0; } - bth2 |= qp->s_psn++ & IPS_PSN_MASK; + bth2 |= qp->s_psn++ & IPATH_PSN_MASK; if ((int)(qp->s_psn - qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; + /* + * Put the QP on the pending list so lost ACKs will cause + * a retry. More than one request can be pending so the + * QP may already be on the dev->pending list. + */ spin_lock(&dev->pending_lock); if (list_empty(&qp->timerwait)) list_add_tail(&qp->timerwait, @@ -364,8 +378,8 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, case OP(RDMA_READ_RESPONSE_FIRST): /* - * This case can only happen if a send is restarted. See - * ipath_restart_rc(). + * This case can only happen if a send is restarted. + * See ipath_restart_rc(). */ ipath_init_restart(qp, wqe); /* FALLTHROUGH */ @@ -373,7 +387,7 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, qp->s_state = OP(SEND_MIDDLE); /* FALLTHROUGH */ case OP(SEND_MIDDLE): - bth2 = qp->s_psn++ & IPS_PSN_MASK; + bth2 = qp->s_psn++ & IPATH_PSN_MASK; if ((int)(qp->s_psn - qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; @@ -415,7 +429,7 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, qp->s_state = OP(RDMA_WRITE_MIDDLE); /* FALLTHROUGH */ case OP(RDMA_WRITE_MIDDLE): - bth2 = qp->s_psn++ & IPS_PSN_MASK; + bth2 = qp->s_psn++ & IPATH_PSN_MASK; if ((int)(qp->s_psn - qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; ss = &qp->s_sge; @@ -452,7 +466,7 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, * See ipath_restart_rc(). */ ipath_init_restart(qp, wqe); - len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) * pmtu; + len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; ohdr->u.rc.reth.vaddr = cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); ohdr->u.rc.reth.rkey = @@ -460,7 +474,7 @@ static inline int ipath_make_rc_req(struct ipath_qp *qp, ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); qp->s_state = OP(RDMA_READ_REQUEST); hwords += sizeof(ohdr->u.rc.reth) / 4; - bth2 = qp->s_psn++ & IPS_PSN_MASK; + bth2 = qp->s_psn++ & IPATH_PSN_MASK; if ((int)(qp->s_psn - qp->s_next_psn) > 0) qp->s_next_psn = qp->s_psn; ss = NULL; @@ -496,189 +510,169 @@ done: return 0; } -static inline void ipath_make_rc_grh(struct ipath_qp *qp, - struct ib_global_route *grh, - u32 nwords) -{ - struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - - /* GRH header size in 32-bit words. */ - qp->s_hdrwords += 10; - qp->s_hdr.u.l.grh.version_tclass_flow = - cpu_to_be32((6 << 28) | - (grh->traffic_class << 20) | - grh->flow_label); - qp->s_hdr.u.l.grh.paylen = - cpu_to_be16(((qp->s_hdrwords - 12) + nwords + - SIZE_OF_CRC) << 2); - /* next_hdr is defined by C8-7 in ch. 8.4.1 */ - qp->s_hdr.u.l.grh.next_hdr = 0x1B; - qp->s_hdr.u.l.grh.hop_limit = grh->hop_limit; - /* The SGID is 32-bit aligned. */ - qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix; - qp->s_hdr.u.l.grh.sgid.global.interface_id = - ipath_layer_get_guid(dev->dd); - qp->s_hdr.u.l.grh.dgid = grh->dgid; -} - /** - * ipath_do_rc_send - perform a send on an RC QP - * @data: contains a pointer to the QP + * send_rc_ack - Construct an ACK packet and send it + * @qp: a pointer to the QP * - * Process entries in the send work queue until credit or queue is - * exhausted. Only allow one CPU to send a packet per QP (tasklet). - * Otherwise, after we drop the QP s_lock, two threads could send - * packets out of order. + * This is called from ipath_rc_rcv() and only uses the receive + * side QP state. + * Note that RDMA reads are handled in the send side QP state and tasklet. */ -void ipath_do_rc_send(unsigned long data) +static void send_rc_ack(struct ipath_qp *qp) { - struct ipath_qp *qp = (struct ipath_qp *)data; struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - unsigned long flags; u16 lrh0; - u32 nwords; - u32 extra_bytes; u32 bth0; - u32 bth2; - u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + u32 hwords; + struct ipath_ib_header hdr; struct ipath_other_headers *ohdr; - if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags)) - goto bail; - - if (unlikely(qp->remote_ah_attr.dlid == - ipath_layer_get_lid(dev->dd))) { - struct ib_wc wc; - - /* - * Pass in an uninitialized ib_wc to be consistent with - * other places where ipath_ruc_loopback() is called. - */ - ipath_ruc_loopback(qp, &wc); - goto clear; - } - - ohdr = &qp->s_hdr.u.oth; - if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) - ohdr = &qp->s_hdr.u.l.oth; - -again: - /* Check for a constructed packet to be sent. */ - if (qp->s_hdrwords != 0) { - /* - * If no PIO bufs are available, return. An interrupt will - * call ipath_ib_piobufavail() when one is available. - */ - _VERBS_INFO("h %u %p\n", qp->s_hdrwords, &qp->s_hdr); - _VERBS_INFO("d %u %p %u %p %u %u %u %u\n", qp->s_cur_size, - qp->s_cur_sge->sg_list, - qp->s_cur_sge->num_sge, - qp->s_cur_sge->sge.vaddr, - qp->s_cur_sge->sge.sge_length, - qp->s_cur_sge->sge.length, - qp->s_cur_sge->sge.m, - qp->s_cur_sge->sge.n); - if (ipath_verbs_send(dev->dd, qp->s_hdrwords, - (u32 *) &qp->s_hdr, qp->s_cur_size, - qp->s_cur_sge)) { - ipath_no_bufs_available(qp, dev); - goto bail; - } - dev->n_unicast_xmit++; - /* Record that we sent the packet and s_hdr is empty. */ - qp->s_hdrwords = 0; - } - - /* - * The lock is needed to synchronize between setting - * qp->s_ack_state, resend timer, and post_send(). - */ - spin_lock_irqsave(&qp->s_lock, flags); - - /* Sending responses has higher priority over sending requests. */ - if (qp->s_ack_state != OP(ACKNOWLEDGE) && - (bth0 = ipath_make_rc_ack(qp, ohdr, pmtu)) != 0) - bth2 = qp->s_ack_psn++ & IPS_PSN_MASK; - else if (!ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2)) - goto done; - - spin_unlock_irqrestore(&qp->s_lock, flags); - /* Construct the header. */ - extra_bytes = (4 - qp->s_cur_size) & 3; - nwords = (qp->s_cur_size + extra_bytes) >> 2; - lrh0 = IPS_LRH_BTH; + ohdr = &hdr.u.oth; + lrh0 = IPATH_LRH_BTH; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ + hwords = 6; if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { - ipath_make_rc_grh(qp, &qp->remote_ah_attr.grh, nwords); - lrh0 = IPS_LRH_GRH; + hwords += ipath_make_grh(dev, &hdr.u.l.grh, + &qp->remote_ah_attr.grh, + hwords, 0); + ohdr = &hdr.u.l.oth; + lrh0 = IPATH_LRH_GRH; } + /* read pkey_index w/o lock (its atomic) */ + bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->r_nak_state << + IPATH_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = ipath_compute_aeth(qp); + if (qp->r_ack_state >= OP(COMPARE_SWAP)) { + bth0 |= OP(ATOMIC_ACKNOWLEDGE) << 24; + ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->r_atomic_data); + hwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4; + } else + bth0 |= OP(ACKNOWLEDGE) << 24; lrh0 |= qp->remote_ah_attr.sl << 4; - qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); - qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); - qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + - SIZE_OF_CRC); - qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); - bth0 |= ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); - bth0 |= extra_bytes << 20; + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); ohdr->bth[0] = cpu_to_be32(bth0); ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); - ohdr->bth[2] = cpu_to_be32(bth2); + ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK); - /* Check for more work to do. */ - goto again; + /* + * If we can send the ACK, clear the ACK state. + */ + if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) { + qp->r_ack_state = OP(ACKNOWLEDGE); + dev->n_unicast_xmit++; + } else { + /* + * We are out of PIO buffers at the moment. + * Pass responsibility for sending the ACK to the + * send tasklet so that when a PIO buffer becomes + * available, the ACK is sent ahead of other outgoing + * packets. + */ + dev->n_rc_qacks++; + spin_lock_irq(&qp->s_lock); + /* Don't coalesce if a RDMA read or atomic is pending. */ + if (qp->s_ack_state == OP(ACKNOWLEDGE) || + qp->s_ack_state < OP(RDMA_READ_REQUEST)) { + qp->s_ack_state = qp->r_ack_state; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + qp->r_ack_state = OP(ACKNOWLEDGE); + } + spin_unlock_irq(&qp->s_lock); -done: - spin_unlock_irqrestore(&qp->s_lock, flags); -clear: - clear_bit(IPATH_S_BUSY, &qp->s_flags); -bail: - return; + /* Call ipath_do_rc_send() in another thread. */ + tasklet_hi_schedule(&qp->s_task); + } } -static void send_rc_ack(struct ipath_qp *qp) +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from ipath_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct ipath_qp *qp, u32 psn) { - struct ipath_ibdev *dev = to_idev(qp->ibqp.device); - u16 lrh0; - u32 bth0; - struct ipath_other_headers *ohdr; + u32 n = qp->s_last; + struct ipath_swqe *wqe = get_swqe_ptr(qp, n); + u32 opcode; - /* Construct the header. */ - ohdr = &qp->s_hdr.u.oth; - lrh0 = IPS_LRH_BTH; - /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ - qp->s_hdrwords = 6; - if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { - ipath_make_rc_grh(qp, &qp->remote_ah_attr.grh, 0); - ohdr = &qp->s_hdr.u.l.oth; - lrh0 = IPS_LRH_GRH; + qp->s_cur = n; + + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (ipath_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + goto done; } - bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); - ohdr->u.aeth = ipath_compute_aeth(qp); - if (qp->s_ack_state >= OP(COMPARE_SWAP)) { - bth0 |= IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24; - ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic); - qp->s_hdrwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4; + + /* Find the work request opcode corresponding to the given PSN. */ + opcode = wqe->wr.opcode; + for (;;) { + int diff; + + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, n); + diff = ipath_cmp24(psn, wqe->psn); + if (diff < 0) + break; + qp->s_cur = n; + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (diff == 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + opcode = wqe->wr.opcode; } - else - bth0 |= OP(ACKNOWLEDGE) << 24; - lrh0 |= qp->remote_ah_attr.sl << 4; - qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); - qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); - qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + SIZE_OF_CRC); - qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); - ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); - ohdr->bth[2] = cpu_to_be32(qp->s_ack_psn & IPS_PSN_MASK); /* - * If we can send the ACK, clear the ACK state. + * Set the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See ipath_do_rc_send(). */ - if (ipath_verbs_send(dev->dd, qp->s_hdrwords, (u32 *) &qp->s_hdr, - 0, NULL) == 0) { - qp->s_ack_state = OP(ACKNOWLEDGE); - dev->n_rc_qacks++; - dev->n_unicast_xmit++; + switch (opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); } +done: + qp->s_psn = psn; } /** @@ -687,13 +681,12 @@ static void send_rc_ack(struct ipath_qp *qp) * @psn: packet sequence number for the request * @wc: the work completion request * - * The QP s_lock should be held. + * The QP s_lock should be held and interrupts disabled. */ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) { struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); struct ipath_ibdev *dev; - u32 n; /* * If there are no requests pending, we are done. @@ -735,62 +728,7 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc) else dev->n_rc_resends += (int)qp->s_psn - (int)psn; - /* - * If we are starting the request from the beginning, let the normal - * send code handle initialization. - */ - qp->s_cur = qp->s_last; - if (ipath_cmp24(psn, wqe->psn) <= 0) { - qp->s_state = OP(SEND_LAST); - qp->s_psn = wqe->psn; - } else { - n = qp->s_cur; - for (;;) { - if (++n == qp->s_size) - n = 0; - if (n == qp->s_tail) { - if (ipath_cmp24(psn, qp->s_next_psn) >= 0) { - qp->s_cur = n; - wqe = get_swqe_ptr(qp, n); - } - break; - } - wqe = get_swqe_ptr(qp, n); - if (ipath_cmp24(psn, wqe->psn) < 0) - break; - qp->s_cur = n; - } - qp->s_psn = psn; - - /* - * Reset the state to restart in the middle of a request. - * Don't change the s_sge, s_cur_sge, or s_cur_size. - * See ipath_do_rc_send(). - */ - switch (wqe->wr.opcode) { - case IB_WR_SEND: - case IB_WR_SEND_WITH_IMM: - qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); - break; - - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_WRITE_WITH_IMM: - qp->s_state = OP(RDMA_READ_RESPONSE_LAST); - break; - - case IB_WR_RDMA_READ: - qp->s_state = - OP(RDMA_READ_RESPONSE_MIDDLE); - break; - - default: - /* - * This case shouldn't happen since its only - * one PSN per req. - */ - qp->s_state = OP(SEND_LAST); - } - } + reset_psn(qp, psn); done: tasklet_hi_schedule(&qp->s_task); @@ -800,76 +738,14 @@ bail: } /** - * reset_psn - reset the QP state to send starting from PSN - * @qp: the QP - * @psn: the packet sequence number to restart at - * - * This is called from ipath_rc_rcv() to process an incoming RC ACK - * for the given QP. - * Called at interrupt level with the QP s_lock held. - */ -static void reset_psn(struct ipath_qp *qp, u32 psn) -{ - struct ipath_swqe *wqe; - u32 n; - - n = qp->s_cur; - wqe = get_swqe_ptr(qp, n); - for (;;) { - if (++n == qp->s_size) - n = 0; - if (n == qp->s_tail) { - if (ipath_cmp24(psn, qp->s_next_psn) >= 0) { - qp->s_cur = n; - wqe = get_swqe_ptr(qp, n); - } - break; - } - wqe = get_swqe_ptr(qp, n); - if (ipath_cmp24(psn, wqe->psn) < 0) - break; - qp->s_cur = n; - } - qp->s_psn = psn; - - /* - * Set the state to restart in the middle of a - * request. Don't change the s_sge, s_cur_sge, or - * s_cur_size. See ipath_do_rc_send(). - */ - switch (wqe->wr.opcode) { - case IB_WR_SEND: - case IB_WR_SEND_WITH_IMM: - qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); - break; - - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_WRITE_WITH_IMM: - qp->s_state = OP(RDMA_READ_RESPONSE_LAST); - break; - - case IB_WR_RDMA_READ: - qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); - break; - - default: - /* - * This case shouldn't happen since its only - * one PSN per req. - */ - qp->s_state = OP(SEND_LAST); - } -} - -/** * do_rc_ack - process an incoming RC ACK * @qp: the QP the ACK came in on * @psn: the packet sequence number of the ACK * @opcode: the opcode of the request that resulted in the ACK * - * This is called from ipath_rc_rcv() to process an incoming RC ACK + * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK * for the given QP. - * Called at interrupt level with the QP s_lock held. + * Called at interrupt level with the QP s_lock held and interrupts disabled. * Returns 1 if OK, 0 if current operation should be aborted (NAK). */ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) @@ -1006,26 +882,16 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) if (qp->s_last == qp->s_tail) goto bail; - /* The last valid PSN seen is the previous request's. */ - qp->s_last_psn = wqe->psn - 1; + /* The last valid PSN is the previous PSN. */ + qp->s_last_psn = psn - 1; dev->n_rc_resends += (int)qp->s_psn - (int)psn; - /* - * If we are starting the request from the beginning, let - * the normal send code handle initialization. - */ - qp->s_cur = qp->s_last; - wqe = get_swqe_ptr(qp, qp->s_cur); - if (ipath_cmp24(psn, wqe->psn) <= 0) { - qp->s_state = OP(SEND_LAST); - qp->s_psn = wqe->psn; - } else - reset_psn(qp, psn); + reset_psn(qp, psn); qp->s_rnr_timeout = - ib_ipath_rnr_table[(aeth >> IPS_AETH_CREDIT_SHIFT) & - IPS_AETH_CREDIT_MASK]; + ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) & + IPATH_AETH_CREDIT_MASK]; ipath_insert_rnr_queue(qp); goto bail; @@ -1033,8 +899,8 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode) /* The last valid PSN seen is the previous request's. */ if (qp->s_last != qp->s_tail) qp->s_last_psn = wqe->psn - 1; - switch ((aeth >> IPS_AETH_CREDIT_SHIFT) & - IPS_AETH_CREDIT_MASK) { + switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) & + IPATH_AETH_CREDIT_MASK) { case 0: /* PSN sequence error */ dev->n_seq_naks++; /* @@ -1182,32 +1048,33 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, goto ack_done; } rdma_read: - if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST))) - goto ack_done; - if (unlikely(tlen != (hdrsize + pmtu + 4))) - goto ack_done; - if (unlikely(pmtu >= qp->s_len)) - goto ack_done; - /* We got a response so update the timeout. */ - if (unlikely(qp->s_last == qp->s_tail || - get_swqe_ptr(qp, qp->s_last)->wr.opcode != - IB_WR_RDMA_READ)) - goto ack_done; - spin_lock(&dev->pending_lock); - if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait)) - list_move_tail(&qp->timerwait, - &dev->pending[dev->pending_index]); - spin_unlock(&dev->pending_lock); - /* - * Update the RDMA receive state but do the copy w/o holding the - * locks and blocking interrupts. XXX Yet another place that - * affects relaxed RDMA order since we don't want s_sge modified. - */ - qp->s_len -= pmtu; - qp->s_last_psn = psn; - spin_unlock_irqrestore(&qp->s_lock, flags); - ipath_copy_sge(&qp->s_sge, data, pmtu); - goto bail; + if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST))) + goto ack_done; + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto ack_done; + if (unlikely(pmtu >= qp->s_len)) + goto ack_done; + /* We got a response so update the timeout. */ + if (unlikely(qp->s_last == qp->s_tail || + get_swqe_ptr(qp, qp->s_last)->wr.opcode != + IB_WR_RDMA_READ)) + goto ack_done; + spin_lock(&dev->pending_lock); + if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait)) + list_move_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + /* + * Update the RDMA receive state but do the copy w/o + * holding the locks and blocking interrupts. + * XXX Yet another place that affects relaxed RDMA order + * since we don't want s_sge modified. + */ + qp->s_len -= pmtu; + qp->s_last_psn = psn; + spin_unlock_irqrestore(&qp->s_lock, flags); + ipath_copy_sge(&qp->s_sge, data, pmtu); + goto bail; case OP(RDMA_READ_RESPONSE_LAST): /* ACKs READ req. */ @@ -1230,18 +1097,12 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, * ICRC (4). */ if (unlikely(tlen <= (hdrsize + pad + 8))) { - /* - * XXX Need to generate an error CQ - * entry. - */ + /* XXX Need to generate an error CQ entry. */ goto ack_done; } tlen -= hdrsize + pad + 8; if (unlikely(tlen != qp->s_len)) { - /* - * XXX Need to generate an error CQ - * entry. - */ + /* XXX Need to generate an error CQ entry. */ goto ack_done; } if (!header_in_data) @@ -1254,9 +1115,12 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, if (do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST))) { /* * Change the state so we contimue - * processing new requests. + * processing new requests and wake up the + * tasklet if there are posted sends. */ qp->s_state = OP(SEND_LAST); + if (qp->s_tail != qp->s_head) + tasklet_hi_schedule(&qp->s_task); } goto ack_done; } @@ -1302,18 +1166,16 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, * Don't queue the NAK if a RDMA read, atomic, or * NAK is pending though. */ - spin_lock(&qp->s_lock); - if ((qp->s_ack_state >= OP(RDMA_READ_REQUEST) && - qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) || - qp->s_nak_state != 0) { - spin_unlock(&qp->s_lock); + if (qp->s_ack_state != OP(ACKNOWLEDGE) || + qp->r_nak_state != 0) goto done; + if (qp->r_ack_state < OP(COMPARE_SWAP)) { + qp->r_ack_state = OP(SEND_ONLY); + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; } - qp->s_ack_state = OP(SEND_ONLY); - qp->s_nak_state = IB_NAK_PSN_ERROR; - /* Use the expected PSN. */ - qp->s_ack_psn = qp->r_psn; - goto resched; + goto send_ack; } /* @@ -1327,27 +1189,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, * send the earliest so that RDMA reads can be restarted at * the requester's expected PSN. */ - spin_lock(&qp->s_lock); - if (qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE && - ipath_cmp24(psn, qp->s_ack_psn) >= 0) { - if (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST) - qp->s_ack_psn = psn; - spin_unlock(&qp->s_lock); - goto done; - } - switch (opcode) { - case OP(RDMA_READ_REQUEST): - /* - * We have to be careful to not change s_rdma_sge - * while ipath_do_rc_send() is using it and not - * holding the s_lock. - */ - if (qp->s_ack_state != OP(ACKNOWLEDGE) && - qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) { - spin_unlock(&qp->s_lock); - dev->n_rdma_dup_busy++; - goto done; - } + if (opcode == OP(RDMA_READ_REQUEST)) { /* RETH comes after BTH */ if (!header_in_data) reth = &ohdr->u.rc.reth; @@ -1355,6 +1197,22 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, reth = (struct ib_reth *)data; data += sizeof(*reth); } + /* + * If we receive a duplicate RDMA request, it means the + * requester saw a sequence error and needs to restart + * from an earlier point. We can abort the current + * RDMA read send in that case. + */ + spin_lock_irq(&qp->s_lock); + if (qp->s_ack_state != OP(ACKNOWLEDGE) && + (qp->s_hdrwords || ipath_cmp24(psn, qp->s_ack_psn) >= 0)) { + /* + * We are already sending earlier requested data. + * Don't abort it to send later out of sequence data. + */ + spin_unlock_irq(&qp->s_lock); + goto done; + } qp->s_rdma_len = be32_to_cpu(reth->length); if (qp->s_rdma_len != 0) { u32 rkey = be32_to_cpu(reth->rkey); @@ -1368,8 +1226,10 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, ok = ipath_rkey_ok(dev, &qp->s_rdma_sge, qp->s_rdma_len, vaddr, rkey, IB_ACCESS_REMOTE_READ); - if (unlikely(!ok)) + if (unlikely(!ok)) { + spin_unlock_irq(&qp->s_lock); goto done; + } } else { qp->s_rdma_sge.sg_list = NULL; qp->s_rdma_sge.num_sge = 0; @@ -1378,25 +1238,44 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, qp->s_rdma_sge.sge.length = 0; qp->s_rdma_sge.sge.sge_length = 0; } - break; + qp->s_ack_state = opcode; + qp->s_ack_psn = psn; + spin_unlock_irq(&qp->s_lock); + tasklet_hi_schedule(&qp->s_task); + goto send_ack; + } + + /* + * A pending RDMA read will ACK anything before it so + * ignore earlier duplicate requests. + */ + if (qp->s_ack_state != OP(ACKNOWLEDGE)) + goto done; + /* + * If an ACK is pending, don't replace the pending ACK + * with an earlier one since the later one will ACK the earlier. + * Also, if we already have a pending atomic, send it. + */ + if (qp->r_ack_state != OP(ACKNOWLEDGE) && + (ipath_cmp24(psn, qp->r_ack_psn) <= 0 || + qp->r_ack_state >= OP(COMPARE_SWAP))) + goto send_ack; + switch (opcode) { case OP(COMPARE_SWAP): case OP(FETCH_ADD): /* - * Check for the PSN of the last atomic operations + * Check for the PSN of the last atomic operation * performed and resend the result if found. */ - if ((psn & IPS_PSN_MASK) != qp->r_atomic_psn) { - spin_unlock(&qp->s_lock); + if ((psn & IPATH_PSN_MASK) != qp->r_atomic_psn) goto done; - } - qp->s_ack_atomic = qp->r_atomic_data; break; } - qp->s_ack_state = opcode; - qp->s_nak_state = 0; - qp->s_ack_psn = psn; -resched: + qp->r_ack_state = opcode; + qp->r_nak_state = 0; + qp->r_ack_psn = psn; +send_ack: return 0; done: @@ -1424,7 +1303,6 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, u32 hdrsize; u32 psn; u32 pad; - unsigned long flags; struct ib_wc wc; u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); int diff; @@ -1453,11 +1331,6 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, } else psn = be32_to_cpu(ohdr->bth[2]); } - /* - * The opcode is in the low byte when its in network order - * (top byte when in host order). - */ - opcode = be32_to_cpu(ohdr->bth[0]) >> 24; /* * Process responses (ACKs) before anything else. Note that the @@ -1465,22 +1338,21 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, * queue rather than the expected receive packet sequence number. * In other words, this QP is the requester. */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && opcode <= OP(ATOMIC_ACKNOWLEDGE)) { ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn, hdrsize, pmtu, header_in_data); - goto bail; + goto done; } - spin_lock_irqsave(&qp->r_rq.lock, flags); - /* Compute 24 bits worth of difference. */ diff = ipath_cmp24(psn, qp->r_psn); if (unlikely(diff)) { if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode, psn, diff, header_in_data)) goto done; - goto resched; + goto send_ack; } /* Check for opcode sequence errors. */ @@ -1492,22 +1364,19 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, opcode == OP(SEND_LAST_WITH_IMMEDIATE)) break; nack_inv: - /* - * A NAK will ACK earlier sends and RDMA writes. Don't queue the - * NAK if a RDMA read, atomic, or NAK is pending though. - */ - spin_lock(&qp->s_lock); - if (qp->s_ack_state >= OP(RDMA_READ_REQUEST) && - qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) { - spin_unlock(&qp->s_lock); - goto done; - } - /* XXX Flush WQEs */ - qp->state = IB_QPS_ERR; - qp->s_ack_state = OP(SEND_ONLY); - qp->s_nak_state = IB_NAK_INVALID_REQUEST; - qp->s_ack_psn = qp->r_psn; - goto resched; + /* + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if a RDMA read, atomic, or NAK + * is pending though. + */ + if (qp->r_ack_state >= OP(COMPARE_SWAP)) + goto send_ack; + /* XXX Flush WQEs */ + qp->state = IB_QPS_ERR; + qp->r_ack_state = OP(SEND_ONLY); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + goto send_ack; case OP(RDMA_WRITE_FIRST): case OP(RDMA_WRITE_MIDDLE): @@ -1517,20 +1386,6 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, break; goto nack_inv; - case OP(RDMA_READ_REQUEST): - case OP(COMPARE_SWAP): - case OP(FETCH_ADD): - /* - * Drop all new requests until a response has been sent. A - * new request then ACKs the RDMA response we sent. Relaxed - * ordering would allow new requests to be processed but we - * would need to keep a queue of rwqe's for all that are in - * progress. Note that we can't RNR NAK this request since - * the RDMA READ or atomic response is already queued to be - * sent (unless we implement a response send queue). - */ - goto done; - default: if (opcode == OP(SEND_MIDDLE) || opcode == OP(SEND_LAST) || @@ -1539,6 +1394,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, opcode == OP(RDMA_WRITE_LAST) || opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) goto nack_inv; + /* + * Note that it is up to the requester to not send a new + * RDMA read or atomic operation before receiving an ACK + * for the previous operation. + */ break; } @@ -1555,17 +1415,12 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, * Don't queue the NAK if a RDMA read or atomic * is pending though. */ - spin_lock(&qp->s_lock); - if (qp->s_ack_state >= - OP(RDMA_READ_REQUEST) && - qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) { - spin_unlock(&qp->s_lock); - goto done; - } - qp->s_ack_state = OP(SEND_ONLY); - qp->s_nak_state = IB_RNR_NAK | qp->s_min_rnr_timer; - qp->s_ack_psn = qp->r_psn; - goto resched; + if (qp->r_ack_state >= OP(COMPARE_SWAP)) + goto send_ack; + qp->r_ack_state = OP(SEND_ONLY); + qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_ack_psn = qp->r_psn; + goto send_ack; } qp->r_rcv_len = 0; /* FALLTHROUGH */ @@ -1622,7 +1477,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; ipath_copy_sge(&qp->r_sge, data, tlen); - atomic_inc(&qp->msn); + qp->r_msn++; if (opcode == OP(RDMA_WRITE_LAST) || opcode == OP(RDMA_WRITE_ONLY)) break; @@ -1666,29 +1521,8 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, ok = ipath_rkey_ok(dev, &qp->r_sge, qp->r_len, vaddr, rkey, IB_ACCESS_REMOTE_WRITE); - if (unlikely(!ok)) { - nack_acc: - /* - * A NAK will ACK earlier sends and RDMA - * writes. Don't queue the NAK if a RDMA - * read, atomic, or NAK is pending though. - */ - spin_lock(&qp->s_lock); - if (qp->s_ack_state >= - OP(RDMA_READ_REQUEST) && - qp->s_ack_state != - IB_OPCODE_ACKNOWLEDGE) { - spin_unlock(&qp->s_lock); - goto done; - } - /* XXX Flush WQEs */ - qp->state = IB_QPS_ERR; - qp->s_ack_state = OP(RDMA_WRITE_ONLY); - qp->s_nak_state = - IB_NAK_REMOTE_ACCESS_ERROR; - qp->s_ack_psn = qp->r_psn; - goto resched; - } + if (unlikely(!ok)) + goto nack_acc; } else { qp->r_sge.sg_list = NULL; qp->r_sge.sge.mr = NULL; @@ -1715,12 +1549,10 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, reth = (struct ib_reth *)data; data += sizeof(*reth); } - spin_lock(&qp->s_lock); - if (qp->s_ack_state != OP(ACKNOWLEDGE) && - qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) { - spin_unlock(&qp->s_lock); - goto done; - } + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_READ))) + goto nack_acc; + spin_lock_irq(&qp->s_lock); qp->s_rdma_len = be32_to_cpu(reth->length); if (qp->s_rdma_len != 0) { u32 rkey = be32_to_cpu(reth->rkey); @@ -1732,7 +1564,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, qp->s_rdma_len, vaddr, rkey, IB_ACCESS_REMOTE_READ); if (unlikely(!ok)) { - spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->s_lock); goto nack_acc; } /* @@ -1749,21 +1581,25 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, qp->s_rdma_sge.sge.length = 0; qp->s_rdma_sge.sge.sge_length = 0; } - if (unlikely(!(qp->qp_access_flags & - IB_ACCESS_REMOTE_READ))) - goto nack_acc; /* * We need to increment the MSN here instead of when we * finish sending the result since a duplicate request would * increment it more than once. */ - atomic_inc(&qp->msn); + qp->r_msn++; + qp->s_ack_state = opcode; - qp->s_nak_state = 0; qp->s_ack_psn = psn; + spin_unlock_irq(&qp->s_lock); + qp->r_psn++; qp->r_state = opcode; - goto rdmadone; + qp->r_nak_state = 0; + + /* Call ipath_do_rc_send() in another thread. */ + tasklet_hi_schedule(&qp->s_task); + + goto done; case OP(COMPARE_SWAP): case OP(FETCH_ADD): { @@ -1792,7 +1628,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, goto nack_acc; /* Perform atomic OP and save result. */ sdata = be64_to_cpu(ateth->swap_data); - spin_lock(&dev->pending_lock); + spin_lock_irq(&dev->pending_lock); qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr; if (opcode == OP(FETCH_ADD)) *(u64 *) qp->r_sge.sge.vaddr = @@ -1800,9 +1636,9 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, else if (qp->r_atomic_data == be64_to_cpu(ateth->compare_data)) *(u64 *) qp->r_sge.sge.vaddr = sdata; - spin_unlock(&dev->pending_lock); - atomic_inc(&qp->msn); - qp->r_atomic_psn = psn & IPS_PSN_MASK; + spin_unlock_irq(&dev->pending_lock); + qp->r_msn++; + qp->r_atomic_psn = psn & IPATH_PSN_MASK; psn |= 1 << 31; break; } @@ -1813,44 +1649,39 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, } qp->r_psn++; qp->r_state = opcode; + qp->r_nak_state = 0; /* Send an ACK if requested or required. */ if (psn & (1 << 31)) { /* * Coalesce ACKs unless there is a RDMA READ or * ATOMIC pending. */ - spin_lock(&qp->s_lock); - if (qp->s_ack_state == OP(ACKNOWLEDGE) || - qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST) { - qp->s_ack_state = opcode; - qp->s_nak_state = 0; - qp->s_ack_psn = psn; - qp->s_ack_atomic = qp->r_atomic_data; - goto resched; + if (qp->r_ack_state < OP(COMPARE_SWAP)) { + qp->r_ack_state = opcode; + qp->r_ack_psn = psn; } - spin_unlock(&qp->s_lock); + goto send_ack; } -done: - spin_unlock_irqrestore(&qp->r_rq.lock, flags); - goto bail; + goto done; -resched: +nack_acc: /* - * Try to send ACK right away but not if ipath_do_rc_send() is - * active. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if a RDMA read, atomic, or NAK + * is pending though. */ - if (qp->s_hdrwords == 0 && - (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST || - qp->s_ack_state >= IB_OPCODE_COMPARE_SWAP)) + if (qp->r_ack_state < OP(COMPARE_SWAP)) { + /* XXX Flush WQEs */ + qp->state = IB_QPS_ERR; + qp->r_ack_state = OP(RDMA_WRITE_ONLY); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; + } +send_ack: + /* Send ACK right away unless the send tasklet has a pending ACK. */ + if (qp->s_ack_state == OP(ACKNOWLEDGE)) send_rc_ack(qp); -rdmadone: - spin_unlock(&qp->s_lock); - spin_unlock_irqrestore(&qp->r_rq.lock, flags); - - /* Call ipath_do_rc_send() in another thread. */ - tasklet_hi_schedule(&qp->s_task); - -bail: +done: return; } diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h index 402126eb79c..89df8f5ea99 100644 --- a/drivers/infiniband/hw/ipath/ipath_registers.h +++ b/drivers/infiniband/hw/ipath/ipath_registers.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index d38f4f3cfd1..772bc59fb85 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,6 +32,7 @@ */ #include "ipath_verbs.h" +#include "ipath_common.h" /* * Convert the AETH RNR timeout code into the number of milliseconds. @@ -111,20 +113,23 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp) * * Return 0 if no RWQE is available, otherwise return 1. * - * Called at interrupt level with the QP r_rq.lock held. + * Can be called from interrupt level. */ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) { + unsigned long flags; struct ipath_rq *rq; struct ipath_srq *srq; struct ipath_rwqe *wqe; - int ret; + int ret = 1; if (!qp->ibqp.srq) { rq = &qp->r_rq; + spin_lock_irqsave(&rq->lock, flags); + if (unlikely(rq->tail == rq->head)) { ret = 0; - goto bail; + goto done; } wqe = get_rwqe_ptr(rq, rq->tail); qp->r_wr_id = wqe->wr_id; @@ -136,17 +141,16 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) } if (++rq->tail >= rq->size) rq->tail = 0; - ret = 1; - goto bail; + goto done; } srq = to_isrq(qp->ibqp.srq); rq = &srq->rq; - spin_lock(&rq->lock); + spin_lock_irqsave(&rq->lock, flags); + if (unlikely(rq->tail == rq->head)) { - spin_unlock(&rq->lock); ret = 0; - goto bail; + goto done; } wqe = get_rwqe_ptr(rq, rq->tail); qp->r_wr_id = wqe->wr_id; @@ -168,18 +172,18 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) n = rq->head - rq->tail; if (n < srq->limit) { srq->limit = 0; - spin_unlock(&rq->lock); + spin_unlock_irqrestore(&rq->lock, flags); ev.device = qp->ibqp.device; ev.element.srq = qp->ibqp.srq; ev.event = IB_EVENT_SRQ_LIMIT_REACHED; srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context); - } else - spin_unlock(&rq->lock); - } else - spin_unlock(&rq->lock); - ret = 1; + goto bail; + } + } +done: + spin_unlock_irqrestore(&rq->lock, flags); bail: return ret; } @@ -187,7 +191,6 @@ bail: /** * ipath_ruc_loopback - handle UC and RC lookback requests * @sqp: the loopback QP - * @wc: the work completion entry * * This is called from ipath_do_uc_send() or ipath_do_rc_send() to * forward a WQE addressed to the same HCA. @@ -196,13 +199,14 @@ bail: * receive interrupts since this is a connected protocol and all packets * will pass through here. */ -void ipath_ruc_loopback(struct ipath_qp *sqp, struct ib_wc *wc) +static void ipath_ruc_loopback(struct ipath_qp *sqp) { struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); struct ipath_qp *qp; struct ipath_swqe *wqe; struct ipath_sge *sge; unsigned long flags; + struct ib_wc wc; u64 sdata; qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn); @@ -233,8 +237,8 @@ again: wqe = get_swqe_ptr(sqp, sqp->s_last); spin_unlock_irqrestore(&sqp->s_lock, flags); - wc->wc_flags = 0; - wc->imm_data = 0; + wc.wc_flags = 0; + wc.imm_data = 0; sqp->s_sge.sge = wqe->sg_list[0]; sqp->s_sge.sg_list = wqe->sg_list + 1; @@ -242,39 +246,34 @@ again: sqp->s_len = wqe->length; switch (wqe->wr.opcode) { case IB_WR_SEND_WITH_IMM: - wc->wc_flags = IB_WC_WITH_IMM; - wc->imm_data = wqe->wr.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + wc.imm_data = wqe->wr.imm_data; /* FALLTHROUGH */ case IB_WR_SEND: - spin_lock_irqsave(&qp->r_rq.lock, flags); if (!ipath_get_rwqe(qp, 0)) { rnr_nak: - spin_unlock_irqrestore(&qp->r_rq.lock, flags); /* Handle RNR NAK */ if (qp->ibqp.qp_type == IB_QPT_UC) goto send_comp; if (sqp->s_rnr_retry == 0) { - wc->status = IB_WC_RNR_RETRY_EXC_ERR; + wc.status = IB_WC_RNR_RETRY_EXC_ERR; goto err; } if (sqp->s_rnr_retry_cnt < 7) sqp->s_rnr_retry--; dev->n_rnr_naks++; sqp->s_rnr_timeout = - ib_ipath_rnr_table[sqp->s_min_rnr_timer]; + ib_ipath_rnr_table[sqp->r_min_rnr_timer]; ipath_insert_rnr_queue(sqp); goto done; } - spin_unlock_irqrestore(&qp->r_rq.lock, flags); break; case IB_WR_RDMA_WRITE_WITH_IMM: - wc->wc_flags = IB_WC_WITH_IMM; - wc->imm_data = wqe->wr.imm_data; - spin_lock_irqsave(&qp->r_rq.lock, flags); + wc.wc_flags = IB_WC_WITH_IMM; + wc.imm_data = wqe->wr.imm_data; if (!ipath_get_rwqe(qp, 1)) goto rnr_nak; - spin_unlock_irqrestore(&qp->r_rq.lock, flags); /* FALLTHROUGH */ case IB_WR_RDMA_WRITE: if (wqe->length == 0) @@ -284,20 +283,20 @@ again: wqe->wr.wr.rdma.rkey, IB_ACCESS_REMOTE_WRITE))) { acc_err: - wc->status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_REM_ACCESS_ERR; err: - wc->wr_id = wqe->wr.wr_id; - wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc->vendor_err = 0; - wc->byte_len = 0; - wc->qp_num = sqp->ibqp.qp_num; - wc->src_qp = sqp->remote_qpn; - wc->pkey_index = 0; - wc->slid = sqp->remote_ah_attr.dlid; - wc->sl = sqp->remote_ah_attr.sl; - wc->dlid_path_bits = 0; - wc->port_num = 0; - ipath_sqerror_qp(sqp, wc); + wc.wr_id = wqe->wr.wr_id; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.vendor_err = 0; + wc.byte_len = 0; + wc.qp_num = sqp->ibqp.qp_num; + wc.src_qp = sqp->remote_qpn; + wc.pkey_index = 0; + wc.slid = sqp->remote_ah_attr.dlid; + wc.sl = sqp->remote_ah_attr.sl; + wc.dlid_path_bits = 0; + wc.port_num = 0; + ipath_sqerror_qp(sqp, &wc); goto done; } break; @@ -373,22 +372,22 @@ again: goto send_comp; if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) - wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; else - wc->opcode = IB_WC_RECV; - wc->wr_id = qp->r_wr_id; - wc->status = IB_WC_SUCCESS; - wc->vendor_err = 0; - wc->byte_len = wqe->length; - wc->qp_num = qp->ibqp.qp_num; - wc->src_qp = qp->remote_qpn; + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.vendor_err = 0; + wc.byte_len = wqe->length; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = qp->remote_qpn; /* XXX do we know which pkey matched? Only needed for GSI. */ - wc->pkey_index = 0; - wc->slid = qp->remote_ah_attr.dlid; - wc->sl = qp->remote_ah_attr.sl; - wc->dlid_path_bits = 0; + wc.pkey_index = 0; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.dlid_path_bits = 0; /* Signal completion event if the solicited bit is set. */ - ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc, + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, wqe->wr.send_flags & IB_SEND_SOLICITED); send_comp: @@ -396,19 +395,19 @@ send_comp: if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &sqp->s_flags) || (wqe->wr.send_flags & IB_SEND_SIGNALED)) { - wc->wr_id = wqe->wr.wr_id; - wc->status = IB_WC_SUCCESS; - wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; - wc->vendor_err = 0; - wc->byte_len = wqe->length; - wc->qp_num = sqp->ibqp.qp_num; - wc->src_qp = 0; - wc->pkey_index = 0; - wc->slid = 0; - wc->sl = 0; - wc->dlid_path_bits = 0; - wc->port_num = 0; - ipath_cq_enter(to_icq(sqp->ibqp.send_cq), wc, 0); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.vendor_err = 0; + wc.byte_len = wqe->length; + wc.qp_num = sqp->ibqp.qp_num; + wc.src_qp = 0; + wc.pkey_index = 0; + wc.slid = 0; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + ipath_cq_enter(to_icq(sqp->ibqp.send_cq), &wc, 0); } /* Update s_last now that we are finished with the SWQE */ @@ -454,11 +453,11 @@ void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev) } /** - * ipath_post_rc_send - post RC and UC sends + * ipath_post_ruc_send - post RC and UC sends * @qp: the QP to post on * @wr: the work request to send */ -int ipath_post_rc_send(struct ipath_qp *qp, struct ib_send_wr *wr) +int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr) { struct ipath_swqe *wqe; unsigned long flags; @@ -533,13 +532,149 @@ int ipath_post_rc_send(struct ipath_qp *qp, struct ib_send_wr *wr) qp->s_head = next; spin_unlock_irqrestore(&qp->s_lock, flags); - if (qp->ibqp.qp_type == IB_QPT_UC) - ipath_do_uc_send((unsigned long) qp); - else - ipath_do_rc_send((unsigned long) qp); + ipath_do_ruc_send((unsigned long) qp); ret = 0; bail: return ret; } + +/** + * ipath_make_grh - construct a GRH header + * @dev: a pointer to the ipath device + * @hdr: a pointer to the GRH header being constructed + * @grh: the global route address to send to + * @hwords: the number of 32 bit words of header being sent + * @nwords: the number of 32 bit words of data being sent + * + * Return the size of the header in 32 bit words. + */ +u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords) +{ + hdr->version_tclass_flow = + cpu_to_be32((6 << 28) | + (grh->traffic_class << 20) | + grh->flow_label); + hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + hdr->next_hdr = 0x1B; + hdr->hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + hdr->sgid.global.subnet_prefix = dev->gid_prefix; + hdr->sgid.global.interface_id = ipath_layer_get_guid(dev->dd); + hdr->dgid = grh->dgid; + + /* GRH header size in 32-bit words. */ + return sizeof(struct ib_grh) / sizeof(u32); +} + +/** + * ipath_do_ruc_send - perform a send on an RC or UC QP + * @data: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, after we drop the QP s_lock, two threads could send + * packets out of order. + */ +void ipath_do_ruc_send(unsigned long data) +{ + struct ipath_qp *qp = (struct ipath_qp *)data; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + u16 lrh0; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u32 bth2; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + struct ipath_other_headers *ohdr; + + if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags)) + goto bail; + + if (unlikely(qp->remote_ah_attr.dlid == + ipath_layer_get_lid(dev->dd))) { + ipath_ruc_loopback(qp); + goto clear; + } + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + +again: + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If no PIO bufs are available, return. An interrupt will + * call ipath_ib_piobufavail() when one is available. + */ + if (ipath_verbs_send(dev->dd, qp->s_hdrwords, + (u32 *) &qp->s_hdr, qp->s_cur_size, + qp->s_cur_sge)) { + ipath_no_bufs_available(qp, dev); + goto bail; + } + dev->n_unicast_xmit++; + /* Record that we sent the packet and s_hdr is empty. */ + qp->s_hdrwords = 0; + } + + /* + * The lock is needed to synchronize between setting + * qp->s_ack_state, resend timer, and post_send(). + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if (qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE && + (bth0 = ipath_make_rc_ack(qp, ohdr, pmtu)) != 0) + bth2 = qp->s_ack_psn++ & IPATH_PSN_MASK; + else if (!((qp->ibqp.qp_type == IB_QPT_RC) ? + ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2) : + ipath_make_uc_req(qp, ohdr, pmtu, &bth0, &bth2))) { + /* + * Clear the busy bit before unlocking to avoid races with + * adding new work queue items and then failing to process + * them. + */ + clear_bit(IPATH_S_BUSY, &qp->s_flags); + spin_unlock_irqrestore(&qp->s_lock, flags); + goto bail; + } + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Construct the header. */ + extra_bytes = (4 - qp->s_cur_size) & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = IPATH_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = IPATH_LRH_GRH; + } + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); + bth0 |= ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); + + /* Check for more work to do. */ + goto again; + +clear: + clear_bit(IPATH_S_BUSY, &qp->s_flags); +bail: + return; +} diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c index 01c4c6c5611..f760434660b 100644 --- a/drivers/infiniband/hw/ipath/ipath_srq.c +++ b/drivers/infiniband/hw/ipath/ipath_srq.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -125,11 +126,23 @@ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata) { + struct ipath_ibdev *dev = to_idev(ibpd->device); struct ipath_srq *srq; u32 sz; struct ib_srq *ret; - if (srq_init_attr->attr.max_sge < 1) { + if (dev->n_srqs_allocated == ib_ipath_max_srqs) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + if (srq_init_attr->attr.max_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) || + (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) { ret = ERR_PTR(-EINVAL); goto bail; } @@ -164,6 +177,8 @@ struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, ret = &srq->ibsrq; + dev->n_srqs_allocated++; + bail: return ret; } @@ -181,24 +196,26 @@ int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, unsigned long flags; int ret; - if (attr_mask & IB_SRQ_LIMIT) { - spin_lock_irqsave(&srq->rq.lock, flags); - srq->limit = attr->srq_limit; - spin_unlock_irqrestore(&srq->rq.lock, flags); - } - if (attr_mask & IB_SRQ_MAX_WR) { - u32 size = attr->max_wr + 1; - struct ipath_rwqe *wq, *p; - u32 n; - u32 sz; + if (attr_mask & IB_SRQ_MAX_WR) + if ((attr->max_wr > ib_ipath_max_srq_wrs) || + (attr->max_sge > srq->rq.max_sge)) { + ret = -EINVAL; + goto bail; + } - if (attr->max_sge < srq->rq.max_sge) { + if (attr_mask & IB_SRQ_LIMIT) + if (attr->srq_limit >= srq->rq.size) { ret = -EINVAL; goto bail; } + if (attr_mask & IB_SRQ_MAX_WR) { + struct ipath_rwqe *wq, *p; + u32 sz, size, n; + sz = sizeof(struct ipath_rwqe) + attr->max_sge * sizeof(struct ipath_sge); + size = attr->max_wr + 1; wq = vmalloc(size * sz); if (!wq) { ret = -ENOMEM; @@ -242,6 +259,11 @@ int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, spin_unlock_irqrestore(&srq->rq.lock, flags); } + if (attr_mask & IB_SRQ_LIMIT) { + spin_lock_irqsave(&srq->rq.lock, flags); + srq->limit = attr->srq_limit; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } ret = 0; bail: @@ -265,7 +287,9 @@ int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) int ipath_destroy_srq(struct ib_srq *ibsrq) { struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_ibdev *dev = to_idev(ibsrq->device); + dev->n_srqs_allocated--; vfree(srq->rq.wq); kfree(srq); diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c index fe209137ee7..70351b7e35c 100644 --- a/drivers/infiniband/hw/ipath/ipath_stats.c +++ b/drivers/infiniband/hw/ipath/ipath_stats.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -185,7 +186,6 @@ static void ipath_qcheck(struct ipath_devdata *dd) dd->ipath_port0head, (unsigned long long) ipath_stats.sps_port0pkts); - ipath_kreceive(dd); } dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts; } diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c index f323791cc49..b98821d7801 100644 --- a/drivers/infiniband/hw/ipath/ipath_sysfs.c +++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -34,8 +35,8 @@ #include <linux/pci.h> #include "ipath_kernel.h" -#include "ips_common.h" #include "ipath_layer.h" +#include "ipath_common.h" /** * ipath_parse_ushort - parse an unsigned short value in an arbitrary base @@ -84,99 +85,6 @@ static ssize_t show_num_units(struct device_driver *dev, char *buf) ipath_count_units(NULL, NULL, NULL)); } -#define DRIVER_STAT(name, attr) \ - static ssize_t show_stat_##name(struct device_driver *dev, \ - char *buf) \ - { \ - return scnprintf( \ - buf, PAGE_SIZE, "%llu\n", \ - (unsigned long long) ipath_stats.sps_ ##attr); \ - } \ - static DRIVER_ATTR(name, S_IRUGO, show_stat_##name, NULL) - -DRIVER_STAT(intrs, ints); -DRIVER_STAT(err_intrs, errints); -DRIVER_STAT(errs, errs); -DRIVER_STAT(pkt_errs, pkterrs); -DRIVER_STAT(crc_errs, crcerrs); -DRIVER_STAT(hw_errs, hwerrs); -DRIVER_STAT(ib_link, iblink); -DRIVER_STAT(port0_pkts, port0pkts); -DRIVER_STAT(ether_spkts, ether_spkts); -DRIVER_STAT(ether_rpkts, ether_rpkts); -DRIVER_STAT(sma_spkts, sma_spkts); -DRIVER_STAT(sma_rpkts, sma_rpkts); -DRIVER_STAT(hdrq_full, hdrqfull); -DRIVER_STAT(etid_full, etidfull); -DRIVER_STAT(no_piobufs, nopiobufs); -DRIVER_STAT(ports, ports); -DRIVER_STAT(pkey0, pkeys[0]); -DRIVER_STAT(pkey1, pkeys[1]); -DRIVER_STAT(pkey2, pkeys[2]); -DRIVER_STAT(pkey3, pkeys[3]); -/* XXX fix the following when dynamic table of devices used */ -DRIVER_STAT(lid0, lid[0]); -DRIVER_STAT(lid1, lid[1]); -DRIVER_STAT(lid2, lid[2]); -DRIVER_STAT(lid3, lid[3]); - -DRIVER_STAT(nports, nports); -DRIVER_STAT(null_intr, nullintr); -DRIVER_STAT(max_pkts_call, maxpkts_call); -DRIVER_STAT(avg_pkts_call, avgpkts_call); -DRIVER_STAT(page_locks, pagelocks); -DRIVER_STAT(page_unlocks, pageunlocks); -DRIVER_STAT(krdrops, krdrops); -/* XXX fix the following when dynamic table of devices used */ -DRIVER_STAT(mlid0, mlid[0]); -DRIVER_STAT(mlid1, mlid[1]); -DRIVER_STAT(mlid2, mlid[2]); -DRIVER_STAT(mlid3, mlid[3]); - -static struct attribute *driver_stat_attributes[] = { - &driver_attr_intrs.attr, - &driver_attr_err_intrs.attr, - &driver_attr_errs.attr, - &driver_attr_pkt_errs.attr, - &driver_attr_crc_errs.attr, - &driver_attr_hw_errs.attr, - &driver_attr_ib_link.attr, - &driver_attr_port0_pkts.attr, - &driver_attr_ether_spkts.attr, - &driver_attr_ether_rpkts.attr, - &driver_attr_sma_spkts.attr, - &driver_attr_sma_rpkts.attr, - &driver_attr_hdrq_full.attr, - &driver_attr_etid_full.attr, - &driver_attr_no_piobufs.attr, - &driver_attr_ports.attr, - &driver_attr_pkey0.attr, - &driver_attr_pkey1.attr, - &driver_attr_pkey2.attr, - &driver_attr_pkey3.attr, - &driver_attr_lid0.attr, - &driver_attr_lid1.attr, - &driver_attr_lid2.attr, - &driver_attr_lid3.attr, - &driver_attr_nports.attr, - &driver_attr_null_intr.attr, - &driver_attr_max_pkts_call.attr, - &driver_attr_avg_pkts_call.attr, - &driver_attr_page_locks.attr, - &driver_attr_page_unlocks.attr, - &driver_attr_krdrops.attr, - &driver_attr_mlid0.attr, - &driver_attr_mlid1.attr, - &driver_attr_mlid2.attr, - &driver_attr_mlid3.attr, - NULL -}; - -static struct attribute_group driver_stat_attr_group = { - .name = "stats", - .attrs = driver_stat_attributes -}; - static ssize_t show_status(struct device *dev, struct device_attribute *attr, char *buf) @@ -272,23 +180,23 @@ static ssize_t store_lid(struct device *dev, size_t count) { struct ipath_devdata *dd = dev_get_drvdata(dev); - u16 lid; + u16 lid = 0; int ret; ret = ipath_parse_ushort(buf, &lid); if (ret < 0) goto invalid; - if (lid == 0 || lid >= 0xc000) { + if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) { ret = -EINVAL; goto invalid; } - ipath_set_sps_lid(dd, lid, 0); + ipath_set_lid(dd, lid, 0); goto bail; invalid: - ipath_dev_err(dd, "attempt to set invalid LID\n"); + ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid); bail: return ret; } @@ -313,13 +221,12 @@ static ssize_t store_mlid(struct device *dev, int ret; ret = ipath_parse_ushort(buf, &mlid); - if (ret < 0) + if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE) goto invalid; unit = dd->ipath_unit; dd->ipath_mlid = mlid; - ipath_stats.sps_mlid[unit] = mlid; ipath_layer_intr(dd, IPATH_LAYER_INT_BCAST); goto bail; @@ -734,20 +641,12 @@ int ipath_driver_create_group(struct device_driver *drv) int ret; ret = sysfs_create_group(&drv->kobj, &driver_attr_group); - if (ret) - goto bail; - ret = sysfs_create_group(&drv->kobj, &driver_stat_attr_group); - if (ret) - sysfs_remove_group(&drv->kobj, &driver_attr_group); - -bail: return ret; } void ipath_driver_remove_group(struct device_driver *drv) { - sysfs_remove_group(&drv->kobj, &driver_stat_attr_group); sysfs_remove_group(&drv->kobj, &driver_attr_group); } diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c index 0d6dbc0a541..c33abea2d5a 100644 --- a/drivers/infiniband/hw/ipath/ipath_uc.c +++ b/drivers/infiniband/hw/ipath/ipath_uc.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -31,7 +32,7 @@ */ #include "ipath_verbs.h" -#include "ips_common.h" +#include "ipath_common.h" /* cut down ridiculously long IB macro names */ #define OP(x) IB_OPCODE_UC_##x @@ -61,90 +62,40 @@ static void complete_last_send(struct ipath_qp *qp, struct ipath_swqe *wqe, } /** - * ipath_do_uc_send - do a send on a UC queue - * @data: contains a pointer to the QP to send on + * ipath_make_uc_req - construct a request packet (SEND, RDMA write) + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * @bth0p: pointer to the BTH opcode word + * @bth2p: pointer to the BTH PSN word * - * Process entries in the send work queue until the queue is exhausted. - * Only allow one CPU to send a packet per QP (tasklet). - * Otherwise, after we drop the QP lock, two threads could send - * packets out of order. - * This is similar to ipath_do_rc_send() below except we don't have - * timeouts or resends. + * Return 1 if constructed; otherwise, return 0. + * Note the QP s_lock must be held and interrupts disabled. */ -void ipath_do_uc_send(unsigned long data) +int ipath_make_uc_req(struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 pmtu, u32 *bth0p, u32 *bth2p) { - struct ipath_qp *qp = (struct ipath_qp *)data; - struct ipath_ibdev *dev = to_idev(qp->ibqp.device); struct ipath_swqe *wqe; - unsigned long flags; - u16 lrh0; u32 hwords; - u32 nwords; - u32 extra_bytes; u32 bth0; - u32 bth2; - u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); u32 len; - struct ipath_other_headers *ohdr; struct ib_wc wc; - if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags)) - goto bail; - - if (unlikely(qp->remote_ah_attr.dlid == - ipath_layer_get_lid(dev->dd))) { - /* Pass in an uninitialized ib_wc to save stack space. */ - ipath_ruc_loopback(qp, &wc); - clear_bit(IPATH_S_BUSY, &qp->s_flags); - goto bail; - } - - ohdr = &qp->s_hdr.u.oth; - if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) - ohdr = &qp->s_hdr.u.l.oth; - -again: - /* Check for a constructed packet to be sent. */ - if (qp->s_hdrwords != 0) { - /* - * If no PIO bufs are available, return. - * An interrupt will call ipath_ib_piobufavail() - * when one is available. - */ - if (ipath_verbs_send(dev->dd, qp->s_hdrwords, - (u32 *) &qp->s_hdr, - qp->s_cur_size, - qp->s_cur_sge)) { - ipath_no_bufs_available(qp, dev); - goto bail; - } - dev->n_unicast_xmit++; - /* Record that we sent the packet and s_hdr is empty. */ - qp->s_hdrwords = 0; - } - - lrh0 = IPS_LRH_BTH; - /* header size in 32-bit words LRH+BTH = (8+12)/4. */ - hwords = 5; - - /* - * The lock is needed to synchronize between - * setting qp->s_ack_state and post_send(). - */ - spin_lock_irqsave(&qp->s_lock, flags); - if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) goto done; - bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 0; - /* Send a request. */ + /* Get the next send request. */ wqe = get_swqe_ptr(qp, qp->s_last); switch (qp->s_state) { default: /* - * Signal the completion of the last send (if there is - * one). + * Signal the completion of the last send + * (if there is one). */ if (qp->s_last != qp->s_tail) complete_last_send(qp, wqe, &wc); @@ -257,61 +208,16 @@ again: } break; } - bth2 = qp->s_next_psn++ & IPS_PSN_MASK; qp->s_len -= len; - bth0 |= qp->s_state << 24; - - spin_unlock_irqrestore(&qp->s_lock, flags); - - /* Construct the header. */ - extra_bytes = (4 - len) & 3; - nwords = (len + extra_bytes) >> 2; - if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { - /* Header size in 32-bit words. */ - hwords += 10; - lrh0 = IPS_LRH_GRH; - qp->s_hdr.u.l.grh.version_tclass_flow = - cpu_to_be32((6 << 28) | - (qp->remote_ah_attr.grh.traffic_class - << 20) | - qp->remote_ah_attr.grh.flow_label); - qp->s_hdr.u.l.grh.paylen = - cpu_to_be16(((hwords - 12) + nwords + - SIZE_OF_CRC) << 2); - /* next_hdr is defined by C8-7 in ch. 8.4.1 */ - qp->s_hdr.u.l.grh.next_hdr = 0x1B; - qp->s_hdr.u.l.grh.hop_limit = - qp->remote_ah_attr.grh.hop_limit; - /* The SGID is 32-bit aligned. */ - qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = - dev->gid_prefix; - qp->s_hdr.u.l.grh.sgid.global.interface_id = - ipath_layer_get_guid(dev->dd); - qp->s_hdr.u.l.grh.dgid = qp->remote_ah_attr.grh.dgid; - } qp->s_hdrwords = hwords; qp->s_cur_sge = &qp->s_sge; qp->s_cur_size = len; - lrh0 |= qp->remote_ah_attr.sl << 4; - qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); - /* DEST LID */ - qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); - qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC); - qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd)); - bth0 |= extra_bytes << 20; - ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); - ohdr->bth[2] = cpu_to_be32(bth2); - - /* Check for more work to do. */ - goto again; + *bth0p = bth0 | (qp->s_state << 24); + *bth2p = qp->s_next_psn++ & IPATH_PSN_MASK; + return 1; done: - spin_unlock_irqrestore(&qp->s_lock, flags); - clear_bit(IPATH_S_BUSY, &qp->s_flags); - -bail: - return; + return 0; } /** @@ -335,7 +241,6 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, u32 hdrsize; u32 psn; u32 pad; - unsigned long flags; struct ib_wc wc; u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); struct ib_reth *reth; @@ -373,8 +278,6 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, wc.imm_data = 0; wc.wc_flags = 0; - spin_lock_irqsave(&qp->r_rq.lock, flags); - /* Compare the PSN verses the expected PSN. */ if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) { /* @@ -535,12 +438,13 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, if (qp->r_len != 0) { u32 rkey = be32_to_cpu(reth->rkey); u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; /* Check rkey */ - if (unlikely(!ipath_rkey_ok( - dev, &qp->r_sge, qp->r_len, - vaddr, rkey, - IB_ACCESS_REMOTE_WRITE))) { + ok = ipath_rkey_ok(dev, &qp->r_sge, qp->r_len, + vaddr, rkey, + IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) { dev->n_pkt_drops++; goto done; } @@ -558,8 +462,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, } if (opcode == OP(RDMA_WRITE_ONLY)) goto rdma_last; - else if (opcode == - OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) goto rdma_last_imm; /* FALLTHROUGH */ case OP(RDMA_WRITE_MIDDLE): @@ -592,9 +495,9 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, dev->n_pkt_drops++; goto done; } - if (qp->r_reuse_sge) { + if (qp->r_reuse_sge) qp->r_reuse_sge = 0; - } else if (!ipath_get_rwqe(qp, 1)) { + else if (!ipath_get_rwqe(qp, 1)) { dev->n_pkt_drops++; goto done; } @@ -631,15 +534,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, default: /* Drop packet for unknown opcodes. */ - spin_unlock_irqrestore(&qp->r_rq.lock, flags); dev->n_pkt_drops++; - goto bail; + goto done; } qp->r_psn++; qp->r_state = opcode; done: - spin_unlock_irqrestore(&qp->r_rq.lock, flags); - -bail: return; } diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index e606daf8321..3466129af80 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -33,7 +34,7 @@ #include <rdma/ib_smi.h> #include "ipath_verbs.h" -#include "ips_common.h" +#include "ipath_common.h" /** * ipath_ud_loopback - handle send on loopback QPs @@ -274,6 +275,11 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) len += wr->sg_list[i].length; ss.num_sge++; } + /* Check for invalid packet size. */ + if (len > ipath_layer_get_ibmtu(dev->dd)) { + ret = -EINVAL; + goto bail; + } extra_bytes = (4 - len) & 3; nwords = (len + extra_bytes) >> 2; @@ -283,8 +289,8 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) ret = -EINVAL; goto bail; } - if (ah_attr->dlid >= IPS_MULTICAST_LID_BASE) { - if (ah_attr->dlid != IPS_PERMISSIVE_LID) + if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) { + if (ah_attr->dlid != IPATH_PERMISSIVE_LID) dev->n_multicast_xmit++; else dev->n_unicast_xmit++; @@ -304,7 +310,7 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) if (ah_attr->ah_flags & IB_AH_GRH) { /* Header size in 32-bit words. */ hwords = 17; - lrh0 = IPS_LRH_GRH; + lrh0 = IPATH_LRH_GRH; ohdr = &qp->s_hdr.u.l.oth; qp->s_hdr.u.l.grh.version_tclass_flow = cpu_to_be32((6 << 28) | @@ -330,7 +336,7 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) } else { /* Header size in 32-bit words. */ hwords = 7; - lrh0 = IPS_LRH_BTH; + lrh0 = IPATH_LRH_BTH; ohdr = &qp->s_hdr.u.oth; } if (wr->opcode == IB_WR_SEND_WITH_IMM) { @@ -361,18 +367,18 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr) if (wr->send_flags & IB_SEND_SOLICITED) bth0 |= 1 << 23; bth0 |= extra_bytes << 20; - bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPS_DEFAULT_P_KEY : + bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY : ipath_layer_get_pkey(dev->dd, qp->s_pkey_index); ohdr->bth[0] = cpu_to_be32(bth0); /* * Use the multicast QP if the destination LID is a multicast LID. */ - ohdr->bth[1] = ah_attr->dlid >= IPS_MULTICAST_LID_BASE && - ah_attr->dlid != IPS_PERMISSIVE_LID ? - __constant_cpu_to_be32(IPS_MULTICAST_QPN) : + ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && + ah_attr->dlid != IPATH_PERMISSIVE_LID ? + __constant_cpu_to_be32(IPATH_MULTICAST_QPN) : cpu_to_be32(wr->wr.ud.remote_qpn); /* XXX Could lose a PSN count but not worth locking */ - ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPS_PSN_MASK); + ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). @@ -463,7 +469,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, src_qp = be32_to_cpu(ohdr->u.ud.deth[1]); } } - src_qp &= IPS_QPN_MASK; + src_qp &= IPATH_QPN_MASK; /* * Check that the permissive LID is only used on QP0 @@ -554,7 +560,16 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, spin_lock_irqsave(&rq->lock, flags); if (rq->tail == rq->head) { spin_unlock_irqrestore(&rq->lock, flags); - dev->n_pkt_drops++; + /* + * Count VL15 packets dropped due to no receive buffer. + * Otherwise, count them as buffer overruns since usually, + * the HW will be able to receive packets even if there are + * no QPs with posted receive buffers. + */ + if (qp->ibqp.qp_num == 0) + dev->n_vl15_dropped++; + else + dev->rcv_errors++; goto bail; } /* Silently drop packets which are too big. */ @@ -612,7 +627,7 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, /* * Save the LMC lower bits if the destination LID is a unicast LID. */ - wc.dlid_path_bits = dlid >= IPS_MULTICAST_LID_BASE ? 0 : + wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 : dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); /* Signal completion event if the solicited bit is set. */ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c index 2bb08afc86d..e32fca9faf8 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_pages.c +++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -57,17 +58,6 @@ static int __get_user_pages(unsigned long start_page, size_t num_pages, size_t got; int ret; -#if 0 - /* - * XXX - causes MPI programs to fail, haven't had time to check - * yet - */ - if (!capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto bail; - } -#endif - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 28fdbdaa789..56ac336dd1e 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -36,7 +37,7 @@ #include "ipath_kernel.h" #include "ipath_verbs.h" -#include "ips_common.h" +#include "ipath_common.h" /* Not static, because we don't want the compiler removing it */ const char ipath_verbs_version[] = "ipath_verbs " IPATH_IDSTR; @@ -55,9 +56,62 @@ unsigned int ib_ipath_debug; /* debug mask */ module_param_named(debug, ib_ipath_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(debug, "Verbs debug mask"); +static unsigned int ib_ipath_max_pds = 0xFFFF; +module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int ib_ipath_max_ahs = 0xFFFF; +module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int ib_ipath_max_cqes = 0x2FFFF; +module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int ib_ipath_max_cqs = 0x1FFFF; +module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int ib_ipath_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int ib_ipath_max_sges = 0x60; +module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int ib_ipath_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int ib_ipath_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int ib_ipath_max_srqs = 1024; +module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int ib_ipath_max_srq_sges = 128; +module_param_named(max_srq_sges, ib_ipath_max_srq_sges, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int ib_ipath_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + MODULE_LICENSE("GPL"); -MODULE_AUTHOR("PathScale <support@pathscale.com>"); -MODULE_DESCRIPTION("Pathscale InfiniPath driver"); +MODULE_AUTHOR("QLogic <support@pathscale.com>"); +MODULE_DESCRIPTION("QLogic InfiniPath driver"); const int ib_ipath_state_ops[IB_QPS_ERR + 1] = { [IB_QPS_RESET] = 0, @@ -193,7 +247,7 @@ static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, switch (qp->ibqp.qp_type) { case IB_QPT_UC: case IB_QPT_RC: - err = ipath_post_rc_send(qp, wr); + err = ipath_post_ruc_send(qp, wr); break; case IB_QPT_SMI: @@ -375,7 +429,7 @@ static void ipath_ib_rcv(void *arg, void *rhdr, void *data, u32 tlen) /* Check for a valid destination LID (see ch. 7.11.1). */ lid = be16_to_cpu(hdr->lrh[1]); - if (lid < IPS_MULTICAST_LID_BASE) { + if (lid < IPATH_MULTICAST_LID_BASE) { lid &= ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1); if (unlikely(lid != ipath_layer_get_lid(dev->dd))) { dev->rcv_errors++; @@ -385,9 +439,9 @@ static void ipath_ib_rcv(void *arg, void *rhdr, void *data, u32 tlen) /* Check for GRH */ lnh = be16_to_cpu(hdr->lrh[0]) & 3; - if (lnh == IPS_LRH_BTH) + if (lnh == IPATH_LRH_BTH) ohdr = &hdr->u.oth; - else if (lnh == IPS_LRH_GRH) + else if (lnh == IPATH_LRH_GRH) ohdr = &hdr->u.l.oth; else { dev->rcv_errors++; @@ -399,8 +453,8 @@ static void ipath_ib_rcv(void *arg, void *rhdr, void *data, u32 tlen) dev->opstats[opcode].n_packets++; /* Get the destination QP number. */ - qp_num = be32_to_cpu(ohdr->bth[1]) & IPS_QPN_MASK; - if (qp_num == IPS_MULTICAST_QPN) { + qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK; + if (qp_num == IPATH_MULTICAST_QPN) { struct ipath_mcast *mcast; struct ipath_mcast_qp *p; @@ -411,7 +465,7 @@ static void ipath_ib_rcv(void *arg, void *rhdr, void *data, u32 tlen) } dev->n_multicast_rcv++; list_for_each_entry_rcu(p, &mcast->qp_list, list) - ipath_qp_rcv(dev, hdr, lnh == IPS_LRH_GRH, data, + ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data, tlen, p->qp); /* * Notify ipath_multicast_detach() if it is waiting for us @@ -423,7 +477,7 @@ static void ipath_ib_rcv(void *arg, void *rhdr, void *data, u32 tlen) qp = ipath_lookup_qpn(&dev->qp_table, qp_num); if (qp) { dev->n_unicast_rcv++; - ipath_qp_rcv(dev, hdr, lnh == IPS_LRH_GRH, data, + ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data, tlen, qp); /* * Notify ipath_destroy_qp() if it is waiting @@ -567,40 +621,38 @@ static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct ipath_ibdev *dev = to_idev(ibdev); - u32 vendor, boardrev, majrev, minrev; memset(props, 0, sizeof(*props)); props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | IB_DEVICE_SYS_IMAGE_GUID; - ipath_layer_query_device(dev->dd, &vendor, &boardrev, - &majrev, &minrev); - props->vendor_id = vendor; - props->vendor_part_id = boardrev; - props->hw_ver = boardrev << 16 | majrev << 8 | minrev; + props->vendor_id = ipath_layer_get_vendorid(dev->dd); + props->vendor_part_id = ipath_layer_get_deviceid(dev->dd); + props->hw_ver = ipath_layer_get_pcirev(dev->dd); props->sys_image_guid = dev->sys_image_guid; props->max_mr_size = ~0ull; - props->max_qp = 0xffff; - props->max_qp_wr = 0xffff; - props->max_sge = 255; - props->max_cq = 0xffff; - props->max_cqe = 0xffff; - props->max_mr = 0xffff; - props->max_pd = 0xffff; + props->max_qp = dev->qp_table.max; + props->max_qp_wr = ib_ipath_max_qp_wrs; + props->max_sge = ib_ipath_max_sges; + props->max_cq = ib_ipath_max_cqs; + props->max_ah = ib_ipath_max_ahs; + props->max_cqe = ib_ipath_max_cqes; + props->max_mr = dev->lk_table.max; + props->max_pd = ib_ipath_max_pds; props->max_qp_rd_atom = 1; props->max_qp_init_rd_atom = 1; /* props->max_res_rd_atom */ - props->max_srq = 0xffff; - props->max_srq_wr = 0xffff; - props->max_srq_sge = 255; + props->max_srq = ib_ipath_max_srqs; + props->max_srq_wr = ib_ipath_max_srq_wrs; + props->max_srq_sge = ib_ipath_max_srq_sges; /* props->local_ca_ack_delay */ props->atomic_cap = IB_ATOMIC_HCA; props->max_pkeys = ipath_layer_get_npkeys(dev->dd); - props->max_mcast_grp = 0xffff; - props->max_mcast_qp_attach = 0xffff; + props->max_mcast_grp = ib_ipath_max_mcast_grps; + props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; @@ -643,10 +695,10 @@ static int ipath_query_port(struct ib_device *ibdev, ipath_layer_get_lastibcstat(dev->dd) & 0xf]; props->port_cap_flags = dev->port_cap_flags; props->gid_tbl_len = 1; - props->max_msg_sz = 4096; + props->max_msg_sz = 0x80000000; props->pkey_tbl_len = ipath_layer_get_npkeys(dev->dd); props->bad_pkey_cntr = ipath_layer_get_cr_errpkey(dev->dd) - - dev->n_pkey_violations; + dev->z_pkey_violations; props->qkey_viol_cntr = dev->qkey_violations; props->active_width = IB_WIDTH_4X; /* See rate_show() */ @@ -743,15 +795,30 @@ static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { + struct ipath_ibdev *dev = to_idev(ibdev); struct ipath_pd *pd; struct ib_pd *ret; + /* + * This is actually totally arbitrary. Some correctness tests + * assume there's a maximum number of PDs that can be allocated. + * We don't actually have this limit, but we fail the test if + * we allow allocations of more than we report for this value. + */ + + if (dev->n_pds_allocated == ib_ipath_max_pds) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + pd = kmalloc(sizeof *pd, GFP_KERNEL); if (!pd) { ret = ERR_PTR(-ENOMEM); goto bail; } + dev->n_pds_allocated++; + /* ib_alloc_pd() will initialize pd->ibpd. */ pd->user = udata != NULL; @@ -764,6 +831,9 @@ bail: static int ipath_dealloc_pd(struct ib_pd *ibpd) { struct ipath_pd *pd = to_ipd(ibpd); + struct ipath_ibdev *dev = to_idev(ibpd->device); + + dev->n_pds_allocated--; kfree(pd); @@ -782,21 +852,40 @@ static struct ib_ah *ipath_create_ah(struct ib_pd *pd, { struct ipath_ah *ah; struct ib_ah *ret; + struct ipath_ibdev *dev = to_idev(pd->device); + + if (dev->n_ahs_allocated == ib_ipath_max_ahs) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } /* A multicast address requires a GRH (see ch. 8.4.1). */ - if (ah_attr->dlid >= IPS_MULTICAST_LID_BASE && - ah_attr->dlid != IPS_PERMISSIVE_LID && + if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && + ah_attr->dlid != IPATH_PERMISSIVE_LID && !(ah_attr->ah_flags & IB_AH_GRH)) { ret = ERR_PTR(-EINVAL); goto bail; } + if (ah_attr->dlid == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (ah_attr->port_num < 1 || + ah_attr->port_num > pd->device->phys_port_cnt) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + ah = kmalloc(sizeof *ah, GFP_ATOMIC); if (!ah) { ret = ERR_PTR(-ENOMEM); goto bail; } + dev->n_ahs_allocated++; + /* ib_create_ah() will initialize ah->ibah. */ ah->attr = *ah_attr; @@ -814,8 +903,11 @@ bail: */ static int ipath_destroy_ah(struct ib_ah *ibah) { + struct ipath_ibdev *dev = to_idev(ibah->device); struct ipath_ah *ah = to_iah(ibah); + dev->n_ahs_allocated--; + kfree(ah); return 0; @@ -889,6 +981,7 @@ static int ipath_verbs_register_sysfs(struct ib_device *dev); */ static void *ipath_register_ib_device(int unit, struct ipath_devdata *dd) { + struct ipath_layer_counters cntrs; struct ipath_ibdev *idev; struct ib_device *dev; int ret; @@ -939,6 +1032,25 @@ static void *ipath_register_ib_device(int unit, struct ipath_devdata *dd) idev->pma_counter_select[5] = IB_PMA_PORT_XMIT_WAIT; idev->link_width_enabled = 3; /* 1x or 4x */ + /* Snapshot current HW counters to "clear" them. */ + ipath_layer_get_counters(dd, &cntrs); + idev->z_symbol_error_counter = cntrs.symbol_error_counter; + idev->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + idev->z_link_downed_counter = cntrs.link_downed_counter; + idev->z_port_rcv_errors = cntrs.port_rcv_errors; + idev->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + idev->z_port_xmit_discards = cntrs.port_xmit_discards; + idev->z_port_xmit_data = cntrs.port_xmit_data; + idev->z_port_rcv_data = cntrs.port_rcv_data; + idev->z_port_xmit_packets = cntrs.port_xmit_packets; + idev->z_port_rcv_packets = cntrs.port_rcv_packets; + idev->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + idev->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + /* * The system image GUID is supposed to be the same for all * IB HCAs in a single system but since there can be other @@ -1109,11 +1221,8 @@ static ssize_t show_rev(struct class_device *cdev, char *buf) { struct ipath_ibdev *dev = container_of(cdev, struct ipath_ibdev, ibdev.class_dev); - int vendor, boardrev, majrev, minrev; - ipath_layer_query_device(dev->dd, &vendor, &boardrev, - &majrev, &minrev); - return sprintf(buf, "%d.%d\n", majrev, minrev); + return sprintf(buf, "%x\n", ipath_layer_get_pcirev(dev->dd)); } static ssize_t show_hca(struct class_device *cdev, char *buf) diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h index 4f8d59300e9..2df684727dc 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.h +++ b/drivers/infiniband/hw/ipath/ipath_verbs.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -148,6 +149,7 @@ struct ipath_mcast { struct list_head qp_list; wait_queue_head_t wait; atomic_t refcount; + int n_attached; }; /* Memory region */ @@ -305,32 +307,34 @@ struct ipath_qp { u32 s_next_psn; /* PSN for next request */ u32 s_last_psn; /* last response PSN processed */ u32 s_psn; /* current packet sequence number */ + u32 s_ack_psn; /* PSN for RDMA_READ */ u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ - u32 s_ack_psn; /* PSN for next ACK or RDMA_READ */ - u64 s_ack_atomic; /* data for atomic ACK */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ u64 r_wr_id; /* ID for current receive WQE */ u64 r_atomic_data; /* data for last atomic op */ u32 r_atomic_psn; /* PSN of last atomic op */ u32 r_len; /* total length of r_sge */ u32 r_rcv_len; /* receive data len processed */ u32 r_psn; /* expected rcv packet sequence number */ + u32 r_msn; /* message sequence number */ u8 state; /* QP state */ u8 s_state; /* opcode of last packet sent */ u8 s_ack_state; /* opcode of packet to ACK */ u8 s_nak_state; /* non-zero if NAK is pending */ u8 r_state; /* opcode of last packet received */ + u8 r_ack_state; /* opcode of packet to ACK */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ u8 r_reuse_sge; /* for UC receive errors */ u8 r_sge_inx; /* current index into sg_list */ - u8 s_max_sge; /* size of s_wq->sg_list */ u8 qp_access_flags; + u8 s_max_sge; /* size of s_wq->sg_list */ u8 s_retry_cnt; /* number of times to retry */ u8 s_rnr_retry_cnt; - u8 s_min_rnr_timer; u8 s_retry; /* requester retry counter */ u8 s_rnr_retry; /* requester RNR retry counter */ u8 s_pkey_index; /* PKEY index to use */ enum ib_mtu path_mtu; - atomic_t msn; /* message sequence number */ u32 remote_qpn; u32 qkey; /* QKEY for this QP (for UD or RD) */ u32 s_size; /* send work queue size */ @@ -431,6 +435,11 @@ struct ipath_ibdev { __be64 sys_image_guid; /* in network order */ __be64 gid_prefix; /* in network order */ __be64 mkey; + u32 n_pds_allocated; /* number of PDs allocated for device */ + u32 n_ahs_allocated; /* number of AHs allocated for device */ + u32 n_cqs_allocated; /* number of CQs allocated for device */ + u32 n_srqs_allocated; /* number of SRQs allocated for device */ + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ u64 ipath_sword; /* total dwords sent (sample result) */ u64 ipath_rword; /* total dwords received (sample result) */ u64 ipath_spkts; /* total packets sent (sample result) */ @@ -442,17 +451,19 @@ struct ipath_ibdev { u64 n_unicast_rcv; /* total unicast packets received */ u64 n_multicast_xmit; /* total multicast packets sent */ u64 n_multicast_rcv; /* total multicast packets received */ - u64 n_symbol_error_counter; /* starting count for PMA */ - u64 n_link_error_recovery_counter; /* starting count for PMA */ - u64 n_link_downed_counter; /* starting count for PMA */ - u64 n_port_rcv_errors; /* starting count for PMA */ - u64 n_port_rcv_remphys_errors; /* starting count for PMA */ - u64 n_port_xmit_discards; /* starting count for PMA */ - u64 n_port_xmit_data; /* starting count for PMA */ - u64 n_port_rcv_data; /* starting count for PMA */ - u64 n_port_xmit_packets; /* starting count for PMA */ - u64 n_port_rcv_packets; /* starting count for PMA */ - u32 n_pkey_violations; /* starting count for PMA */ + u64 z_symbol_error_counter; /* starting count for PMA */ + u64 z_link_error_recovery_counter; /* starting count for PMA */ + u64 z_link_downed_counter; /* starting count for PMA */ + u64 z_port_rcv_errors; /* starting count for PMA */ + u64 z_port_rcv_remphys_errors; /* starting count for PMA */ + u64 z_port_xmit_discards; /* starting count for PMA */ + u64 z_port_xmit_data; /* starting count for PMA */ + u64 z_port_rcv_data; /* starting count for PMA */ + u64 z_port_xmit_packets; /* starting count for PMA */ + u64 z_port_rcv_packets; /* starting count for PMA */ + u32 z_pkey_violations; /* starting count for PMA */ + u32 z_local_link_integrity_errors; /* starting count for PMA */ + u32 z_excessive_buffer_overrun_errors; /* starting count for PMA */ u32 n_rc_resends; u32 n_rc_acks; u32 n_rc_qacks; @@ -462,6 +473,7 @@ struct ipath_ibdev { u32 n_other_naks; u32 n_timeouts; u32 n_pkt_drops; + u32 n_vl15_dropped; u32 n_wqe_errs; u32 n_rdma_dup_busy; u32 n_piowait; @@ -580,10 +592,6 @@ void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc); void ipath_get_credit(struct ipath_qp *qp, u32 aeth); -void ipath_do_rc_send(unsigned long data); - -void ipath_do_uc_send(unsigned long data); - void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss, @@ -596,7 +604,7 @@ void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length); void ipath_skip_sge(struct ipath_sge_state *ss, u32 length); -int ipath_post_rc_send(struct ipath_qp *qp, struct ib_send_wr *wr); +int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr); void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, int has_grh, void *data, u32 tlen, struct ipath_qp *qp); @@ -678,7 +686,19 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp); int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only); -void ipath_ruc_loopback(struct ipath_qp *sqp, struct ib_wc *wc); +u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords); + +void ipath_do_ruc_send(unsigned long data); + +u32 ipath_make_rc_ack(struct ipath_qp *qp, struct ipath_other_headers *ohdr, + u32 pmtu); + +int ipath_make_rc_req(struct ipath_qp *qp, struct ipath_other_headers *ohdr, + u32 pmtu, u32 *bth0p, u32 *bth2p); + +int ipath_make_uc_req(struct ipath_qp *qp, struct ipath_other_headers *ohdr, + u32 pmtu, u32 *bth0p, u32 *bth2p); extern const enum ib_wc_opcode ib_ipath_wc_opcode[]; @@ -688,6 +708,24 @@ extern const int ib_ipath_state_ops[]; extern unsigned int ib_ipath_lkey_table_size; +extern unsigned int ib_ipath_max_cqes; + +extern unsigned int ib_ipath_max_cqs; + +extern unsigned int ib_ipath_max_qp_wrs; + +extern unsigned int ib_ipath_max_sges; + +extern unsigned int ib_ipath_max_mcast_grps; + +extern unsigned int ib_ipath_max_mcast_qp_attached; + +extern unsigned int ib_ipath_max_srqs; + +extern unsigned int ib_ipath_max_srq_sges; + +extern unsigned int ib_ipath_max_srq_wrs; + extern const u32 ib_ipath_rnr_table[]; #endif /* IPATH_VERBS_H */ diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c index 10b31d2c4f2..ee0e1d96d72 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -92,6 +93,7 @@ static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid) INIT_LIST_HEAD(&mcast->qp_list); init_waitqueue_head(&mcast->wait); atomic_set(&mcast->refcount, 0); + mcast->n_attached = 0; bail: return mcast; @@ -157,7 +159,8 @@ bail: * the table but the QP was added. Return ESRCH if the QP was already * attached and neither structure was added. */ -static int ipath_mcast_add(struct ipath_mcast *mcast, +static int ipath_mcast_add(struct ipath_ibdev *dev, + struct ipath_mcast *mcast, struct ipath_mcast_qp *mqp) { struct rb_node **n = &mcast_tree.rb_node; @@ -188,34 +191,47 @@ static int ipath_mcast_add(struct ipath_mcast *mcast, /* Search the QP list to see if this is already there. */ list_for_each_entry_rcu(p, &tmcast->qp_list, list) { if (p->qp == mqp->qp) { - spin_unlock_irqrestore(&mcast_lock, flags); ret = ESRCH; goto bail; } } + if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) { + ret = ENOMEM; + goto bail; + } + + tmcast->n_attached++; + list_add_tail_rcu(&mqp->list, &tmcast->qp_list); - spin_unlock_irqrestore(&mcast_lock, flags); ret = EEXIST; goto bail; } + if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) { + ret = ENOMEM; + goto bail; + } + + dev->n_mcast_grps_allocated++; + list_add_tail_rcu(&mqp->list, &mcast->qp_list); atomic_inc(&mcast->refcount); rb_link_node(&mcast->rb_node, pn, n); rb_insert_color(&mcast->rb_node, &mcast_tree); - spin_unlock_irqrestore(&mcast_lock, flags); - ret = 0; bail: + spin_unlock_irqrestore(&mcast_lock, flags); + return ret; } int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); struct ipath_mcast *mcast; struct ipath_mcast_qp *mqp; int ret; @@ -235,7 +251,7 @@ int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) ret = -ENOMEM; goto bail; } - switch (ipath_mcast_add(mcast, mqp)) { + switch (ipath_mcast_add(dev, mcast, mqp)) { case ESRCH: /* Neither was used: can't attach the same QP twice. */ ipath_mcast_qp_free(mqp); @@ -245,6 +261,12 @@ int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) case EEXIST: /* The mcast wasn't used */ ipath_mcast_free(mcast); break; + case ENOMEM: + /* Exceeded the maximum number of mcast groups. */ + ipath_mcast_qp_free(mqp); + ipath_mcast_free(mcast); + ret = -ENOMEM; + goto bail; default: break; } @@ -258,6 +280,7 @@ bail: int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); struct ipath_mcast *mcast = NULL; struct ipath_mcast_qp *p, *tmp; struct rb_node *n; @@ -272,7 +295,7 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) while (1) { if (n == NULL) { spin_unlock_irqrestore(&mcast_lock, flags); - ret = 0; + ret = -EINVAL; goto bail; } @@ -296,6 +319,7 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) * link until we are sure there are no list walkers. */ list_del_rcu(&p->list); + mcast->n_attached--; /* If this was the last attached QP, remove the GID too. */ if (list_empty(&mcast->qp_list)) { @@ -319,6 +343,7 @@ int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) atomic_dec(&mcast->refcount); wait_event(mcast->wait, !atomic_read(&mcast->refcount)); ipath_mcast_free(mcast); + dev->n_mcast_grps_allocated--; } ret = 0; diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c index adc5322f15c..f8f9e2e8cbd 100644 --- a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c +++ b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/drivers/infiniband/hw/ipath/ips_common.h b/drivers/infiniband/hw/ipath/ips_common.h deleted file mode 100644 index ab7cbbbfd03..00000000000 --- a/drivers/infiniband/hw/ipath/ips_common.h +++ /dev/null @@ -1,263 +0,0 @@ -#ifndef IPS_COMMON_H -#define IPS_COMMON_H -/* - * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "ipath_common.h" - -struct ipath_header { - /* - * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset - - * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, - * Port 3, TID 11, offset 14. - */ - __le32 ver_port_tid_offset; - __le16 chksum; - __le16 pkt_flags; -}; - -struct ips_message_header { - __be16 lrh[4]; - __be32 bth[3]; - /* fields below this point are in host byte order */ - struct ipath_header iph; - __u8 sub_opcode; - __u8 flags; - __u16 src_rank; - /* 24 bits. The upper 8 bit is available for other use */ - union { - struct { - unsigned ack_seq_num:24; - unsigned port:4; - unsigned unused:4; - }; - __u32 ack_seq_num_org; - }; - __u8 expected_tid_session_id; - __u8 tinylen; /* to aid MPI */ - union { - __u16 tag; /* to aid MPI */ - __u16 mqhdr; /* for PSM MQ */ - }; - union { - __u32 mpi[4]; /* to aid MPI */ - __u32 data[4]; - __u64 mq[2]; /* for PSM MQ */ - struct { - __u16 mtu; - __u8 major_ver; - __u8 minor_ver; - __u32 not_used; //free - __u32 run_id; - __u32 client_ver; - }; - }; -}; - -struct ether_header { - __be16 lrh[4]; - __be32 bth[3]; - struct ipath_header iph; - __u8 sub_opcode; - __u8 cmd; - __be16 lid; - __u16 mac[3]; - __u8 frag_num; - __u8 seq_num; - __le32 len; - /* MUST be of word size due to PIO write requirements */ - __le32 csum; - __le16 csum_offset; - __le16 flags; - __u16 first_2_bytes; - __u8 unused[2]; /* currently unused */ -}; - -/* - * The PIO buffer used for sending infinipath messages must only be written - * in 32-bit words, all the data must be written, and no writes can occur - * after the last word is written (which transfers "ownership" of the buffer - * to the chip and triggers the message to be sent). - * Since the Linux sk_buff structure can be recursive, non-aligned, and - * any number of bytes in each segment, we use the following structure - * to keep information about the overall state of the copy operation. - * This is used to save the information needed to store the checksum - * in the right place before sending the last word to the hardware and - * to buffer the last 0-3 bytes of non-word sized segments. - */ -struct copy_data_s { - struct ether_header *hdr; - /* addr of PIO buf to write csum to */ - __u32 __iomem *csum_pio; - __u32 __iomem *to; /* addr of PIO buf to write data to */ - __u32 device; /* which device to allocate PIO bufs from */ - __s32 error; /* set if there is an error. */ - __s32 extra; /* amount of data saved in u.buf below */ - __u32 len; /* total length to send in bytes */ - __u32 flen; /* frament length in words */ - __u32 csum; /* partial IP checksum */ - __u32 pos; /* position for partial checksum */ - __u32 offset; /* offset to where data currently starts */ - __s32 checksum_calc; /* set to 1 when csum has been calculated */ - struct sk_buff *skb; - union { - __u32 w; - __u8 buf[4]; - } u; -}; - -/* IB - LRH header consts */ -#define IPS_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ -#define IPS_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ - -#define IPS_OFFSET 0 - -/* - * defines the cut-off point between the header queue and eager/expected - * TID queue - */ -#define NUM_OF_EXTRA_WORDS_IN_HEADER_QUEUE \ - ((sizeof(struct ips_message_header) - \ - offsetof(struct ips_message_header, iph)) >> 2) - -/* OpCodes */ -#define OPCODE_IPS 0xC0 -#define OPCODE_ITH4X 0xC1 - -/* OpCode 30 is use by stand-alone test programs */ -#define OPCODE_RAW_DATA 0xDE -/* last OpCode (31) is reserved for test */ -#define OPCODE_TEST 0xDF - -/* sub OpCodes - ips */ -#define OPCODE_SEQ_DATA 0x01 -#define OPCODE_SEQ_CTRL 0x02 - -#define OPCODE_SEQ_MQ_DATA 0x03 -#define OPCODE_SEQ_MQ_CTRL 0x04 - -#define OPCODE_ACK 0x10 -#define OPCODE_NAK 0x11 - -#define OPCODE_ERR_CHK 0x20 -#define OPCODE_ERR_CHK_PLS 0x21 - -#define OPCODE_STARTUP 0x30 -#define OPCODE_STARTUP_ACK 0x31 -#define OPCODE_STARTUP_NAK 0x32 - -#define OPCODE_STARTUP_EXT 0x34 -#define OPCODE_STARTUP_ACK_EXT 0x35 -#define OPCODE_STARTUP_NAK_EXT 0x36 - -#define OPCODE_TIDS_RELEASE 0x40 -#define OPCODE_TIDS_RELEASE_CONFIRM 0x41 - -#define OPCODE_CLOSE 0x50 -#define OPCODE_CLOSE_ACK 0x51 -/* - * like OPCODE_CLOSE, but no complaint if other side has already closed. - * Used when doing abort(), MPI_Abort(), etc. - */ -#define OPCODE_ABORT 0x52 - -/* sub OpCodes - ith4x */ -#define OPCODE_ENCAP 0x81 -#define OPCODE_LID_ARP 0x82 - -/* Receive Header Queue: receive type (from infinipath) */ -#define RCVHQ_RCV_TYPE_EXPECTED 0 -#define RCVHQ_RCV_TYPE_EAGER 1 -#define RCVHQ_RCV_TYPE_NON_KD 2 -#define RCVHQ_RCV_TYPE_ERROR 3 - -/* misc. */ -#define SIZE_OF_CRC 1 - -#define EAGER_TID_ID INFINIPATH_I_TID_MASK - -#define IPS_DEFAULT_P_KEY 0xFFFF - -#define IPS_PERMISSIVE_LID 0xFFFF -#define IPS_MULTICAST_LID_BASE 0xC000 - -#define IPS_AETH_CREDIT_SHIFT 24 -#define IPS_AETH_CREDIT_MASK 0x1F -#define IPS_AETH_CREDIT_INVAL 0x1F - -#define IPS_PSN_MASK 0xFFFFFF -#define IPS_MSN_MASK 0xFFFFFF -#define IPS_QPN_MASK 0xFFFFFF -#define IPS_MULTICAST_QPN 0xFFFFFF - -/* functions for extracting fields from rcvhdrq entries */ -static inline __u32 ips_get_hdr_err_flags(const __le32 * rbuf) -{ - return __le32_to_cpu(rbuf[1]); -} - -static inline __u32 ips_get_index(const __le32 * rbuf) -{ - return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT) - & INFINIPATH_RHF_EGRINDEX_MASK; -} - -static inline __u32 ips_get_rcv_type(const __le32 * rbuf) -{ - return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT) - & INFINIPATH_RHF_RCVTYPE_MASK; -} - -static inline __u32 ips_get_length_in_bytes(const __le32 * rbuf) -{ - return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT) - & INFINIPATH_RHF_LENGTH_MASK) << 2; -} - -static inline void *ips_get_first_protocol_header(const __u32 * rbuf) -{ - return (void *)&rbuf[2]; -} - -static inline struct ips_message_header *ips_get_ips_header(const __u32 * - rbuf) -{ - return (struct ips_message_header *)&rbuf[2]; -} - -static inline __u32 ips_get_ipath_ver(__le32 hdrword) -{ - return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT) - & INFINIPATH_I_VERS_MASK; -} - -#endif /* IPS_COMMON_H */ diff --git a/drivers/infiniband/hw/ipath/verbs_debug.h b/drivers/infiniband/hw/ipath/verbs_debug.h index 40d693cf3f9..6186676f2a1 100644 --- a/drivers/infiniband/hw/ipath/verbs_debug.h +++ b/drivers/infiniband/hw/ipath/verbs_debug.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 2006 QLogic, Inc. All rights reserved. * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c index d536217e700..a29b1b6d82b 100644 --- a/drivers/infiniband/hw/mthca/mthca_eq.c +++ b/drivers/infiniband/hw/mthca/mthca_eq.c @@ -900,7 +900,7 @@ int __devinit mthca_init_eq_table(struct mthca_dev *dev) mthca_is_memfree(dev) ? mthca_arbel_interrupt : mthca_tavor_interrupt, - SA_SHIRQ, DRV_NAME, dev); + IRQF_SHARED, DRV_NAME, dev); if (err) goto err_out_cmd; dev->eq_table.have_irq = 1; diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 9b9ff7bff35..557cde3a456 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -34,7 +34,6 @@ * $Id: mthca_main.c 1396 2004-12-28 04:10:27Z roland $ */ -#include <linux/config.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> @@ -172,8 +171,9 @@ static int __devinit mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) { mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than " - "PCI resource 2 size of 0x%lx, aborting.\n", - dev_lim->uar_size, pci_resource_len(mdev->pdev, 2)); + "PCI resource 2 size of 0x%llx, aborting.\n", + dev_lim->uar_size, + (unsigned long long)pci_resource_len(mdev->pdev, 2)); return -ENODEV; } diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c index f4fddd5327f..91934f2d9db 100644 --- a/drivers/infiniband/hw/mthca/mthca_reset.c +++ b/drivers/infiniband/hw/mthca/mthca_reset.c @@ -32,7 +32,6 @@ * $Id: mthca_reset.c 1349 2004-12-16 21:09:43Z roland $ */ -#include <linux/config.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/pci.h> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 491d2afaf5b..3f89f5e1903 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -42,7 +42,6 @@ #include <linux/netdevice.h> #include <linux/workqueue.h> #include <linux/pci.h> -#include <linux/config.h> #include <linux/kref.h> #include <linux/if_infiniband.h> #include <linux/mutex.h> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 216471fa01c..ab40488182b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -864,8 +864,7 @@ void ipoib_mcast_restart_task(void *dev_ptr) if (mcast) { /* Destroy the send only entry */ - list_del(&mcast->list); - list_add_tail(&mcast->list, &remove_list); + list_move_tail(&mcast->list, &remove_list); rb_replace_node(&mcast->rb_node, &nmcast->rb_node, @@ -890,8 +889,7 @@ void ipoib_mcast_restart_task(void *dev_ptr) rb_erase(&mcast->rb_node, &priv->multicast_tree); /* Move to the remove list */ - list_del(&mcast->list); - list_add_tail(&mcast->list, &remove_list); + list_move_tail(&mcast->list, &remove_list); } } diff --git a/drivers/infiniband/ulp/iser/Kconfig b/drivers/infiniband/ulp/iser/Kconfig new file mode 100644 index 00000000000..fead87d1eff --- /dev/null +++ b/drivers/infiniband/ulp/iser/Kconfig @@ -0,0 +1,11 @@ +config INFINIBAND_ISER + tristate "ISCSI RDMA Protocol" + depends on INFINIBAND && SCSI + select SCSI_ISCSI_ATTRS + ---help--- + Support for the ISCSI RDMA Protocol over InfiniBand. This + allows you to access storage devices that speak ISER/ISCSI + over InfiniBand. + + The ISER protocol is defined by IETF. + See <http://www.ietf.org/>. diff --git a/drivers/infiniband/ulp/iser/Makefile b/drivers/infiniband/ulp/iser/Makefile new file mode 100644 index 00000000000..fe6cd15f231 --- /dev/null +++ b/drivers/infiniband/ulp/iser/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_INFINIBAND_ISER) += ib_iser.o + +ib_iser-y := iser_verbs.o iser_initiator.o iser_memory.o \ + iscsi_iser.o diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c new file mode 100644 index 00000000000..b2c033edb03 --- /dev/null +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -0,0 +1,789 @@ +/* + * iSCSI Initiator over iSER Data-Path + * + * Copyright (C) 2004 Dmitry Yusupov + * Copyright (C) 2004 Alex Aizman + * Copyright (C) 2005 Mike Christie + * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved. + * maintained by openib-general@openib.org + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Credits: + * Christoph Hellwig + * FUJITA Tomonori + * Arne Redlich + * Zhenyu Wang + * Modified by: + * Erez Zilber + * + * + * $Id: iscsi_iser.c 6965 2006-05-07 11:36:20Z ogerlitz $ + */ + +#include <linux/types.h> +#include <linux/list.h> +#include <linux/hardirq.h> +#include <linux/kfifo.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/ioctl.h> +#include <linux/cdev.h> +#include <linux/in.h> +#include <linux/net.h> +#include <linux/scatterlist.h> +#include <linux/delay.h> + +#include <net/sock.h> + +#include <asm/uaccess.h> + +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_device.h> +#include <scsi/scsi_eh.h> +#include <scsi/scsi_tcq.h> +#include <scsi/scsi_host.h> +#include <scsi/scsi.h> +#include <scsi/scsi_transport_iscsi.h> + +#include "iscsi_iser.h" + +static unsigned int iscsi_max_lun = 512; +module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO); + +int iser_debug_level = 0; + +MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover " + "v" DRV_VER " (" DRV_DATE ")"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz"); + +module_param_named(debug_level, iser_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)"); + +struct iser_global ig; + +void +iscsi_iser_recv(struct iscsi_conn *conn, + struct iscsi_hdr *hdr, char *rx_data, int rx_data_len) +{ + int rc = 0; + uint32_t ret_itt; + int datalen; + int ahslen; + + /* verify PDU length */ + datalen = ntoh24(hdr->dlength); + if (datalen != rx_data_len) { + printk(KERN_ERR "iscsi_iser: datalen %d (hdr) != %d (IB) \n", + datalen, rx_data_len); + rc = ISCSI_ERR_DATALEN; + goto error; + } + + /* read AHS */ + ahslen = hdr->hlength * 4; + + /* verify itt (itt encoding: age+cid+itt) */ + rc = iscsi_verify_itt(conn, hdr, &ret_itt); + + if (!rc) + rc = iscsi_complete_pdu(conn, hdr, rx_data, rx_data_len); + + if (rc && rc != ISCSI_ERR_NO_SCSI_CMD) + goto error; + + return; +error: + iscsi_conn_failure(conn, rc); +} + + +/** + * iscsi_iser_cmd_init - Initialize iSCSI SCSI_READ or SCSI_WRITE commands + * + **/ +static void +iscsi_iser_cmd_init(struct iscsi_cmd_task *ctask) +{ + struct iscsi_iser_conn *iser_conn = ctask->conn->dd_data; + struct iscsi_iser_cmd_task *iser_ctask = ctask->dd_data; + struct scsi_cmnd *sc = ctask->sc; + + iser_ctask->command_sent = 0; + iser_ctask->iser_conn = iser_conn; + + if (sc->sc_data_direction == DMA_TO_DEVICE) { + BUG_ON(ctask->total_length == 0); + /* bytes to be sent via RDMA operations */ + iser_ctask->rdma_data_count = ctask->total_length - + ctask->imm_count - + ctask->unsol_count; + + debug_scsi("cmd [itt %x total %d imm %d unsol_data %d " + "rdma_data %d]\n", + ctask->itt, ctask->total_length, ctask->imm_count, + ctask->unsol_count, iser_ctask->rdma_data_count); + } else + /* bytes to be sent via RDMA operations */ + iser_ctask->rdma_data_count = ctask->total_length; + + iser_ctask_rdma_init(iser_ctask); +} + +/** + * iscsi_mtask_xmit - xmit management(immediate) task + * @conn: iscsi connection + * @mtask: task management task + * + * Notes: + * The function can return -EAGAIN in which case caller must + * call it again later, or recover. '0' return code means successful + * xmit. + * + **/ +static int +iscsi_iser_mtask_xmit(struct iscsi_conn *conn, + struct iscsi_mgmt_task *mtask) +{ + int error = 0; + + debug_scsi("mtask deq [cid %d itt 0x%x]\n", conn->id, mtask->itt); + + error = iser_send_control(conn, mtask); + + /* since iser xmits control with zero copy, mtasks can not be recycled + * right after sending them. + * The recycling scheme is based on whether a response is expected + * - if yes, the mtask is recycled at iscsi_complete_pdu + * - if no, the mtask is recycled at iser_snd_completion + */ + if (error && error != -EAGAIN) + iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED); + + return error; +} + +static int +iscsi_iser_ctask_xmit_unsol_data(struct iscsi_conn *conn, + struct iscsi_cmd_task *ctask) +{ + struct iscsi_data hdr; + int error = 0; + struct iscsi_iser_cmd_task *iser_ctask = ctask->dd_data; + + /* Send data-out PDUs while there's still unsolicited data to send */ + while (ctask->unsol_count > 0) { + iscsi_prep_unsolicit_data_pdu(ctask, &hdr, + iser_ctask->rdma_data_count); + + debug_scsi("Sending data-out: itt 0x%x, data count %d\n", + hdr.itt, ctask->data_count); + + /* the buffer description has been passed with the command */ + /* Send the command */ + error = iser_send_data_out(conn, ctask, &hdr); + if (error) { + ctask->unsol_datasn--; + goto iscsi_iser_ctask_xmit_unsol_data_exit; + } + ctask->unsol_count -= ctask->data_count; + debug_scsi("Need to send %d more as data-out PDUs\n", + ctask->unsol_count); + } + +iscsi_iser_ctask_xmit_unsol_data_exit: + return error; +} + +static int +iscsi_iser_ctask_xmit(struct iscsi_conn *conn, + struct iscsi_cmd_task *ctask) +{ + struct iscsi_iser_cmd_task *iser_ctask = ctask->dd_data; + int error = 0; + + debug_scsi("ctask deq [cid %d itt 0x%x]\n", + conn->id, ctask->itt); + + /* + * serialize with TMF AbortTask + */ + if (ctask->mtask) + return error; + + /* Send the cmd PDU */ + if (!iser_ctask->command_sent) { + error = iser_send_command(conn, ctask); + if (error) + goto iscsi_iser_ctask_xmit_exit; + iser_ctask->command_sent = 1; + } + + /* Send unsolicited data-out PDU(s) if necessary */ + if (ctask->unsol_count) + error = iscsi_iser_ctask_xmit_unsol_data(conn, ctask); + + iscsi_iser_ctask_xmit_exit: + if (error && error != -EAGAIN) + iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED); + return error; +} + +static void +iscsi_iser_cleanup_ctask(struct iscsi_conn *conn, struct iscsi_cmd_task *ctask) +{ + struct iscsi_iser_cmd_task *iser_ctask = ctask->dd_data; + + if (iser_ctask->status == ISER_TASK_STATUS_STARTED) { + iser_ctask->status = ISER_TASK_STATUS_COMPLETED; + iser_ctask_rdma_finalize(iser_ctask); + } +} + +static struct iser_conn * +iscsi_iser_ib_conn_lookup(__u64 ep_handle) +{ + struct iser_conn *ib_conn; + struct iser_conn *uib_conn = (struct iser_conn *)(unsigned long)ep_handle; + + mutex_lock(&ig.connlist_mutex); + list_for_each_entry(ib_conn, &ig.connlist, conn_list) { + if (ib_conn == uib_conn) { + mutex_unlock(&ig.connlist_mutex); + return ib_conn; + } + } + mutex_unlock(&ig.connlist_mutex); + iser_err("no conn exists for eph %llx\n",(unsigned long long)ep_handle); + return NULL; +} + +static struct iscsi_cls_conn * +iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) +{ + struct iscsi_conn *conn; + struct iscsi_cls_conn *cls_conn; + struct iscsi_iser_conn *iser_conn; + + cls_conn = iscsi_conn_setup(cls_session, conn_idx); + if (!cls_conn) + return NULL; + conn = cls_conn->dd_data; + + /* + * due to issues with the login code re iser sematics + * this not set in iscsi_conn_setup - FIXME + */ + conn->max_recv_dlength = 128; + + iser_conn = kzalloc(sizeof(*iser_conn), GFP_KERNEL); + if (!iser_conn) + goto conn_alloc_fail; + + /* currently this is the only field which need to be initiated */ + rwlock_init(&iser_conn->lock); + + conn->dd_data = iser_conn; + iser_conn->iscsi_conn = conn; + + return cls_conn; + +conn_alloc_fail: + iscsi_conn_teardown(cls_conn); + return NULL; +} + +static void +iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iscsi_iser_conn *iser_conn = conn->dd_data; + + iscsi_conn_teardown(cls_conn); + kfree(iser_conn); +} + +static int +iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, + struct iscsi_cls_conn *cls_conn, uint64_t transport_eph, + int is_leading) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iscsi_iser_conn *iser_conn; + struct iser_conn *ib_conn; + int error; + + error = iscsi_conn_bind(cls_session, cls_conn, is_leading); + if (error) + return error; + + /* the transport ep handle comes from user space so it must be + * verified against the global ib connections list */ + ib_conn = iscsi_iser_ib_conn_lookup(transport_eph); + if (!ib_conn) { + iser_err("can't bind eph %llx\n", + (unsigned long long)transport_eph); + return -EINVAL; + } + /* binds the iSER connection retrieved from the previously + * connected ep_handle to the iSCSI layer connection. exchanges + * connection pointers */ + iser_err("binding iscsi conn %p to iser_conn %p\n",conn,ib_conn); + iser_conn = conn->dd_data; + ib_conn->iser_conn = iser_conn; + iser_conn->ib_conn = ib_conn; + + conn->recv_lock = &iser_conn->lock; + + return 0; +} + +static int +iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + int err; + + err = iscsi_conn_start(cls_conn); + if (err) + return err; + + return iser_conn_set_full_featured_mode(conn); +} + +static void +iscsi_iser_conn_terminate(struct iscsi_conn *conn) +{ + struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_conn *ib_conn = iser_conn->ib_conn; + + BUG_ON(!ib_conn); + /* starts conn teardown process, waits until all previously * + * posted buffers get flushed, deallocates all conn resources */ + iser_conn_terminate(ib_conn); + iser_conn->ib_conn = NULL; + conn->recv_lock = NULL; +} + + +static struct iscsi_transport iscsi_iser_transport; + +static struct iscsi_cls_session * +iscsi_iser_session_create(struct iscsi_transport *iscsit, + struct scsi_transport_template *scsit, + uint32_t initial_cmdsn, uint32_t *hostno) +{ + struct iscsi_cls_session *cls_session; + struct iscsi_session *session; + int i; + uint32_t hn; + struct iscsi_cmd_task *ctask; + struct iscsi_mgmt_task *mtask; + struct iscsi_iser_cmd_task *iser_ctask; + struct iser_desc *desc; + + cls_session = iscsi_session_setup(iscsit, scsit, + sizeof(struct iscsi_iser_cmd_task), + sizeof(struct iser_desc), + initial_cmdsn, &hn); + if (!cls_session) + return NULL; + + *hostno = hn; + session = class_to_transport_session(cls_session); + + /* libiscsi setup itts, data and pool so just set desc fields */ + for (i = 0; i < session->cmds_max; i++) { + ctask = session->cmds[i]; + iser_ctask = ctask->dd_data; + ctask->hdr = (struct iscsi_cmd *)&iser_ctask->desc.iscsi_header; + } + + for (i = 0; i < session->mgmtpool_max; i++) { + mtask = session->mgmt_cmds[i]; + desc = mtask->dd_data; + mtask->hdr = &desc->iscsi_header; + desc->data = mtask->data; + } + + return cls_session; +} + +static int +iscsi_iser_conn_set_param(struct iscsi_cls_conn *cls_conn, + enum iscsi_param param, uint32_t value) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + struct iscsi_session *session = conn->session; + + spin_lock_bh(&session->lock); + if (conn->c_stage != ISCSI_CONN_INITIAL_STAGE && + conn->stop_stage != STOP_CONN_RECOVER) { + printk(KERN_ERR "iscsi_iser: can not change parameter [%d]\n", + param); + spin_unlock_bh(&session->lock); + return 0; + } + spin_unlock_bh(&session->lock); + + switch (param) { + case ISCSI_PARAM_MAX_RECV_DLENGTH: + /* TBD */ + break; + case ISCSI_PARAM_MAX_XMIT_DLENGTH: + conn->max_xmit_dlength = value; + break; + case ISCSI_PARAM_HDRDGST_EN: + if (value) { + printk(KERN_ERR "DataDigest wasn't negotiated to None"); + return -EPROTO; + } + break; + case ISCSI_PARAM_DATADGST_EN: + if (value) { + printk(KERN_ERR "DataDigest wasn't negotiated to None"); + return -EPROTO; + } + break; + case ISCSI_PARAM_INITIAL_R2T_EN: + session->initial_r2t_en = value; + break; + case ISCSI_PARAM_IMM_DATA_EN: + session->imm_data_en = value; + break; + case ISCSI_PARAM_FIRST_BURST: + session->first_burst = value; + break; + case ISCSI_PARAM_MAX_BURST: + session->max_burst = value; + break; + case ISCSI_PARAM_PDU_INORDER_EN: + session->pdu_inorder_en = value; + break; + case ISCSI_PARAM_DATASEQ_INORDER_EN: + session->dataseq_inorder_en = value; + break; + case ISCSI_PARAM_ERL: + session->erl = value; + break; + case ISCSI_PARAM_IFMARKER_EN: + if (value) { + printk(KERN_ERR "IFMarker wasn't negotiated to No"); + return -EPROTO; + } + break; + case ISCSI_PARAM_OFMARKER_EN: + if (value) { + printk(KERN_ERR "OFMarker wasn't negotiated to No"); + return -EPROTO; + } + break; + default: + break; + } + + return 0; +} + +static int +iscsi_iser_session_get_param(struct iscsi_cls_session *cls_session, + enum iscsi_param param, uint32_t *value) +{ + struct Scsi_Host *shost = iscsi_session_to_shost(cls_session); + struct iscsi_session *session = iscsi_hostdata(shost->hostdata); + + switch (param) { + case ISCSI_PARAM_INITIAL_R2T_EN: + *value = session->initial_r2t_en; + break; + case ISCSI_PARAM_MAX_R2T: + *value = session->max_r2t; + break; + case ISCSI_PARAM_IMM_DATA_EN: + *value = session->imm_data_en; + break; + case ISCSI_PARAM_FIRST_BURST: + *value = session->first_burst; + break; + case ISCSI_PARAM_MAX_BURST: + *value = session->max_burst; + break; + case ISCSI_PARAM_PDU_INORDER_EN: + *value = session->pdu_inorder_en; + break; + case ISCSI_PARAM_DATASEQ_INORDER_EN: + *value = session->dataseq_inorder_en; + break; + case ISCSI_PARAM_ERL: + *value = session->erl; + break; + case ISCSI_PARAM_IFMARKER_EN: + *value = 0; + break; + case ISCSI_PARAM_OFMARKER_EN: + *value = 0; + break; + default: + return ISCSI_ERR_PARAM_NOT_FOUND; + } + + return 0; +} + +static int +iscsi_iser_conn_get_param(struct iscsi_cls_conn *cls_conn, + enum iscsi_param param, uint32_t *value) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + + switch(param) { + case ISCSI_PARAM_MAX_RECV_DLENGTH: + *value = conn->max_recv_dlength; + break; + case ISCSI_PARAM_MAX_XMIT_DLENGTH: + *value = conn->max_xmit_dlength; + break; + case ISCSI_PARAM_HDRDGST_EN: + *value = 0; + break; + case ISCSI_PARAM_DATADGST_EN: + *value = 0; + break; + /*case ISCSI_PARAM_TARGET_RECV_DLENGTH: + *value = conn->target_recv_dlength; + break; + case ISCSI_PARAM_INITIATOR_RECV_DLENGTH: + *value = conn->initiator_recv_dlength; + break;*/ + default: + return ISCSI_ERR_PARAM_NOT_FOUND; + } + + return 0; +} + + +static void +iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *stats) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + + stats->txdata_octets = conn->txdata_octets; + stats->rxdata_octets = conn->rxdata_octets; + stats->scsicmd_pdus = conn->scsicmd_pdus_cnt; + stats->dataout_pdus = conn->dataout_pdus_cnt; + stats->scsirsp_pdus = conn->scsirsp_pdus_cnt; + stats->datain_pdus = conn->datain_pdus_cnt; /* always 0 */ + stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */ + stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt; + stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt; + stats->custom_length = 3; + strcpy(stats->custom[0].desc, "qp_tx_queue_full"); + stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */ + strcpy(stats->custom[1].desc, "fmr_map_not_avail"); + stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */; + strcpy(stats->custom[2].desc, "eh_abort_cnt"); + stats->custom[2].value = conn->eh_abort_cnt; +} + +static int +iscsi_iser_ep_connect(struct sockaddr *dst_addr, int non_blocking, + __u64 *ep_handle) +{ + int err; + struct iser_conn *ib_conn; + + err = iser_conn_init(&ib_conn); + if (err) + goto out; + + err = iser_connect(ib_conn, NULL, (struct sockaddr_in *)dst_addr, non_blocking); + if (!err) + *ep_handle = (__u64)(unsigned long)ib_conn; + +out: + return err; +} + +static int +iscsi_iser_ep_poll(__u64 ep_handle, int timeout_ms) +{ + struct iser_conn *ib_conn = iscsi_iser_ib_conn_lookup(ep_handle); + int rc; + + if (!ib_conn) + return -EINVAL; + + rc = wait_event_interruptible_timeout(ib_conn->wait, + ib_conn->state == ISER_CONN_UP, + msecs_to_jiffies(timeout_ms)); + + /* if conn establishment failed, return error code to iscsi */ + if (!rc && + (ib_conn->state == ISER_CONN_TERMINATING || + ib_conn->state == ISER_CONN_DOWN)) + rc = -1; + + iser_err("ib conn %p rc = %d\n", ib_conn, rc); + + if (rc > 0) + return 1; /* success, this is the equivalent of POLLOUT */ + else if (!rc) + return 0; /* timeout */ + else + return rc; /* signal */ +} + +static void +iscsi_iser_ep_disconnect(__u64 ep_handle) +{ + struct iser_conn *ib_conn = iscsi_iser_ib_conn_lookup(ep_handle); + + if (!ib_conn) + return; + + iser_err("ib conn %p state %d\n",ib_conn, ib_conn->state); + + iser_conn_terminate(ib_conn); +} + +static struct scsi_host_template iscsi_iser_sht = { + .name = "iSCSI Initiator over iSER, v." DRV_VER, + .queuecommand = iscsi_queuecommand, + .can_queue = ISCSI_XMIT_CMDS_MAX - 1, + .sg_tablesize = ISCSI_ISER_SG_TABLESIZE, + .cmd_per_lun = ISCSI_MAX_CMD_PER_LUN, + .eh_abort_handler = iscsi_eh_abort, + .eh_host_reset_handler = iscsi_eh_host_reset, + .use_clustering = DISABLE_CLUSTERING, + .proc_name = "iscsi_iser", + .this_id = -1, +}; + +static struct iscsi_transport iscsi_iser_transport = { + .owner = THIS_MODULE, + .name = "iser", + .caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T, + .param_mask = ISCSI_MAX_RECV_DLENGTH | + ISCSI_MAX_XMIT_DLENGTH | + ISCSI_HDRDGST_EN | + ISCSI_DATADGST_EN | + ISCSI_INITIAL_R2T_EN | + ISCSI_MAX_R2T | + ISCSI_IMM_DATA_EN | + ISCSI_FIRST_BURST | + ISCSI_MAX_BURST | + ISCSI_PDU_INORDER_EN | + ISCSI_DATASEQ_INORDER_EN, + .host_template = &iscsi_iser_sht, + .conndata_size = sizeof(struct iscsi_conn), + .max_lun = ISCSI_ISER_MAX_LUN, + .max_cmd_len = ISCSI_ISER_MAX_CMD_LEN, + /* session management */ + .create_session = iscsi_iser_session_create, + .destroy_session = iscsi_session_teardown, + /* connection management */ + .create_conn = iscsi_iser_conn_create, + .bind_conn = iscsi_iser_conn_bind, + .destroy_conn = iscsi_iser_conn_destroy, + .set_param = iscsi_iser_conn_set_param, + .get_conn_param = iscsi_iser_conn_get_param, + .get_session_param = iscsi_iser_session_get_param, + .start_conn = iscsi_iser_conn_start, + .stop_conn = iscsi_conn_stop, + /* these are called as part of conn recovery */ + .suspend_conn_recv = NULL, /* FIXME is/how this relvant to iser? */ + .terminate_conn = iscsi_iser_conn_terminate, + /* IO */ + .send_pdu = iscsi_conn_send_pdu, + .get_stats = iscsi_iser_conn_get_stats, + .init_cmd_task = iscsi_iser_cmd_init, + .xmit_cmd_task = iscsi_iser_ctask_xmit, + .xmit_mgmt_task = iscsi_iser_mtask_xmit, + .cleanup_cmd_task = iscsi_iser_cleanup_ctask, + /* recovery */ + .session_recovery_timedout = iscsi_session_recovery_timedout, + + .ep_connect = iscsi_iser_ep_connect, + .ep_poll = iscsi_iser_ep_poll, + .ep_disconnect = iscsi_iser_ep_disconnect +}; + +static int __init iser_init(void) +{ + int err; + + iser_dbg("Starting iSER datamover...\n"); + + if (iscsi_max_lun < 1) { + printk(KERN_ERR "Invalid max_lun value of %u\n", iscsi_max_lun); + return -EINVAL; + } + + iscsi_iser_transport.max_lun = iscsi_max_lun; + + memset(&ig, 0, sizeof(struct iser_global)); + + ig.desc_cache = kmem_cache_create("iser_descriptors", + sizeof (struct iser_desc), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (ig.desc_cache == NULL) + return -ENOMEM; + + /* device init is called only after the first addr resolution */ + mutex_init(&ig.device_list_mutex); + INIT_LIST_HEAD(&ig.device_list); + mutex_init(&ig.connlist_mutex); + INIT_LIST_HEAD(&ig.connlist); + + if (!iscsi_register_transport(&iscsi_iser_transport)) { + iser_err("iscsi_register_transport failed\n"); + err = -EINVAL; + goto register_transport_failure; + } + + return 0; + +register_transport_failure: + kmem_cache_destroy(ig.desc_cache); + + return err; +} + +static void __exit iser_exit(void) +{ + iser_dbg("Removing iSER datamover...\n"); + iscsi_unregister_transport(&iscsi_iser_transport); + kmem_cache_destroy(ig.desc_cache); +} + +module_init(iser_init); +module_exit(iser_exit); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h new file mode 100644 index 00000000000..3350ba690cf --- /dev/null +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -0,0 +1,354 @@ +/* + * iSER transport for the Open iSCSI Initiator & iSER transport internals + * + * Copyright (C) 2004 Dmitry Yusupov + * Copyright (C) 2004 Alex Aizman + * Copyright (C) 2005 Mike Christie + * based on code maintained by open-iscsi@googlegroups.com + * + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: iscsi_iser.h 7051 2006-05-10 12:29:11Z ogerlitz $ + */ +#ifndef __ISCSI_ISER_H__ +#define __ISCSI_ISER_H__ + +#include <linux/types.h> +#include <linux/net.h> +#include <scsi/libiscsi.h> +#include <scsi/scsi_transport_iscsi.h> + +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/dma-mapping.h> +#include <linux/mutex.h> +#include <linux/mempool.h> +#include <linux/uio.h> + +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/in6.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_fmr_pool.h> +#include <rdma/rdma_cm.h> + +#define DRV_NAME "iser" +#define PFX DRV_NAME ": " +#define DRV_VER "0.1" +#define DRV_DATE "May 7th, 2006" + +#define iser_dbg(fmt, arg...) \ + do { \ + if (iser_debug_level > 0) \ + printk(KERN_DEBUG PFX "%s:" fmt,\ + __func__ , ## arg); \ + } while (0) + +#define iser_err(fmt, arg...) \ + do { \ + printk(KERN_ERR PFX "%s:" fmt, \ + __func__ , ## arg); \ + } while (0) + + /* support upto 512KB in one RDMA */ +#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> PAGE_SHIFT) +#define ISCSI_ISER_MAX_LUN 256 +#define ISCSI_ISER_MAX_CMD_LEN 16 + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ + +#define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * + * SCSI_TMFUNC(2), LOGOUT(1) */ + +#define ISER_QP_MAX_RECV_DTOS (ISCSI_XMIT_CMDS_MAX + \ + ISER_MAX_RX_MISC_PDUS + \ + ISER_MAX_TX_MISC_PDUS) + +/* the max TX (send) WR supported by the iSER QP is defined by * + * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * + * to have at max for SCSI command. The tx posting & completion handling code * + * supports -EAGAIN scheme where tx is suspended till the QP has room for more * + * send WR. D=8 comes from 64K/8K */ + +#define ISER_INFLIGHT_DATAOUTS 8 + +#define ISER_QP_MAX_REQ_DTOS (ISCSI_XMIT_CMDS_MAX * \ + (1 + ISER_INFLIGHT_DATAOUTS) + \ + ISER_MAX_TX_MISC_PDUS + \ + ISER_MAX_RX_MISC_PDUS) + +#define ISER_VER 0x10 +#define ISER_WSV 0x08 +#define ISER_RSV 0x04 + +struct iser_hdr { + u8 flags; + u8 rsvd[3]; + __be32 write_stag; /* write rkey */ + __be64 write_va; + __be32 read_stag; /* read rkey */ + __be64 read_va; +} __attribute__((packed)); + + +/* Length of an object name string */ +#define ISER_OBJECT_NAME_SIZE 64 + +enum iser_ib_conn_state { + ISER_CONN_INIT, /* descriptor allocd, no conn */ + ISER_CONN_PENDING, /* in the process of being established */ + ISER_CONN_UP, /* up and running */ + ISER_CONN_TERMINATING, /* in the process of being terminated */ + ISER_CONN_DOWN, /* shut down */ + ISER_CONN_STATES_NUM +}; + +enum iser_task_status { + ISER_TASK_STATUS_INIT = 0, + ISER_TASK_STATUS_STARTED, + ISER_TASK_STATUS_COMPLETED +}; + +enum iser_data_dir { + ISER_DIR_IN = 0, /* to initiator */ + ISER_DIR_OUT, /* from initiator */ + ISER_DIRS_NUM +}; + +struct iser_data_buf { + void *buf; /* pointer to the sg list */ + unsigned int size; /* num entries of this sg */ + unsigned long data_len; /* total data len */ + unsigned int dma_nents; /* returned by dma_map_sg */ + char *copy_buf; /* allocated copy buf for SGs unaligned * + * for rdma which are copied */ + struct scatterlist sg_single; /* SG-ified clone of a non SG SC or * + * unaligned SG */ + }; + +/* fwd declarations */ +struct iser_device; +struct iscsi_iser_conn; +struct iscsi_iser_cmd_task; + +struct iser_mem_reg { + u32 lkey; + u32 rkey; + u64 va; + u64 len; + void *mem_h; +}; + +struct iser_regd_buf { + struct iser_mem_reg reg; /* memory registration info */ + void *virt_addr; + struct iser_device *device; /* device->device for dma_unmap */ + dma_addr_t dma_addr; /* if non zero, addr for dma_unmap */ + enum dma_data_direction direction; /* direction for dma_unmap */ + unsigned int data_size; + atomic_t ref_count; /* refcount, freed when dec to 0 */ +}; + +#define MAX_REGD_BUF_VECTOR_LEN 2 + +struct iser_dto { + struct iscsi_iser_cmd_task *ctask; + struct iscsi_iser_conn *conn; + int notify_enable; + + /* vector of registered buffers */ + unsigned int regd_vector_len; + struct iser_regd_buf *regd[MAX_REGD_BUF_VECTOR_LEN]; + + /* offset into the registered buffer may be specified */ + unsigned int offset[MAX_REGD_BUF_VECTOR_LEN]; + + /* a smaller size may be specified, if 0, then full size is used */ + unsigned int used_sz[MAX_REGD_BUF_VECTOR_LEN]; +}; + +enum iser_desc_type { + ISCSI_RX, + ISCSI_TX_CONTROL , + ISCSI_TX_SCSI_COMMAND, + ISCSI_TX_DATAOUT +}; + +struct iser_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + struct iser_regd_buf hdr_regd_buf; + void *data; /* used by RX & TX_CONTROL */ + struct iser_regd_buf data_regd_buf; /* used by RX & TX_CONTROL */ + enum iser_desc_type type; + struct iser_dto dto; +}; + +struct iser_device { + struct ib_device *ib_device; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_mr *mr; + struct tasklet_struct cq_tasklet; + struct list_head ig_list; /* entry in ig devices list */ + int refcount; +}; + +struct iser_conn { + struct iscsi_iser_conn *iser_conn; /* iser conn for upcalls */ + enum iser_ib_conn_state state; /* rdma connection state */ + spinlock_t lock; /* used for state changes */ + struct iser_device *device; /* device context */ + struct rdma_cm_id *cma_id; /* CMA ID */ + struct ib_qp *qp; /* QP */ + struct ib_fmr_pool *fmr_pool; /* pool of IB FMRs */ + int disc_evt_flag; /* disconn event delivered */ + wait_queue_head_t wait; /* waitq for conn/disconn */ + atomic_t post_recv_buf_count; /* posted rx count */ + atomic_t post_send_buf_count; /* posted tx count */ + struct work_struct comperror_work; /* conn term sleepable ctx*/ + char name[ISER_OBJECT_NAME_SIZE]; + struct iser_page_vec *page_vec; /* represents SG to fmr maps* + * maps serialized as tx is*/ + struct list_head conn_list; /* entry in ig conn list */ +}; + +struct iscsi_iser_conn { + struct iscsi_conn *iscsi_conn;/* ptr to iscsi conn */ + struct iser_conn *ib_conn; /* iSER IB conn */ + + rwlock_t lock; +}; + +struct iscsi_iser_cmd_task { + struct iser_desc desc; + struct iscsi_iser_conn *iser_conn; + int rdma_data_count;/* RDMA bytes */ + enum iser_task_status status; + int command_sent; /* set if command sent */ + int dir[ISER_DIRS_NUM]; /* set if dir use*/ + struct iser_regd_buf rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */ + struct iser_data_buf data[ISER_DIRS_NUM]; /* orig. data des*/ + struct iser_data_buf data_copy[ISER_DIRS_NUM];/* contig. copy */ +}; + +struct iser_page_vec { + u64 *pages; + int length; + int offset; + int data_size; +}; + +struct iser_global { + struct mutex device_list_mutex;/* */ + struct list_head device_list; /* all iSER devices */ + struct mutex connlist_mutex; + struct list_head connlist; /* all iSER IB connections */ + + kmem_cache_t *desc_cache; +}; + +extern struct iser_global ig; +extern int iser_debug_level; + +/* allocate connection resources needed for rdma functionality */ +int iser_conn_set_full_featured_mode(struct iscsi_conn *conn); + +int iser_send_control(struct iscsi_conn *conn, + struct iscsi_mgmt_task *mtask); + +int iser_send_command(struct iscsi_conn *conn, + struct iscsi_cmd_task *ctask); + +int iser_send_data_out(struct iscsi_conn *conn, + struct iscsi_cmd_task *ctask, + struct iscsi_data *hdr); + +void iscsi_iser_recv(struct iscsi_conn *conn, + struct iscsi_hdr *hdr, + char *rx_data, + int rx_data_len); + +int iser_conn_init(struct iser_conn **ib_conn); + +void iser_conn_terminate(struct iser_conn *ib_conn); + +void iser_conn_release(struct iser_conn *ib_conn); + +void iser_rcv_completion(struct iser_desc *desc, + unsigned long dto_xfer_len); + +void iser_snd_completion(struct iser_desc *desc); + +void iser_ctask_rdma_init(struct iscsi_iser_cmd_task *ctask); + +void iser_ctask_rdma_finalize(struct iscsi_iser_cmd_task *ctask); + +void iser_dto_buffs_release(struct iser_dto *dto); + +int iser_regd_buff_release(struct iser_regd_buf *regd_buf); + +void iser_reg_single(struct iser_device *device, + struct iser_regd_buf *regd_buf, + enum dma_data_direction direction); + +int iser_start_rdma_unaligned_sg(struct iscsi_iser_cmd_task *ctask, + enum iser_data_dir cmd_dir); + +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_cmd_task *ctask, + enum iser_data_dir cmd_dir); + +int iser_reg_rdma_mem(struct iscsi_iser_cmd_task *ctask, + enum iser_data_dir cmd_dir); + +int iser_connect(struct iser_conn *ib_conn, + struct sockaddr_in *src_addr, + struct sockaddr_in *dst_addr, + int non_blocking); + +int iser_reg_page_vec(struct iser_conn *ib_conn, + struct iser_page_vec *page_vec, + struct iser_mem_reg *mem_reg); + +void iser_unreg_mem(struct iser_mem_reg *mem_reg); + +int iser_post_recv(struct iser_desc *rx_desc); +int iser_post_send(struct iser_desc *tx_desc); + +int iser_conn_state_comp(struct iser_conn *ib_conn, + enum iser_ib_conn_state comp); +#endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c new file mode 100644 index 00000000000..ccf56f6f723 --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -0,0 +1,738 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: iser_initiator.c 6964 2006-05-07 11:11:43Z ogerlitz $ + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <asm/io.h> +#include <asm/scatterlist.h> +#include <linux/scatterlist.h> +#include <linux/kfifo.h> +#include <scsi/scsi_cmnd.h> +#include <scsi/scsi_host.h> + +#include "iscsi_iser.h" + +/* Constant PDU lengths calculations */ +#define ISER_TOTAL_HEADERS_LEN (sizeof (struct iser_hdr) + \ + sizeof (struct iscsi_hdr)) + +/* iser_dto_add_regd_buff - increments the reference count for * + * the registered buffer & adds it to the DTO object */ +static void iser_dto_add_regd_buff(struct iser_dto *dto, + struct iser_regd_buf *regd_buf, + unsigned long use_offset, + unsigned long use_size) +{ + int add_idx; + + atomic_inc(®d_buf->ref_count); + + add_idx = dto->regd_vector_len; + dto->regd[add_idx] = regd_buf; + dto->used_sz[add_idx] = use_size; + dto->offset[add_idx] = use_offset; + + dto->regd_vector_len++; +} + +static int iser_dma_map_task_data(struct iscsi_iser_cmd_task *iser_ctask, + struct iser_data_buf *data, + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir) +{ + struct device *dma_device; + + iser_ctask->dir[iser_dir] = 1; + dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; + + data->dma_nents = dma_map_sg(dma_device, data->buf, data->size, dma_dir); + if (data->dma_nents == 0) { + iser_err("dma_map_sg failed!!!\n"); + return -EINVAL; + } + return 0; +} + +static void iser_dma_unmap_task_data(struct iscsi_iser_cmd_task *iser_ctask) +{ + struct device *dma_device; + struct iser_data_buf *data; + + dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; + + if (iser_ctask->dir[ISER_DIR_IN]) { + data = &iser_ctask->data[ISER_DIR_IN]; + dma_unmap_sg(dma_device, data->buf, data->size, DMA_FROM_DEVICE); + } + + if (iser_ctask->dir[ISER_DIR_OUT]) { + data = &iser_ctask->data[ISER_DIR_OUT]; + dma_unmap_sg(dma_device, data->buf, data->size, DMA_TO_DEVICE); + } +} + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Total data size is stored in + * iser_ctask->data[ISER_DIR_IN].data_len + */ +static int iser_prepare_read_cmd(struct iscsi_cmd_task *ctask, + unsigned int edtl) + +{ + struct iscsi_iser_cmd_task *iser_ctask = ctask->dd_data; + struct iser_regd_buf *regd_buf; + int err; + struct iser_hdr *hdr = &iser_ctask->desc.iser_header; + struct iser_data_buf *buf_in = &iser_ctask->data[ISER_DIR_IN]; + + err = iser_dma_map_task_data(iser_ctask, + buf_in, + ISER_DIR_IN, + DMA_FROM_DEVICE); + if (err) + return err; + + if (edtl > iser_ctask->data[ISER_DIR_IN].data_len) { + iser_err("Total data length: %ld, less than EDTL: " + "%d, in READ cmd BHS itt: %d, conn: 0x%p\n", + iser_ctask->data[ISER_DIR_IN].data_len, edtl, + ctask->itt, iser_ctask->iser_conn); + return -EINVAL; + } + + err = iser_reg_rdma_mem(iser_ctask,ISER_DIR_IN); + if (err) { + iser_err("Failed to set up Data-IN RDMA\n"); + return err; + } + regd_buf = &iser_ctask->rdma_regd[ISER_DIR_IN]; + + hdr->flags |= ISER_RSV; + hdr->read_stag = cpu_to_be32(regd_buf->reg.rkey); + hdr->read_va = cpu_to_be64(regd_buf->reg.va); + + iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n", + ctask->itt, regd_buf->reg.rkey, + (unsigned long long)regd_buf->reg.va); + + return 0; +} + +/* Register user buffer memory and initialize passive rdma + * dto descriptor. Total data size is stored in + * ctask->data[ISER_DIR_OUT].data_len + */ +static int +iser_prepare_write_cmd(struct iscsi_cmd_task *ctask, + unsigned int imm_sz, + unsigned int unsol_sz, + unsigned int edtl) +{ + struct iscsi_iser_cmd_task *iser_ctask = ctask->dd_data; + struct iser_regd_buf *regd_buf; + int err; + struct iser_dto *send_dto = &iser_ctask->desc.dto; + struct iser_hdr *hdr = &iser_ctask->desc.iser_header; + struct iser_data_buf *buf_out = &iser_ctask->data[ISER_DIR_OUT]; + + err = iser_dma_map_task_data(iser_ctask, + buf_out, + ISER_DIR_OUT, + DMA_TO_DEVICE); + if (err) + return err; + + if (edtl > iser_ctask->data[ISER_DIR_OUT].data_len) { + iser_err("Total data length: %ld, less than EDTL: %d, " + "in WRITE cmd BHS itt: %d, conn: 0x%p\n", + iser_ctask->data[ISER_DIR_OUT].data_len, + edtl, ctask->itt, ctask->conn); + return -EINVAL; + } + + err = iser_reg_rdma_mem(iser_ctask,ISER_DIR_OUT); + if (err != 0) { + iser_err("Failed to register write cmd RDMA mem\n"); + return err; + } + + regd_buf = &iser_ctask->rdma_regd[ISER_DIR_OUT]; + + if (unsol_sz < edtl) { + hdr->flags |= ISER_WSV; + hdr->write_stag = cpu_to_be32(regd_buf->reg.rkey); + hdr->write_va = cpu_to_be64(regd_buf->reg.va + unsol_sz); + + iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X " + "VA:%#llX + unsol:%d\n", + ctask->itt, regd_buf->reg.rkey, + (unsigned long long)regd_buf->reg.va, unsol_sz); + } + + if (imm_sz > 0) { + iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n", + ctask->itt, imm_sz); + iser_dto_add_regd_buff(send_dto, + regd_buf, + 0, + imm_sz); + } + + return 0; +} + +/** + * iser_post_receive_control - allocates, initializes and posts receive DTO. + */ +static int iser_post_receive_control(struct iscsi_conn *conn) +{ + struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_desc *rx_desc; + struct iser_regd_buf *regd_hdr; + struct iser_regd_buf *regd_data; + struct iser_dto *recv_dto = NULL; + struct iser_device *device = iser_conn->ib_conn->device; + int rx_data_size, err = 0; + + rx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO); + if (rx_desc == NULL) { + iser_err("Failed to alloc desc for post recv\n"); + return -ENOMEM; + } + rx_desc->type = ISCSI_RX; + + /* for the login sequence we must support rx of upto 8K; login is done + * after conn create/bind (connect) and conn stop/bind (reconnect), + * what's common for both schemes is that the connection is not started + */ + if (conn->c_stage != ISCSI_CONN_STARTED) + rx_data_size = DEFAULT_MAX_RECV_DATA_SEGMENT_LENGTH; + else /* FIXME till user space sets conn->max_recv_dlength correctly */ + rx_data_size = 128; + + rx_desc->data = kmalloc(rx_data_size, GFP_NOIO); + if (rx_desc->data == NULL) { + iser_err("Failed to alloc data buf for post recv\n"); + err = -ENOMEM; + goto post_rx_kmalloc_failure; + } + + recv_dto = &rx_desc->dto; + recv_dto->conn = iser_conn; + recv_dto->regd_vector_len = 0; + + regd_hdr = &rx_desc->hdr_regd_buf; + memset(regd_hdr, 0, sizeof(struct iser_regd_buf)); + regd_hdr->device = device; + regd_hdr->virt_addr = rx_desc; /* == &rx_desc->iser_header */ + regd_hdr->data_size = ISER_TOTAL_HEADERS_LEN; + + iser_reg_single(device, regd_hdr, DMA_FROM_DEVICE); + + iser_dto_add_regd_buff(recv_dto, regd_hdr, 0, 0); + + regd_data = &rx_desc->data_regd_buf; + memset(regd_data, 0, sizeof(struct iser_regd_buf)); + regd_data->device = device; + regd_data->virt_addr = rx_desc->data; + regd_data->data_size = rx_data_size; + + iser_reg_single(device, regd_data, DMA_FROM_DEVICE); + + iser_dto_add_regd_buff(recv_dto, regd_data, 0, 0); + + err = iser_post_recv(rx_desc); + if (!err) + return 0; + + /* iser_post_recv failed */ + iser_dto_buffs_release(recv_dto); + kfree(rx_desc->data); +post_rx_kmalloc_failure: + kmem_cache_free(ig.desc_cache, rx_desc); + return err; +} + +/* creates a new tx descriptor and adds header regd buffer */ +static void iser_create_send_desc(struct iscsi_iser_conn *iser_conn, + struct iser_desc *tx_desc) +{ + struct iser_regd_buf *regd_hdr = &tx_desc->hdr_regd_buf; + struct iser_dto *send_dto = &tx_desc->dto; + + memset(regd_hdr, 0, sizeof(struct iser_regd_buf)); + regd_hdr->device = iser_conn->ib_conn->device; + regd_hdr->virt_addr = tx_desc; /* == &tx_desc->iser_header */ + regd_hdr->data_size = ISER_TOTAL_HEADERS_LEN; + + send_dto->conn = iser_conn; + send_dto->notify_enable = 1; + send_dto->regd_vector_len = 0; + + memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); + tx_desc->iser_header.flags = ISER_VER; + + iser_dto_add_regd_buff(send_dto, regd_hdr, 0, 0); +} + +/** + * iser_conn_set_full_featured_mode - (iSER API) + */ +int iser_conn_set_full_featured_mode(struct iscsi_conn *conn) +{ + struct iscsi_iser_conn *iser_conn = conn->dd_data; + + int i; + /* no need to keep it in a var, we are after login so if this should + * be negotiated, by now the result should be available here */ + int initial_post_recv_bufs_num = ISER_MAX_RX_MISC_PDUS; + + iser_dbg("Initially post: %d\n", initial_post_recv_bufs_num); + + /* Check that there is no posted recv or send buffers left - */ + /* they must be consumed during the login phase */ + BUG_ON(atomic_read(&iser_conn->ib_conn->post_recv_buf_count) != 0); + BUG_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0); + + /* Initial post receive buffers */ + for (i = 0; i < initial_post_recv_bufs_num; i++) { + if (iser_post_receive_control(conn) != 0) { + iser_err("Failed to post recv bufs at:%d conn:0x%p\n", + i, conn); + return -ENOMEM; + } + } + iser_dbg("Posted %d post recv bufs, conn:0x%p\n", i, conn); + return 0; +} + +static int +iser_check_xmit(struct iscsi_conn *conn, void *task) +{ + int rc = 0; + struct iscsi_iser_conn *iser_conn = conn->dd_data; + + write_lock_bh(conn->recv_lock); + if (atomic_read(&iser_conn->ib_conn->post_send_buf_count) == + ISER_QP_MAX_REQ_DTOS) { + iser_dbg("%ld can't xmit task %p, suspending tx\n",jiffies,task); + set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + rc = -EAGAIN; + } + write_unlock_bh(conn->recv_lock); + return rc; +} + + +/** + * iser_send_command - send command PDU + */ +int iser_send_command(struct iscsi_conn *conn, + struct iscsi_cmd_task *ctask) +{ + struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iscsi_iser_cmd_task *iser_ctask = ctask->dd_data; + struct iser_dto *send_dto = NULL; + unsigned long edtl; + int err = 0; + struct iser_data_buf *data_buf; + + struct iscsi_cmd *hdr = ctask->hdr; + struct scsi_cmnd *sc = ctask->sc; + + if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) { + iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn); + return -EPERM; + } + if (iser_check_xmit(conn, ctask)) + return -EAGAIN; + + edtl = ntohl(hdr->data_length); + + /* build the tx desc regd header and add it to the tx desc dto */ + iser_ctask->desc.type = ISCSI_TX_SCSI_COMMAND; + send_dto = &iser_ctask->desc.dto; + send_dto->ctask = iser_ctask; + iser_create_send_desc(iser_conn, &iser_ctask->desc); + + if (hdr->flags & ISCSI_FLAG_CMD_READ) + data_buf = &iser_ctask->data[ISER_DIR_IN]; + else + data_buf = &iser_ctask->data[ISER_DIR_OUT]; + + if (sc->use_sg) { /* using a scatter list */ + data_buf->buf = sc->request_buffer; + data_buf->size = sc->use_sg; + } else if (sc->request_bufflen) { + /* using a single buffer - convert it into one entry SG */ + sg_init_one(&data_buf->sg_single, + sc->request_buffer, sc->request_bufflen); + data_buf->buf = &data_buf->sg_single; + data_buf->size = 1; + } + + data_buf->data_len = sc->request_bufflen; + + if (hdr->flags & ISCSI_FLAG_CMD_READ) { + err = iser_prepare_read_cmd(ctask, edtl); + if (err) + goto send_command_error; + } + if (hdr->flags & ISCSI_FLAG_CMD_WRITE) { + err = iser_prepare_write_cmd(ctask, + ctask->imm_count, + ctask->imm_count + + ctask->unsol_count, + edtl); + if (err) + goto send_command_error; + } + + iser_reg_single(iser_conn->ib_conn->device, + send_dto->regd[0], DMA_TO_DEVICE); + + if (iser_post_receive_control(conn) != 0) { + iser_err("post_recv failed!\n"); + err = -ENOMEM; + goto send_command_error; + } + + iser_ctask->status = ISER_TASK_STATUS_STARTED; + + err = iser_post_send(&iser_ctask->desc); + if (!err) + return 0; + +send_command_error: + iser_dto_buffs_release(send_dto); + iser_err("conn %p failed ctask->itt %d err %d\n",conn, ctask->itt, err); + return err; +} + +/** + * iser_send_data_out - send data out PDU + */ +int iser_send_data_out(struct iscsi_conn *conn, + struct iscsi_cmd_task *ctask, + struct iscsi_data *hdr) +{ + struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iscsi_iser_cmd_task *iser_ctask = ctask->dd_data; + struct iser_desc *tx_desc = NULL; + struct iser_dto *send_dto = NULL; + unsigned long buf_offset; + unsigned long data_seg_len; + unsigned int itt; + int err = 0; + + if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) { + iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn); + return -EPERM; + } + + if (iser_check_xmit(conn, ctask)) + return -EAGAIN; + + itt = ntohl(hdr->itt); + data_seg_len = ntoh24(hdr->dlength); + buf_offset = ntohl(hdr->offset); + + iser_dbg("%s itt %d dseg_len %d offset %d\n", + __func__,(int)itt,(int)data_seg_len,(int)buf_offset); + + tx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO); + if (tx_desc == NULL) { + iser_err("Failed to alloc desc for post dataout\n"); + return -ENOMEM; + } + + tx_desc->type = ISCSI_TX_DATAOUT; + memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr)); + + /* build the tx desc regd header and add it to the tx desc dto */ + send_dto = &tx_desc->dto; + send_dto->ctask = iser_ctask; + iser_create_send_desc(iser_conn, tx_desc); + + iser_reg_single(iser_conn->ib_conn->device, + send_dto->regd[0], DMA_TO_DEVICE); + + /* all data was registered for RDMA, we can use the lkey */ + iser_dto_add_regd_buff(send_dto, + &iser_ctask->rdma_regd[ISER_DIR_OUT], + buf_offset, + data_seg_len); + + if (buf_offset + data_seg_len > iser_ctask->data[ISER_DIR_OUT].data_len) { + iser_err("Offset:%ld & DSL:%ld in Data-Out " + "inconsistent with total len:%ld, itt:%d\n", + buf_offset, data_seg_len, + iser_ctask->data[ISER_DIR_OUT].data_len, itt); + err = -EINVAL; + goto send_data_out_error; + } + iser_dbg("data-out itt: %d, offset: %ld, sz: %ld\n", + itt, buf_offset, data_seg_len); + + + err = iser_post_send(tx_desc); + if (!err) + return 0; + +send_data_out_error: + iser_dto_buffs_release(send_dto); + kmem_cache_free(ig.desc_cache, tx_desc); + iser_err("conn %p failed err %d\n",conn, err); + return err; +} + +int iser_send_control(struct iscsi_conn *conn, + struct iscsi_mgmt_task *mtask) +{ + struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_desc *mdesc = mtask->dd_data; + struct iser_dto *send_dto = NULL; + unsigned int itt; + unsigned long data_seg_len; + int err = 0; + unsigned char opcode; + struct iser_regd_buf *regd_buf; + struct iser_device *device; + + if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) { + iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn); + return -EPERM; + } + + if (iser_check_xmit(conn,mtask)) + return -EAGAIN; + + /* build the tx desc regd header and add it to the tx desc dto */ + mdesc->type = ISCSI_TX_CONTROL; + send_dto = &mdesc->dto; + send_dto->ctask = NULL; + iser_create_send_desc(iser_conn, mdesc); + + device = iser_conn->ib_conn->device; + + iser_reg_single(device, send_dto->regd[0], DMA_TO_DEVICE); + + itt = ntohl(mtask->hdr->itt); + opcode = mtask->hdr->opcode & ISCSI_OPCODE_MASK; + data_seg_len = ntoh24(mtask->hdr->dlength); + + if (data_seg_len > 0) { + regd_buf = &mdesc->data_regd_buf; + memset(regd_buf, 0, sizeof(struct iser_regd_buf)); + regd_buf->device = device; + regd_buf->virt_addr = mtask->data; + regd_buf->data_size = mtask->data_count; + iser_reg_single(device, regd_buf, + DMA_TO_DEVICE); + iser_dto_add_regd_buff(send_dto, regd_buf, + 0, + data_seg_len); + } + + if (iser_post_receive_control(conn) != 0) { + iser_err("post_rcv_buff failed!\n"); + err = -ENOMEM; + goto send_control_error; + } + + err = iser_post_send(mdesc); + if (!err) + return 0; + +send_control_error: + iser_dto_buffs_release(send_dto); + iser_err("conn %p failed err %d\n",conn, err); + return err; +} + +/** + * iser_rcv_dto_completion - recv DTO completion + */ +void iser_rcv_completion(struct iser_desc *rx_desc, + unsigned long dto_xfer_len) +{ + struct iser_dto *dto = &rx_desc->dto; + struct iscsi_iser_conn *conn = dto->conn; + struct iscsi_session *session = conn->iscsi_conn->session; + struct iscsi_cmd_task *ctask; + struct iscsi_iser_cmd_task *iser_ctask; + struct iscsi_hdr *hdr; + char *rx_data = NULL; + int rx_data_len = 0; + unsigned int itt; + unsigned char opcode; + + hdr = &rx_desc->iscsi_header; + + iser_dbg("op 0x%x itt 0x%x\n", hdr->opcode,hdr->itt); + + if (dto_xfer_len > ISER_TOTAL_HEADERS_LEN) { /* we have data */ + rx_data_len = dto_xfer_len - ISER_TOTAL_HEADERS_LEN; + rx_data = dto->regd[1]->virt_addr; + rx_data += dto->offset[1]; + } + + opcode = hdr->opcode & ISCSI_OPCODE_MASK; + + if (opcode == ISCSI_OP_SCSI_CMD_RSP) { + itt = hdr->itt & ISCSI_ITT_MASK; /* mask out cid and age bits */ + if (!(itt < session->cmds_max)) + iser_err("itt can't be matched to task!!!" + "conn %p opcode %d cmds_max %d itt %d\n", + conn->iscsi_conn,opcode,session->cmds_max,itt); + /* use the mapping given with the cmds array indexed by itt */ + ctask = (struct iscsi_cmd_task *)session->cmds[itt]; + iser_ctask = ctask->dd_data; + iser_dbg("itt %d ctask %p\n",itt,ctask); + iser_ctask->status = ISER_TASK_STATUS_COMPLETED; + iser_ctask_rdma_finalize(iser_ctask); + } + + iser_dto_buffs_release(dto); + + iscsi_iser_recv(conn->iscsi_conn, hdr, rx_data, rx_data_len); + + kfree(rx_desc->data); + kmem_cache_free(ig.desc_cache, rx_desc); + + /* decrementing conn->post_recv_buf_count only --after-- freeing the * + * task eliminates the need to worry on tasks which are completed in * + * parallel to the execution of iser_conn_term. So the code that waits * + * for the posted rx bufs refcount to become zero handles everything */ + atomic_dec(&conn->ib_conn->post_recv_buf_count); +} + +void iser_snd_completion(struct iser_desc *tx_desc) +{ + struct iser_dto *dto = &tx_desc->dto; + struct iscsi_iser_conn *iser_conn = dto->conn; + struct iscsi_conn *conn = iser_conn->iscsi_conn; + struct iscsi_mgmt_task *mtask; + + iser_dbg("Initiator, Data sent dto=0x%p\n", dto); + + iser_dto_buffs_release(dto); + + if (tx_desc->type == ISCSI_TX_DATAOUT) + kmem_cache_free(ig.desc_cache, tx_desc); + + atomic_dec(&iser_conn->ib_conn->post_send_buf_count); + + write_lock(conn->recv_lock); + if (conn->suspend_tx) { + iser_dbg("%ld resuming tx\n",jiffies); + clear_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + scsi_queue_work(conn->session->host, &conn->xmitwork); + } + write_unlock(conn->recv_lock); + + if (tx_desc->type == ISCSI_TX_CONTROL) { + /* this arithmetic is legal by libiscsi dd_data allocation */ + mtask = (void *) ((long)(void *)tx_desc - + sizeof(struct iscsi_mgmt_task)); + if (mtask->hdr->itt == cpu_to_be32(ISCSI_RESERVED_TAG)) { + struct iscsi_session *session = conn->session; + + spin_lock(&conn->session->lock); + list_del(&mtask->running); + __kfifo_put(session->mgmtpool.queue, (void*)&mtask, + sizeof(void*)); + spin_unlock(&session->lock); + } + } +} + +void iser_ctask_rdma_init(struct iscsi_iser_cmd_task *iser_ctask) + +{ + iser_ctask->status = ISER_TASK_STATUS_INIT; + + iser_ctask->dir[ISER_DIR_IN] = 0; + iser_ctask->dir[ISER_DIR_OUT] = 0; + + iser_ctask->data[ISER_DIR_IN].data_len = 0; + iser_ctask->data[ISER_DIR_OUT].data_len = 0; + + memset(&iser_ctask->rdma_regd[ISER_DIR_IN], 0, + sizeof(struct iser_regd_buf)); + memset(&iser_ctask->rdma_regd[ISER_DIR_OUT], 0, + sizeof(struct iser_regd_buf)); +} + +void iser_ctask_rdma_finalize(struct iscsi_iser_cmd_task *iser_ctask) +{ + int deferred; + + /* if we were reading, copy back to unaligned sglist, + * anyway dma_unmap and free the copy + */ + if (iser_ctask->data_copy[ISER_DIR_IN].copy_buf != NULL) + iser_finalize_rdma_unaligned_sg(iser_ctask, ISER_DIR_IN); + if (iser_ctask->data_copy[ISER_DIR_OUT].copy_buf != NULL) + iser_finalize_rdma_unaligned_sg(iser_ctask, ISER_DIR_OUT); + + if (iser_ctask->dir[ISER_DIR_IN]) { + deferred = iser_regd_buff_release + (&iser_ctask->rdma_regd[ISER_DIR_IN]); + if (deferred) { + iser_err("References remain for BUF-IN rdma reg\n"); + BUG(); + } + } + + if (iser_ctask->dir[ISER_DIR_OUT]) { + deferred = iser_regd_buff_release + (&iser_ctask->rdma_regd[ISER_DIR_OUT]); + if (deferred) { + iser_err("References remain for BUF-OUT rdma reg\n"); + BUG(); + } + } + + iser_dma_unmap_task_data(iser_ctask); +} + +void iser_dto_buffs_release(struct iser_dto *dto) +{ + int i; + + for (i = 0; i < dto->regd_vector_len; i++) + iser_regd_buff_release(dto->regd[i]); +} + diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c new file mode 100644 index 00000000000..31950a522a1 --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: iser_memory.c 6964 2006-05-07 11:11:43Z ogerlitz $ + */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <asm/io.h> +#include <asm/scatterlist.h> +#include <linux/scatterlist.h> + +#include "iscsi_iser.h" + +#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ +/** + * Decrements the reference count for the + * registered buffer & releases it + * + * returns 0 if released, 1 if deferred + */ +int iser_regd_buff_release(struct iser_regd_buf *regd_buf) +{ + struct device *dma_device; + + if ((atomic_read(®d_buf->ref_count) == 0) || + atomic_dec_and_test(®d_buf->ref_count)) { + /* if we used the dma mr, unreg is just NOP */ + if (regd_buf->reg.rkey != 0) + iser_unreg_mem(®d_buf->reg); + + if (regd_buf->dma_addr) { + dma_device = regd_buf->device->ib_device->dma_device; + dma_unmap_single(dma_device, + regd_buf->dma_addr, + regd_buf->data_size, + regd_buf->direction); + } + /* else this regd buf is associated with task which we */ + /* dma_unmap_single/sg later */ + return 0; + } else { + iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf); + return 1; + } +} + +/** + * iser_reg_single - fills registered buffer descriptor with + * registration information + */ +void iser_reg_single(struct iser_device *device, + struct iser_regd_buf *regd_buf, + enum dma_data_direction direction) +{ + dma_addr_t dma_addr; + + dma_addr = dma_map_single(device->ib_device->dma_device, + regd_buf->virt_addr, + regd_buf->data_size, direction); + BUG_ON(dma_mapping_error(dma_addr)); + + regd_buf->reg.lkey = device->mr->lkey; + regd_buf->reg.rkey = 0; /* indicate there's no need to unreg */ + regd_buf->reg.len = regd_buf->data_size; + regd_buf->reg.va = dma_addr; + + regd_buf->dma_addr = dma_addr; + regd_buf->direction = direction; +} + +/** + * iser_start_rdma_unaligned_sg + */ +int iser_start_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, + enum iser_data_dir cmd_dir) +{ + int dma_nents; + struct device *dma_device; + char *mem = NULL; + struct iser_data_buf *data = &iser_ctask->data[cmd_dir]; + unsigned long cmd_data_len = data->data_len; + + if (cmd_data_len > ISER_KMALLOC_THRESHOLD) + mem = (void *)__get_free_pages(GFP_NOIO, + long_log2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); + else + mem = kmalloc(cmd_data_len, GFP_NOIO); + + if (mem == NULL) { + iser_err("Failed to allocate mem size %d %d for copying sglist\n", + data->size,(int)cmd_data_len); + return -ENOMEM; + } + + if (cmd_dir == ISER_DIR_OUT) { + /* copy the unaligned sg the buffer which is used for RDMA */ + struct scatterlist *sg = (struct scatterlist *)data->buf; + int i; + char *p, *from; + + for (p = mem, i = 0; i < data->size; i++) { + from = kmap_atomic(sg[i].page, KM_USER0); + memcpy(p, + from + sg[i].offset, + sg[i].length); + kunmap_atomic(from, KM_USER0); + p += sg[i].length; + } + } + + sg_init_one(&iser_ctask->data_copy[cmd_dir].sg_single, mem, cmd_data_len); + iser_ctask->data_copy[cmd_dir].buf = + &iser_ctask->data_copy[cmd_dir].sg_single; + iser_ctask->data_copy[cmd_dir].size = 1; + + iser_ctask->data_copy[cmd_dir].copy_buf = mem; + + dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; + + if (cmd_dir == ISER_DIR_OUT) + dma_nents = dma_map_sg(dma_device, + &iser_ctask->data_copy[cmd_dir].sg_single, + 1, DMA_TO_DEVICE); + else + dma_nents = dma_map_sg(dma_device, + &iser_ctask->data_copy[cmd_dir].sg_single, + 1, DMA_FROM_DEVICE); + + BUG_ON(dma_nents == 0); + + iser_ctask->data_copy[cmd_dir].dma_nents = dma_nents; + return 0; +} + +/** + * iser_finalize_rdma_unaligned_sg + */ +void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, + enum iser_data_dir cmd_dir) +{ + struct device *dma_device; + struct iser_data_buf *mem_copy; + unsigned long cmd_data_len; + + dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; + mem_copy = &iser_ctask->data_copy[cmd_dir]; + + if (cmd_dir == ISER_DIR_OUT) + dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, + DMA_TO_DEVICE); + else + dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, + DMA_FROM_DEVICE); + + if (cmd_dir == ISER_DIR_IN) { + char *mem; + struct scatterlist *sg; + unsigned char *p, *to; + unsigned int sg_size; + int i; + + /* copy back read RDMA to unaligned sg */ + mem = mem_copy->copy_buf; + + sg = (struct scatterlist *)iser_ctask->data[ISER_DIR_IN].buf; + sg_size = iser_ctask->data[ISER_DIR_IN].size; + + for (p = mem, i = 0; i < sg_size; i++){ + to = kmap_atomic(sg[i].page, KM_SOFTIRQ0); + memcpy(to + sg[i].offset, + p, + sg[i].length); + kunmap_atomic(to, KM_SOFTIRQ0); + p += sg[i].length; + } + } + + cmd_data_len = iser_ctask->data[cmd_dir].data_len; + + if (cmd_data_len > ISER_KMALLOC_THRESHOLD) + free_pages((unsigned long)mem_copy->copy_buf, + long_log2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); + else + kfree(mem_copy->copy_buf); + + mem_copy->copy_buf = NULL; +} + +/** + * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses + * and returns the length of resulting physical address array (may be less than + * the original due to possible compaction). + * + * we build a "page vec" under the assumption that the SG meets the RDMA + * alignment requirements. Other then the first and last SG elements, all + * the "internal" elements can be compacted into a list whose elements are + * dma addresses of physical pages. The code supports also the weird case + * where --few fragments of the same page-- are present in the SG as + * consecutive elements. Also, it handles one entry SG. + */ +static int iser_sg_to_page_vec(struct iser_data_buf *data, + struct iser_page_vec *page_vec) +{ + struct scatterlist *sg = (struct scatterlist *)data->buf; + dma_addr_t first_addr, last_addr, page; + int start_aligned, end_aligned; + unsigned int cur_page = 0; + unsigned long total_sz = 0; + int i; + + /* compute the offset of first element */ + page_vec->offset = (u64) sg[0].offset; + + for (i = 0; i < data->dma_nents; i++) { + total_sz += sg_dma_len(&sg[i]); + + first_addr = sg_dma_address(&sg[i]); + last_addr = first_addr + sg_dma_len(&sg[i]); + + start_aligned = !(first_addr & ~PAGE_MASK); + end_aligned = !(last_addr & ~PAGE_MASK); + + /* continue to collect page fragments till aligned or SG ends */ + while (!end_aligned && (i + 1 < data->dma_nents)) { + i++; + total_sz += sg_dma_len(&sg[i]); + last_addr = sg_dma_address(&sg[i]) + sg_dma_len(&sg[i]); + end_aligned = !(last_addr & ~PAGE_MASK); + } + + first_addr = first_addr & PAGE_MASK; + + for (page = first_addr; page < last_addr; page += PAGE_SIZE) + page_vec->pages[cur_page++] = page; + + } + page_vec->data_size = total_sz; + iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); + return cur_page; +} + +#define MASK_4K ((1UL << 12) - 1) /* 0xFFF */ +#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & MASK_4K) == 0) + +/** + * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned + * for RDMA sub-list of a scatter-gather list of memory buffers, and returns + * the number of entries which are aligned correctly. Supports the case where + * consecutive SG elements are actually fragments of the same physcial page. + */ +static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data) +{ + struct scatterlist *sg; + dma_addr_t end_addr, next_addr; + int i, cnt; + unsigned int ret_len = 0; + + sg = (struct scatterlist *)data->buf; + + for (cnt = 0, i = 0; i < data->dma_nents; i++, cnt++) { + /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " + "offset: %ld sz: %ld\n", i, + (unsigned long)page_to_phys(sg[i].page), + (unsigned long)sg[i].offset, + (unsigned long)sg[i].length); */ + end_addr = sg_dma_address(&sg[i]) + + sg_dma_len(&sg[i]); + /* iser_dbg("Checking sg iobuf end address " + "0x%08lX\n", end_addr); */ + if (i + 1 < data->dma_nents) { + next_addr = sg_dma_address(&sg[i+1]); + /* are i, i+1 fragments of the same page? */ + if (end_addr == next_addr) + continue; + else if (!IS_4K_ALIGNED(end_addr)) { + ret_len = cnt + 1; + break; + } + } + } + if (i == data->dma_nents) + ret_len = cnt; /* loop ended */ + iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", + ret_len, data->dma_nents, data); + return ret_len; +} + +static void iser_data_buf_dump(struct iser_data_buf *data) +{ + struct scatterlist *sg = (struct scatterlist *)data->buf; + int i; + + for (i = 0; i < data->size; i++) + iser_err("sg[%d] dma_addr:0x%lX page:0x%p " + "off:%d sz:%d dma_len:%d\n", + i, (unsigned long)sg_dma_address(&sg[i]), + sg[i].page, sg[i].offset, + sg[i].length,sg_dma_len(&sg[i])); +} + +static void iser_dump_page_vec(struct iser_page_vec *page_vec) +{ + int i; + + iser_err("page vec length %d data size %d\n", + page_vec->length, page_vec->data_size); + for (i = 0; i < page_vec->length; i++) + iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); +} + +static void iser_page_vec_build(struct iser_data_buf *data, + struct iser_page_vec *page_vec) +{ + int page_vec_len = 0; + + page_vec->length = 0; + page_vec->offset = 0; + + iser_dbg("Translating sg sz: %d\n", data->dma_nents); + page_vec_len = iser_sg_to_page_vec(data,page_vec); + iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len); + + page_vec->length = page_vec_len; + + if (page_vec_len * PAGE_SIZE < page_vec->data_size) { + iser_err("page_vec too short to hold this SG\n"); + iser_data_buf_dump(data); + iser_dump_page_vec(page_vec); + BUG(); + } +} + +/** + * iser_reg_rdma_mem - Registers memory intended for RDMA, + * obtaining rkey and va + * + * returns 0 on success, errno code on failure + */ +int iser_reg_rdma_mem(struct iscsi_iser_cmd_task *iser_ctask, + enum iser_data_dir cmd_dir) +{ + struct iser_conn *ib_conn = iser_ctask->iser_conn->ib_conn; + struct iser_data_buf *mem = &iser_ctask->data[cmd_dir]; + struct iser_regd_buf *regd_buf; + int aligned_len; + int err; + + regd_buf = &iser_ctask->rdma_regd[cmd_dir]; + + aligned_len = iser_data_buf_aligned_len(mem); + if (aligned_len != mem->size) { + iser_err("rdma alignment violation %d/%d aligned\n", + aligned_len, mem->size); + iser_data_buf_dump(mem); + /* allocate copy buf, if we are writing, copy the */ + /* unaligned scatterlist, dma map the copy */ + if (iser_start_rdma_unaligned_sg(iser_ctask, cmd_dir) != 0) + return -ENOMEM; + mem = &iser_ctask->data_copy[cmd_dir]; + } + + iser_page_vec_build(mem, ib_conn->page_vec); + err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, ®d_buf->reg); + if (err) + return err; + + /* take a reference on this regd buf such that it will not be released * + * (eg in send dto completion) before we get the scsi response */ + atomic_inc(®d_buf->ref_count); + return 0; +} diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c new file mode 100644 index 00000000000..ff117bbf81b --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -0,0 +1,827 @@ +/* + * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: iser_verbs.c 7051 2006-05-10 12:29:11Z ogerlitz $ + */ +#include <asm/io.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/smp_lock.h> +#include <linux/delay.h> +#include <linux/version.h> + +#include "iscsi_iser.h" + +#define ISCSI_ISER_MAX_CONN 8 +#define ISER_MAX_CQ_LEN ((ISER_QP_MAX_RECV_DTOS + \ + ISER_QP_MAX_REQ_DTOS) * \ + ISCSI_ISER_MAX_CONN) + +static void iser_cq_tasklet_fn(unsigned long data); +static void iser_cq_callback(struct ib_cq *cq, void *cq_context); +static void iser_comp_error_worker(void *data); + +static void iser_cq_event_callback(struct ib_event *cause, void *context) +{ + iser_err("got cq event %d \n", cause->event); +} + +static void iser_qp_event_callback(struct ib_event *cause, void *context) +{ + iser_err("got qp event %d\n",cause->event); +} + +/** + * iser_create_device_ib_res - creates Protection Domain (PD), Completion + * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with + * the adapator. + * + * returns 0 on success, -1 on failure + */ +static int iser_create_device_ib_res(struct iser_device *device) +{ + device->pd = ib_alloc_pd(device->ib_device); + if (IS_ERR(device->pd)) + goto pd_err; + + device->cq = ib_create_cq(device->ib_device, + iser_cq_callback, + iser_cq_event_callback, + (void *)device, + ISER_MAX_CQ_LEN); + if (IS_ERR(device->cq)) + goto cq_err; + + if (ib_req_notify_cq(device->cq, IB_CQ_NEXT_COMP)) + goto cq_arm_err; + + tasklet_init(&device->cq_tasklet, + iser_cq_tasklet_fn, + (unsigned long)device); + + device->mr = ib_get_dma_mr(device->pd, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(device->mr)) + goto dma_mr_err; + + return 0; + +dma_mr_err: + tasklet_kill(&device->cq_tasklet); +cq_arm_err: + ib_destroy_cq(device->cq); +cq_err: + ib_dealloc_pd(device->pd); +pd_err: + iser_err("failed to allocate an IB resource\n"); + return -1; +} + +/** + * iser_free_device_ib_res - destory/dealloc/dereg the DMA MR, + * CQ and PD created with the device associated with the adapator. + */ +static void iser_free_device_ib_res(struct iser_device *device) +{ + BUG_ON(device->mr == NULL); + + tasklet_kill(&device->cq_tasklet); + + (void)ib_dereg_mr(device->mr); + (void)ib_destroy_cq(device->cq); + (void)ib_dealloc_pd(device->pd); + + device->mr = NULL; + device->cq = NULL; + device->pd = NULL; +} + +/** + * iser_create_ib_conn_res - Creates FMR pool and Queue-Pair (QP) + * + * returns 0 on success, -1 on failure + */ +static int iser_create_ib_conn_res(struct iser_conn *ib_conn) +{ + struct iser_device *device; + struct ib_qp_init_attr init_attr; + int ret; + struct ib_fmr_pool_param params; + + BUG_ON(ib_conn->device == NULL); + + device = ib_conn->device; + + ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) + + (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)), + GFP_KERNEL); + if (!ib_conn->page_vec) { + ret = -ENOMEM; + goto alloc_err; + } + ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1); + + params.page_shift = PAGE_SHIFT; + /* when the first/last SG element are not start/end * + * page aligned, the map whould be of N+1 pages */ + params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1; + /* make the pool size twice the max number of SCSI commands * + * the ML is expected to queue, watermark for unmap at 50% */ + params.pool_size = ISCSI_XMIT_CMDS_MAX * 2; + params.dirty_watermark = ISCSI_XMIT_CMDS_MAX; + params.cache = 0; + params.flush_function = NULL; + params.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + ib_conn->fmr_pool = ib_create_fmr_pool(device->pd, ¶ms); + if (IS_ERR(ib_conn->fmr_pool)) { + ret = PTR_ERR(ib_conn->fmr_pool); + goto fmr_pool_err; + } + + memset(&init_attr, 0, sizeof init_attr); + + init_attr.event_handler = iser_qp_event_callback; + init_attr.qp_context = (void *)ib_conn; + init_attr.send_cq = device->cq; + init_attr.recv_cq = device->cq; + init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; + init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; + init_attr.cap.max_send_sge = MAX_REGD_BUF_VECTOR_LEN; + init_attr.cap.max_recv_sge = 2; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_RC; + + ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); + if (ret) + goto qp_err; + + ib_conn->qp = ib_conn->cma_id->qp; + iser_err("setting conn %p cma_id %p: fmr_pool %p qp %p\n", + ib_conn, ib_conn->cma_id, + ib_conn->fmr_pool, ib_conn->cma_id->qp); + return ret; + +qp_err: + (void)ib_destroy_fmr_pool(ib_conn->fmr_pool); +fmr_pool_err: + kfree(ib_conn->page_vec); +alloc_err: + iser_err("unable to alloc mem or create resource, err %d\n", ret); + return ret; +} + +/** + * releases the FMR pool, QP and CMA ID objects, returns 0 on success, + * -1 on failure + */ +static int iser_free_ib_conn_res(struct iser_conn *ib_conn) +{ + BUG_ON(ib_conn == NULL); + + iser_err("freeing conn %p cma_id %p fmr pool %p qp %p\n", + ib_conn, ib_conn->cma_id, + ib_conn->fmr_pool, ib_conn->qp); + + /* qp is created only once both addr & route are resolved */ + if (ib_conn->fmr_pool != NULL) + ib_destroy_fmr_pool(ib_conn->fmr_pool); + + if (ib_conn->qp != NULL) + rdma_destroy_qp(ib_conn->cma_id); + + if (ib_conn->cma_id != NULL) + rdma_destroy_id(ib_conn->cma_id); + + ib_conn->fmr_pool = NULL; + ib_conn->qp = NULL; + ib_conn->cma_id = NULL; + kfree(ib_conn->page_vec); + + return 0; +} + +/** + * based on the resolved device node GUID see if there already allocated + * device for this device. If there's no such, create one. + */ +static +struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id) +{ + struct list_head *p_list; + struct iser_device *device = NULL; + + mutex_lock(&ig.device_list_mutex); + + p_list = ig.device_list.next; + while (p_list != &ig.device_list) { + device = list_entry(p_list, struct iser_device, ig_list); + /* find if there's a match using the node GUID */ + if (device->ib_device->node_guid == cma_id->device->node_guid) + break; + } + + if (device == NULL) { + device = kzalloc(sizeof *device, GFP_KERNEL); + if (device == NULL) + goto out; + /* assign this device to the device */ + device->ib_device = cma_id->device; + /* init the device and link it into ig device list */ + if (iser_create_device_ib_res(device)) { + kfree(device); + device = NULL; + goto out; + } + list_add(&device->ig_list, &ig.device_list); + } +out: + BUG_ON(device == NULL); + device->refcount++; + mutex_unlock(&ig.device_list_mutex); + return device; +} + +/* if there's no demand for this device, release it */ +static void iser_device_try_release(struct iser_device *device) +{ + mutex_lock(&ig.device_list_mutex); + device->refcount--; + iser_err("device %p refcount %d\n",device,device->refcount); + if (!device->refcount) { + iser_free_device_ib_res(device); + list_del(&device->ig_list); + kfree(device); + } + mutex_unlock(&ig.device_list_mutex); +} + +int iser_conn_state_comp(struct iser_conn *ib_conn, + enum iser_ib_conn_state comp) +{ + int ret; + + spin_lock_bh(&ib_conn->lock); + ret = (ib_conn->state == comp); + spin_unlock_bh(&ib_conn->lock); + return ret; +} + +static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, + enum iser_ib_conn_state comp, + enum iser_ib_conn_state exch) +{ + int ret; + + spin_lock_bh(&ib_conn->lock); + if ((ret = (ib_conn->state == comp))) + ib_conn->state = exch; + spin_unlock_bh(&ib_conn->lock); + return ret; +} + +/** + * triggers start of the disconnect procedures and wait for them to be done + */ +void iser_conn_terminate(struct iser_conn *ib_conn) +{ + int err = 0; + + /* change the ib conn state only if the conn is UP, however always call + * rdma_disconnect since this is the only way to cause the CMA to change + * the QP state to ERROR + */ + + iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, ISER_CONN_TERMINATING); + err = rdma_disconnect(ib_conn->cma_id); + if (err) + iser_err("Failed to disconnect, conn: 0x%p err %d\n", + ib_conn,err); + + wait_event_interruptible(ib_conn->wait, + ib_conn->state == ISER_CONN_DOWN); + + iser_conn_release(ib_conn); +} + +static void iser_connect_error(struct rdma_cm_id *cma_id) +{ + struct iser_conn *ib_conn; + ib_conn = (struct iser_conn *)cma_id->context; + + ib_conn->state = ISER_CONN_DOWN; + wake_up_interruptible(&ib_conn->wait); +} + +static void iser_addr_handler(struct rdma_cm_id *cma_id) +{ + struct iser_device *device; + struct iser_conn *ib_conn; + int ret; + + device = iser_device_find_by_ib_device(cma_id); + ib_conn = (struct iser_conn *)cma_id->context; + ib_conn->device = device; + + ret = rdma_resolve_route(cma_id, 1000); + if (ret) { + iser_err("resolve route failed: %d\n", ret); + iser_connect_error(cma_id); + } + return; +} + +static void iser_route_handler(struct rdma_cm_id *cma_id) +{ + struct rdma_conn_param conn_param; + int ret; + + ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context); + if (ret) + goto failure; + + iser_dbg("path.mtu is %d setting it to %d\n", + cma_id->route.path_rec->mtu, IB_MTU_1024); + + /* we must set the MTU to 1024 as this is what the target is assuming */ + if (cma_id->route.path_rec->mtu > IB_MTU_1024) + cma_id->route.path_rec->mtu = IB_MTU_1024; + + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 4; + conn_param.initiator_depth = 1; + conn_param.retry_count = 7; + conn_param.rnr_retry_count = 6; + + ret = rdma_connect(cma_id, &conn_param); + if (ret) { + iser_err("failure connecting: %d\n", ret); + goto failure; + } + + return; +failure: + iser_connect_error(cma_id); +} + +static void iser_connected_handler(struct rdma_cm_id *cma_id) +{ + struct iser_conn *ib_conn; + + ib_conn = (struct iser_conn *)cma_id->context; + ib_conn->state = ISER_CONN_UP; + wake_up_interruptible(&ib_conn->wait); +} + +static void iser_disconnected_handler(struct rdma_cm_id *cma_id) +{ + struct iser_conn *ib_conn; + + ib_conn = (struct iser_conn *)cma_id->context; + ib_conn->disc_evt_flag = 1; + + /* getting here when the state is UP means that the conn is being * + * terminated asynchronously from the iSCSI layer's perspective. */ + if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, + ISER_CONN_TERMINATING)) + iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + + /* Complete the termination process if no posts are pending */ + if ((atomic_read(&ib_conn->post_recv_buf_count) == 0) && + (atomic_read(&ib_conn->post_send_buf_count) == 0)) { + ib_conn->state = ISER_CONN_DOWN; + wake_up_interruptible(&ib_conn->wait); + } +} + +static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) +{ + int ret = 0; + + iser_err("event %d conn %p id %p\n",event->event,cma_id->context,cma_id); + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + iser_addr_handler(cma_id); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + iser_route_handler(cma_id); + break; + case RDMA_CM_EVENT_ESTABLISHED: + iser_connected_handler(cma_id); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + iser_err("event: %d, error: %d\n", event->event, event->status); + iser_connect_error(cma_id); + break; + case RDMA_CM_EVENT_DISCONNECTED: + iser_disconnected_handler(cma_id); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + BUG(); + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + BUG(); + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + default: + break; + } + return ret; +} + +int iser_conn_init(struct iser_conn **ibconn) +{ + struct iser_conn *ib_conn; + + ib_conn = kzalloc(sizeof *ib_conn, GFP_KERNEL); + if (!ib_conn) { + iser_err("can't alloc memory for struct iser_conn\n"); + return -ENOMEM; + } + ib_conn->state = ISER_CONN_INIT; + init_waitqueue_head(&ib_conn->wait); + atomic_set(&ib_conn->post_recv_buf_count, 0); + atomic_set(&ib_conn->post_send_buf_count, 0); + INIT_WORK(&ib_conn->comperror_work, iser_comp_error_worker, + ib_conn); + INIT_LIST_HEAD(&ib_conn->conn_list); + spin_lock_init(&ib_conn->lock); + + *ibconn = ib_conn; + return 0; +} + + /** + * starts the process of connecting to the target + * sleeps untill the connection is established or rejected + */ +int iser_connect(struct iser_conn *ib_conn, + struct sockaddr_in *src_addr, + struct sockaddr_in *dst_addr, + int non_blocking) +{ + struct sockaddr *src, *dst; + int err = 0; + + sprintf(ib_conn->name,"%d.%d.%d.%d:%d", + NIPQUAD(dst_addr->sin_addr.s_addr), dst_addr->sin_port); + + /* the device is known only --after-- address resolution */ + ib_conn->device = NULL; + + iser_err("connecting to: %d.%d.%d.%d, port 0x%x\n", + NIPQUAD(dst_addr->sin_addr), dst_addr->sin_port); + + ib_conn->state = ISER_CONN_PENDING; + + ib_conn->cma_id = rdma_create_id(iser_cma_handler, + (void *)ib_conn, + RDMA_PS_TCP); + if (IS_ERR(ib_conn->cma_id)) { + err = PTR_ERR(ib_conn->cma_id); + iser_err("rdma_create_id failed: %d\n", err); + goto id_failure; + } + + src = (struct sockaddr *)src_addr; + dst = (struct sockaddr *)dst_addr; + err = rdma_resolve_addr(ib_conn->cma_id, src, dst, 1000); + if (err) { + iser_err("rdma_resolve_addr failed: %d\n", err); + goto addr_failure; + } + + if (!non_blocking) { + wait_event_interruptible(ib_conn->wait, + (ib_conn->state != ISER_CONN_PENDING)); + + if (ib_conn->state != ISER_CONN_UP) { + err = -EIO; + goto connect_failure; + } + } + + mutex_lock(&ig.connlist_mutex); + list_add(&ib_conn->conn_list, &ig.connlist); + mutex_unlock(&ig.connlist_mutex); + return 0; + +id_failure: + ib_conn->cma_id = NULL; +addr_failure: + ib_conn->state = ISER_CONN_DOWN; +connect_failure: + iser_conn_release(ib_conn); + return err; +} + +/** + * Frees all conn objects and deallocs conn descriptor + */ +void iser_conn_release(struct iser_conn *ib_conn) +{ + struct iser_device *device = ib_conn->device; + + BUG_ON(ib_conn->state != ISER_CONN_DOWN); + + mutex_lock(&ig.connlist_mutex); + list_del(&ib_conn->conn_list); + mutex_unlock(&ig.connlist_mutex); + + iser_free_ib_conn_res(ib_conn); + ib_conn->device = NULL; + /* on EVENT_ADDR_ERROR there's no device yet for this conn */ + if (device != NULL) + iser_device_try_release(device); + kfree(ib_conn); +} + + +/** + * iser_reg_page_vec - Register physical memory + * + * returns: 0 on success, errno code on failure + */ +int iser_reg_page_vec(struct iser_conn *ib_conn, + struct iser_page_vec *page_vec, + struct iser_mem_reg *mem_reg) +{ + struct ib_pool_fmr *mem; + u64 io_addr; + u64 *page_list; + int status; + + page_list = page_vec->pages; + io_addr = page_list[0]; + + mem = ib_fmr_pool_map_phys(ib_conn->fmr_pool, + page_list, + page_vec->length, + &io_addr); + + if (IS_ERR(mem)) { + status = (int)PTR_ERR(mem); + iser_err("ib_fmr_pool_map_phys failed: %d\n", status); + return status; + } + + mem_reg->lkey = mem->fmr->lkey; + mem_reg->rkey = mem->fmr->rkey; + mem_reg->len = page_vec->length * PAGE_SIZE; + mem_reg->va = io_addr; + mem_reg->mem_h = (void *)mem; + + mem_reg->va += page_vec->offset; + mem_reg->len = page_vec->data_size; + + iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, " + "entry[0]: (0x%08lx,%ld)] -> " + "[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n", + page_vec, page_vec->length, + (unsigned long)page_vec->pages[0], + (unsigned long)page_vec->data_size, + (unsigned int)mem_reg->lkey, mem_reg->mem_h, + (unsigned long)mem_reg->va, (unsigned long)mem_reg->len); + return 0; +} + +/** + * Unregister (previosuly registered) memory. + */ +void iser_unreg_mem(struct iser_mem_reg *reg) +{ + int ret; + + iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); + + ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); + if (ret) + iser_err("ib_fmr_pool_unmap failed %d\n", ret); + + reg->mem_h = NULL; +} + +/** + * iser_dto_to_iov - builds IOV from a dto descriptor + */ +static void iser_dto_to_iov(struct iser_dto *dto, struct ib_sge *iov, int iov_len) +{ + int i; + struct ib_sge *sge; + struct iser_regd_buf *regd_buf; + + if (dto->regd_vector_len > iov_len) { + iser_err("iov size %d too small for posting dto of len %d\n", + iov_len, dto->regd_vector_len); + BUG(); + } + + for (i = 0; i < dto->regd_vector_len; i++) { + sge = &iov[i]; + regd_buf = dto->regd[i]; + + sge->addr = regd_buf->reg.va; + sge->length = regd_buf->reg.len; + sge->lkey = regd_buf->reg.lkey; + + if (dto->used_sz[i] > 0) /* Adjust size */ + sge->length = dto->used_sz[i]; + + /* offset and length should not exceed the regd buf length */ + if (sge->length + dto->offset[i] > regd_buf->reg.len) { + iser_err("Used len:%ld + offset:%d, exceed reg.buf.len:" + "%ld in dto:0x%p [%d], va:0x%08lX\n", + (unsigned long)sge->length, dto->offset[i], + (unsigned long)regd_buf->reg.len, dto, i, + (unsigned long)sge->addr); + BUG(); + } + + sge->addr += dto->offset[i]; /* Adjust offset */ + } +} + +/** + * iser_post_recv - Posts a receive buffer. + * + * returns 0 on success, -1 on failure + */ +int iser_post_recv(struct iser_desc *rx_desc) +{ + int ib_ret, ret_val = 0; + struct ib_recv_wr recv_wr, *recv_wr_failed; + struct ib_sge iov[2]; + struct iser_conn *ib_conn; + struct iser_dto *recv_dto = &rx_desc->dto; + + /* Retrieve conn */ + ib_conn = recv_dto->conn->ib_conn; + + iser_dto_to_iov(recv_dto, iov, 2); + + recv_wr.next = NULL; + recv_wr.sg_list = iov; + recv_wr.num_sge = recv_dto->regd_vector_len; + recv_wr.wr_id = (unsigned long)rx_desc; + + atomic_inc(&ib_conn->post_recv_buf_count); + ib_ret = ib_post_recv(ib_conn->qp, &recv_wr, &recv_wr_failed); + if (ib_ret) { + iser_err("ib_post_recv failed ret=%d\n", ib_ret); + atomic_dec(&ib_conn->post_recv_buf_count); + ret_val = -1; + } + + return ret_val; +} + +/** + * iser_start_send - Initiate a Send DTO operation + * + * returns 0 on success, -1 on failure + */ +int iser_post_send(struct iser_desc *tx_desc) +{ + int ib_ret, ret_val = 0; + struct ib_send_wr send_wr, *send_wr_failed; + struct ib_sge iov[MAX_REGD_BUF_VECTOR_LEN]; + struct iser_conn *ib_conn; + struct iser_dto *dto = &tx_desc->dto; + + ib_conn = dto->conn->ib_conn; + + iser_dto_to_iov(dto, iov, MAX_REGD_BUF_VECTOR_LEN); + + send_wr.next = NULL; + send_wr.wr_id = (unsigned long)tx_desc; + send_wr.sg_list = iov; + send_wr.num_sge = dto->regd_vector_len; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = dto->notify_enable ? IB_SEND_SIGNALED : 0; + + atomic_inc(&ib_conn->post_send_buf_count); + + ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); + if (ib_ret) { + iser_err("Failed to start SEND DTO, dto: 0x%p, IOV len: %d\n", + dto, dto->regd_vector_len); + iser_err("ib_post_send failed, ret:%d\n", ib_ret); + atomic_dec(&ib_conn->post_send_buf_count); + ret_val = -1; + } + + return ret_val; +} + +static void iser_comp_error_worker(void *data) +{ + struct iser_conn *ib_conn = data; + + /* getting here when the state is UP means that the conn is being * + * terminated asynchronously from the iSCSI layer's perspective. */ + if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, + ISER_CONN_TERMINATING)) + iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + + /* complete the termination process if disconnect event was delivered * + * note there are no more non completed posts to the QP */ + if (ib_conn->disc_evt_flag) { + ib_conn->state = ISER_CONN_DOWN; + wake_up_interruptible(&ib_conn->wait); + } +} + +static void iser_handle_comp_error(struct iser_desc *desc) +{ + struct iser_dto *dto = &desc->dto; + struct iser_conn *ib_conn = dto->conn->ib_conn; + + iser_dto_buffs_release(dto); + + if (desc->type == ISCSI_RX) { + kfree(desc->data); + kmem_cache_free(ig.desc_cache, desc); + atomic_dec(&ib_conn->post_recv_buf_count); + } else { /* type is TX control/command/dataout */ + if (desc->type == ISCSI_TX_DATAOUT) + kmem_cache_free(ig.desc_cache, desc); + atomic_dec(&ib_conn->post_send_buf_count); + } + + if (atomic_read(&ib_conn->post_recv_buf_count) == 0 && + atomic_read(&ib_conn->post_send_buf_count) == 0) + schedule_work(&ib_conn->comperror_work); +} + +static void iser_cq_tasklet_fn(unsigned long data) +{ + struct iser_device *device = (struct iser_device *)data; + struct ib_cq *cq = device->cq; + struct ib_wc wc; + struct iser_desc *desc; + unsigned long xfer_len; + + while (ib_poll_cq(cq, 1, &wc) == 1) { + desc = (struct iser_desc *) (unsigned long) wc.wr_id; + BUG_ON(desc == NULL); + + if (wc.status == IB_WC_SUCCESS) { + if (desc->type == ISCSI_RX) { + xfer_len = (unsigned long)wc.byte_len; + iser_rcv_completion(desc, xfer_len); + } else /* type == ISCSI_TX_CONTROL/SCSI_CMD/DOUT */ + iser_snd_completion(desc); + } else { + iser_err("comp w. error op %d status %d\n",desc->type,wc.status); + iser_handle_comp_error(desc); + } + } + /* #warning "it is assumed here that arming CQ only once its empty" * + * " would not cause interrupts to be missed" */ + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +} + +static void iser_cq_callback(struct ib_cq *cq, void *cq_context) +{ + struct iser_device *device = (struct iser_device *)cq_context; + + tasklet_schedule(&device->cq_tasklet); +} |