diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-03-26 15:54:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-03-26 15:54:36 -0700 |
commit | 13220a94d35708d5378114e96ffcc88d0a74fe99 (patch) | |
tree | be6530677d5f9536c7211e05ba012923e4c0b307 /net/rds/ib.c | |
parent | 8690d8a9f6c2d5728a9c9f68231f1bb4de109e3a (diff) | |
parent | 08abe18af1f78ee80c3c3a5ac47c3e0ae0beadf6 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1750 commits)
ixgbe: Allow Priority Flow Control settings to survive a device reset
net: core: remove unneeded include in net/core/utils.c.
e1000e: update version number
e1000e: fix close interrupt race
e1000e: fix loss of multicast packets
e1000e: commonize tx cleanup routine to match e1000 & igb
netfilter: fix nf_logger name in ebt_ulog.
netfilter: fix warning in ebt_ulog init function.
netfilter: fix warning about invalid const usage
e1000: fix close race with interrupt
e1000: cleanup clean_tx_irq routine so that it completely cleans ring
e1000: fix tx hang detect logic and address dma mapping issues
bridge: bad error handling when adding invalid ether address
bonding: select current active slave when enslaving device for mode tlb and alb
gianfar: reallocate skb when headroom is not enough for fcb
Bump release date to 25Mar2009 and version to 0.22
r6040: Fix second PHY address
qeth: fix wait_event_timeout handling
qeth: check for completion of a running recovery
qeth: unregister MAC addresses during recovery.
...
Manually fixed up conflicts in:
drivers/infiniband/hw/cxgb3/cxio_hal.h
drivers/infiniband/hw/nes/nes_nic.c
Diffstat (limited to 'net/rds/ib.c')
-rw-r--r-- | net/rds/ib.c | 323 |
1 files changed, 323 insertions, 0 deletions
diff --git a/net/rds/ib.c b/net/rds/ib.c new file mode 100644 index 00000000000..06a7b798d9a --- /dev/null +++ b/net/rds/ib.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2006 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include <linux/kernel.h> +#include <linux/in.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/if_arp.h> +#include <linux/delay.h> + +#include "rds.h" +#include "ib.h" + +unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; +unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ + +module_param(fmr_pool_size, int, 0444); +MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); +module_param(fmr_message_size, int, 0444); +MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); + +struct list_head rds_ib_devices; + +DEFINE_SPINLOCK(ib_nodev_conns_lock); +LIST_HEAD(ib_nodev_conns); + +void rds_ib_add_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + struct ib_device_attr *dev_attr; + + /* Only handle IB (no iWARP) devices */ + if (device->node_type != RDMA_NODE_IB_CA) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); + if (!rds_ibdev) + goto free_attr; + + spin_lock_init(&rds_ibdev->spinlock); + + rds_ibdev->max_wrs = dev_attr->max_qp_wr; + rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + + rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); + rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift; + rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1); + rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; + rds_ibdev->max_fmrs = dev_attr->max_fmr ? + min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : + fmr_pool_size; + + rds_ibdev->dev = device; + rds_ibdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_ibdev->pd)) + goto free_dev; + + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) + goto err_pd; + + rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); + if (IS_ERR(rds_ibdev->mr_pool)) { + rds_ibdev->mr_pool = NULL; + goto err_mr; + } + + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + list_add_tail(&rds_ibdev->list, &rds_ib_devices); + + ib_set_client_data(device, &rds_ib_client, rds_ibdev); + + goto free_attr; + +err_mr: + ib_dereg_mr(rds_ibdev->mr); +err_pd: + ib_dealloc_pd(rds_ibdev->pd); +free_dev: + kfree(rds_ibdev); +free_attr: + kfree(dev_attr); +} + +void rds_ib_remove_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + struct rds_ib_ipaddr *i_ipaddr, *i_next; + + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (!rds_ibdev) + return; + + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + } + + rds_ib_remove_conns(rds_ibdev); + + if (rds_ibdev->mr_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + + ib_dereg_mr(rds_ibdev->mr); + + while (ib_dealloc_pd(rds_ibdev->pd)) { + rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); + msleep(1); + } + + list_del(&rds_ibdev->list); + kfree(rds_ibdev); +} + +struct ib_client rds_ib_client = { + .name = "rds_ib", + .add = rds_ib_add_one, + .remove = rds_ib_remove_one +}; + +static int rds_ib_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds_info_rdma_connection *iinfo = buffer; + struct rds_ib_connection *ic; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_ib_transport) + return 0; + + iinfo->src_addr = conn->c_laddr; + iinfo->dst_addr = conn->c_faddr; + + memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); + memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_ib_device *rds_ibdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); + ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + iinfo->max_send_wr = ic->i_send_ring.w_nr; + iinfo->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo->max_send_sge = rds_ibdev->max_sge; + rds_ib_get_mr_info(rds_ibdev, iinfo); + } + return 1; +} + +static void rds_ib_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds_ib_conn_info_visitor, + sizeof(struct rds_info_rdma_connection)); +} + + +/* + * Early RDS/IB was built to only bind to an address if there is an IPoIB + * device with that address set. + * + * If it were me, I'd advocate for something more flexible. Sending and + * receiving should be device-agnostic. Transports would try and maintain + * connections between peers who have messages queued. Userspace would be + * allowed to influence which paths have priority. We could call userspace + * asserting this policy "routing". + */ +static int rds_ib_laddr_check(__be32 addr) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin; + + /* Create a CMA ID and try to bind it. This catches both + * IB and iWARP capable NICs. + */ + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + if (!cm_id) + return -EADDRNOTAVAIL; + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + + /* rdma_bind_addr will only succeed for IB & iWARP devices */ + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + /* due to this, we will claim to support iWARP devices unless we + check node_type. */ + if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) + ret = -EADDRNOTAVAIL; + + rdsdebug("addr %pI4 ret %d node type %d\n", + &addr, ret, + cm_id->device ? cm_id->device->node_type : -1); + + rdma_destroy_id(cm_id); + + return ret; +} + +void rds_ib_exit(void) +{ + rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_ib_remove_nodev_conns(); + ib_unregister_client(&rds_ib_client); + rds_ib_sysctl_exit(); + rds_ib_recv_exit(); + rds_trans_unregister(&rds_ib_transport); +} + +struct rds_transport rds_ib_transport = { + .laddr_check = rds_ib_laddr_check, + .xmit_complete = rds_ib_xmit_complete, + .xmit = rds_ib_xmit, + .xmit_cong_map = NULL, + .xmit_rdma = rds_ib_xmit_rdma, + .recv = rds_ib_recv, + .conn_alloc = rds_ib_conn_alloc, + .conn_free = rds_ib_conn_free, + .conn_connect = rds_ib_conn_connect, + .conn_shutdown = rds_ib_conn_shutdown, + .inc_copy_to_user = rds_ib_inc_copy_to_user, + .inc_purge = rds_ib_inc_purge, + .inc_free = rds_ib_inc_free, + .cm_initiate_connect = rds_ib_cm_initiate_connect, + .cm_handle_connect = rds_ib_cm_handle_connect, + .cm_connect_complete = rds_ib_cm_connect_complete, + .stats_info_copy = rds_ib_stats_info_copy, + .exit = rds_ib_exit, + .get_mr = rds_ib_get_mr, + .sync_mr = rds_ib_sync_mr, + .free_mr = rds_ib_free_mr, + .flush_mrs = rds_ib_flush_mrs, + .t_owner = THIS_MODULE, + .t_name = "infiniband", +}; + +int __init rds_ib_init(void) +{ + int ret; + + INIT_LIST_HEAD(&rds_ib_devices); + + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out; + + ret = rds_ib_sysctl_init(); + if (ret) + goto out_ibreg; + + ret = rds_ib_recv_init(); + if (ret) + goto out_sysctl; + + ret = rds_trans_register(&rds_ib_transport); + if (ret) + goto out_recv; + + rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + + goto out; + +out_recv: + rds_ib_recv_exit(); +out_sysctl: + rds_ib_sysctl_exit(); +out_ibreg: + ib_unregister_client(&rds_ib_client); +out: + return ret; +} + +MODULE_LICENSE("GPL"); + |