summaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-13 18:13:19 -0600
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-13 18:13:19 -0600
commit06b8ab55289345ab191bf4bf0e4acc6d4bdf293d (patch)
tree9af9215097e26c026f30a58c6ca3092ec15d1e1e /net/sunrpc
parentdc1cc85133120e49c223f36aa77d398b8abac727 (diff)
parent71a6ec8ac587418ceb6b420def1ca44b334c1ff7 (diff)
Merge tag 'nfs-for-3.17-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust: "Highlights include: - stable fix for a bug in nfs3_list_one_acl() - speed up NFS path walks by supporting LOOKUP_RCU - more read/write code cleanups - pNFS fixes for layout return on close - fixes for the RCU handling in the rpcsec_gss code - more NFS/RDMA fixes" * tag 'nfs-for-3.17-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (79 commits) nfs: reject changes to resvport and sharecache during remount NFS: Avoid infinite loop when RELEASE_LOCKOWNER getting expired error SUNRPC: remove all refcounting of groupinfo from rpcauth_lookupcred NFS: fix two problems in lookup_revalidate in RCU-walk NFS: allow lockless access to access_cache NFS: teach nfs_lookup_verify_inode to handle LOOKUP_RCU NFS: teach nfs_neg_need_reval to understand LOOKUP_RCU NFS: support RCU_WALK in nfs_permission() sunrpc/auth: allow lockless (rcu) lookup of credential cache. NFS: prepare for RCU-walk support but pushing tests later in code. NFS: nfs4_lookup_revalidate: only evaluate parent if it will be used. NFS: add checks for returned value of try_module_get() nfs: clear_request_commit while holding i_lock pnfs: add pnfs_put_lseg_async pnfs: find swapped pages on pnfs commit lists too nfs: fix comment and add warn_on for PG_INODE_REF nfs: check wait_on_bit_lock err in page_group_lock sunrpc: remove "ec" argument from encrypt_v2 operation sunrpc: clean up sparse endianness warnings in gss_krb5_wrap.c sunrpc: clean up sparse endianness warnings in gss_krb5_seal.c ...
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/addr.c16
-rw-r--r--net/sunrpc/auth.c68
-rw-r--r--net/sunrpc/auth_generic.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c126
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c9
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c28
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c20
-rw-r--r--net/sunrpc/auth_null.c2
-rw-r--r--net/sunrpc/clnt.c5
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rw-r--r--net/sunrpc/xprt.c1
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c83
-rw-r--r--net/sunrpc/xprtrdma/transport.c17
-rw-r--r--net/sunrpc/xprtrdma/verbs.c739
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h61
-rw-r--r--net/sunrpc/xprtsock.c9
16 files changed, 809 insertions, 383 deletions
diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c
index a622ad64acd..2e0a6f92e56 100644
--- a/net/sunrpc/addr.c
+++ b/net/sunrpc/addr.c
@@ -176,7 +176,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
len = (buf + buflen) - delim - 1;
p = kstrndup(delim + 1, len, GFP_KERNEL);
if (p) {
- unsigned long scope_id = 0;
+ u32 scope_id = 0;
struct net_device *dev;
dev = dev_get_by_name(net, p);
@@ -184,7 +184,7 @@ static int rpc_parse_scope_id(struct net *net, const char *buf,
scope_id = dev->ifindex;
dev_put(dev);
} else {
- if (strict_strtoul(p, 10, &scope_id) == 0) {
+ if (kstrtou32(p, 10, &scope_id) == 0) {
kfree(p);
return 0;
}
@@ -304,7 +304,7 @@ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags)
* @sap: buffer into which to plant socket address
* @salen: size of buffer
*
- * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and
+ * @uaddr does not have to be '\0'-terminated, but kstrtou8() and
* rpc_pton() require proper string termination to be successful.
*
* Returns the size of the socket address if successful; otherwise
@@ -315,7 +315,7 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
const size_t salen)
{
char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')];
- unsigned long portlo, porthi;
+ u8 portlo, porthi;
unsigned short port;
if (uaddr_len > RPCBIND_MAXUADDRLEN)
@@ -327,18 +327,14 @@ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr,
c = strrchr(buf, '.');
if (unlikely(c == NULL))
return 0;
- if (unlikely(strict_strtoul(c + 1, 10, &portlo) != 0))
- return 0;
- if (unlikely(portlo > 255))
+ if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0))
return 0;
*c = '\0';
c = strrchr(buf, '.');
if (unlikely(c == NULL))
return 0;
- if (unlikely(strict_strtoul(c + 1, 10, &porthi) != 0))
- return 0;
- if (unlikely(porthi > 255))
+ if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0))
return 0;
port = (unsigned short)((porthi << 8) | portlo);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index f7736671742..383eb919ac0 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -48,7 +48,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
if (!val)
goto out_inval;
- ret = strict_strtoul(val, 0, &num);
+ ret = kstrtoul(val, 0, &num);
if (ret == -EINVAL)
goto out_inval;
nbits = fls(num);
@@ -80,6 +80,10 @@ static struct kernel_param_ops param_ops_hashtbl_sz = {
module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644);
MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size");
+static unsigned long auth_max_cred_cachesize = ULONG_MAX;
+module_param(auth_max_cred_cachesize, ulong, 0644);
+MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size");
+
static u32
pseudoflavor_to_flavor(u32 flavor) {
if (flavor > RPC_AUTH_MAXFLAVOR)
@@ -363,6 +367,15 @@ rpcauth_cred_key_to_expire(struct rpc_cred *cred)
}
EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire);
+char *
+rpcauth_stringify_acceptor(struct rpc_cred *cred)
+{
+ if (!cred->cr_ops->crstringify_acceptor)
+ return NULL;
+ return cred->cr_ops->crstringify_acceptor(cred);
+}
+EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor);
+
/*
* Destroy a list of credentials
*/
@@ -472,6 +485,20 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
return freed;
}
+static unsigned long
+rpcauth_cache_do_shrink(int nr_to_scan)
+{
+ LIST_HEAD(free);
+ unsigned long freed;
+
+ spin_lock(&rpc_credcache_lock);
+ freed = rpcauth_prune_expired(&free, nr_to_scan);
+ spin_unlock(&rpc_credcache_lock);
+ rpcauth_destroy_credlist(&free);
+
+ return freed;
+}
+
/*
* Run memory cache shrinker.
*/
@@ -479,9 +506,6 @@ static unsigned long
rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
- LIST_HEAD(free);
- unsigned long freed;
-
if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
return SHRINK_STOP;
@@ -489,12 +513,7 @@ rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
if (list_empty(&cred_unused))
return SHRINK_STOP;
- spin_lock(&rpc_credcache_lock);
- freed = rpcauth_prune_expired(&free, sc->nr_to_scan);
- spin_unlock(&rpc_credcache_lock);
- rpcauth_destroy_credlist(&free);
-
- return freed;
+ return rpcauth_cache_do_shrink(sc->nr_to_scan);
}
static unsigned long
@@ -504,6 +523,21 @@ rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
}
+static void
+rpcauth_cache_enforce_limit(void)
+{
+ unsigned long diff;
+ unsigned int nr_to_scan;
+
+ if (number_cred_unused <= auth_max_cred_cachesize)
+ return;
+ diff = number_cred_unused - auth_max_cred_cachesize;
+ nr_to_scan = 100;
+ if (diff < nr_to_scan)
+ nr_to_scan = diff;
+ rpcauth_cache_do_shrink(nr_to_scan);
+}
+
/*
* Look up a process' credentials in the authentication cache
*/
@@ -523,6 +557,12 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
if (!entry->cr_ops->crmatch(acred, entry, flags))
continue;
+ if (flags & RPCAUTH_LOOKUP_RCU) {
+ if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) &&
+ !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags))
+ cred = entry;
+ break;
+ }
spin_lock(&cache->lock);
if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) {
spin_unlock(&cache->lock);
@@ -537,6 +577,9 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
if (cred != NULL)
goto found;
+ if (flags & RPCAUTH_LOOKUP_RCU)
+ return ERR_PTR(-ECHILD);
+
new = auth->au_ops->crcreate(auth, acred, flags);
if (IS_ERR(new)) {
cred = new;
@@ -557,6 +600,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
} else
list_add_tail(&new->cr_lru, &free);
spin_unlock(&cache->lock);
+ rpcauth_cache_enforce_limit();
found:
if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) &&
cred->cr_ops->cr_init != NULL &&
@@ -586,10 +630,8 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags)
memset(&acred, 0, sizeof(acred));
acred.uid = cred->fsuid;
acred.gid = cred->fsgid;
- acred.group_info = get_group_info(((struct cred *)cred)->group_info);
-
+ acred.group_info = cred->group_info;
ret = auth->au_ops->lookup_cred(auth, &acred, flags);
- put_group_info(acred.group_info);
return ret;
}
EXPORT_SYMBOL_GPL(rpcauth_lookupcred);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index ed04869b2d4..6f6b829c9e8 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -38,6 +38,12 @@ struct rpc_cred *rpc_lookup_cred(void)
}
EXPORT_SYMBOL_GPL(rpc_lookup_cred);
+struct rpc_cred *rpc_lookup_cred_nonblock(void)
+{
+ return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU);
+}
+EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock);
+
/*
* Public call interface for looking up machine creds.
*/
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index b6e440baccc..afb292cd797 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -183,8 +183,9 @@ gss_cred_get_ctx(struct rpc_cred *cred)
struct gss_cl_ctx *ctx = NULL;
rcu_read_lock();
- if (gss_cred->gc_ctx)
- ctx = gss_get_ctx(gss_cred->gc_ctx);
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (ctx)
+ gss_get_ctx(ctx);
rcu_read_unlock();
return ctx;
}
@@ -262,9 +263,22 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
p = ERR_PTR(ret);
goto err;
}
- dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u\n",
- __func__, ctx->gc_expiry, now, timeout);
- return q;
+
+ /* is there any trailing data? */
+ if (q == end) {
+ p = q;
+ goto done;
+ }
+
+ /* pull in acceptor name (if there is one) */
+ p = simple_get_netobj(q, end, &ctx->gc_acceptor);
+ if (IS_ERR(p))
+ goto err;
+done:
+ dprintk("RPC: %s Success. gc_expiry %lu now %lu timeout %u acceptor %.*s\n",
+ __func__, ctx->gc_expiry, now, timeout, ctx->gc_acceptor.len,
+ ctx->gc_acceptor.data);
+ return p;
err:
dprintk("RPC: %s returns error %ld\n", __func__, -PTR_ERR(p));
return p;
@@ -1194,13 +1208,13 @@ gss_destroying_context(struct rpc_cred *cred)
{
struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
+ struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
struct rpc_task *task;
- if (gss_cred->gc_ctx == NULL ||
- test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
+ if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) == 0)
return 0;
- gss_cred->gc_ctx->gc_proc = RPC_GSS_PROC_DESTROY;
+ ctx->gc_proc = RPC_GSS_PROC_DESTROY;
cred->cr_ops = &gss_nullops;
/* Take a reference to ensure the cred will be destroyed either
@@ -1225,6 +1239,7 @@ gss_do_free_ctx(struct gss_cl_ctx *ctx)
gss_delete_sec_context(&ctx->gc_gss_ctx);
kfree(ctx->gc_wire_ctx.data);
+ kfree(ctx->gc_acceptor.data);
kfree(ctx);
}
@@ -1260,7 +1275,7 @@ gss_destroy_nullcred(struct rpc_cred *cred)
{
struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
struct gss_auth *gss_auth = container_of(cred->cr_auth, struct gss_auth, rpc_auth);
- struct gss_cl_ctx *ctx = gss_cred->gc_ctx;
+ struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1);
RCU_INIT_POINTER(gss_cred->gc_ctx, NULL);
call_rcu(&cred->cr_rcu, gss_free_cred_callback);
@@ -1332,6 +1347,36 @@ gss_cred_init(struct rpc_auth *auth, struct rpc_cred *cred)
return err;
}
+static char *
+gss_stringify_acceptor(struct rpc_cred *cred)
+{
+ char *string = NULL;
+ struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx;
+ struct xdr_netobj *acceptor;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (!ctx)
+ goto out;
+
+ acceptor = &ctx->gc_acceptor;
+
+ /* no point if there's no string */
+ if (!acceptor->len)
+ goto out;
+
+ string = kmalloc(acceptor->len + 1, GFP_KERNEL);
+ if (!string)
+ goto out;
+
+ memcpy(string, acceptor->data, acceptor->len);
+ string[acceptor->len] = '\0';
+out:
+ rcu_read_unlock();
+ return string;
+}
+
/*
* Returns -EACCES if GSS context is NULL or will expire within the
* timeout (miliseconds)
@@ -1340,15 +1385,16 @@ static int
gss_key_timeout(struct rpc_cred *rc)
{
struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx;
unsigned long now = jiffies;
unsigned long expire;
- if (gss_cred->gc_ctx == NULL)
- return -EACCES;
-
- expire = gss_cred->gc_ctx->gc_expiry - (gss_key_expire_timeo * HZ);
-
- if (time_after(now, expire))
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (ctx)
+ expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+ rcu_read_unlock();
+ if (!ctx || time_after(now, expire))
return -EACCES;
return 0;
}
@@ -1357,13 +1403,19 @@ static int
gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
{
struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx;
int ret;
if (test_bit(RPCAUTH_CRED_NEW, &rc->cr_flags))
goto out;
/* Don't match with creds that have expired. */
- if (time_after(jiffies, gss_cred->gc_ctx->gc_expiry))
+ rcu_read_lock();
+ ctx = rcu_dereference(gss_cred->gc_ctx);
+ if (!ctx || time_after(jiffies, ctx->gc_expiry)) {
+ rcu_read_unlock();
return 0;
+ }
+ rcu_read_unlock();
if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
return 0;
out:
@@ -1909,29 +1961,31 @@ static const struct rpc_authops authgss_ops = {
};
static const struct rpc_credops gss_credops = {
- .cr_name = "AUTH_GSS",
- .crdestroy = gss_destroy_cred,
- .cr_init = gss_cred_init,
- .crbind = rpcauth_generic_bind_cred,
- .crmatch = gss_match,
- .crmarshal = gss_marshal,
- .crrefresh = gss_refresh,
- .crvalidate = gss_validate,
- .crwrap_req = gss_wrap_req,
- .crunwrap_resp = gss_unwrap_resp,
- .crkey_timeout = gss_key_timeout,
+ .cr_name = "AUTH_GSS",
+ .crdestroy = gss_destroy_cred,
+ .cr_init = gss_cred_init,
+ .crbind = rpcauth_generic_bind_cred,
+ .crmatch = gss_match,
+ .crmarshal = gss_marshal,
+ .crrefresh = gss_refresh,
+ .crvalidate = gss_validate,
+ .crwrap_req = gss_wrap_req,
+ .crunwrap_resp = gss_unwrap_resp,
+ .crkey_timeout = gss_key_timeout,
+ .crstringify_acceptor = gss_stringify_acceptor,
};
static const struct rpc_credops gss_nullops = {
- .cr_name = "AUTH_GSS",
- .crdestroy = gss_destroy_nullcred,
- .crbind = rpcauth_generic_bind_cred,
- .crmatch = gss_match,
- .crmarshal = gss_marshal,
- .crrefresh = gss_refresh_null,
- .crvalidate = gss_validate,
- .crwrap_req = gss_wrap_req,
- .crunwrap_resp = gss_unwrap_resp,
+ .cr_name = "AUTH_GSS",
+ .crdestroy = gss_destroy_nullcred,
+ .crbind = rpcauth_generic_bind_cred,
+ .crmatch = gss_match,
+ .crmarshal = gss_marshal,
+ .crrefresh = gss_refresh_null,
+ .crvalidate = gss_validate,
+ .crwrap_req = gss_wrap_req,
+ .crunwrap_resp = gss_unwrap_resp,
+ .crstringify_acceptor = gss_stringify_acceptor,
};
static const struct rpc_pipe_ops gss_upcall_ops_v0 = {
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 0f43e894bc0..f5ed9f6ece0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -641,7 +641,7 @@ out:
u32
gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
- struct xdr_buf *buf, int ec, struct page **pages)
+ struct xdr_buf *buf, struct page **pages)
{
u32 err;
struct xdr_netobj hmac;
@@ -684,13 +684,8 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
ecptr = buf->tail[0].iov_base;
}
- memset(ecptr, 'X', ec);
- buf->tail[0].iov_len += ec;
- buf->len += ec;
-
/* copy plaintext gss token header after filler (if any) */
- memcpy(ecptr + ec, buf->head[0].iov_base + offset,
- GSS_KRB5_TOK_HDR_LEN);
+ memcpy(ecptr, buf->head[0].iov_base + offset, GSS_KRB5_TOK_HDR_LEN);
buf->tail[0].iov_len += GSS_KRB5_TOK_HDR_LEN;
buf->len += GSS_KRB5_TOK_HDR_LEN;
diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 62ae3273186..42768e5c399 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -70,31 +70,37 @@
DEFINE_SPINLOCK(krb5_seq_lock);
-static char *
+static void *
setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token)
{
- __be16 *ptr, *krb5_hdr;
+ u16 *ptr;
+ void *krb5_hdr;
int body_size = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
token->len = g_token_size(&ctx->mech_used, body_size);
- ptr = (__be16 *)token->data;
+ ptr = (u16 *)token->data;
g_make_token_header(&ctx->mech_used, body_size, (unsigned char **)&ptr);
/* ptr now at start of header described in rfc 1964, section 1.2.1: */
krb5_hdr = ptr;
*ptr++ = KG_TOK_MIC_MSG;
- *ptr++ = cpu_to_le16(ctx->gk5e->signalg);
+ /*
+ * signalg is stored as if it were converted from LE to host endian, even
+ * though it's an opaque pair of bytes according to the RFC.
+ */
+ *ptr++ = (__force u16)cpu_to_le16(ctx->gk5e->signalg);
*ptr++ = SEAL_ALG_NONE;
- *ptr++ = 0xffff;
+ *ptr = 0xffff;
- return (char *)krb5_hdr;
+ return krb5_hdr;
}
static void *
setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
{
- __be16 *ptr, *krb5_hdr;
+ u16 *ptr;
+ void *krb5_hdr;
u8 *p, flags = 0x00;
if ((ctx->flags & KRB5_CTX_FLAG_INITIATOR) == 0)
@@ -104,15 +110,15 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token)
/* Per rfc 4121, sec 4.2.6.1, there is no header,
* just start the token */
- krb5_hdr = ptr = (__be16 *)token->data;
+ krb5_hdr = ptr = (u16 *)token->data;
*ptr++ = KG2_TOK_MIC;
p = (u8 *)ptr;
*p++ = flags;
*p++ = 0xff;
- ptr = (__be16 *)p;
- *ptr++ = 0xffff;
+ ptr = (u16 *)p;
*ptr++ = 0xffff;
+ *ptr = 0xffff;
token->len = GSS_KRB5_TOK_HDR_LEN + ctx->gk5e->cksumlength;
return krb5_hdr;
@@ -181,7 +187,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
spin_lock(&krb5_seq_lock);
seq_send = ctx->seq_send64++;
spin_unlock(&krb5_seq_lock);
- *((u64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
+ *((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
if (ctx->initiate) {
cksumkey = ctx->initiator_sign;
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 42560e55d97..4b614c604fe 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -201,9 +201,15 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
msg_start = ptr + GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength;
- *(__be16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
- memset(ptr + 4, 0xff, 4);
- *(__be16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+ /*
+ * signalg and sealalg are stored as if they were converted from LE
+ * to host endian, even though they're opaque pairs of bytes according
+ * to the RFC.
+ */
+ *(__le16 *)(ptr + 2) = cpu_to_le16(kctx->gk5e->signalg);
+ *(__le16 *)(ptr + 4) = cpu_to_le16(kctx->gk5e->sealalg);
+ ptr[6] = 0xff;
+ ptr[7] = 0xff;
gss_krb5_make_confounder(msg_start, conflen);
@@ -438,7 +444,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
u8 *ptr, *plainhdr;
s32 now;
u8 flags = 0x00;
- __be16 *be16ptr, ec = 0;
+ __be16 *be16ptr;
__be64 *be64ptr;
u32 err;
@@ -468,16 +474,16 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
be16ptr = (__be16 *)ptr;
blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
- *be16ptr++ = cpu_to_be16(ec);
+ *be16ptr++ = 0;
/* "inner" token header always uses 0 for RRC */
- *be16ptr++ = cpu_to_be16(0);
+ *be16ptr++ = 0;
be64ptr = (__be64 *)be16ptr;
spin_lock(&krb5_seq_lock);
*be64ptr = cpu_to_be64(kctx->seq_send64++);
spin_unlock(&krb5_seq_lock);
- err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, ec, pages);
+ err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages);
if (err)
return err;
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index f0ebe07978a..712c123e04e 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -35,6 +35,8 @@ nul_destroy(struct rpc_auth *auth)
static struct rpc_cred *
nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
{
+ if (flags & RPCAUTH_LOOKUP_RCU)
+ return &null_cred;
return get_rpccred(&null_cred);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2e6ab10734f..488ddeed936 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1746,6 +1746,7 @@ call_bind_status(struct rpc_task *task)
case -EHOSTDOWN:
case -EHOSTUNREACH:
case -ENETUNREACH:
+ case -ENOBUFS:
case -EPIPE:
dprintk("RPC: %5u remote rpcbind unreachable: %d\n",
task->tk_pid, task->tk_status);
@@ -1812,6 +1813,8 @@ call_connect_status(struct rpc_task *task)
case -ECONNABORTED:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -ENOBUFS:
+ case -EPIPE:
if (RPC_IS_SOFTCONN(task))
break;
/* retry with existing socket, after a delay */
@@ -1918,6 +1921,7 @@ call_transmit_status(struct rpc_task *task)
case -ECONNRESET:
case -ECONNABORTED:
case -ENOTCONN:
+ case -ENOBUFS:
case -EPIPE:
rpc_task_force_reencode(task);
}
@@ -2034,6 +2038,7 @@ call_status(struct rpc_task *task)
case -ECONNRESET:
case -ECONNABORTED:
rpc_force_rebind(clnt);
+ case -ENOBUFS:
rpc_delay(task, 3*HZ);
case -EPIPE:
case -ENOTCONN:
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index b1855489856..2d12b76b5a6 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -195,7 +195,7 @@ static struct inode *
rpc_alloc_inode(struct super_block *sb)
{
struct rpc_inode *rpci;
- rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
+ rpci = kmem_cache_alloc(rpc_inode_cachep, GFP_KERNEL);
if (!rpci)
return NULL;
return &rpci->vfs_inode;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 51c63165073..56e4e150e80 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -744,6 +744,7 @@ static void xprt_connect_status(struct rpc_task *task)
case -ECONNABORTED:
case -ENETUNREACH:
case -EHOSTUNREACH:
+ case -EPIPE:
case -EAGAIN:
dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
break;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 693966d3f33..6166c985fe2 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,14 +53,6 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-enum rpcrdma_chunktype {
- rpcrdma_noch = 0,
- rpcrdma_readch,
- rpcrdma_areadch,
- rpcrdma_writech,
- rpcrdma_replych
-};
-
#ifdef RPC_DEBUG
static const char transfertypes[][12] = {
"pure inline", /* no chunks */
@@ -279,13 +271,37 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
return (unsigned char *)iptr - (unsigned char *)headerp;
out:
- for (pos = 0; nchunks--;)
- pos += rpcrdma_deregister_external(
- &req->rl_segments[pos], r_xprt);
+ if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
+ for (pos = 0; nchunks--;)
+ pos += rpcrdma_deregister_external(
+ &req->rl_segments[pos], r_xprt);
+ }
return n;
}
/*
+ * Marshal chunks. This routine returns the header length
+ * consumed by marshaling.
+ *
+ * Returns positive RPC/RDMA header size, or negative errno.
+ */
+
+ssize_t
+rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
+{
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base;
+
+ if (req->rl_rtype != rpcrdma_noch)
+ result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+ headerp, req->rl_rtype);
+ else if (req->rl_wtype != rpcrdma_noch)
+ result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+ headerp, req->rl_wtype);
+ return result;
+}
+
+/*
* Copy write data inline.
* This function is used for "small" requests. Data which is passed
* to RPC via iovecs (or page list) is copied directly into the
@@ -377,7 +393,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
char *base;
size_t rpclen, padlen;
ssize_t hdrlen;
- enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
/*
@@ -415,13 +430,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* into pages; otherwise use reply chunks.
*/
if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
- wtype = rpcrdma_noch;
+ req->rl_wtype = rpcrdma_noch;
else if (rqst->rq_rcv_buf.page_len == 0)
- wtype = rpcrdma_replych;
+ req->rl_wtype = rpcrdma_replych;
else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
- wtype = rpcrdma_writech;
+ req->rl_wtype = rpcrdma_writech;
else
- wtype = rpcrdma_replych;
+ req->rl_wtype = rpcrdma_replych;
/*
* Chunks needed for arguments?
@@ -438,16 +453,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* TBD check NFSv4 setacl
*/
if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
- rtype = rpcrdma_noch;
+ req->rl_rtype = rpcrdma_noch;
else if (rqst->rq_snd_buf.page_len == 0)
- rtype = rpcrdma_areadch;
+ req->rl_rtype = rpcrdma_areadch;
else
- rtype = rpcrdma_readch;
+ req->rl_rtype = rpcrdma_readch;
/* The following simplification is not true forever */
- if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
- wtype = rpcrdma_noch;
- if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
+ if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
+ req->rl_wtype = rpcrdma_noch;
+ if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
__func__);
return -EIO;
@@ -461,7 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* When padding is in use and applies to the transfer, insert
* it and change the message type.
*/
- if (rtype == rpcrdma_noch) {
+ if (req->rl_rtype == rpcrdma_noch) {
padlen = rpcrdma_inline_pullup(rqst,
RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -476,7 +491,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
- if (wtype != rpcrdma_noch) {
+ if (req->rl_wtype != rpcrdma_noch) {
dprintk("RPC: %s: invalid chunk list\n",
__func__);
return -EIO;
@@ -497,30 +512,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* on receive. Therefore, we request a reply chunk
* for non-writes wherever feasible and efficient.
*/
- if (wtype == rpcrdma_noch)
- wtype = rpcrdma_replych;
+ if (req->rl_wtype == rpcrdma_noch)
+ req->rl_wtype = rpcrdma_replych;
}
}
- /*
- * Marshal chunks. This routine will return the header length
- * consumed by marshaling.
- */
- if (rtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst,
- &rqst->rq_snd_buf, headerp, rtype);
- wtype = rtype; /* simplify dprintk */
-
- } else if (wtype != rpcrdma_noch) {
- hdrlen = rpcrdma_create_chunks(rqst,
- &rqst->rq_rcv_buf, headerp, wtype);
- }
+ hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
if (hdrlen < 0)
return hdrlen;
dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+ __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
headerp, base, req->rl_iov.lkey);
/*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 66f91f0d071..2faac494056 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -296,7 +296,6 @@ xprt_setup_rdma(struct xprt_create *args)
xprt->resvport = 0; /* privileged port not needed */
xprt->tsh_size = 0; /* RPC-RDMA handles framing */
- xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE;
xprt->ops = &xprt_rdma_procs;
/*
@@ -382,6 +381,9 @@ xprt_setup_rdma(struct xprt_create *args)
new_ep->rep_xprt = xprt;
xprt_rdma_format_addresses(xprt);
+ xprt->max_payload = rpcrdma_max_payload(new_xprt);
+ dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
+ __func__, xprt->max_payload);
if (!try_module_get(THIS_MODULE))
goto out4;
@@ -412,7 +414,7 @@ xprt_rdma_close(struct rpc_xprt *xprt)
if (r_xprt->rx_ep.rep_connected > 0)
xprt->reestablish_timeout = 0;
xprt_disconnect_done(xprt);
- (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
}
static void
@@ -595,13 +597,14 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- int rc;
+ int rc = 0;
- if (req->rl_niovs == 0) {
+ if (req->rl_niovs == 0)
rc = rpcrdma_marshal_req(rqst);
- if (rc < 0)
- goto failed_marshal;
- }
+ else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+ rc = rpcrdma_marshal_chunks(rqst, 0);
+ if (rc < 0)
+ goto failed_marshal;
if (req->rl_reply == NULL) /* e.g. reconnection */
rpcrdma_recv_buffer_get(req);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 13dbd1c389f..61c41298b4e 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -61,6 +61,8 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
+static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
+
/*
* internal functions
*/
@@ -103,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data)
static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
-static inline void
-rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
- tasklet_schedule(&rpcrdma_tasklet_g);
-}
-
static void
rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
{
@@ -153,12 +144,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
if (wc->wr_id == 0ULL)
return;
if (wc->status != IB_WC_SUCCESS)
- return;
-
- if (wc->opcode == IB_WC_FAST_REG_MR)
- frmr->r.frmr.state = FRMR_IS_VALID;
- else if (wc->opcode == IB_WC_LOCAL_INV)
- frmr->r.frmr.state = FRMR_IS_INVALID;
+ frmr->r.frmr.fr_state = FRMR_IS_STALE;
}
static int
@@ -217,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
}
static void
-rpcrdma_recvcq_process_wc(struct ib_wc *wc)
+rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
{
struct rpcrdma_rep *rep =
(struct rpcrdma_rep *)(unsigned long)wc->wr_id;
@@ -248,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc)
}
out_schedule:
- rpcrdma_schedule_tasklet(rep);
+ list_add_tail(&rep->rr_list, sched_list);
}
static int
rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
{
+ struct list_head sched_list;
struct ib_wc *wcs;
int budget, count, rc;
+ unsigned long flags;
+ INIT_LIST_HEAD(&sched_list);
budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
do {
wcs = ep->rep_recv_wcs;
rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
if (rc <= 0)
- return rc;
+ goto out_schedule;
count = rc;
while (count-- > 0)
- rpcrdma_recvcq_process_wc(wcs++);
+ rpcrdma_recvcq_process_wc(wcs++, &sched_list);
} while (rc == RPCRDMA_POLLSIZE && --budget);
- return 0;
+ rc = 0;
+
+out_schedule:
+ spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
+ list_splice_tail(&sched_list, &rpcrdma_tasklets_g);
+ spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+ tasklet_schedule(&rpcrdma_tasklet_g);
+ return rc;
}
/*
@@ -310,6 +306,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
rpcrdma_recvcq_poll(cq, ep);
}
+static void
+rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
+{
+ rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep);
+ rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep);
+}
+
#ifdef RPC_DEBUG
static const char * const conn[] = {
"address resolved",
@@ -323,8 +326,16 @@ static const char * const conn[] = {
"rejected",
"established",
"disconnected",
- "device removal"
+ "device removal",
+ "multicast join",
+ "multicast error",
+ "address change",
+ "timewait exit",
};
+
+#define CONNECTION_MSG(status) \
+ ((status) < ARRAY_SIZE(conn) ? \
+ conn[(status)] : "unrecognized connection error")
#endif
static int
@@ -382,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_DEVICE_REMOVAL:
connstate = -ENODEV;
connected:
- dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
- __func__,
- (event->event <= 11) ? conn[event->event] :
- "unknown connection error",
- &addr->sin_addr.s_addr,
- ntohs(addr->sin_port),
- ep, event->event);
atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
dprintk("RPC: %s: %sconnected\n",
__func__, connstate > 0 ? "" : "dis");
ep->rep_connected = connstate;
ep->rep_func(ep);
wake_up_all(&ep->rep_connect_wait);
- break;
+ /*FALLTHROUGH*/
default:
- dprintk("RPC: %s: unexpected CM event %d\n",
- __func__, event->event);
+ dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n",
+ __func__, &addr->sin_addr.s_addr,
+ ntohs(addr->sin_port), ep,
+ CONNECTION_MSG(event->event));
break;
}
@@ -558,12 +564,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (!ia->ri_id->device->alloc_fmr) {
dprintk("RPC: %s: MTHCAFMR registration "
"not supported by HCA\n", __func__);
-#if RPCRDMA_PERSISTENT_REGISTRATION
memreg = RPCRDMA_ALLPHYSICAL;
-#else
- rc = -ENOMEM;
- goto out2;
-#endif
}
}
@@ -578,20 +579,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
switch (memreg) {
case RPCRDMA_FRMR:
break;
-#if RPCRDMA_PERSISTENT_REGISTRATION
case RPCRDMA_ALLPHYSICAL:
mem_priv = IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_READ;
goto register_setup;
-#endif
case RPCRDMA_MTHCAFMR:
if (ia->ri_have_dma_lkey)
break;
mem_priv = IB_ACCESS_LOCAL_WRITE;
-#if RPCRDMA_PERSISTENT_REGISTRATION
register_setup:
-#endif
ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
if (IS_ERR(ia->ri_bind_mem)) {
printk(KERN_ALERT "%s: ib_get_dma_mr for "
@@ -613,6 +610,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
/* Else will do memory reg/dereg for each chunk */
ia->ri_memreg_strategy = memreg;
+ rwlock_init(&ia->ri_qplock);
return 0;
out2:
rdma_destroy_id(ia->ri_id);
@@ -826,10 +824,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
cancel_delayed_work_sync(&ep->rep_connect_worker);
if (ia->ri_id->qp) {
- rc = rpcrdma_ep_disconnect(ep, ia);
- if (rc)
- dprintk("RPC: %s: rpcrdma_ep_disconnect"
- " returned %i\n", __func__, rc);
+ rpcrdma_ep_disconnect(ep, ia);
rdma_destroy_qp(ia->ri_id);
ia->ri_id->qp = NULL;
}
@@ -859,7 +854,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
int
rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
- struct rdma_cm_id *id;
+ struct rdma_cm_id *id, *old;
int rc = 0;
int retry_count = 0;
@@ -867,13 +862,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
struct rpcrdma_xprt *xprt;
retry:
dprintk("RPC: %s: reconnecting...\n", __func__);
- rc = rpcrdma_ep_disconnect(ep, ia);
- if (rc && rc != -ENOTCONN)
- dprintk("RPC: %s: rpcrdma_ep_disconnect"
- " status %i\n", __func__, rc);
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
+ rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_flush_cqs(ep);
+
+ if (ia->ri_memreg_strategy == RPCRDMA_FRMR)
+ rpcrdma_reset_frmrs(ia);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
id = rpcrdma_create_id(xprt, ia,
@@ -905,9 +899,14 @@ retry:
rc = -ENETUNREACH;
goto out;
}
- rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
+
+ write_lock(&ia->ri_qplock);
+ old = ia->ri_id;
ia->ri_id = id;
+ write_unlock(&ia->ri_qplock);
+
+ rdma_destroy_qp(old);
+ rdma_destroy_id(old);
} else {
dprintk("RPC: %s: connecting...\n", __func__);
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -974,13 +973,12 @@ out:
* This call is not reentrant, and must not be made in parallel
* on the same endpoint.
*/
-int
+void
rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
int rc;
- rpcrdma_clean_cq(ep->rep_attr.recv_cq);
- rpcrdma_clean_cq(ep->rep_attr.send_cq);
+ rpcrdma_flush_cqs(ep);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
@@ -992,12 +990,93 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
ep->rep_connected = rc;
}
+}
+
+static int
+rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+ int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+ struct ib_fmr_attr fmr_attr = {
+ .max_pages = RPCRDMA_MAX_DATA_SEGS,
+ .max_maps = 1,
+ .page_shift = PAGE_SHIFT
+ };
+ struct rpcrdma_mw *r;
+ int i, rc;
+
+ i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+ dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
+
+ while (i--) {
+ r = kzalloc(sizeof(*r), GFP_KERNEL);
+ if (r == NULL)
+ return -ENOMEM;
+
+ r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
+ if (IS_ERR(r->r.fmr)) {
+ rc = PTR_ERR(r->r.fmr);
+ dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
+ __func__, rc);
+ goto out_free;
+ }
+
+ list_add(&r->mw_list, &buf->rb_mws);
+ list_add(&r->mw_all, &buf->rb_all);
+ }
+ return 0;
+
+out_free:
+ kfree(r);
+ return rc;
+}
+
+static int
+rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_frmr *f;
+ struct rpcrdma_mw *r;
+ int i, rc;
+
+ i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+ dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
+
+ while (i--) {
+ r = kzalloc(sizeof(*r), GFP_KERNEL);
+ if (r == NULL)
+ return -ENOMEM;
+ f = &r->r.frmr;
+
+ f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(f->fr_mr)) {
+ rc = PTR_ERR(f->fr_mr);
+ dprintk("RPC: %s: ib_alloc_fast_reg_mr "
+ "failed %i\n", __func__, rc);
+ goto out_free;
+ }
+
+ f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(f->fr_pgl)) {
+ rc = PTR_ERR(f->fr_pgl);
+ dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
+ "failed %i\n", __func__, rc);
+
+ ib_dereg_mr(f->fr_mr);
+ goto out_free;
+ }
+
+ list_add(&r->mw_list, &buf->rb_mws);
+ list_add(&r->mw_all, &buf->rb_all);
+ }
+
+ return 0;
+
+out_free:
+ kfree(r);
return rc;
}
-/*
- * Initialize buffer memory
- */
int
rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
@@ -1005,7 +1084,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
char *p;
size_t len, rlen, wlen;
int i, rc;
- struct rpcrdma_mw *r;
buf->rb_max_requests = cdata->max_requests;
spin_lock_init(&buf->rb_lock);
@@ -1016,28 +1094,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
* 2. arrays of struct rpcrdma_req to fill in pointers
* 3. array of struct rpcrdma_rep for replies
* 4. padding, if any
- * 5. mw's, fmr's or frmr's, if any
* Send/recv buffers in req/rep need to be registered
*/
-
len = buf->rb_max_requests *
(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
len += cdata->padding;
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
- sizeof(struct rpcrdma_mw);
- break;
- case RPCRDMA_MTHCAFMR:
- /* TBD we are perhaps overallocating here */
- len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
- sizeof(struct rpcrdma_mw);
- break;
- default:
- break;
- }
- /* allocate 1, 4 and 5 in one shot */
p = kzalloc(len, GFP_KERNEL);
if (p == NULL) {
dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
@@ -1064,51 +1126,17 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
p += cdata->padding;
INIT_LIST_HEAD(&buf->rb_mws);
- r = (struct rpcrdma_mw *)p;
+ INIT_LIST_HEAD(&buf->rb_all);
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
- for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
- r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
- ia->ri_max_frmr_depth);
- if (IS_ERR(r->r.frmr.fr_mr)) {
- rc = PTR_ERR(r->r.frmr.fr_mr);
- dprintk("RPC: %s: ib_alloc_fast_reg_mr"
- " failed %i\n", __func__, rc);
- goto out;
- }
- r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
- ia->ri_id->device,
- ia->ri_max_frmr_depth);
- if (IS_ERR(r->r.frmr.fr_pgl)) {
- rc = PTR_ERR(r->r.frmr.fr_pgl);
- dprintk("RPC: %s: "
- "ib_alloc_fast_reg_page_list "
- "failed %i\n", __func__, rc);
-
- ib_dereg_mr(r->r.frmr.fr_mr);
- goto out;
- }
- list_add(&r->mw_list, &buf->rb_mws);
- ++r;
- }
+ rc = rpcrdma_init_frmrs(ia, buf);
+ if (rc)
+ goto out;
break;
case RPCRDMA_MTHCAFMR:
- /* TBD we are perhaps overallocating here */
- for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
- static struct ib_fmr_attr fa =
- { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
- r->r.fmr = ib_alloc_fmr(ia->ri_pd,
- IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
- &fa);
- if (IS_ERR(r->r.fmr)) {
- rc = PTR_ERR(r->r.fmr);
- dprintk("RPC: %s: ib_alloc_fmr"
- " failed %i\n", __func__, rc);
- goto out;
- }
- list_add(&r->mw_list, &buf->rb_mws);
- ++r;
- }
+ rc = rpcrdma_init_fmrs(ia, buf);
+ if (rc)
+ goto out;
break;
default:
break;
@@ -1176,24 +1204,57 @@ out:
return rc;
}
-/*
- * Unregister and destroy buffer memory. Need to deal with
- * partial initialization, so it's callable from failed create.
- * Must be called before destroying endpoint, as registrations
- * reference it.
- */
+static void
+rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+ int rc;
+
+ while (!list_empty(&buf->rb_all)) {
+ r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&r->mw_all);
+ list_del(&r->mw_list);
+
+ rc = ib_dealloc_fmr(r->r.fmr);
+ if (rc)
+ dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
+ __func__, rc);
+
+ kfree(r);
+ }
+}
+
+static void
+rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+ int rc;
+
+ while (!list_empty(&buf->rb_all)) {
+ r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&r->mw_all);
+ list_del(&r->mw_list);
+
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s: ib_dereg_mr failed %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+ kfree(r);
+ }
+}
+
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
- int rc, i;
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
- struct rpcrdma_mw *r;
+ int i;
/* clean up in reverse order from create
* 1. recv mr memory (mr free, then kfree)
* 2. send mr memory (mr free, then kfree)
- * 3. padding (if any) [moved to rpcrdma_ep_destroy]
- * 4. arrays
+ * 3. MWs
*/
dprintk("RPC: %s: entering\n", __func__);
@@ -1212,34 +1273,217 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
}
}
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ rpcrdma_destroy_frmrs(buf);
+ break;
+ case RPCRDMA_MTHCAFMR:
+ rpcrdma_destroy_fmrs(buf);
+ break;
+ default:
+ break;
+ }
+
+ kfree(buf->rb_pool);
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each. FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_xprt *r_xprt =
+ container_of(ia, struct rpcrdma_xprt, rx_ia);
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct list_head *pos;
+ struct rpcrdma_mw *r;
+ int rc;
+
+ list_for_each(pos, &buf->rb_all) {
+ r = list_entry(pos, struct rpcrdma_mw, mw_all);
+
+ if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+ continue;
+
+ rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ if (rc)
+ dprintk("RPC: %s: ib_dereg_mr failed %i\n",
+ __func__, rc);
+ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+
+ r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(r->r.frmr.fr_mr)) {
+ rc = PTR_ERR(r->r.frmr.fr_mr);
+ dprintk("RPC: %s: ib_alloc_fast_reg_mr"
+ " failed %i\n", __func__, rc);
+ continue;
+ }
+ r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
+ ia->ri_id->device,
+ ia->ri_max_frmr_depth);
+ if (IS_ERR(r->r.frmr.fr_pgl)) {
+ rc = PTR_ERR(r->r.frmr.fr_pgl);
+ dprintk("RPC: %s: "
+ "ib_alloc_fast_reg_page_list "
+ "failed %i\n", __func__, rc);
+
+ ib_dereg_mr(r->r.frmr.fr_mr);
+ continue;
+ }
+ r->r.frmr.fr_state = FRMR_IS_INVALID;
+ }
+}
+
+/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
+ * some req segments uninitialized.
+ */
+static void
+rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
+{
+ if (*mw) {
+ list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
+ *mw = NULL;
+ }
+}
+
+/* Cycle mw's back in reverse order, and "spin" them.
+ * This delays and scrambles reuse as much as possible.
+ */
+static void
+rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mr_seg *seg = req->rl_segments;
+ struct rpcrdma_mr_seg *seg1 = seg;
+ int i;
+
+ for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
+ rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
+ rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
+}
+
+static void
+rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+ buf->rb_send_bufs[--buf->rb_send_index] = req;
+ req->rl_niovs = 0;
+ if (req->rl_reply) {
+ buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
+ req->rl_reply->rr_func = NULL;
+ req->rl_reply = NULL;
+ }
+}
+
+/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+ * Redo only the ib_post_send().
+ */
+static void
+rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_xprt *r_xprt =
+ container_of(ia, struct rpcrdma_xprt, rx_ia);
+ struct ib_send_wr invalidate_wr, *bad_wr;
+ int rc;
+
+ dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
+
+ /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
+ r->r.frmr.fr_state = FRMR_IS_INVALID;
+
+ memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+ invalidate_wr.wr_id = (unsigned long)(void *)r;
+ invalidate_wr.opcode = IB_WR_LOCAL_INV;
+ invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
+ DECR_CQCOUNT(&r_xprt->rx_ep);
+
+ dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
+ __func__, r, r->r.frmr.fr_mr->rkey);
+
+ read_lock(&ia->ri_qplock);
+ rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ read_unlock(&ia->ri_qplock);
+ if (rc) {
+ /* Force rpcrdma_buffer_get() to retry */
+ r->r.frmr.fr_state = FRMR_IS_STALE;
+ dprintk("RPC: %s: ib_post_send failed, %i\n",
+ __func__, rc);
+ }
+}
+
+static void
+rpcrdma_retry_flushed_linv(struct list_head *stale,
+ struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ struct list_head *pos;
+ struct rpcrdma_mw *r;
+ unsigned long flags;
+
+ list_for_each(pos, stale) {
+ r = list_entry(pos, struct rpcrdma_mw, mw_list);
+ rpcrdma_retry_local_inv(r, ia);
+ }
+
+ spin_lock_irqsave(&buf->rb_lock, flags);
+ list_splice_tail(stale, &buf->rb_mws);
+ spin_unlock_irqrestore(&buf->rb_lock, flags);
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
+ struct list_head *stale)
+{
+ struct rpcrdma_mw *r;
+ int i;
+
+ i = RPCRDMA_MAX_SEGS - 1;
while (!list_empty(&buf->rb_mws)) {
r = list_entry(buf->rb_mws.next,
- struct rpcrdma_mw, mw_list);
+ struct rpcrdma_mw, mw_list);
list_del(&r->mw_list);
- switch (ia->ri_memreg_strategy) {
- case RPCRDMA_FRMR:
- rc = ib_dereg_mr(r->r.frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dereg_mr"
- " failed %i\n",
- __func__, rc);
- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
- break;
- case RPCRDMA_MTHCAFMR:
- rc = ib_dealloc_fmr(r->r.fmr);
- if (rc)
- dprintk("RPC: %s:"
- " ib_dealloc_fmr"
- " failed %i\n",
- __func__, rc);
- break;
- default:
- break;
+ if (r->r.frmr.fr_state == FRMR_IS_STALE) {
+ list_add(&r->mw_list, stale);
+ continue;
}
+ req->rl_segments[i].mr_chunk.rl_mw = r;
+ if (unlikely(i-- == 0))
+ return req; /* Success */
}
- kfree(buf->rb_pool);
+ /* Not enough entries on rb_mws for this req */
+ rpcrdma_buffer_put_sendbuf(req, buf);
+ rpcrdma_buffer_put_mrs(req, buf);
+ return NULL;
+}
+
+static struct rpcrdma_req *
+rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_mw *r;
+ int i;
+
+ i = RPCRDMA_MAX_SEGS - 1;
+ while (!list_empty(&buf->rb_mws)) {
+ r = list_entry(buf->rb_mws.next,
+ struct rpcrdma_mw, mw_list);
+ list_del(&r->mw_list);
+ req->rl_segments[i].mr_chunk.rl_mw = r;
+ if (unlikely(i-- == 0))
+ return req; /* Success */
+ }
+
+ /* Not enough entries on rb_mws for this req */
+ rpcrdma_buffer_put_sendbuf(req, buf);
+ rpcrdma_buffer_put_mrs(req, buf);
+ return NULL;
}
/*
@@ -1254,10 +1498,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
+ struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
+ struct list_head stale;
struct rpcrdma_req *req;
unsigned long flags;
- int i;
- struct rpcrdma_mw *r;
spin_lock_irqsave(&buffers->rb_lock, flags);
if (buffers->rb_send_index == buffers->rb_max_requests) {
@@ -1277,16 +1521,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
}
buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
- if (!list_empty(&buffers->rb_mws)) {
- i = RPCRDMA_MAX_SEGS - 1;
- do {
- r = list_entry(buffers->rb_mws.next,
- struct rpcrdma_mw, mw_list);
- list_del(&r->mw_list);
- req->rl_segments[i].mr_chunk.rl_mw = r;
- } while (--i >= 0);
+
+ INIT_LIST_HEAD(&stale);
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
+ break;
+ case RPCRDMA_MTHCAFMR:
+ req = rpcrdma_buffer_get_fmrs(req, buffers);
+ break;
+ default:
+ break;
}
spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ if (!list_empty(&stale))
+ rpcrdma_retry_flushed_linv(&stale, buffers);
return req;
}
@@ -1299,34 +1548,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
- int i;
unsigned long flags;
spin_lock_irqsave(&buffers->rb_lock, flags);
- buffers->rb_send_bufs[--buffers->rb_send_index] = req;
- req->rl_niovs = 0;
- if (req->rl_reply) {
- buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
- req->rl_reply->rr_func = NULL;
- req->rl_reply = NULL;
- }
+ rpcrdma_buffer_put_sendbuf(req, buffers);
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
case RPCRDMA_MTHCAFMR:
- /*
- * Cycle mw's back in reverse order, and "spin" them.
- * This delays and scrambles reuse as much as possible.
- */
- i = 1;
- do {
- struct rpcrdma_mw **mw;
- mw = &req->rl_segments[i].mr_chunk.rl_mw;
- list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
- *mw = NULL;
- } while (++i < RPCRDMA_MAX_SEGS);
- list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
- &buffers->rb_mws);
- req->rl_segments[0].mr_chunk.rl_mw = NULL;
+ rpcrdma_buffer_put_mrs(req, buffers);
break;
default:
break;
@@ -1388,6 +1617,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
*/
iov->addr = ib_dma_map_single(ia->ri_id->device,
va, len, DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
+ return -ENOMEM;
+
iov->length = len;
if (ia->ri_have_dma_lkey) {
@@ -1483,8 +1715,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_mr_seg *seg1 = seg;
- struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
-
+ struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
+ struct rpcrdma_frmr *frmr = &mw->r.frmr;
+ struct ib_mr *mr = frmr->fr_mr;
+ struct ib_send_wr fastreg_wr, *bad_wr;
u8 key;
int len, pageoff;
int i, rc;
@@ -1502,8 +1736,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
rpcrdma_map_one(ia, seg, writing);
pa = seg->mr_dma;
for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
- seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
- page_list[page_no++] = pa;
+ frmr->fr_pgl->page_list[page_no++] = pa;
pa += PAGE_SIZE;
}
len += seg->mr_len;
@@ -1515,65 +1748,51 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
break;
}
dprintk("RPC: %s: Using frmr %p to map %d segments\n",
- __func__, seg1->mr_chunk.rl_mw, i);
-
- if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
- dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
- __func__,
- seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
- /* Invalidate before using. */
- memset(&invalidate_wr, 0, sizeof invalidate_wr);
- invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
- invalidate_wr.next = &frmr_wr;
- invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.send_flags = IB_SEND_SIGNALED;
- invalidate_wr.ex.invalidate_rkey =
- seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
- DECR_CQCOUNT(&r_xprt->rx_ep);
- post_wr = &invalidate_wr;
- } else
- post_wr = &frmr_wr;
-
- /* Prepare FRMR WR */
- memset(&frmr_wr, 0, sizeof frmr_wr);
- frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
- frmr_wr.opcode = IB_WR_FAST_REG_MR;
- frmr_wr.send_flags = IB_SEND_SIGNALED;
- frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
- frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
- frmr_wr.wr.fast_reg.page_list_len = page_no;
- frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
- if (frmr_wr.wr.fast_reg.length < len) {
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
- return -EIO;
+ __func__, mw, i);
+
+ frmr->fr_state = FRMR_IS_VALID;
+
+ memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+ fastreg_wr.wr_id = (unsigned long)(void *)mw;
+ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+ fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
+ fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+ fastreg_wr.wr.fast_reg.page_list_len = page_no;
+ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+ if (fastreg_wr.wr.fast_reg.length < len) {
+ rc = -EIO;
+ goto out_err;
}
/* Bump the key */
- key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
- ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+ key = (u8)(mr->rkey & 0x000000FF);
+ ib_update_fast_reg_key(mr, ++key);
- frmr_wr.wr.fast_reg.access_flags = (writing ?
+ fastreg_wr.wr.fast_reg.access_flags = (writing ?
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ);
- frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ fastreg_wr.wr.fast_reg.rkey = mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
-
+ rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
if (rc) {
dprintk("RPC: %s: failed ib_post_send for register,"
" status %i\n", __func__, rc);
- while (i--)
- rpcrdma_unmap_one(ia, --seg);
+ ib_update_fast_reg_key(mr, --key);
+ goto out_err;
} else {
- seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
+ seg1->mr_rkey = mr->rkey;
seg1->mr_base = seg1->mr_dma + pageoff;
seg1->mr_nsegs = i;
seg1->mr_len = len;
}
*nsegs = i;
+ return 0;
+out_err:
+ frmr->fr_state = FRMR_IS_INVALID;
+ while (i--)
+ rpcrdma_unmap_one(ia, --seg);
return rc;
}
@@ -1585,20 +1804,25 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
struct ib_send_wr invalidate_wr, *bad_wr;
int rc;
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia, seg++);
+ seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
memset(&invalidate_wr, 0, sizeof invalidate_wr);
invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.send_flags = IB_SEND_SIGNALED;
invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
+ read_lock(&ia->ri_qplock);
+ while (seg1->mr_nsegs--)
+ rpcrdma_unmap_one(ia, seg++);
rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
- if (rc)
+ read_unlock(&ia->ri_qplock);
+ if (rc) {
+ /* Force rpcrdma_buffer_get() to retry */
+ seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
dprintk("RPC: %s: failed ib_post_send for invalidate,"
" status %i\n", __func__, rc);
+ }
return rc;
}
@@ -1656,8 +1880,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
rc = ib_unmap_fmr(&l);
+ read_lock(&ia->ri_qplock);
while (seg1->mr_nsegs--)
rpcrdma_unmap_one(ia, seg++);
+ read_unlock(&ia->ri_qplock);
if (rc)
dprintk("RPC: %s: failed ib_unmap_fmr,"
" status %i\n", __func__, rc);
@@ -1673,7 +1899,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
switch (ia->ri_memreg_strategy) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
case RPCRDMA_ALLPHYSICAL:
rpcrdma_map_one(ia, seg, writing);
seg->mr_rkey = ia->ri_bind_mem->rkey;
@@ -1681,7 +1906,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
seg->mr_nsegs = 1;
nsegs = 1;
break;
-#endif
/* Registration using frmr registration */
case RPCRDMA_FRMR:
@@ -1711,11 +1935,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
switch (ia->ri_memreg_strategy) {
-#if RPCRDMA_PERSISTENT_REGISTRATION
case RPCRDMA_ALLPHYSICAL:
+ read_lock(&ia->ri_qplock);
rpcrdma_unmap_one(ia, seg);
+ read_unlock(&ia->ri_qplock);
break;
-#endif
case RPCRDMA_FRMR:
rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
@@ -1809,3 +2033,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
rc);
return rc;
}
+
+/* Physical mapping means one Read/Write list entry per-page.
+ * All list entries must fit within an inline buffer
+ *
+ * NB: The server must return a Write list for NFS READ,
+ * which has the same constraint. Factor in the inline
+ * rsize as well.
+ */
+static size_t
+rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+ unsigned int inline_size, pages;
+
+ inline_size = min_t(unsigned int,
+ cdata->inline_wsize, cdata->inline_rsize);
+ inline_size -= RPCRDMA_HDRLEN_MIN;
+ pages = inline_size / sizeof(struct rpcrdma_segment);
+ return pages << PAGE_SHIFT;
+}
+
+static size_t
+rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+ return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+}
+
+size_t
+rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
+{
+ size_t result;
+
+ switch (r_xprt->rx_ia.ri_memreg_strategy) {
+ case RPCRDMA_ALLPHYSICAL:
+ result = rpcrdma_physical_max_payload(r_xprt);
+ break;
+ default:
+ result = rpcrdma_mr_max_payload(r_xprt);
+ }
+ return result;
+}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 89e7cd47970..c419498b8f4 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -59,6 +59,7 @@
* Interface Adapter -- one per transport instance
*/
struct rpcrdma_ia {
+ rwlock_t ri_qplock;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
struct ib_mr *ri_bind_mem;
@@ -98,6 +99,14 @@ struct rpcrdma_ep {
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
+enum rpcrdma_chunktype {
+ rpcrdma_noch = 0,
+ rpcrdma_readch,
+ rpcrdma_areadch,
+ rpcrdma_writech,
+ rpcrdma_replych
+};
+
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -137,6 +146,40 @@ struct rpcrdma_rep {
};
/*
+ * struct rpcrdma_mw - external memory region metadata
+ *
+ * An external memory region is any buffer or page that is registered
+ * on the fly (ie, not pre-registered).
+ *
+ * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
+ * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
+ * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
+ * track of registration metadata while each RPC is pending.
+ * rpcrdma_deregister_external() uses this metadata to unmap and
+ * release these resources when an RPC is complete.
+ */
+enum rpcrdma_frmr_state {
+ FRMR_IS_INVALID, /* ready to be used */
+ FRMR_IS_VALID, /* in use */
+ FRMR_IS_STALE, /* failed completion */
+};
+
+struct rpcrdma_frmr {
+ struct ib_fast_reg_page_list *fr_pgl;
+ struct ib_mr *fr_mr;
+ enum rpcrdma_frmr_state fr_state;
+};
+
+struct rpcrdma_mw {
+ union {
+ struct ib_fmr *fmr;
+ struct rpcrdma_frmr frmr;
+ } r;
+ struct list_head mw_list;
+ struct list_head mw_all;
+};
+
+/*
* struct rpcrdma_req -- structure central to the request/reply sequence.
*
* N of these are associated with a transport instance, and stored in
@@ -163,17 +206,7 @@ struct rpcrdma_rep {
struct rpcrdma_mr_seg { /* chunk descriptors */
union { /* chunk memory handles */
struct ib_mr *rl_mr; /* if registered directly */
- struct rpcrdma_mw { /* if registered from region */
- union {
- struct ib_fmr *fmr;
- struct {
- struct ib_fast_reg_page_list *fr_pgl;
- struct ib_mr *fr_mr;
- enum { FRMR_IS_INVALID, FRMR_IS_VALID } state;
- } frmr;
- } r;
- struct list_head mw_list;
- } *rl_mw;
+ struct rpcrdma_mw *rl_mw; /* if registered from region */
} mr_chunk;
u64 mr_base; /* registration result */
u32 mr_rkey; /* registration result */
@@ -191,6 +224,7 @@ struct rpcrdma_req {
unsigned int rl_niovs; /* 0, 2 or 4 */
unsigned int rl_nchunks; /* non-zero if chunks */
unsigned int rl_connect_cookie; /* retry detection */
+ enum rpcrdma_chunktype rl_rtype, rl_wtype;
struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
@@ -214,6 +248,7 @@ struct rpcrdma_buffer {
atomic_t rb_credits; /* most recent server credits */
int rb_max_requests;/* client max requests */
struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
+ struct list_head rb_all;
int rb_send_index;
struct rpcrdma_req **rb_send_bufs;
int rb_recv_index;
@@ -306,7 +341,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
struct rpcrdma_create_data_internal *);
void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
struct rpcrdma_req *);
@@ -346,7 +381,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
/*
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
*/
+ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
int rpcrdma_marshal_req(struct rpc_rqst *);
+size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
/* Temporary NFS request map cache. Created in svc_rdma.c */
extern struct kmem_cache *svc_rdma_map_cachep;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index be8bbd5d65e..43cd89eacfa 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -594,6 +594,7 @@ static int xs_local_send_request(struct rpc_task *task)
}
switch (status) {
+ case -ENOBUFS:
case -EAGAIN:
status = xs_nospace(task);
break;
@@ -661,6 +662,7 @@ static int xs_udp_send_request(struct rpc_task *task)
dprintk("RPC: sendmsg returned unrecognized error %d\n",
-status);
case -ENETUNREACH:
+ case -ENOBUFS:
case -EPIPE:
case -ECONNREFUSED:
/* When the server has died, an ICMP port unreachable message
@@ -758,6 +760,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
status = -ENOTCONN;
/* Should we call xs_close() here? */
break;
+ case -ENOBUFS:
case -EAGAIN:
status = xs_nospace(task);
break;
@@ -1946,6 +1949,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
dprintk("RPC: xprt %p connected to %s\n",
xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
xprt_set_connected(xprt);
+ case -ENOBUFS:
break;
case -ENOENT:
dprintk("RPC: xprt %p: socket %s does not exist\n",
@@ -2281,6 +2285,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
case -ECONNREFUSED:
case -ECONNRESET:
case -ENETUNREACH:
+ case -ENOBUFS:
/* retry with existing socket, after a delay */
goto out;
}
@@ -3054,12 +3059,12 @@ static int param_set_uint_minmax(const char *val,
const struct kernel_param *kp,
unsigned int min, unsigned int max)
{
- unsigned long num;
+ unsigned int num;
int ret;
if (!val)
return -EINVAL;
- ret = strict_strtoul(val, 0, &num);
+ ret = kstrtouint(val, 0, &num);
if (ret == -EINVAL || num < min || num > max)
return -EINVAL;
*((unsigned int *)kp->arg) = num;