The main issue to be dealt with is a deadlock that can occur due to
an ABBA-type of situation between layoutget and layoutreturn.
Trond Myklebust (3):
pNFS: Don't clear the layout return info if there are segments to
return
pNFS: Fix a deadlock when coalescing writes and returning the layout
pNFS: Fix a typo in pnfs_generic_alloc_ds_commits
fs/nfs/pnfs.c | 10 +++++++---
fs/nfs/pnfs_nfs.c | 2 +-
2 files changed, 8 insertions(+), 4 deletions(-)
--
2.9.3
In pnfs_clear_layoutreturn_info, ensure that we don't clear the layout
return info if there are new segments queued for return due to, for
instance, a race between a LAYOUTRETURN and a failed I/O attempt.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/pnfs.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 140ecd7d350f..cea1e838efae 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -322,9 +322,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
static void
pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
{
+ struct pnfs_layout_segment *lseg;
lo->plh_return_iomode = 0;
lo->plh_return_seq = 0;
clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
}
static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
@@ -367,9 +373,9 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg, *next;
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_clear_layoutreturn_info(lo);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
pnfs_clear_lseg_state(lseg, lseg_list);
+ pnfs_clear_layoutreturn_info(lo);
pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
!test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
--
2.9.3
Add an argument to struct rpc_create_args that allows the specification
of how many transport connections you want to set up to the server.
Signed-off-by: Trond Myklebust <[email protected]>
---
include/linux/sunrpc/clnt.h | 1 +
net/sunrpc/clnt.c | 17 ++++++++++++++++-
net/sunrpc/xprtmultipath.c | 3 +--
3 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 6095ecba0dde..8c3cb38a385b 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -120,6 +120,7 @@ struct rpc_create_args {
u32 prognumber; /* overrides program->number */
u32 version;
rpc_authflavor_t authflavor;
+ u32 nconnect;
unsigned long flags;
char *client_name;
struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 673046c64e48..0ff97288b43f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -522,6 +522,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
.bc_xprt = args->bc_xprt,
};
char servername[48];
+ struct rpc_clnt *clnt;
+ int i;
if (args->bc_xprt) {
WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
@@ -584,7 +586,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
xprt->resvport = 0;
- return rpc_create_xprt(args, xprt);
+ clnt = rpc_create_xprt(args, xprt);
+ if (IS_ERR(clnt) || args->nconnect <= 1)
+ return clnt;
+
+ for (i = 0; i < args->nconnect - 1; i++) {
+ if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0)
+ break;
+ }
+ return clnt;
}
EXPORT_SYMBOL_GPL(rpc_create);
@@ -2605,6 +2615,10 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
return -ENOMEM;
data->xps = xprt_switch_get(xps);
data->xprt = xprt_get(xprt);
+ if (rpc_xprt_switch_has_addr(data->xps, (struct sockaddr *)&xprt->addr)) {
+ rpc_cb_add_xprt_release(data);
+ goto success;
+ }
cred = authnull_ops.lookup_cred(NULL, NULL, 0);
task = rpc_call_null_helper(clnt, xprt, cred,
@@ -2614,6 +2628,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
if (IS_ERR(task))
return PTR_ERR(task);
rpc_put_task(task);
+success:
return 1;
}
EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 95064d510ce6..486819d0c58b 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -51,8 +51,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
if (xprt == NULL)
return;
spin_lock(&xps->xps_lock);
- if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
- !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+ if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
xprt_switch_add_xprt_locked(xps, xprt);
spin_unlock(&xps->xps_lock);
}
--
2.9.3
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/super.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 7eb48934dc79..0e07a6684235 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -673,6 +673,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
seq_printf(m, ",proto=%s",
rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
rcu_read_unlock();
+ if (clp->cl_nconnect > 0)
+ seq_printf(m, ",nconnect=%u", clp->cl_nconnect);
if (version == 4) {
if (nfss->port != NFS_PORT)
seq_printf(m, ",port=%u", nfss->port);
--
2.9.3
If the user specifies the -onconn=<number> mount option, and the transport
protocol is TCP, then set up <number> connections to the server. The
connections will all go to the same IP address.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/client.c | 2 ++
fs/nfs/internal.h | 1 +
fs/nfs/nfs4client.c | 10 ++++++++--
include/linux/nfs_fs_sb.h | 1 +
4 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e0302101e18a..c5b0f3e270a3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -180,6 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_rpcclient = ERR_PTR(-EINVAL);
clp->cl_proto = cl_init->proto;
+ clp->cl_nconnect = cl_init->nconnect;
clp->cl_net = get_net(cl_init->net);
cred = rpc_lookup_machine_cred("*");
@@ -488,6 +489,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
struct rpc_create_args args = {
.net = clp->cl_net,
.protocol = clp->cl_proto,
+ .nconnect = clp->cl_nconnect,
.address = (struct sockaddr *)&clp->cl_addr,
.addrsize = clp->cl_addrlen,
.timeout = cl_init->timeparms,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 31757a742e9b..abe5d3934eaf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -77,6 +77,7 @@ struct nfs_client_initdata {
struct nfs_subversion *nfs_mod;
int proto;
u32 minorversion;
+ unsigned int nconnect;
struct net *net;
const struct rpc_timeout *timeparms;
};
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 692a7a8bfc7a..c9b10b7829f0 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -834,7 +834,8 @@ static int nfs4_set_client(struct nfs_server *server,
const size_t addrlen,
const char *ip_addr,
int proto, const struct rpc_timeout *timeparms,
- u32 minorversion, struct net *net)
+ u32 minorversion, unsigned int nconnect,
+ struct net *net)
{
struct nfs_client_initdata cl_init = {
.hostname = hostname,
@@ -849,6 +850,8 @@ static int nfs4_set_client(struct nfs_server *server,
};
struct nfs_client *clp;
+ if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP)
+ cl_init.nconnect = nconnect;
if (server->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
if (server->options & NFS_OPTION_MIGRATION)
@@ -1040,6 +1043,7 @@ static int nfs4_init_server(struct nfs_server *server,
data->nfs_server.protocol,
&timeparms,
data->minorversion,
+ data->nfs_server.nconnect,
data->net);
if (error < 0)
return error;
@@ -1124,6 +1128,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
rpc_protocol(parent_server->client),
parent_server->client->cl_timeout,
parent_client->cl_mvops->minor_version,
+ parent_client->cl_nconnect,
parent_client->cl_net);
if (error < 0)
goto error;
@@ -1215,7 +1220,8 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
nfs_server_remove_lists(server);
error = nfs4_set_client(server, hostname, sap, salen, buf,
clp->cl_proto, clnt->cl_timeout,
- clp->cl_minorversion, net);
+ clp->cl_minorversion,
+ clp->cl_nconnect, net);
nfs_put_client(clp);
if (error != 0) {
nfs_server_insert_lists(server);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2a70f34dffe8..b7e6b94d1246 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -55,6 +55,7 @@ struct nfs_client {
struct nfs_subversion * cl_nfs_mod; /* pointer to nfs version module */
u32 cl_minorversion;/* NFSv4 minorversion */
+ unsigned int cl_nconnect; /* Number of connections */
struct rpc_cred *cl_machine_cred;
#if IS_ENABLED(CONFIG_NFS_V4)
--
2.9.3
Allow the user to specify that the client should use multiple connections
to the server. For the moment, this functionality will be limited to
TCP and to NFSv4.x (x>0).
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/internal.h | 1 +
fs/nfs/super.c | 10 ++++++++++
2 files changed, 11 insertions(+)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 31b26cf1b476..31757a742e9b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -117,6 +117,7 @@ struct nfs_parsed_mount_data {
char *export_path;
int port;
unsigned short protocol;
+ unsigned short nconnect;
} nfs_server;
struct security_mnt_opts lsm_opts;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 54e0f9f2dd94..7eb48934dc79 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -76,6 +76,8 @@
#define NFS_DEFAULT_VERSION 2
#endif
+#define NFS_MAX_CONNECTIONS 16
+
enum {
/* Mount options that take no arguments */
Opt_soft, Opt_hard,
@@ -107,6 +109,7 @@ enum {
Opt_nfsvers,
Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
Opt_addr, Opt_mountaddr, Opt_clientaddr,
+ Opt_nconnect,
Opt_lookupcache,
Opt_fscache_uniq,
Opt_local_lock,
@@ -179,6 +182,8 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_mounthost, "mounthost=%s" },
{ Opt_mountaddr, "mountaddr=%s" },
+ { Opt_nconnect, "nconnect=%s" },
+
{ Opt_lookupcache, "lookupcache=%s" },
{ Opt_fscache_uniq, "fsc=%s" },
{ Opt_local_lock, "local_lock=%s" },
@@ -1544,6 +1549,11 @@ static int nfs_parse_mount_options(char *raw,
if (mnt->mount_server.addrlen == 0)
goto out_invalid_address;
break;
+ case Opt_nconnect:
+ if (nfs_get_option_ul_bound(args, &option, 1, NFS_MAX_CONNECTIONS))
+ goto out_invalid_value;
+ mnt->nfs_server.nconnect = option;
+ break;
case Opt_lookupcache:
string = match_strdup(args);
if (string == NULL)
--
2.9.3
If the user specifies -onconn=<number> mount option, and the transport
protocol is TCP, then set up <number> connections to the pNFS data server
as well. The connections will all go to the same IP address.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/nfs3client.c | 3 +++
fs/nfs/nfs4client.c | 3 +++
2 files changed, 6 insertions(+)
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 7879f2a0fcfd..8c624c74ddbe 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -100,6 +100,9 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
+ if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+ cl_init.nconnect = mds_clp->cl_nconnect;
+
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index c9b10b7829f0..bfea1b232dd2 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -912,6 +912,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
+ if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+ cl_init.nconnect = mds_clp->cl_nconnect;
+
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
--
2.9.3
Consider the following deadlock:
Process P1 Process P2 Process P3
========== ========== ==========
lock_page(page)
lseg = pnfs_update_layout(inode)
lo = NFS_I(inode)->layout
pnfs_error_mark_layout_for_return(lo)
lock_page(page)
lseg = pnfs_update_layout(inode)
In this scenario,
- P1 has declared the layout to be in error, but P2 holds a reference to
a layout segment on that inode, so the layoutreturn is deferred.
- P2 is waiting for a page lock held by P3.
- P3 is asking for a new layout segment, but is blocked waiting
for the layoutreturn.
The fix is to ensure that pnfs_error_mark_layout_for_return() does
not set the NFS_LAYOUT_RETURN flag, which blocks P3. Instead, we allow
the latter to call LAYOUTGET so that it can make progress and unblock
P2.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/pnfs.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cea1e838efae..adc6ec28d4b5 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2063,8 +2063,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
return;
}
pnfs_set_plh_return_info(lo, range.iomode, 0);
- /* Block LAYOUTGET */
- set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
--
2.9.3
If the layout segment is invalid, we want to just resend the remaining
writes.
Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/pnfs_nfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7697ac0ff81a..ae600ab1a646 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -223,7 +223,7 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
*/
if (!pnfs_is_valid_lseg(bucket->clseg) &&
!test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags))
- continue;
+ break;
data = nfs_commitdata_alloc(false);
if (!data)
break;
--
2.9.3