2017-05-02 16:38:16

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 0/3] Fix up a couple of issues around layout handling

The main issue to be dealt with is a deadlock that can occur due to
an ABBA-type of situation between layoutget and layoutreturn.

Trond Myklebust (3):
pNFS: Don't clear the layout return info if there are segments to
return
pNFS: Fix a deadlock when coalescing writes and returning the layout
pNFS: Fix a typo in pnfs_generic_alloc_ds_commits

fs/nfs/pnfs.c | 10 +++++++---
fs/nfs/pnfs_nfs.c | 2 +-
2 files changed, 8 insertions(+), 4 deletions(-)

--
2.9.3



2017-05-02 16:38:17

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 1/3] pNFS: Don't clear the layout return info if there are segments to return

In pnfs_clear_layoutreturn_info, ensure that we don't clear the layout
return info if there are new segments queued for return due to, for
instance, a race between a LAYOUTRETURN and a failed I/O attempt.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/pnfs.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 140ecd7d350f..cea1e838efae 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -322,9 +322,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
static void
pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
{
+ struct pnfs_layout_segment *lseg;
lo->plh_return_iomode = 0;
lo->plh_return_seq = 0;
clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
}

static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
@@ -367,9 +373,9 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg, *next;

set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_clear_layoutreturn_info(lo);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
pnfs_clear_lseg_state(lseg, lseg_list);
+ pnfs_clear_layoutreturn_info(lo);
pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
!test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
--
2.9.3


2017-05-02 16:38:16

by Trond Myklebust

[permalink] [raw]
Subject: [RFC PATCH 1/5] SUNRPC: Allow creation of RPC clients with multiple connections

Add an argument to struct rpc_create_args that allows the specification
of how many transport connections you want to set up to the server.

Signed-off-by: Trond Myklebust <[email protected]>
---
include/linux/sunrpc/clnt.h | 1 +
net/sunrpc/clnt.c | 17 ++++++++++++++++-
net/sunrpc/xprtmultipath.c | 3 +--
3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 6095ecba0dde..8c3cb38a385b 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -120,6 +120,7 @@ struct rpc_create_args {
u32 prognumber; /* overrides program->number */
u32 version;
rpc_authflavor_t authflavor;
+ u32 nconnect;
unsigned long flags;
char *client_name;
struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 673046c64e48..0ff97288b43f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -522,6 +522,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
.bc_xprt = args->bc_xprt,
};
char servername[48];
+ struct rpc_clnt *clnt;
+ int i;

if (args->bc_xprt) {
WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
@@ -584,7 +586,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
xprt->resvport = 0;

- return rpc_create_xprt(args, xprt);
+ clnt = rpc_create_xprt(args, xprt);
+ if (IS_ERR(clnt) || args->nconnect <= 1)
+ return clnt;
+
+ for (i = 0; i < args->nconnect - 1; i++) {
+ if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0)
+ break;
+ }
+ return clnt;
}
EXPORT_SYMBOL_GPL(rpc_create);

@@ -2605,6 +2615,10 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
return -ENOMEM;
data->xps = xprt_switch_get(xps);
data->xprt = xprt_get(xprt);
+ if (rpc_xprt_switch_has_addr(data->xps, (struct sockaddr *)&xprt->addr)) {
+ rpc_cb_add_xprt_release(data);
+ goto success;
+ }

cred = authnull_ops.lookup_cred(NULL, NULL, 0);
task = rpc_call_null_helper(clnt, xprt, cred,
@@ -2614,6 +2628,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
if (IS_ERR(task))
return PTR_ERR(task);
rpc_put_task(task);
+success:
return 1;
}
EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 95064d510ce6..486819d0c58b 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -51,8 +51,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
if (xprt == NULL)
return;
spin_lock(&xps->xps_lock);
- if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
- !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+ if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
xprt_switch_add_xprt_locked(xps, xprt);
spin_unlock(&xps->xps_lock);
}
--
2.9.3


2017-05-02 16:38:28

by Trond Myklebust

[permalink] [raw]
Subject: [RFC PATCH 5/5] NFS: Display the "nconnect" mount option if it is set.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/super.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 7eb48934dc79..0e07a6684235 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -673,6 +673,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
seq_printf(m, ",proto=%s",
rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID));
rcu_read_unlock();
+ if (clp->cl_nconnect > 0)
+ seq_printf(m, ",nconnect=%u", clp->cl_nconnect);
if (version == 4) {
if (nfss->port != NFS_PORT)
seq_printf(m, ",port=%u", nfss->port);
--
2.9.3


2017-05-02 16:38:25

by Trond Myklebust

[permalink] [raw]
Subject: [RFC PATCH 3/5] NFSv4: Allow multiple connections to NFSv4.x (x>0) servers

If the user specifies the -onconn=<number> mount option, and the transport
protocol is TCP, then set up <number> connections to the server. The
connections will all go to the same IP address.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/client.c | 2 ++
fs/nfs/internal.h | 1 +
fs/nfs/nfs4client.c | 10 ++++++++--
include/linux/nfs_fs_sb.h | 1 +
4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e0302101e18a..c5b0f3e270a3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -180,6 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_rpcclient = ERR_PTR(-EINVAL);

clp->cl_proto = cl_init->proto;
+ clp->cl_nconnect = cl_init->nconnect;
clp->cl_net = get_net(cl_init->net);

cred = rpc_lookup_machine_cred("*");
@@ -488,6 +489,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
struct rpc_create_args args = {
.net = clp->cl_net,
.protocol = clp->cl_proto,
+ .nconnect = clp->cl_nconnect,
.address = (struct sockaddr *)&clp->cl_addr,
.addrsize = clp->cl_addrlen,
.timeout = cl_init->timeparms,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 31757a742e9b..abe5d3934eaf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -77,6 +77,7 @@ struct nfs_client_initdata {
struct nfs_subversion *nfs_mod;
int proto;
u32 minorversion;
+ unsigned int nconnect;
struct net *net;
const struct rpc_timeout *timeparms;
};
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 692a7a8bfc7a..c9b10b7829f0 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -834,7 +834,8 @@ static int nfs4_set_client(struct nfs_server *server,
const size_t addrlen,
const char *ip_addr,
int proto, const struct rpc_timeout *timeparms,
- u32 minorversion, struct net *net)
+ u32 minorversion, unsigned int nconnect,
+ struct net *net)
{
struct nfs_client_initdata cl_init = {
.hostname = hostname,
@@ -849,6 +850,8 @@ static int nfs4_set_client(struct nfs_server *server,
};
struct nfs_client *clp;

+ if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP)
+ cl_init.nconnect = nconnect;
if (server->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
if (server->options & NFS_OPTION_MIGRATION)
@@ -1040,6 +1043,7 @@ static int nfs4_init_server(struct nfs_server *server,
data->nfs_server.protocol,
&timeparms,
data->minorversion,
+ data->nfs_server.nconnect,
data->net);
if (error < 0)
return error;
@@ -1124,6 +1128,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
rpc_protocol(parent_server->client),
parent_server->client->cl_timeout,
parent_client->cl_mvops->minor_version,
+ parent_client->cl_nconnect,
parent_client->cl_net);
if (error < 0)
goto error;
@@ -1215,7 +1220,8 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
nfs_server_remove_lists(server);
error = nfs4_set_client(server, hostname, sap, salen, buf,
clp->cl_proto, clnt->cl_timeout,
- clp->cl_minorversion, net);
+ clp->cl_minorversion,
+ clp->cl_nconnect, net);
nfs_put_client(clp);
if (error != 0) {
nfs_server_insert_lists(server);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 2a70f34dffe8..b7e6b94d1246 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -55,6 +55,7 @@ struct nfs_client {
struct nfs_subversion * cl_nfs_mod; /* pointer to nfs version module */

u32 cl_minorversion;/* NFSv4 minorversion */
+ unsigned int cl_nconnect; /* Number of connections */
struct rpc_cred *cl_machine_cred;

#if IS_ENABLED(CONFIG_NFS_V4)
--
2.9.3


2017-05-02 16:38:18

by Trond Myklebust

[permalink] [raw]
Subject: [RFC PATCH 2/5] NFS: Add a mount option to specify number of TCP connections to use

Allow the user to specify that the client should use multiple connections
to the server. For the moment, this functionality will be limited to
TCP and to NFSv4.x (x>0).

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/internal.h | 1 +
fs/nfs/super.c | 10 ++++++++++
2 files changed, 11 insertions(+)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 31b26cf1b476..31757a742e9b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -117,6 +117,7 @@ struct nfs_parsed_mount_data {
char *export_path;
int port;
unsigned short protocol;
+ unsigned short nconnect;
} nfs_server;

struct security_mnt_opts lsm_opts;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 54e0f9f2dd94..7eb48934dc79 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -76,6 +76,8 @@
#define NFS_DEFAULT_VERSION 2
#endif

+#define NFS_MAX_CONNECTIONS 16
+
enum {
/* Mount options that take no arguments */
Opt_soft, Opt_hard,
@@ -107,6 +109,7 @@ enum {
Opt_nfsvers,
Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
Opt_addr, Opt_mountaddr, Opt_clientaddr,
+ Opt_nconnect,
Opt_lookupcache,
Opt_fscache_uniq,
Opt_local_lock,
@@ -179,6 +182,8 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_mounthost, "mounthost=%s" },
{ Opt_mountaddr, "mountaddr=%s" },

+ { Opt_nconnect, "nconnect=%s" },
+
{ Opt_lookupcache, "lookupcache=%s" },
{ Opt_fscache_uniq, "fsc=%s" },
{ Opt_local_lock, "local_lock=%s" },
@@ -1544,6 +1549,11 @@ static int nfs_parse_mount_options(char *raw,
if (mnt->mount_server.addrlen == 0)
goto out_invalid_address;
break;
+ case Opt_nconnect:
+ if (nfs_get_option_ul_bound(args, &option, 1, NFS_MAX_CONNECTIONS))
+ goto out_invalid_value;
+ mnt->nfs_server.nconnect = option;
+ break;
case Opt_lookupcache:
string = match_strdup(args);
if (string == NULL)
--
2.9.3


2017-05-02 16:38:22

by Trond Myklebust

[permalink] [raw]
Subject: [RFC PATCH 4/5] pNFS: Allow multiple connections to the DS

If the user specifies -onconn=<number> mount option, and the transport
protocol is TCP, then set up <number> connections to the pNFS data server
as well. The connections will all go to the same IP address.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/nfs3client.c | 3 +++
fs/nfs/nfs4client.c | 3 +++
2 files changed, 6 insertions(+)

diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 7879f2a0fcfd..8c624c74ddbe 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -100,6 +100,9 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;

+ if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+ cl_init.nconnect = mds_clp->cl_nconnect;
+
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index c9b10b7829f0..bfea1b232dd2 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -912,6 +912,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;

+ if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP)
+ cl_init.nconnect = mds_clp->cl_nconnect;
+
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);

--
2.9.3


2017-05-02 16:38:19

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 2/3] pNFS: Fix a deadlock when coalescing writes and returning the layout

Consider the following deadlock:

Process P1 Process P2 Process P3
========== ========== ==========
lock_page(page)

lseg = pnfs_update_layout(inode)

lo = NFS_I(inode)->layout
pnfs_error_mark_layout_for_return(lo)

lock_page(page)

lseg = pnfs_update_layout(inode)

In this scenario,
- P1 has declared the layout to be in error, but P2 holds a reference to
a layout segment on that inode, so the layoutreturn is deferred.
- P2 is waiting for a page lock held by P3.
- P3 is asking for a new layout segment, but is blocked waiting
for the layoutreturn.

The fix is to ensure that pnfs_error_mark_layout_for_return() does
not set the NFS_LAYOUT_RETURN flag, which blocks P3. Instead, we allow
the latter to call LAYOUTGET so that it can make progress and unblock
P2.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/pnfs.c | 2 --
1 file changed, 2 deletions(-)

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cea1e838efae..adc6ec28d4b5 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2063,8 +2063,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
return;
}
pnfs_set_plh_return_info(lo, range.iomode, 0);
- /* Block LAYOUTGET */
- set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
--
2.9.3


2017-05-02 16:38:21

by Trond Myklebust

[permalink] [raw]
Subject: [PATCH 3/3] pNFS: Fix a typo in pnfs_generic_alloc_ds_commits

If the layout segment is invalid, we want to just resend the remaining
writes.

Signed-off-by: Trond Myklebust <[email protected]>
---
fs/nfs/pnfs_nfs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7697ac0ff81a..ae600ab1a646 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -223,7 +223,7 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
*/
if (!pnfs_is_valid_lseg(bucket->clseg) &&
!test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags))
- continue;
+ break;
data = nfs_commitdata_alloc(false);
if (!data)
break;
--
2.9.3