2013-08-15 02:36:19

by NeilBrown

[permalink] [raw]
Subject: [PATCH/RFC] Don't try to recover NFS locks when they are lost.



When an NFS (V4 specifically) client loses contact with the server it can
lose any locks that it holds.
Currently when it reconnects to the server it simply tries to reclaim
those locks. This might succeed even though some other client has held and
released a lock in the mean time. So the first client might think the file
is unchanged, but it isn't. This isn't good.

If, when recovery happens, the locks cannot be claimed because some other
client still holds the lock, then we get a message in the kernel logs, but
the client can still write. So two clients can both think they have a lock
and can both write at the same time. This is equally not good.

There was a patch a while ago
http://comments.gmane.org/gmane.linux.nfs/41917

which tried to address some of this, but it didn't seem to go anywhere.
That patch would also send a signal to the process. That might be useful
but I'm really just interested in failing the writes.
For NFSv4 (unlike v2/v3) there is a strong link between the lock and the
write request so we can fairly easily fail an IO of the lock is gone.

The patch below attempts to do this. Does it make sense?
Because this is a fairly big change I introduces a module parameter
"recover_locks" which defaults to true (the current behaviour) but can be set
to "false" to tell the client not to try to recover things that were lost.

Comments?

Thanks,
NeilBrown



diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f5c84c3..de0229b 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -826,9 +826,10 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
}

-static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
{
rpc_call_start(task);
+ return 0;
}

static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -847,9 +848,10 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
}

-static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
{
rpc_call_start(task);
+ return 0;
}

static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ee81e35..a468b345 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -135,6 +135,7 @@ struct nfs4_lock_state {
struct list_head ls_locks; /* Other lock stateids */
struct nfs4_state * ls_state; /* Pointer to open state */
#define NFS_LOCK_INITIALIZED 0
+#define NFS_LOCK_LOST 1
unsigned long ls_flags;
struct nfs_seqid_counter ls_seqid;
nfs4_stateid ls_stateid;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf11799..bcbcd07 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3907,15 +3907,19 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
}

-static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
{
if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
&data->args.seq_args,
&data->res.seq_res,
task))
- return;
- nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, FMODE_READ);
+ return 0;
+ if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+ data->args.lock_context, FMODE_READ) == -EIO)
+ return -EIO;
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+ return -EIO;
+ return 0;
}

static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3990,15 +3994,19 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
}

-static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
{
if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
&data->args.seq_args,
&data->res.seq_res,
task))
- return;
- nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, FMODE_WRITE);
+ return 0;
+ if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+ data->args.lock_context, FMODE_WRITE) == -EIO)
+ return -EIO;
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+ return -EIO;
+ return 0;
}

static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -5380,6 +5388,11 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
return err;
}

+bool recover_locks = true;
+module_param(recover_locks, bool, 0644);
+MODULE_PARM_DESC(recovery_locks,
+ "If the server reports that a lock might be lost, "
+ "try to recovery it risking corruption.");
static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
{
struct nfs_server *server = NFS_SERVER(state->inode);
@@ -5391,6 +5404,10 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
err = nfs4_set_lock_state(state, request);
if (err != 0)
return err;
+ if (!recover_locks) {
+ set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
+ return 0;
+ }
do {
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e22862f..4d103ff 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -998,7 +998,9 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
fl_pid = lockowner->l_pid;
spin_lock(&state->state_lock);
lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
- if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
+ if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+ ret = -EIO;
+ else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
nfs4_stateid_copy(dst, &lsp->ls_stateid);
ret = 0;
smp_rmb();
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c041c41..a8f57c7 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -623,9 +623,10 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
}

-static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
{
rpc_call_start(task);
+ return 0;
}

static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
@@ -644,9 +645,10 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
}

-static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
{
rpc_call_start(task);
+ return 0;
}

static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 70a26c6..31db5c3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -513,9 +513,10 @@ static void nfs_readpage_release_common(void *calldata)
void nfs_read_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_read_data *data = calldata;
- NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
- rpc_exit(task, -EIO);
+ int err;
+ err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
+ if (err)
+ rpc_exit(task, err);
}

static const struct rpc_call_ops nfs_read_common_ops = {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f1bdb72..7816801 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1265,9 +1265,10 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
void nfs_write_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
- NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
- rpc_exit(task, -EIO);
+ int err;
+ err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
+ if (err)
+ rpc_exit(task, err);
}

void nfs_commit_prepare(struct rpc_task *task, void *calldata)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 8651574..c71e12b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1419,12 +1419,12 @@ struct nfs_rpc_ops {
void (*read_setup) (struct nfs_read_data *, struct rpc_message *);
void (*read_pageio_init)(struct nfs_pageio_descriptor *, struct inode *,
const struct nfs_pgio_completion_ops *);
- void (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
+ int (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
int (*read_done) (struct rpc_task *, struct nfs_read_data *);
void (*write_setup) (struct nfs_write_data *, struct rpc_message *);
void (*write_pageio_init)(struct nfs_pageio_descriptor *, struct inode *, int,
const struct nfs_pgio_completion_ops *);
- void (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
+ int (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
int (*write_done) (struct rpc_task *, struct nfs_write_data *);
void (*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
void (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);


Attachments:
signature.asc (828.00 B)

2013-08-16 10:38:38

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH/RFC] Don't try to recover NFS locks when they are lost.

On Thu, 15 Aug 2013 08:47:06 -0400 Jeff Layton <[email protected]> wrote:

> On Thu, 15 Aug 2013 12:36:04 +1000
> NeilBrown <[email protected]> wrote:
>
> >
> >
> > When an NFS (V4 specifically) client loses contact with the server it can
> > lose any locks that it holds.
> > Currently when it reconnects to the server it simply tries to reclaim
> > those locks. This might succeed even though some other client has held and
> > released a lock in the mean time. So the first client might think the file
> > is unchanged, but it isn't. This isn't good.
> >
> > If, when recovery happens, the locks cannot be claimed because some other
> > client still holds the lock, then we get a message in the kernel logs, but
> > the client can still write. So two clients can both think they have a lock
> > and can both write at the same time. This is equally not good.
> >
> > There was a patch a while ago
> > http://comments.gmane.org/gmane.linux.nfs/41917
> >
> > which tried to address some of this, but it didn't seem to go anywhere.
> > That patch would also send a signal to the process. That might be useful
> > but I'm really just interested in failing the writes.
> > For NFSv4 (unlike v2/v3) there is a strong link between the lock and the
> > write request so we can fairly easily fail an IO of the lock is gone.
> >
> > The patch below attempts to do this. Does it make sense?
> > Because this is a fairly big change I introduces a module parameter
> > "recover_locks" which defaults to true (the current behaviour) but can be set
> > to "false" to tell the client not to try to recover things that were lost.
> >
> > Comments?
> >
> > Thanks,
> > NeilBrown
> >
> >
>
> Failing a read or write when we can't recover a lock over the range
> seems reasonable to me. IIUC though, you're also saying that we
> shouldn't try to recover locks when the lease has expired? If so, then
> that seems wrong...
>
> Isn't it the responsibility of the server to not allow a lock to be
> reclaimed when there has been a conflicting lock in the interim? It's
> quite possible (and even advantageous) for a server to hold onto a lock
> for a client that has missed renewing its lease when no other client has
> made a conflicting lock request.

Hi Jeff,
I had thought that too. But when I looked I could find no evidence for it.
The only time a client can 'reclaim' a lock is during the grace period when
the server might have lost the lock due to a reboot.
The case I'm looking at is when neither host rebooted but there was a network
partition.
I think that if the server is to preserve the lock while no other client
contends it, it has to preserve the whole state and not return
NFS4ERR_EXPIRED.
Once the client gets NFS4ERR_EXPIRED it must assume that all related locks
may have been subject to conflicting locks from other clients.

Thanks,
NeilBrown


>
> >
> > diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
> > index f5c84c3..de0229b 100644
> > --- a/fs/nfs/nfs3proc.c
> > +++ b/fs/nfs/nfs3proc.c
> > @@ -826,9 +826,10 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
> > msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
> > }
> >
> > -static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > +static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > {
> > rpc_call_start(task);
> > + return 0;
> > }
> >
> > static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
> > @@ -847,9 +848,10 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
> > msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
> > }
> >
> > -static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > +static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > {
> > rpc_call_start(task);
> > + return 0;
> > }
> >
> > static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
> > diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
> > index ee81e35..a468b345 100644
> > --- a/fs/nfs/nfs4_fs.h
> > +++ b/fs/nfs/nfs4_fs.h
> > @@ -135,6 +135,7 @@ struct nfs4_lock_state {
> > struct list_head ls_locks; /* Other lock stateids */
> > struct nfs4_state * ls_state; /* Pointer to open state */
> > #define NFS_LOCK_INITIALIZED 0
> > +#define NFS_LOCK_LOST 1
> > unsigned long ls_flags;
> > struct nfs_seqid_counter ls_seqid;
> > nfs4_stateid ls_stateid;
> > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> > index cf11799..bcbcd07 100644
> > --- a/fs/nfs/nfs4proc.c
> > +++ b/fs/nfs/nfs4proc.c
> > @@ -3907,15 +3907,19 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
> > nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
> > }
> >
> > -static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > +static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > {
> > if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
> > &data->args.seq_args,
> > &data->res.seq_res,
> > task))
> > - return;
> > - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> > - data->args.lock_context, FMODE_READ);
> > + return 0;
> > + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> > + data->args.lock_context, FMODE_READ) == -EIO)
> > + return -EIO;
> > + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> > + return -EIO;
> > + return 0;
> > }
> >
> > static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
> > @@ -3990,15 +3994,19 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
> > nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
> > }
> >
> > -static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > +static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > {
> > if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
> > &data->args.seq_args,
> > &data->res.seq_res,
> > task))
> > - return;
> > - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> > - data->args.lock_context, FMODE_WRITE);
> > + return 0;
> > + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> > + data->args.lock_context, FMODE_WRITE) == -EIO)
> > + return -EIO;
> > + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> > + return -EIO;
> > + return 0;
> > }
> >
> > static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
> > @@ -5380,6 +5388,11 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
> > return err;
> > }
> >
> > +bool recover_locks = true;
> > +module_param(recover_locks, bool, 0644);
> > +MODULE_PARM_DESC(recovery_locks,
> > + "If the server reports that a lock might be lost, "
> > + "try to recovery it risking corruption.");
> > static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
> > {
> > struct nfs_server *server = NFS_SERVER(state->inode);
> > @@ -5391,6 +5404,10 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
> > err = nfs4_set_lock_state(state, request);
> > if (err != 0)
> > return err;
> > + if (!recover_locks) {
> > + set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
> > + return 0;
> > + }
> > do {
> > if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
> > return 0;
> > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> > index e22862f..4d103ff 100644
> > --- a/fs/nfs/nfs4state.c
> > +++ b/fs/nfs/nfs4state.c
> > @@ -998,7 +998,9 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
> > fl_pid = lockowner->l_pid;
> > spin_lock(&state->state_lock);
> > lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
> > - if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
> > + if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
> > + ret = -EIO;
> > + else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
> > nfs4_stateid_copy(dst, &lsp->ls_stateid);
> > ret = 0;
> > smp_rmb();
> > diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
> > index c041c41..a8f57c7 100644
> > --- a/fs/nfs/proc.c
> > +++ b/fs/nfs/proc.c
> > @@ -623,9 +623,10 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
> > msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
> > }
> >
> > -static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > +static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > {
> > rpc_call_start(task);
> > + return 0;
> > }
> >
> > static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
> > @@ -644,9 +645,10 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
> > msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
> > }
> >
> > -static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > +static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > {
> > rpc_call_start(task);
> > + return 0;
> > }
> >
> > static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
> > diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> > index 70a26c6..31db5c3 100644
> > --- a/fs/nfs/read.c
> > +++ b/fs/nfs/read.c
> > @@ -513,9 +513,10 @@ static void nfs_readpage_release_common(void *calldata)
> > void nfs_read_prepare(struct rpc_task *task, void *calldata)
> > {
> > struct nfs_read_data *data = calldata;
> > - NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
> > - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> > - rpc_exit(task, -EIO);
> > + int err;
> > + err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
> > + if (err)
> > + rpc_exit(task, err);
> > }
> >
> > static const struct rpc_call_ops nfs_read_common_ops = {
> > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > index f1bdb72..7816801 100644
> > --- a/fs/nfs/write.c
> > +++ b/fs/nfs/write.c
> > @@ -1265,9 +1265,10 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
> > void nfs_write_prepare(struct rpc_task *task, void *calldata)
> > {
> > struct nfs_write_data *data = calldata;
> > - NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
> > - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> > - rpc_exit(task, -EIO);
> > + int err;
> > + err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
> > + if (err)
> > + rpc_exit(task, err);
> > }
> >
> > void nfs_commit_prepare(struct rpc_task *task, void *calldata)
> > diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> > index 8651574..c71e12b 100644
> > --- a/include/linux/nfs_xdr.h
> > +++ b/include/linux/nfs_xdr.h
> > @@ -1419,12 +1419,12 @@ struct nfs_rpc_ops {
> > void (*read_setup) (struct nfs_read_data *, struct rpc_message *);
> > void (*read_pageio_init)(struct nfs_pageio_descriptor *, struct inode *,
> > const struct nfs_pgio_completion_ops *);
> > - void (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
> > + int (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
> > int (*read_done) (struct rpc_task *, struct nfs_read_data *);
> > void (*write_setup) (struct nfs_write_data *, struct rpc_message *);
> > void (*write_pageio_init)(struct nfs_pageio_descriptor *, struct inode *, int,
> > const struct nfs_pgio_completion_ops *);
> > - void (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
> > + int (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
> > int (*write_done) (struct rpc_task *, struct nfs_write_data *);
> > void (*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
> > void (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
>
>


Attachments:
signature.asc (828.00 B)

2013-08-16 17:14:01

by Chuck Lever III

[permalink] [raw]
Subject: Re: [PATCH/RFC] Don't try to recover NFS locks when they are lost.


On Aug 16, 2013, at 9:30 AM, Jeff Layton <[email protected]> wrote:

> On Fri, 16 Aug 2013 20:38:21 +1000
> NeilBrown <[email protected]> wrote:
>
>> On Thu, 15 Aug 2013 08:47:06 -0400 Jeff Layton <[email protected]> wrote:
>>
>>> On Thu, 15 Aug 2013 12:36:04 +1000
>>> NeilBrown <[email protected]> wrote:
>>>
>>>>
>>>>
>>>> When an NFS (V4 specifically) client loses contact with the server it can
>>>> lose any locks that it holds.
>>>> Currently when it reconnects to the server it simply tries to reclaim
>>>> those locks. This might succeed even though some other client has held and
>>>> released a lock in the mean time. So the first client might think the file
>>>> is unchanged, but it isn't. This isn't good.
>>>>
>>>> If, when recovery happens, the locks cannot be claimed because some other
>>>> client still holds the lock, then we get a message in the kernel logs, but
>>>> the client can still write. So two clients can both think they have a lock
>>>> and can both write at the same time. This is equally not good.
>>>>
>>>> There was a patch a while ago
>>>> http://comments.gmane.org/gmane.linux.nfs/41917
>>>>
>>>> which tried to address some of this, but it didn't seem to go anywhere.
>>>> That patch would also send a signal to the process. That might be useful
>>>> but I'm really just interested in failing the writes.
>>>> For NFSv4 (unlike v2/v3) there is a strong link between the lock and the
>>>> write request so we can fairly easily fail an IO of the lock is gone.
>>>>
>>>> The patch below attempts to do this. Does it make sense?
>>>> Because this is a fairly big change I introduces a module parameter
>>>> "recover_locks" which defaults to true (the current behaviour) but can be set
>>>> to "false" to tell the client not to try to recover things that were lost.
>>>>
>>>> Comments?
>>>>
>>>> Thanks,
>>>> NeilBrown
>>>>
>>>>
>>>
>>> Failing a read or write when we can't recover a lock over the range
>>> seems reasonable to me. IIUC though, you're also saying that we
>>> shouldn't try to recover locks when the lease has expired? If so, then
>>> that seems wrong...
>>>
>>> Isn't it the responsibility of the server to not allow a lock to be
>>> reclaimed when there has been a conflicting lock in the interim? It's
>>> quite possible (and even advantageous) for a server to hold onto a lock
>>> for a client that has missed renewing its lease when no other client has
>>> made a conflicting lock request.
>>
>> Hi Jeff,
>> I had thought that too. But when I looked I could find no evidence for it.
>> The only time a client can 'reclaim' a lock is during the grace period when
>> the server might have lost the lock due to a reboot.
>> The case I'm looking at is when neither host rebooted but there was a network
>> partition.
>> I think that if the server is to preserve the lock while no other client
>> contends it, it has to preserve the whole state and not return
>> NFS4ERR_EXPIRED.
>> Once the client gets NFS4ERR_EXPIRED it must assume that all related locks
>> may have been subject to conflicting locks from other clients.
>>
>> Thanks,
>> NeilBrown
>>
>>
>
> Ahh, a very good point. So I guess to reiterate, NFS4ERR_EXPIRED means
> that the state (if there was any) has been purged, and all bets are
> off. So yeah, ok...trying to reclaim locks at that point is probably
> wrong.
>
> That said...why is the server granting those lock reclaims in this case?
> Presumably the grace period has passed and it shouldn't be granting any
> reclaim requests, right?

After NFS4ERR_EXPIRED, our client uses a normal open ( OPEN(CLAIM_NULL) ). That is allowed any time the server is out of its grace period.



>
>
>
>>>
>>>>
>>>> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
>>>> index f5c84c3..de0229b 100644
>>>> --- a/fs/nfs/nfs3proc.c
>>>> +++ b/fs/nfs/nfs3proc.c
>>>> @@ -826,9 +826,10 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
>>>> msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
>>>> }
>>>>
>>>> -static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
>>>> +static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
>>>> {
>>>> rpc_call_start(task);
>>>> + return 0;
>>>> }
>>>>
>>>> static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
>>>> @@ -847,9 +848,10 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
>>>> msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
>>>> }
>>>>
>>>> -static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
>>>> +static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
>>>> {
>>>> rpc_call_start(task);
>>>> + return 0;
>>>> }
>>>>
>>>> static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
>>>> diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
>>>> index ee81e35..a468b345 100644
>>>> --- a/fs/nfs/nfs4_fs.h
>>>> +++ b/fs/nfs/nfs4_fs.h
>>>> @@ -135,6 +135,7 @@ struct nfs4_lock_state {
>>>> struct list_head ls_locks; /* Other lock stateids */
>>>> struct nfs4_state * ls_state; /* Pointer to open state */
>>>> #define NFS_LOCK_INITIALIZED 0
>>>> +#define NFS_LOCK_LOST 1
>>>> unsigned long ls_flags;
>>>> struct nfs_seqid_counter ls_seqid;
>>>> nfs4_stateid ls_stateid;
>>>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
>>>> index cf11799..bcbcd07 100644
>>>> --- a/fs/nfs/nfs4proc.c
>>>> +++ b/fs/nfs/nfs4proc.c
>>>> @@ -3907,15 +3907,19 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
>>>> nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
>>>> }
>>>>
>>>> -static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
>>>> +static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
>>>> {
>>>> if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
>>>> &data->args.seq_args,
>>>> &data->res.seq_res,
>>>> task))
>>>> - return;
>>>> - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
>>>> - data->args.lock_context, FMODE_READ);
>>>> + return 0;
>>>> + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
>>>> + data->args.lock_context, FMODE_READ) == -EIO)
>>>> + return -EIO;
>>>> + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
>>>> + return -EIO;
>>>> + return 0;
>>>> }
>>>>
>>>> static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
>>>> @@ -3990,15 +3994,19 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
>>>> nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
>>>> }
>>>>
>>>> -static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
>>>> +static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
>>>> {
>>>> if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
>>>> &data->args.seq_args,
>>>> &data->res.seq_res,
>>>> task))
>>>> - return;
>>>> - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
>>>> - data->args.lock_context, FMODE_WRITE);
>>>> + return 0;
>>>> + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
>>>> + data->args.lock_context, FMODE_WRITE) == -EIO)
>>>> + return -EIO;
>>>> + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
>>>> + return -EIO;
>>>> + return 0;
>>>> }
>>>>
>>>> static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
>>>> @@ -5380,6 +5388,11 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
>>>> return err;
>>>> }
>>>>
>>>> +bool recover_locks = true;
>>>> +module_param(recover_locks, bool, 0644);
>>>> +MODULE_PARM_DESC(recovery_locks,
>>>> + "If the server reports that a lock might be lost, "
>>>> + "try to recovery it risking corruption.");
>>>> static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
>>>> {
>>>> struct nfs_server *server = NFS_SERVER(state->inode);
>>>> @@ -5391,6 +5404,10 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
>>>> err = nfs4_set_lock_state(state, request);
>>>> if (err != 0)
>>>> return err;
>>>> + if (!recover_locks) {
>>>> + set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
>>>> + return 0;
>>>> + }
>>>> do {
>>>> if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
>>>> return 0;
>>>> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
>>>> index e22862f..4d103ff 100644
>>>> --- a/fs/nfs/nfs4state.c
>>>> +++ b/fs/nfs/nfs4state.c
>>>> @@ -998,7 +998,9 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
>>>> fl_pid = lockowner->l_pid;
>>>> spin_lock(&state->state_lock);
>>>> lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
>>>> - if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
>>>> + if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
>>>> + ret = -EIO;
>>>> + else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
>>>> nfs4_stateid_copy(dst, &lsp->ls_stateid);
>>>> ret = 0;
>>>> smp_rmb();
>>>> diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
>>>> index c041c41..a8f57c7 100644
>>>> --- a/fs/nfs/proc.c
>>>> +++ b/fs/nfs/proc.c
>>>> @@ -623,9 +623,10 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
>>>> msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
>>>> }
>>>>
>>>> -static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
>>>> +static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
>>>> {
>>>> rpc_call_start(task);
>>>> + return 0;
>>>> }
>>>>
>>>> static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
>>>> @@ -644,9 +645,10 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
>>>> msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
>>>> }
>>>>
>>>> -static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
>>>> +static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
>>>> {
>>>> rpc_call_start(task);
>>>> + return 0;
>>>> }
>>>>
>>>> static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
>>>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>>>> index 70a26c6..31db5c3 100644
>>>> --- a/fs/nfs/read.c
>>>> +++ b/fs/nfs/read.c
>>>> @@ -513,9 +513,10 @@ static void nfs_readpage_release_common(void *calldata)
>>>> void nfs_read_prepare(struct rpc_task *task, void *calldata)
>>>> {
>>>> struct nfs_read_data *data = calldata;
>>>> - NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
>>>> - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
>>>> - rpc_exit(task, -EIO);
>>>> + int err;
>>>> + err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
>>>> + if (err)
>>>> + rpc_exit(task, err);
>>>> }
>>>>
>>>> static const struct rpc_call_ops nfs_read_common_ops = {
>>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>>>> index f1bdb72..7816801 100644
>>>> --- a/fs/nfs/write.c
>>>> +++ b/fs/nfs/write.c
>>>> @@ -1265,9 +1265,10 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
>>>> void nfs_write_prepare(struct rpc_task *task, void *calldata)
>>>> {
>>>> struct nfs_write_data *data = calldata;
>>>> - NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
>>>> - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
>>>> - rpc_exit(task, -EIO);
>>>> + int err;
>>>> + err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
>>>> + if (err)
>>>> + rpc_exit(task, err);
>>>> }
>>>>
>>>> void nfs_commit_prepare(struct rpc_task *task, void *calldata)
>>>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>>>> index 8651574..c71e12b 100644
>>>> --- a/include/linux/nfs_xdr.h
>>>> +++ b/include/linux/nfs_xdr.h
>>>> @@ -1419,12 +1419,12 @@ struct nfs_rpc_ops {
>>>> void (*read_setup) (struct nfs_read_data *, struct rpc_message *);
>>>> void (*read_pageio_init)(struct nfs_pageio_descriptor *, struct inode *,
>>>> const struct nfs_pgio_completion_ops *);
>>>> - void (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
>>>> + int (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
>>>> int (*read_done) (struct rpc_task *, struct nfs_read_data *);
>>>> void (*write_setup) (struct nfs_write_data *, struct rpc_message *);
>>>> void (*write_pageio_init)(struct nfs_pageio_descriptor *, struct inode *, int,
>>>> const struct nfs_pgio_completion_ops *);
>>>> - void (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
>>>> + int (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
>>>> int (*write_done) (struct rpc_task *, struct nfs_write_data *);
>>>> void (*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
>>>> void (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
>>>
>>>
>>
>
>
> --
> Jeff Layton <[email protected]>

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com





2013-08-16 13:29:36

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH/RFC] Don't try to recover NFS locks when they are lost.

On Fri, 16 Aug 2013 20:38:21 +1000
NeilBrown <[email protected]> wrote:

> On Thu, 15 Aug 2013 08:47:06 -0400 Jeff Layton <[email protected]> wrote:
>
> > On Thu, 15 Aug 2013 12:36:04 +1000
> > NeilBrown <[email protected]> wrote:
> >
> > >
> > >
> > > When an NFS (V4 specifically) client loses contact with the server it can
> > > lose any locks that it holds.
> > > Currently when it reconnects to the server it simply tries to reclaim
> > > those locks. This might succeed even though some other client has held and
> > > released a lock in the mean time. So the first client might think the file
> > > is unchanged, but it isn't. This isn't good.
> > >
> > > If, when recovery happens, the locks cannot be claimed because some other
> > > client still holds the lock, then we get a message in the kernel logs, but
> > > the client can still write. So two clients can both think they have a lock
> > > and can both write at the same time. This is equally not good.
> > >
> > > There was a patch a while ago
> > > http://comments.gmane.org/gmane.linux.nfs/41917
> > >
> > > which tried to address some of this, but it didn't seem to go anywhere.
> > > That patch would also send a signal to the process. That might be useful
> > > but I'm really just interested in failing the writes.
> > > For NFSv4 (unlike v2/v3) there is a strong link between the lock and the
> > > write request so we can fairly easily fail an IO of the lock is gone.
> > >
> > > The patch below attempts to do this. Does it make sense?
> > > Because this is a fairly big change I introduces a module parameter
> > > "recover_locks" which defaults to true (the current behaviour) but can be set
> > > to "false" to tell the client not to try to recover things that were lost.
> > >
> > > Comments?
> > >
> > > Thanks,
> > > NeilBrown
> > >
> > >
> >
> > Failing a read or write when we can't recover a lock over the range
> > seems reasonable to me. IIUC though, you're also saying that we
> > shouldn't try to recover locks when the lease has expired? If so, then
> > that seems wrong...
> >
> > Isn't it the responsibility of the server to not allow a lock to be
> > reclaimed when there has been a conflicting lock in the interim? It's
> > quite possible (and even advantageous) for a server to hold onto a lock
> > for a client that has missed renewing its lease when no other client has
> > made a conflicting lock request.
>
> Hi Jeff,
> I had thought that too. But when I looked I could find no evidence for it.
> The only time a client can 'reclaim' a lock is during the grace period when
> the server might have lost the lock due to a reboot.
> The case I'm looking at is when neither host rebooted but there was a network
> partition.
> I think that if the server is to preserve the lock while no other client
> contends it, it has to preserve the whole state and not return
> NFS4ERR_EXPIRED.
> Once the client gets NFS4ERR_EXPIRED it must assume that all related locks
> may have been subject to conflicting locks from other clients.
>
> Thanks,
> NeilBrown
>
>

Ahh, a very good point. So I guess to reiterate, NFS4ERR_EXPIRED means
that the state (if there was any) has been purged, and all bets are
off. So yeah, ok...trying to reclaim locks at that point is probably
wrong.

That said...why is the server granting those lock reclaims in this case?
Presumably the grace period has passed and it shouldn't be granting any
reclaim requests, right?



> >
> > >
> > > diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
> > > index f5c84c3..de0229b 100644
> > > --- a/fs/nfs/nfs3proc.c
> > > +++ b/fs/nfs/nfs3proc.c
> > > @@ -826,9 +826,10 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
> > > msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
> > > }
> > >
> > > -static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > > +static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > > {
> > > rpc_call_start(task);
> > > + return 0;
> > > }
> > >
> > > static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
> > > @@ -847,9 +848,10 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
> > > msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
> > > }
> > >
> > > -static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > > +static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > > {
> > > rpc_call_start(task);
> > > + return 0;
> > > }
> > >
> > > static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
> > > diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
> > > index ee81e35..a468b345 100644
> > > --- a/fs/nfs/nfs4_fs.h
> > > +++ b/fs/nfs/nfs4_fs.h
> > > @@ -135,6 +135,7 @@ struct nfs4_lock_state {
> > > struct list_head ls_locks; /* Other lock stateids */
> > > struct nfs4_state * ls_state; /* Pointer to open state */
> > > #define NFS_LOCK_INITIALIZED 0
> > > +#define NFS_LOCK_LOST 1
> > > unsigned long ls_flags;
> > > struct nfs_seqid_counter ls_seqid;
> > > nfs4_stateid ls_stateid;
> > > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> > > index cf11799..bcbcd07 100644
> > > --- a/fs/nfs/nfs4proc.c
> > > +++ b/fs/nfs/nfs4proc.c
> > > @@ -3907,15 +3907,19 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
> > > nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
> > > }
> > >
> > > -static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > > +static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > > {
> > > if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
> > > &data->args.seq_args,
> > > &data->res.seq_res,
> > > task))
> > > - return;
> > > - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> > > - data->args.lock_context, FMODE_READ);
> > > + return 0;
> > > + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> > > + data->args.lock_context, FMODE_READ) == -EIO)
> > > + return -EIO;
> > > + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> > > + return -EIO;
> > > + return 0;
> > > }
> > >
> > > static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
> > > @@ -3990,15 +3994,19 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
> > > nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
> > > }
> > >
> > > -static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > > +static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > > {
> > > if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
> > > &data->args.seq_args,
> > > &data->res.seq_res,
> > > task))
> > > - return;
> > > - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> > > - data->args.lock_context, FMODE_WRITE);
> > > + return 0;
> > > + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> > > + data->args.lock_context, FMODE_WRITE) == -EIO)
> > > + return -EIO;
> > > + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> > > + return -EIO;
> > > + return 0;
> > > }
> > >
> > > static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
> > > @@ -5380,6 +5388,11 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
> > > return err;
> > > }
> > >
> > > +bool recover_locks = true;
> > > +module_param(recover_locks, bool, 0644);
> > > +MODULE_PARM_DESC(recovery_locks,
> > > + "If the server reports that a lock might be lost, "
> > > + "try to recovery it risking corruption.");
> > > static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
> > > {
> > > struct nfs_server *server = NFS_SERVER(state->inode);
> > > @@ -5391,6 +5404,10 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
> > > err = nfs4_set_lock_state(state, request);
> > > if (err != 0)
> > > return err;
> > > + if (!recover_locks) {
> > > + set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
> > > + return 0;
> > > + }
> > > do {
> > > if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
> > > return 0;
> > > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> > > index e22862f..4d103ff 100644
> > > --- a/fs/nfs/nfs4state.c
> > > +++ b/fs/nfs/nfs4state.c
> > > @@ -998,7 +998,9 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
> > > fl_pid = lockowner->l_pid;
> > > spin_lock(&state->state_lock);
> > > lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
> > > - if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
> > > + if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
> > > + ret = -EIO;
> > > + else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
> > > nfs4_stateid_copy(dst, &lsp->ls_stateid);
> > > ret = 0;
> > > smp_rmb();
> > > diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
> > > index c041c41..a8f57c7 100644
> > > --- a/fs/nfs/proc.c
> > > +++ b/fs/nfs/proc.c
> > > @@ -623,9 +623,10 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
> > > msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
> > > }
> > >
> > > -static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > > +static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> > > {
> > > rpc_call_start(task);
> > > + return 0;
> > > }
> > >
> > > static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
> > > @@ -644,9 +645,10 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
> > > msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
> > > }
> > >
> > > -static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > > +static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> > > {
> > > rpc_call_start(task);
> > > + return 0;
> > > }
> > >
> > > static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
> > > diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> > > index 70a26c6..31db5c3 100644
> > > --- a/fs/nfs/read.c
> > > +++ b/fs/nfs/read.c
> > > @@ -513,9 +513,10 @@ static void nfs_readpage_release_common(void *calldata)
> > > void nfs_read_prepare(struct rpc_task *task, void *calldata)
> > > {
> > > struct nfs_read_data *data = calldata;
> > > - NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
> > > - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> > > - rpc_exit(task, -EIO);
> > > + int err;
> > > + err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
> > > + if (err)
> > > + rpc_exit(task, err);
> > > }
> > >
> > > static const struct rpc_call_ops nfs_read_common_ops = {
> > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > index f1bdb72..7816801 100644
> > > --- a/fs/nfs/write.c
> > > +++ b/fs/nfs/write.c
> > > @@ -1265,9 +1265,10 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
> > > void nfs_write_prepare(struct rpc_task *task, void *calldata)
> > > {
> > > struct nfs_write_data *data = calldata;
> > > - NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
> > > - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> > > - rpc_exit(task, -EIO);
> > > + int err;
> > > + err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
> > > + if (err)
> > > + rpc_exit(task, err);
> > > }
> > >
> > > void nfs_commit_prepare(struct rpc_task *task, void *calldata)
> > > diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> > > index 8651574..c71e12b 100644
> > > --- a/include/linux/nfs_xdr.h
> > > +++ b/include/linux/nfs_xdr.h
> > > @@ -1419,12 +1419,12 @@ struct nfs_rpc_ops {
> > > void (*read_setup) (struct nfs_read_data *, struct rpc_message *);
> > > void (*read_pageio_init)(struct nfs_pageio_descriptor *, struct inode *,
> > > const struct nfs_pgio_completion_ops *);
> > > - void (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
> > > + int (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
> > > int (*read_done) (struct rpc_task *, struct nfs_read_data *);
> > > void (*write_setup) (struct nfs_write_data *, struct rpc_message *);
> > > void (*write_pageio_init)(struct nfs_pageio_descriptor *, struct inode *, int,
> > > const struct nfs_pgio_completion_ops *);
> > > - void (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
> > > + int (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
> > > int (*write_done) (struct rpc_task *, struct nfs_write_data *);
> > > void (*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
> > > void (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);
> >
> >
>


--
Jeff Layton <[email protected]>


Attachments:
signature.asc (836.00 B)

2013-08-15 12:40:09

by Malahal Naineni

[permalink] [raw]
Subject: Re: [PATCH/RFC] Don't try to recover NFS locks when they are lost.

NeilBrown <neilb@...> writes:
> Because this is a fairly big change I introduces a module parameter
> "recover_locks" which defaults to true (the current behaviour) but can be set
> to "false" to tell the client not to try to recover things that were lost.

Current behaviour is broken, why do we want to keep it? I would say, set
it to 'False' by default, and if someone really wants the current broken
behavior, they can set it to 'True'

> -static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct
nfs_read_data *data)
> +static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct
nfs_read_data *data)
> {
> if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
> &data->args.seq_args,
> &data->res.seq_res,
> task))
> - return;
> - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> - data->args.lock_context, FMODE_READ);
> + return 0;
> + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> + data->args.lock_context, FMODE_READ) == -EIO)
> + return -EIO;

Do we want to check for only -EIO return and ignore other errors?

> + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> + return -EIO;
> + return 0;
> }
>
> static int nfs4_write_done_cb(struct rpc_task *task, struct
nfs_write_data *data)
> <at> <at> -3990,15 +3994,19 <at> <at> static void
nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
> nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
> }
>
> -static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct
nfs_write_data *data)
> +static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct
nfs_write_data *data)
> {
> if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
> &data->args.seq_args,
> &data->res.seq_res,
> task))
> - return;
> - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> - data->args.lock_context, FMODE_WRITE);
> + return 0;
> + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> + data->args.lock_context, FMODE_WRITE) == -EIO)
> + return -EIO;

Do we want to check for only -EIO return and ignore other errors?

> + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> + return -EIO;
> + return 0;
> }
>
> static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct
nfs_commit_data *data)
> <at> <at> -5380,6 +5388,11 <at> <at> static int
nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
> return err;
> }
>
> +bool recover_locks = true;
> +module_param(recover_locks, bool, 0644);
> +MODULE_PARM_DESC(recovery_locks,
> + "If the server reports that a lock might be lost, "
> + "try to recovery it risking corruption.");
> static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock
*request)
> {
> struct nfs_server *server = NFS_SERVER(state->inode);
> <at> <at> -5391,6 +5404,10 <at> <at> static int
nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
> err = nfs4_set_lock_state(state, request);
> if (err != 0)
> return err;
> + if (!recover_locks) {
> + set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
> + return 0;
> + }
> do {
> if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
> return 0;
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index e22862f..4d103ff 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> <at> <at> -998,7 +998,9 <at> <at> static int
nfs4_copy_lock_stateid(nfs4_stateid *dst,
> fl_pid = lockowner->l_pid;
> spin_lock(&state->state_lock);
> lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
> - if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
> + if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
> + ret = -EIO;
> + else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)
!= 0) {
> nfs4_stateid_copy(dst, &lsp->ls_stateid);
> ret = 0;
> smp_rmb();

(lsp) and (lsp != NULL): They are same, but it would be good to have the
same check. Also, Trond likes to return EBADF rather than EIO in this
lock loss case. You may need to change EIO to EBADF.

The patch looks good. Thank you.

Regards, Malahal.



2013-08-15 12:46:18

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH/RFC] Don't try to recover NFS locks when they are lost.

On Thu, 15 Aug 2013 12:36:04 +1000
NeilBrown <[email protected]> wrote:

>
>
> When an NFS (V4 specifically) client loses contact with the server it can
> lose any locks that it holds.
> Currently when it reconnects to the server it simply tries to reclaim
> those locks. This might succeed even though some other client has held and
> released a lock in the mean time. So the first client might think the file
> is unchanged, but it isn't. This isn't good.
>
> If, when recovery happens, the locks cannot be claimed because some other
> client still holds the lock, then we get a message in the kernel logs, but
> the client can still write. So two clients can both think they have a lock
> and can both write at the same time. This is equally not good.
>
> There was a patch a while ago
> http://comments.gmane.org/gmane.linux.nfs/41917
>
> which tried to address some of this, but it didn't seem to go anywhere.
> That patch would also send a signal to the process. That might be useful
> but I'm really just interested in failing the writes.
> For NFSv4 (unlike v2/v3) there is a strong link between the lock and the
> write request so we can fairly easily fail an IO of the lock is gone.
>
> The patch below attempts to do this. Does it make sense?
> Because this is a fairly big change I introduces a module parameter
> "recover_locks" which defaults to true (the current behaviour) but can be set
> to "false" to tell the client not to try to recover things that were lost.
>
> Comments?
>
> Thanks,
> NeilBrown
>
>

Failing a read or write when we can't recover a lock over the range
seems reasonable to me. IIUC though, you're also saying that we
shouldn't try to recover locks when the lease has expired? If so, then
that seems wrong...

Isn't it the responsibility of the server to not allow a lock to be
reclaimed when there has been a conflicting lock in the interim? It's
quite possible (and even advantageous) for a server to hold onto a lock
for a client that has missed renewing its lease when no other client has
made a conflicting lock request.

>
> diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
> index f5c84c3..de0229b 100644
> --- a/fs/nfs/nfs3proc.c
> +++ b/fs/nfs/nfs3proc.c
> @@ -826,9 +826,10 @@ static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message
> msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
> }
>
> -static void nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> +static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> {
> rpc_call_start(task);
> + return 0;
> }
>
> static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
> @@ -847,9 +848,10 @@ static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
> msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
> }
>
> -static void nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> +static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> {
> rpc_call_start(task);
> + return 0;
> }
>
> static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
> diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
> index ee81e35..a468b345 100644
> --- a/fs/nfs/nfs4_fs.h
> +++ b/fs/nfs/nfs4_fs.h
> @@ -135,6 +135,7 @@ struct nfs4_lock_state {
> struct list_head ls_locks; /* Other lock stateids */
> struct nfs4_state * ls_state; /* Pointer to open state */
> #define NFS_LOCK_INITIALIZED 0
> +#define NFS_LOCK_LOST 1
> unsigned long ls_flags;
> struct nfs_seqid_counter ls_seqid;
> nfs4_stateid ls_stateid;
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index cf11799..bcbcd07 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -3907,15 +3907,19 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
> nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
> }
>
> -static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> +static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> {
> if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
> &data->args.seq_args,
> &data->res.seq_res,
> task))
> - return;
> - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> - data->args.lock_context, FMODE_READ);
> + return 0;
> + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> + data->args.lock_context, FMODE_READ) == -EIO)
> + return -EIO;
> + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> + return -EIO;
> + return 0;
> }
>
> static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
> @@ -3990,15 +3994,19 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
> nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
> }
>
> -static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> +static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> {
> if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
> &data->args.seq_args,
> &data->res.seq_res,
> task))
> - return;
> - nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> - data->args.lock_context, FMODE_WRITE);
> + return 0;
> + if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
> + data->args.lock_context, FMODE_WRITE) == -EIO)
> + return -EIO;
> + if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> + return -EIO;
> + return 0;
> }
>
> static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
> @@ -5380,6 +5388,11 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
> return err;
> }
>
> +bool recover_locks = true;
> +module_param(recover_locks, bool, 0644);
> +MODULE_PARM_DESC(recovery_locks,
> + "If the server reports that a lock might be lost, "
> + "try to recovery it risking corruption.");
> static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
> {
> struct nfs_server *server = NFS_SERVER(state->inode);
> @@ -5391,6 +5404,10 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
> err = nfs4_set_lock_state(state, request);
> if (err != 0)
> return err;
> + if (!recover_locks) {
> + set_bit(NFS_LOCK_LOST, &request->fl_u.nfs4_fl.owner->ls_flags);
> + return 0;
> + }
> do {
> if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
> return 0;
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index e22862f..4d103ff 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -998,7 +998,9 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
> fl_pid = lockowner->l_pid;
> spin_lock(&state->state_lock);
> lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
> - if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
> + if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
> + ret = -EIO;
> + else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
> nfs4_stateid_copy(dst, &lsp->ls_stateid);
> ret = 0;
> smp_rmb();
> diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
> index c041c41..a8f57c7 100644
> --- a/fs/nfs/proc.c
> +++ b/fs/nfs/proc.c
> @@ -623,9 +623,10 @@ static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *
> msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
> }
>
> -static void nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> +static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
> {
> rpc_call_start(task);
> + return 0;
> }
>
> static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
> @@ -644,9 +645,10 @@ static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message
> msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
> }
>
> -static void nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> +static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
> {
> rpc_call_start(task);
> + return 0;
> }
>
> static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
> index 70a26c6..31db5c3 100644
> --- a/fs/nfs/read.c
> +++ b/fs/nfs/read.c
> @@ -513,9 +513,10 @@ static void nfs_readpage_release_common(void *calldata)
> void nfs_read_prepare(struct rpc_task *task, void *calldata)
> {
> struct nfs_read_data *data = calldata;
> - NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
> - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> - rpc_exit(task, -EIO);
> + int err;
> + err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
> + if (err)
> + rpc_exit(task, err);
> }
>
> static const struct rpc_call_ops nfs_read_common_ops = {
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index f1bdb72..7816801 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -1265,9 +1265,10 @@ EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
> void nfs_write_prepare(struct rpc_task *task, void *calldata)
> {
> struct nfs_write_data *data = calldata;
> - NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
> - if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
> - rpc_exit(task, -EIO);
> + int err;
> + err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
> + if (err)
> + rpc_exit(task, err);
> }
>
> void nfs_commit_prepare(struct rpc_task *task, void *calldata)
> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
> index 8651574..c71e12b 100644
> --- a/include/linux/nfs_xdr.h
> +++ b/include/linux/nfs_xdr.h
> @@ -1419,12 +1419,12 @@ struct nfs_rpc_ops {
> void (*read_setup) (struct nfs_read_data *, struct rpc_message *);
> void (*read_pageio_init)(struct nfs_pageio_descriptor *, struct inode *,
> const struct nfs_pgio_completion_ops *);
> - void (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
> + int (*read_rpc_prepare)(struct rpc_task *, struct nfs_read_data *);
> int (*read_done) (struct rpc_task *, struct nfs_read_data *);
> void (*write_setup) (struct nfs_write_data *, struct rpc_message *);
> void (*write_pageio_init)(struct nfs_pageio_descriptor *, struct inode *, int,
> const struct nfs_pgio_completion_ops *);
> - void (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
> + int (*write_rpc_prepare)(struct rpc_task *, struct nfs_write_data *);
> int (*write_done) (struct rpc_task *, struct nfs_write_data *);
> void (*commit_setup) (struct nfs_commit_data *, struct rpc_message *);
> void (*commit_rpc_prepare)(struct rpc_task *, struct nfs_commit_data *);


--
Jeff Layton <[email protected]>

2013-09-04 00:49:59

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH/RFC] Don't try to recover NFS locks when they are lost.

On Tue, 3 Sep 2013 18:43:23 +0000 "Myklebust, Trond"
<[email protected]> wrote:

> On Thu, 2013-08-15 at 12:36 +1000, NeilBrown wrote:
> >
> > When an NFS (V4 specifically) client loses contact with the server it can
> > lose any locks that it holds.
> > Currently when it reconnects to the server it simply tries to reclaim
> > those locks. This might succeed even though some other client has held and
> > released a lock in the mean time. So the first client might think the file
> > is unchanged, but it isn't. This isn't good.
> >
> > If, when recovery happens, the locks cannot be claimed because some other
> > client still holds the lock, then we get a message in the kernel logs, but
> > the client can still write. So two clients can both think they have a lock
> > and can both write at the same time. This is equally not good.
> >
> > There was a patch a while ago
> > http://comments.gmane.org/gmane.linux.nfs/41917
> >
> > which tried to address some of this, but it didn't seem to go anywhere.
> > That patch would also send a signal to the process. That might be useful
> > but I'm really just interested in failing the writes.
> > For NFSv4 (unlike v2/v3) there is a strong link between the lock and the
> > write request so we can fairly easily fail an IO of the lock is gone.
> >
> > The patch below attempts to do this. Does it make sense?
> > Because this is a fairly big change I introduces a module parameter
> > "recover_locks" which defaults to true (the current behaviour) but can be set
> > to "false" to tell the client not to try to recover things that were lost.
> >
> > Comments?
>
> I think this patch is close to being usable. A couple of questions,
> though:
>
> 1. What happens if another process' open() causes us to receive a
> delegation after NFS_LOCK_LOST has been set on our lock stateid,
> but before we call nfs4_set_rw_stateid()?

Good point. I think we need to check for NFS_LOCK_LOST before checking for a
delegation. Does the incremental patch below look OK?
It takes a spinlock in the case where we have a delegation and hold some
locks which it didn't have to take before. Is that a concern?


> 2. Shouldn't we clear NFS_LOCK_LOST at some point? It looks to me
> as if a process which sees the EIO, and decides to recover by
> calling close(), reopen()ing the file and then locking it again,
> might find NFS_LOCK_LOST still being set.


NFS_LOCK_LOST is per nfs4_lock_state which should be freed by
nfs4_fl_release_lock().
So when the files is closed, the locks a dropped, and the structure holding
the NFS_LOCK_LOST flag will go away.
Or did I miss something?

Thanks,
NeilBrown


diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 4d103ff..bb1fd5d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1040,10 +1040,11 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
fmode_t fmode, const struct nfs_lockowner *lockowner)
{
- int ret = 0;
+ int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+ if (ret == -EIO)
+ goto out;
if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
goto out;
- ret = nfs4_copy_lock_stateid(dst, state, lockowner);
if (ret != -ENOENT)
goto out;
ret = nfs4_copy_open_stateid(dst, state);


Attachments:
signature.asc (828.00 B)

2013-09-04 03:17:43

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH/RFC] Don't try to recover NFS locks when they are lost.

Hi Neil,

That looks better, but we still want to send the delegation stateid in the case where we have both a lock and a delegation.

Cheers
Trond

Sent from my tablet.

NeilBrown <[email protected]> wrote:


On Tue, 3 Sep 2013 18:43:23 +0000 "Myklebust, Trond"
<[email protected]> wrote:

> On Thu, 2013-08-15 at 12:36 +1000, NeilBrown wrote:
> >
> > When an NFS (V4 specifically) client loses contact with the server it can
> > lose any locks that it holds.
> > Currently when it reconnects to the server it simply tries to reclaim
> > those locks. This might succeed even though some other client has held and
> > released a lock in the mean time. So the first client might think the file
> > is unchanged, but it isn't. This isn't good.
> >
> > If, when recovery happens, the locks cannot be claimed because some other
> > client still holds the lock, then we get a message in the kernel logs, but
> > the client can still write. So two clients can both think they have a lock
> > and can both write at the same time. This is equally not good.
> >
> > There was a patch a while ago
> > http://comments.gmane.org/gmane.linux.nfs/41917
> >
> > which tried to address some of this, but it didn't seem to go anywhere.
> > That patch would also send a signal to the process. That might be useful
> > but I'm really just interested in failing the writes.
> > For NFSv4 (unlike v2/v3) there is a strong link between the lock and the
> > write request so we can fairly easily fail an IO of the lock is gone.
> >
> > The patch below attempts to do this. Does it make sense?
> > Because this is a fairly big change I introduces a module parameter
> > "recover_locks" which defaults to true (the current behaviour) but can be set
> > to "false" to tell the client not to try to recover things that were lost.
> >
> > Comments?
>
> I think this patch is close to being usable. A couple of questions,
> though:
>
> 1. What happens if another process' open() causes us to receive a
> delegation after NFS_LOCK_LOST has been set on our lock stateid,
> but before we call nfs4_set_rw_stateid()?

Good point. I think we need to check for NFS_LOCK_LOST before checking for a
delegation. Does the incremental patch below look OK?
It takes a spinlock in the case where we have a delegation and hold some
locks which it didn't have to take before. Is that a concern?


> 2. Shouldn't we clear NFS_LOCK_LOST at some point? It looks to me
> as if a process which sees the EIO, and decides to recover by
> calling close(), reopen()ing the file and then locking it again,
> might find NFS_LOCK_LOST still being set.


NFS_LOCK_LOST is per nfs4_lock_state which should be freed by
nfs4_fl_release_lock().
So when the files is closed, the locks a dropped, and the structure holding
the NFS_LOCK_LOST flag will go away.
Or did I miss something?

Thanks,
NeilBrown


diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 4d103ff..bb1fd5d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1040,10 +1040,11 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
fmode_t fmode, const struct nfs_lockowner *lockowner)
{
- int ret = 0;
+ int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+ if (ret == -EIO)
+ goto out;
if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
goto out;
- ret = nfs4_copy_lock_stateid(dst, state, lockowner);
if (ret != -ENOENT)
goto out;
ret = nfs4_copy_open_stateid(dst, state);

2013-09-04 03:25:32

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH/RFC] Don't try to recover NFS locks when they are lost.

On Wed, 4 Sep 2013 03:17:41 +0000 "Myklebust, Trond"
<[email protected]> wrote:

> Hi Neil,
>
> That looks better, but we still want to send the delegation stateid in the case where we have both a lock and a delegation.

That is exactly what that code does.
First it checks for locks and aborts if a lost-lock was found.
Then it checks for delegations and returns one if found.
Then (if no delegation) it returns the lock stateid if that was found.
Then it goes on to the open stateid.

I'll combine it all into one patch and submit properly.

Thanks,
NeilBrown

>
> Cheers
> Trond
>
> Sent from my tablet.
>
> NeilBrown <[email protected]> wrote:
>
>
> On Tue, 3 Sep 2013 18:43:23 +0000 "Myklebust, Trond"
> <[email protected]> wrote:
>
> > On Thu, 2013-08-15 at 12:36 +1000, NeilBrown wrote:
> > >
> > > When an NFS (V4 specifically) client loses contact with the server it can
> > > lose any locks that it holds.
> > > Currently when it reconnects to the server it simply tries to reclaim
> > > those locks. This might succeed even though some other client has held and
> > > released a lock in the mean time. So the first client might think the file
> > > is unchanged, but it isn't. This isn't good.
> > >
> > > If, when recovery happens, the locks cannot be claimed because some other
> > > client still holds the lock, then we get a message in the kernel logs, but
> > > the client can still write. So two clients can both think they have a lock
> > > and can both write at the same time. This is equally not good.
> > >
> > > There was a patch a while ago
> > > http://comments.gmane.org/gmane.linux.nfs/41917
> > >
> > > which tried to address some of this, but it didn't seem to go anywhere.
> > > That patch would also send a signal to the process. That might be useful
> > > but I'm really just interested in failing the writes.
> > > For NFSv4 (unlike v2/v3) there is a strong link between the lock and the
> > > write request so we can fairly easily fail an IO of the lock is gone.
> > >
> > > The patch below attempts to do this. Does it make sense?
> > > Because this is a fairly big change I introduces a module parameter
> > > "recover_locks" which defaults to true (the current behaviour) but can be set
> > > to "false" to tell the client not to try to recover things that were lost.
> > >
> > > Comments?
> >
> > I think this patch is close to being usable. A couple of questions,
> > though:
> >
> > 1. What happens if another process' open() causes us to receive a
> > delegation after NFS_LOCK_LOST has been set on our lock stateid,
> > but before we call nfs4_set_rw_stateid()?
>
> Good point. I think we need to check for NFS_LOCK_LOST before checking for a
> delegation. Does the incremental patch below look OK?
> It takes a spinlock in the case where we have a delegation and hold some
> locks which it didn't have to take before. Is that a concern?
>
>
> > 2. Shouldn't we clear NFS_LOCK_LOST at some point? It looks to me
> > as if a process which sees the EIO, and decides to recover by
> > calling close(), reopen()ing the file and then locking it again,
> > might find NFS_LOCK_LOST still being set.
>
>
> NFS_LOCK_LOST is per nfs4_lock_state which should be freed by
> nfs4_fl_release_lock().
> So when the files is closed, the locks a dropped, and the structure holding
> the NFS_LOCK_LOST flag will go away.
> Or did I miss something?
>
> Thanks,
> NeilBrown
>
>
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index 4d103ff..bb1fd5d 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -1040,10 +1040,11 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
> int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
> fmode_t fmode, const struct nfs_lockowner *lockowner)
> {
> - int ret = 0;
> + int ret = nfs4_copy_lock_stateid(dst, state, lockowner);
> + if (ret == -EIO)
> + goto out;
> if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
> goto out;
> - ret = nfs4_copy_lock_stateid(dst, state, lockowner);
> if (ret != -ENOENT)
> goto out;
> ret = nfs4_copy_open_stateid(dst, state);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html


Attachments:
signature.asc (828.00 B)

2013-09-03 18:43:25

by Myklebust, Trond

[permalink] [raw]
Subject: Re: [PATCH/RFC] Don't try to recover NFS locks when they are lost.

T24gVGh1LCAyMDEzLTA4LTE1IGF0IDEyOjM2ICsxMDAwLCBOZWlsQnJvd24gd3JvdGU6DQo+IA0K
PiBXaGVuIGFuIE5GUyAoVjQgc3BlY2lmaWNhbGx5KSBjbGllbnQgbG9zZXMgY29udGFjdCB3aXRo
IHRoZSBzZXJ2ZXIgaXQgY2FuDQo+IGxvc2UgYW55IGxvY2tzIHRoYXQgaXQgaG9sZHMuDQo+IEN1
cnJlbnRseSB3aGVuIGl0IHJlY29ubmVjdHMgdG8gdGhlIHNlcnZlciBpdCBzaW1wbHkgdHJpZXMg
dG8gcmVjbGFpbQ0KPiB0aG9zZSBsb2Nrcy4gIFRoaXMgbWlnaHQgc3VjY2VlZCBldmVuIHRob3Vn
aCBzb21lIG90aGVyIGNsaWVudCBoYXMgaGVsZCBhbmQNCj4gcmVsZWFzZWQgYSBsb2NrIGluIHRo
ZSBtZWFuIHRpbWUuICBTbyB0aGUgZmlyc3QgY2xpZW50IG1pZ2h0IHRoaW5rIHRoZSBmaWxlDQo+
IGlzIHVuY2hhbmdlZCwgYnV0IGl0IGlzbid0LiAgVGhpcyBpc24ndCBnb29kLg0KPiANCj4gSWYs
IHdoZW4gcmVjb3ZlcnkgaGFwcGVucywgdGhlIGxvY2tzIGNhbm5vdCBiZSBjbGFpbWVkIGJlY2F1
c2Ugc29tZSBvdGhlcg0KPiBjbGllbnQgc3RpbGwgaG9sZHMgdGhlIGxvY2ssIHRoZW4gIHdlIGdl
dCBhIG1lc3NhZ2UgaW4gdGhlIGtlcm5lbCBsb2dzLCBidXQNCj4gdGhlIGNsaWVudCBjYW4gc3Rp
bGwgd3JpdGUuICBTbyB0d28gY2xpZW50cyBjYW4gYm90aCB0aGluayB0aGV5IGhhdmUgYSBsb2Nr
DQo+IGFuZCBjYW4gYm90aCB3cml0ZSBhdCB0aGUgc2FtZSB0aW1lLiAgVGhpcyBpcyBlcXVhbGx5
IG5vdCBnb29kLg0KPiANCj4gVGhlcmUgd2FzIGEgcGF0Y2ggYSB3aGlsZSBhZ28NCj4gICBodHRw
Oi8vY29tbWVudHMuZ21hbmUub3JnL2dtYW5lLmxpbnV4Lm5mcy80MTkxNw0KPiANCj4gd2hpY2gg
dHJpZWQgdG8gYWRkcmVzcyBzb21lIG9mIHRoaXMsIGJ1dCBpdCBkaWRuJ3Qgc2VlbSB0byBnbyBh
bnl3aGVyZS4NCj4gVGhhdCBwYXRjaCB3b3VsZCBhbHNvIHNlbmQgYSBzaWduYWwgdG8gdGhlIHBy
b2Nlc3MuICBUaGF0IG1pZ2h0IGJlIHVzZWZ1bA0KPiBidXQgSSdtIHJlYWxseSBqdXN0IGludGVy
ZXN0ZWQgaW4gZmFpbGluZyB0aGUgd3JpdGVzLg0KPiBGb3IgTkZTdjQgKHVubGlrZSB2Mi92Mykg
dGhlcmUgaXMgYSBzdHJvbmcgbGluayBiZXR3ZWVuIHRoZSBsb2NrIGFuZCB0aGUNCj4gd3JpdGUg
cmVxdWVzdCBzbyB3ZSBjYW4gZmFpcmx5IGVhc2lseSBmYWlsIGFuIElPIG9mIHRoZSBsb2NrIGlz
IGdvbmUuDQo+IA0KPiBUaGUgcGF0Y2ggYmVsb3cgYXR0ZW1wdHMgdG8gZG8gdGhpcy4gIERvZXMg
aXQgbWFrZSBzZW5zZT8NCj4gQmVjYXVzZSB0aGlzIGlzIGEgZmFpcmx5IGJpZyBjaGFuZ2UgSSBp
bnRyb2R1Y2VzIGEgbW9kdWxlIHBhcmFtZXRlcg0KPiAicmVjb3Zlcl9sb2NrcyIgd2hpY2ggZGVm
YXVsdHMgdG8gdHJ1ZSAodGhlIGN1cnJlbnQgYmVoYXZpb3VyKSBidXQgY2FuIGJlIHNldA0KPiB0
byAiZmFsc2UiIHRvIHRlbGwgdGhlIGNsaWVudCBub3QgdG8gdHJ5IHRvIHJlY292ZXIgdGhpbmdz
IHRoYXQgd2VyZSBsb3N0Lg0KPiANCj4gQ29tbWVudHM/DQoNCkkgdGhpbmsgdGhpcyBwYXRjaCBp
cyBjbG9zZSB0byBiZWluZyB1c2FibGUuIEEgY291cGxlIG9mIHF1ZXN0aW9ucywNCnRob3VnaDoN
Cg0KICAgICAxLiBXaGF0IGhhcHBlbnMgaWYgYW5vdGhlciBwcm9jZXNzJyBvcGVuKCkgY2F1c2Vz
IHVzIHRvIHJlY2VpdmUgYQ0KICAgICAgICBkZWxlZ2F0aW9uIGFmdGVyIE5GU19MT0NLX0xPU1Qg
aGFzIGJlZW4gc2V0IG9uIG91ciBsb2NrIHN0YXRlaWQsDQogICAgICAgIGJ1dCBiZWZvcmUgd2Ug
Y2FsbCBuZnM0X3NldF9yd19zdGF0ZWlkKCk/DQogICAgIDIuIFNob3VsZG4ndCB3ZSBjbGVhciBO
RlNfTE9DS19MT1NUIGF0IHNvbWUgcG9pbnQ/IEl0IGxvb2tzIHRvIG1lDQogICAgICAgIGFzIGlm
IGEgcHJvY2VzcyB3aGljaCBzZWVzIHRoZSBFSU8sIGFuZCBkZWNpZGVzIHRvIHJlY292ZXIgYnkN
CiAgICAgICAgY2FsbGluZyBjbG9zZSgpLCByZW9wZW4oKWluZyB0aGUgZmlsZSBhbmQgdGhlbiBs
b2NraW5nIGl0IGFnYWluLA0KICAgICAgICBtaWdodCBmaW5kIE5GU19MT0NLX0xPU1Qgc3RpbGwg
YmVpbmcgc2V0Lg0KDQpDaGVlcnMNCiAgVHJvbmQNCi0tIA0KVHJvbmQgTXlrbGVidXN0DQpMaW51
eCBORlMgY2xpZW50IG1haW50YWluZXINCg0KTmV0QXBwDQpUcm9uZC5NeWtsZWJ1c3RAbmV0YXBw
LmNvbQ0Kd3d3Lm5ldGFwcC5jb20NCg==