Hi Peter,
The following patchsets takes up the theme from the NLM patch that you
sent me a couple of weeks ago, and re-implements your fix in terms
that are closer to the existing NFSv4 implementation (which you
said was fine).
Cheers
Trond
---
Trond Myklebust
Linux NFS client maintainer
NetApp
[email protected]
http://www.netapp.com
--- linux-2.6.24.i686/fs/lockd/clntproc.c.org
+++ linux-2.6.24.i686/fs/lockd/clntproc.c
@@ -155,8 +155,6 @@ static void nlmclnt_release_lockargs(str
int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
{
struct nlm_rqst *call;
- sigset_t oldset;
- unsigned long flags;
int status;
nlm_get_host(host);
@@ -168,22 +166,6 @@ int nlmclnt_proc(struct nlm_host *host,
/* Set up the argument struct */
nlmclnt_setlockargs(call, fl);
- /* Keep the old signal mask */
- spin_lock_irqsave(¤t->sighand->siglock, flags);
- oldset = current->blocked;
-
- /* If we're cleaning up locks because the process is exiting,
- * perform the RPC call asynchronously. */
- if ((IS_SETLK(cmd) || IS_SETLKW(cmd))
- && fl->fl_type == F_UNLCK
- && (current->flags & PF_EXITING)) {
- sigfillset(¤t->blocked); /* Mask all signals */
- recalc_sigpending();
-
- call->a_flags = RPC_TASK_ASYNC;
- }
- spin_unlock_irqrestore(¤t->sighand->siglock, flags);
-
if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
if (fl->fl_type != F_UNLCK) {
call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
@@ -192,17 +174,14 @@ int nlmclnt_proc(struct nlm_host *host,
status = nlmclnt_unlock(call, fl);
} else if (IS_GETLK(cmd))
status = nlmclnt_test(call, fl);
- else
+ else {
+ nlm_release_call(call);
status = -EINVAL;
+ }
fl->fl_ops->fl_release_private(fl);
fl->fl_ops = NULL;
- spin_lock_irqsave(¤t->sighand->siglock, flags);
- current->blocked = oldset;
- recalc_sigpending();
- spin_unlock_irqrestore(¤t->sighand->siglock, flags);
-
dprintk("lockd: clnt proc returns %d\n", status);
return status;
}
@@ -596,9 +575,34 @@ nlmclnt_reclaim(struct nlm_host *host, s
static int
nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
{
- struct nlm_host *host = req->a_host;
- struct nlm_res *resp = &req->a_res;
+ struct nlm_host *host = req->a_host;
+ sigset_t oldset;
+ unsigned long flags;
int status = 0;
+ struct rpc_message msg = {
+ .rpc_argp = &req->a_args,
+ .rpc_resp = &req->a_res,
+ };
+ struct rpc_clnt *clnt;
+ struct rpc_task *task;
+ struct rpc_task_setup task_setup_data = {
+ .rpc_message = &msg,
+ .callback_ops = &nlmclnt_unlock_ops,
+ .callback_data = req,
+ .flags = RPC_TASK_ASYNC,
+ };
+
+ /* Keep the old signal mask */
+ spin_lock_irqsave(¤t->sighand->siglock, flags);
+ oldset = current->blocked;
+
+ /* If we're cleaning up locks because the process is exiting,
+ * perform the RPC call asynchronously. */
+ if (current->flags & PF_EXITING) {
+ sigfillset(¤t->blocked); /* Mask all signals */
+ recalc_sigpending();
+ }
+ spin_unlock_irqrestore(¤t->sighand->siglock, flags);
/*
* Note: the server is supposed to either grant us the unlock
@@ -609,27 +613,38 @@ nlmclnt_unlock(struct nlm_rqst *req, str
down_read(&host->h_rwsem);
if (do_vfs_lock(fl) == -ENOENT) {
up_read(&host->h_rwsem);
- goto out;
+ goto err;
}
up_read(&host->h_rwsem);
- if (req->a_flags & RPC_TASK_ASYNC)
- return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+ /* If we have no RPC client yet, create one. */
+ clnt = nlm_bind_host(host);
+ if (clnt == NULL)
+ goto err;
- status = nlmclnt_call(req, NLMPROC_UNLOCK);
- if (status < 0)
- goto out;
+ msg.rpc_proc = &clnt->cl_procinfo[NLMPROC_UNLOCK];
+
+ task_setup_data.rpc_client = clnt;
- if (resp->status == nlm_granted)
+ task = rpc_run_task(&task_setup_data);
+ status = PTR_ERR(task);
+ if (IS_ERR(task))
goto out;
- if (resp->status != nlm_lck_denied_nolocks)
- printk("lockd: unexpected unlock status: %d\n", resp->status);
- /* What to do now? I'm out of my depth... */
- status = -ENOLCK;
+ status = rpc_wait_for_completion_task(task);
+ rpc_put_task(task);
+
out:
- nlm_release_call(req);
+ spin_lock_irqsave(¤t->sighand->siglock, flags);
+ current->blocked = oldset;
+ recalc_sigpending();
+ spin_unlock_irqrestore(¤t->sighand->siglock, flags);
+
return status;
+
+err:
+ nlm_release_call(req);
+ goto out;
}
static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
On Fri, 2008-03-28 at 17:37 -0400, Peter Staubach wrote:
> However, I think that nlmclnt_unlock() needs to wait until
> the RPC is completed.
It should do that now. See the call to rpc_wait_for_completion_task() in
nlm_async_call()
> The original problem was test12() in
> the Connectathon testsuite, which would occasionally fail.
> It would fail because the parent would kill the child process
> (actually the child of the child) and immediately attempt to
> grab the lock. This would fail because the child hadn't
> completed releasing the lock yet. There were some timing
> dependencies in test12() itself, which I eliminated, but then
> discovered that this wouldn't solve the entire problem. (I
> can send you the new version of test12(), if you wish.)
So, at least in 2.6.25, the call to rpc_wait_for_completion_task() will
exit only on a fatal signal. The problem in test12() is that there is a
'pre-existing condition', in that the parent signalled us with a SIGINT,
and so the signal is set upon entry to the function.
IOW: we might have to perform a similar trick to what do_coredump()
does, and clear the TIF_SIGPENDING flag. I'm not sure if that is
sufficient, but given that we're eliminating the calls to
recalc_sigpending(), and that there should be no such calls left in the
RPC code, I think we're OK.
--
Trond Myklebust
Linux NFS client maintainer
NetApp
[email protected]
http://www.netapp.com
Trond Myklebust wrote:
> On Fri, 2008-03-28 at 17:37 -0400, Peter Staubach wrote:
>
>
>> However, I think that nlmclnt_unlock() needs to wait until
>> the RPC is completed.
>>
>
> It should do that now. See the call to rpc_wait_for_completion_task() in
> nlm_async_call()
>
>
Ahh, yes, sorry, was misreading the patch.
>> The original problem was test12() in
>> the Connectathon testsuite, which would occasionally fail.
>> It would fail because the parent would kill the child process
>> (actually the child of the child) and immediately attempt to
>> grab the lock. This would fail because the child hadn't
>> completed releasing the lock yet. There were some timing
>> dependencies in test12() itself, which I eliminated, but then
>> discovered that this wouldn't solve the entire problem. (I
>> can send you the new version of test12(), if you wish.)
>>
>
> So, at least in 2.6.25, the call to rpc_wait_for_completion_task() will
> exit only on a fatal signal. The problem in test12() is that there is a
> 'pre-existing condition', in that the parent signalled us with a SIGINT,
> and so the signal is set upon entry to the function.
>
> IOW: we might have to perform a similar trick to what do_coredump()
> does, and clear the TIF_SIGPENDING flag. I'm not sure if that is
> sufficient, but given that we're eliminating the calls to
> recalc_sigpending(), and that there should be no such calls left in the
> RPC code, I think we're OK.
I suspect that we are okay too, but I will try this and then allow
the RHTS folks to play with it as well. They are the ones seeing
the failures, so hopefully this will make them happy.
Thanx!
ps