Hi Trond,
This set is basically unchanged from the last one, aside from a bit
more cleanup of unneeded arguments in patch #1.
I know that you basically NAKed this set earlier this week. The issue
you saw was that the generic locking codepaths never set the fl_owner
value for flock locks. That's true, but nfs_flock does set this for any
file_lock request that comes through it, so patch #1 is safe to apply
now if you see no other issue with it.
I have a patch queued for v3.16 that makes the generic flock codepaths
set the fl_owner, but that's just cleanup and won't really affect how
this works.
The main problem that I think we need to fix soon though is the one that
patch #2 fixes. An unprivileged user can trigger that BUG() and if
panic_on_oops is set, then that's an unprivileged DoS at least.
Jeff Layton (3):
nfs4: treat lock owners as opaque values
nfs4: queue free_lock_state job submission to nfsiod
nfs4: turn free_lock_state into a void return operation
fs/nfs/nfs4_fs.h | 26 +++++++-------------
fs/nfs/nfs4proc.c | 14 +++++------
fs/nfs/nfs4state.c | 69 +++++++++++++++++++++---------------------------------
3 files changed, 42 insertions(+), 67 deletions(-)
--
1.9.0
Nothing checks its return value.
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/nfs4_fs.h | 2 +-
fs/nfs/nfs4proc.c | 14 +++++++-------
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 75aadf8c0429..f495ff306c4f 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -54,7 +54,7 @@ struct nfs4_minor_version_ops {
const nfs4_stateid *);
int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
struct nfs_fsinfo *);
- int (*free_lock_state)(struct nfs_server *,
+ void (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
const struct rpc_call_ops *call_sync_ops;
const struct nfs4_state_recovery_ops *reboot_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cc6c3fe3e060..06c1e9e777ab 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5915,7 +5915,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
.rpc_release = nfs4_release_lockowner_release,
};
-static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
struct nfs_release_lockowner_data *data;
struct rpc_message msg = {
@@ -5923,11 +5924,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
};
if (server->nfs_client->cl_mvops->minor_version != 0)
- return -EINVAL;
+ return;
data = kmalloc(sizeof(*data), GFP_NOFS);
if (!data)
- return -ENOMEM;
+ return;
data->lsp = lsp;
data->server = server;
data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5938,7 +5939,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
msg.rpc_resp = &data->res;
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
- return 0;
}
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -8225,7 +8225,8 @@ static int nfs41_free_stateid(struct nfs_server *server,
return ret;
}
-static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
{
struct rpc_task *task;
struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
@@ -8233,9 +8234,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta
task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
nfs4_free_lock_state(server, lsp);
if (IS_ERR(task))
- return PTR_ERR(task);
+ return;
rpc_put_task(task);
- return 0;
}
static bool nfs41_match_stateid(const nfs4_stateid *s1,
--
1.9.0
Do the following set of ops with a file on a NFSv4 mount:
exec 3>>/file/on/nfsv4
flock -x 3
exec 3>&-
You'll see the LOCK request go across the wire, but no LOCKU when the
file is closed.
What happens is that the fd is passed across a fork, and the final close
is done in a different process than the opener. That makes
__nfs4_find_lock_state miss finding the correct lock state because it
uses the fl_pid as a search key. A new one is created, and the locking
code treats it as a delegation stateid (because NFS_LOCK_INITIALIZED
isn't set).
The root cause of this breakage seems to be commit 77041ed9b49a9e
(NFSv4: Ensure the lockowners are labelled using the fl_owner and/or
fl_pid).
That changed it so that flock lockowners are allocated based on the
fl_pid. I think this is incorrect. flock locks should be "owned" by the
struct file, and that is already accounted for in the fl_owner field of
the lock request when it comes through nfs_flock.
This patch basically reverts the above commit and with it, a LOCKU is
sent in the above reproducer.
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/nfs4_fs.h | 13 +------------
fs/nfs/nfs4state.c | 45 +++++++++------------------------------------
2 files changed, 10 insertions(+), 48 deletions(-)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index e1d1badbe53c..3888dd0b43a1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -129,17 +129,6 @@ enum {
* LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
*/
-struct nfs4_lock_owner {
- unsigned int lo_type;
-#define NFS4_ANY_LOCK_TYPE (0U)
-#define NFS4_FLOCK_LOCK_TYPE (1U << 0)
-#define NFS4_POSIX_LOCK_TYPE (1U << 1)
- union {
- fl_owner_t posix_owner;
- pid_t flock_owner;
- } lo_u;
-};
-
struct nfs4_lock_state {
struct list_head ls_locks; /* Other lock stateids */
struct nfs4_state * ls_state; /* Pointer to open state */
@@ -149,7 +138,7 @@ struct nfs4_lock_state {
struct nfs_seqid_counter ls_seqid;
nfs4_stateid ls_stateid;
atomic_t ls_count;
- struct nfs4_lock_owner ls_owner;
+ fl_owner_t ls_owner;
};
/* bits for nfs4_state->flags */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2349518eef2c..f43e5016343e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -787,21 +787,12 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
* that is compatible with current->files
*/
static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
{
struct nfs4_lock_state *pos;
list_for_each_entry(pos, &state->lock_states, ls_locks) {
- if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
+ if (pos->ls_owner != fl_owner)
continue;
- switch (pos->ls_owner.lo_type) {
- case NFS4_POSIX_LOCK_TYPE:
- if (pos->ls_owner.lo_u.posix_owner != fl_owner)
- continue;
- break;
- case NFS4_FLOCK_LOCK_TYPE:
- if (pos->ls_owner.lo_u.flock_owner != fl_pid)
- continue;
- }
atomic_inc(&pos->ls_count);
return pos;
}
@@ -813,7 +804,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
* exists, return an uninitialized one.
*
*/
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
{
struct nfs4_lock_state *lsp;
struct nfs_server *server = state->owner->so_server;
@@ -824,17 +815,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
nfs4_init_seqid_counter(&lsp->ls_seqid);
atomic_set(&lsp->ls_count, 1);
lsp->ls_state = state;
- lsp->ls_owner.lo_type = type;
- switch (lsp->ls_owner.lo_type) {
- case NFS4_FLOCK_LOCK_TYPE:
- lsp->ls_owner.lo_u.flock_owner = fl_pid;
- break;
- case NFS4_POSIX_LOCK_TYPE:
- lsp->ls_owner.lo_u.posix_owner = fl_owner;
- break;
- default:
- goto out_free;
- }
+ lsp->ls_owner = fl_owner;
lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
@@ -857,13 +838,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp
* exists, return an uninitialized one.
*
*/
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
{
struct nfs4_lock_state *lsp, *new = NULL;
for(;;) {
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, owner, pid, type);
+ lsp = __nfs4_find_lock_state(state, owner);
if (lsp != NULL)
break;
if (new != NULL) {
@@ -874,7 +855,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
break;
}
spin_unlock(&state->state_lock);
- new = nfs4_alloc_lock_state(state, owner, pid, type);
+ new = nfs4_alloc_lock_state(state, owner);
if (new == NULL)
return NULL;
}
@@ -935,13 +916,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
if (fl->fl_ops != NULL)
return 0;
- if (fl->fl_flags & FL_POSIX)
- lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
- else if (fl->fl_flags & FL_FLOCK)
- lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
- NFS4_FLOCK_LOCK_TYPE);
- else
- return -EINVAL;
+ lsp = nfs4_get_lock_state(state, fl->fl_owner);
if (lsp == NULL)
return -ENOMEM;
fl->fl_u.nfs4_fl.owner = lsp;
@@ -955,7 +930,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
{
struct nfs4_lock_state *lsp;
fl_owner_t fl_owner;
- pid_t fl_pid;
int ret = -ENOENT;
@@ -966,9 +940,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
goto out;
fl_owner = lockowner->l_owner;
- fl_pid = lockowner->l_pid;
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
+ lsp = __nfs4_find_lock_state(state, fl_owner);
if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
ret = -EIO;
else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
--
1.9.0
We got a report of the following warning in Fedora:
BUG: sleeping function called from invalid context at mm/slub.c:969
in_atomic(): 1, irqs_disabled(): 0, pid: 533, name: bash
3 locks held by bash/533:
#0: (&sp->so_delegreturn_mutex){+.+...}, at: [<ffffffffa033da62>] nfs4_proc_lock+0x262/0x910 [nfsv4]
#1: (&nfsi->rwsem){.+.+.+}, at: [<ffffffffa033da6a>] nfs4_proc_lock+0x26a/0x910 [nfsv4]
#2: (&sb->s_type->i_lock_key#23){+.+...}, at: [<ffffffff812998dc>] flock_lock_file_wait+0x8c/0x3a0
CPU: 0 PID: 533 Comm: bash Not tainted 3.15.0-0.rc1.git1.1.fc21.x86_64 #1
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
0000000000000000 00000000d664ff3c ffff880078b69a70 ffffffff817e82e0
0000000000000000 ffff880078b69a98 ffffffff810cf1a4 0000000000000050
0000000000000050 ffff88007cc01a00 ffff880078b69ad8 ffffffff8121449e
Call Trace:
[<ffffffff817e82e0>] dump_stack+0x4d/0x66
[<ffffffff810cf1a4>] __might_sleep+0x184/0x240
[<ffffffff8121449e>] kmem_cache_alloc_trace+0x4e/0x330
[<ffffffffa0331124>] ? nfs4_release_lockowner+0x74/0x110 [nfsv4]
[<ffffffffa0331124>] nfs4_release_lockowner+0x74/0x110 [nfsv4]
[<ffffffffa0352340>] nfs4_put_lock_state+0x90/0xb0 [nfsv4]
[<ffffffffa0352375>] nfs4_fl_release_lock+0x15/0x20 [nfsv4]
[<ffffffff81297515>] locks_free_lock+0x45/0x90
[<ffffffff8129996c>] flock_lock_file_wait+0x11c/0x3a0
[<ffffffffa033da6a>] ? nfs4_proc_lock+0x26a/0x910 [nfsv4]
[<ffffffffa033301e>] do_vfs_lock+0x1e/0x30 [nfsv4]
[<ffffffffa033da79>] nfs4_proc_lock+0x279/0x910 [nfsv4]
[<ffffffff810dbb26>] ? local_clock+0x16/0x30
[<ffffffff810f5a3f>] ? lock_release_holdtime.part.28+0xf/0x200
[<ffffffffa02f820c>] do_unlk+0x8c/0xc0 [nfs]
[<ffffffffa02f85c5>] nfs_flock+0xa5/0xf0 [nfs]
[<ffffffff8129a6f6>] locks_remove_file+0xb6/0x1e0
[<ffffffff812159d8>] ? kfree+0xd8/0x2d0
[<ffffffff8123bc63>] __fput+0xd3/0x210
[<ffffffff8123bdee>] ____fput+0xe/0x10
[<ffffffff810bfb6d>] task_work_run+0xcd/0xf0
[<ffffffff81019cd1>] do_notify_resume+0x61/0x90
[<ffffffff817fbea2>] int_signal+0x12/0x17
The problem is that NFSv4 is trying to do an allocation from
fl_release_private (in order to send a RELEASE_LOCKOWNER call). That
function can be called while holding the inode->i_lock, and it's
currently set up to do __GFP_WAIT allocations. v4.1 code has a
similar problem.
This patch adds a work_struct to the nfs4_lock_state and has the code
queue the free_lock_state operation to nfsiod.
Reported-by: Josh Stone <[email protected]>
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/nfs4_fs.h | 13 +++++++------
fs/nfs/nfs4state.c | 24 ++++++++++++++++++------
2 files changed, 25 insertions(+), 12 deletions(-)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3888dd0b43a1..75aadf8c0429 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -130,15 +130,16 @@ enum {
*/
struct nfs4_lock_state {
- struct list_head ls_locks; /* Other lock stateids */
- struct nfs4_state * ls_state; /* Pointer to open state */
+ struct list_head ls_locks; /* Other lock stateids */
+ struct nfs4_state * ls_state; /* Pointer to open state */
#define NFS_LOCK_INITIALIZED 0
#define NFS_LOCK_LOST 1
- unsigned long ls_flags;
+ unsigned long ls_flags;
struct nfs_seqid_counter ls_seqid;
- nfs4_stateid ls_stateid;
- atomic_t ls_count;
- fl_owner_t ls_owner;
+ nfs4_stateid ls_stateid;
+ atomic_t ls_count;
+ fl_owner_t ls_owner;
+ struct work_struct ls_release;
};
/* bits for nfs4_state->flags */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f43e5016343e..1919483bcf17 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -799,6 +799,18 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
return NULL;
}
+static void
+free_lock_state_work(struct work_struct *work)
+{
+ struct nfs4_lock_state *lsp = container_of(work,
+ struct nfs4_lock_state, ls_release);
+ struct nfs4_state *state = lsp->ls_state;
+ struct nfs_server *server = state->owner->so_server;
+ struct nfs_client *clp = server->nfs_client;
+
+ clp->cl_mvops->free_lock_state(server, lsp);
+}
+
/*
* Return a compatible lock_state. If no initialized lock_state structure
* exists, return an uninitialized one.
@@ -820,6 +832,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
INIT_LIST_HEAD(&lsp->ls_locks);
+ INIT_WORK(&lsp->ls_release, free_lock_state_work);
return lsp;
out_free:
kfree(lsp);
@@ -883,13 +896,12 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
if (list_empty(&state->lock_states))
clear_bit(LK_STATE_IN_USE, &state->flags);
spin_unlock(&state->state_lock);
- server = state->owner->so_server;
- if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
- struct nfs_client *clp = server->nfs_client;
-
- clp->cl_mvops->free_lock_state(server, lsp);
- } else
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
+ queue_work(nfsiod_workqueue, &lsp->ls_release);
+ else {
+ server = state->owner->so_server;
nfs4_free_lock_state(server, lsp);
+ }
}
static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
--
1.9.0
On Thu, 1 May 2014 06:28:44 -0400
Jeff Layton <[email protected]> wrote:
> Hi Trond,
> This set is basically unchanged from the last one, aside from a bit
> more cleanup of unneeded arguments in patch #1.
>
> I know that you basically NAKed this set earlier this week. The issue
> you saw was that the generic locking codepaths never set the fl_owner
> value for flock locks. That's true, but nfs_flock does set this for any
> file_lock request that comes through it, so patch #1 is safe to apply
> now if you see no other issue with it.
>
> I have a patch queued for v3.16 that makes the generic flock codepaths
> set the fl_owner, but that's just cleanup and won't really affect how
> this works.
>
> The main problem that I think we need to fix soon though is the one that
> patch #2 fixes. An unprivileged user can trigger that BUG() and if
> panic_on_oops is set, then that's an unprivileged DoS at least.
>
> Jeff Layton (3):
> nfs4: treat lock owners as opaque values
> nfs4: queue free_lock_state job submission to nfsiod
> nfs4: turn free_lock_state into a void return operation
>
> fs/nfs/nfs4_fs.h | 26 +++++++-------------
> fs/nfs/nfs4proc.c | 14 +++++------
> fs/nfs/nfs4state.c | 69 +++++++++++++++++++++---------------------------------
> 3 files changed, 42 insertions(+), 67 deletions(-)
>
Hi Trond,
The patch that moves the setting of fl_owner for flock locks into the
generic locking code was merged for v3.16. Are you willing to rescind
your earlier NACK now that that's done?
Thanks,
--
Jeff Layton <[email protected]>
This seems to introduce a fairly easily reproducible oops, which I can
reproduce when running xfstests against a Linux NFSv4.1 server:
generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1]
SMP
[ 2187.042899] Modules linked in:
[ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
[ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 2187.044287] Workqueue: nfsiod free_lock_state_work
[ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
[ 2187.044287] RIP: 0010:[<ffffffff81361ca6>] [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296
[ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
[ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
[ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
[ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
[ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
[ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
[ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
[ 2187.044287] Stack:
[ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
[ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
[ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
[ 2187.044287] Call Trace:
[ 2187.044287] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
[ 2187.044287] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
[ 2187.044287] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
[ 2187.044287] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
[ 2187.044287] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
[ 2187.044287] [<ffffffff810d3c6f>] kthread+0xdf/0x100
[ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
[ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
[ 2187.044287] RIP [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287] RSP <ffff88007a4efd58>
[ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---
seems like this happens when trying to derefence the state owner in
the work queue:
(gdb) l *(free_lock_state_work+0x16)
0xffffffff81361ca6 is in free_lock_state_work (fs/nfs/nfs4state.c:808).
803 free_lock_state_work(struct work_struct *work)
804 {
805 struct nfs4_lock_state *lsp = container_of(work,
806 struct nfs4_lock_state,
ls_release);
807 struct nfs4_state *state = lsp->ls_state;
808 struct nfs_server *server = state->owner->so_server;
809 struct nfs_client *clp = server->nfs_client;
810
811 clp->cl_mvops->free_lock_state(server, lsp);
812 }
> the openowner. We don't have a pointer directly to that, so we're
> probably best off just holding a reference to the open stateid, and
> pinning the superblock too.
>
> Maybe something like this instead? I'm also running xfstests on a VM
> now to see if I can reproduce this and verify the fix:
This version has survived several xfstests run so far, so it seems
good. I'll keep running it a bit longer.
I managed to hit a similar but different issue with your patch applied (note
the slab poisoning pattern in rax):
generic/089 409s ...[ 399.057379] general protection fault: 0000 [#1]
SMP
[ 399.059137] Modules linked in:
[ 399.060089] CPU: 2 PID: 4367 Comm: kworker/2:2 Not tainted 3.16.0-rc6+ #1153
[ 399.060617] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 399.060617] Workqueue: nfsiod free_lock_state_work
[ 399.060617] task: ffff88007ab68810 ti: ffff88007c3b4000 task.ti: ffff88007c3b4000
[ 399.060617] RIP: 0010:[<ffffffff8134e5bb>] [<ffffffff8134e5bb>] nfs41_free_lock_state+0x2b/0x70
[ 399.060617] RSP: 0018:ffff88007c3b7d18 EFLAGS: 00010286
[ 399.060617] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007cdd3800 RCX: 0000000000000000
[ 399.060617] RDX: ffffffff81e04c60 RSI: ffff88007cdd39a0 RDI: ffff880079e5a000
[ 399.060617] RBP: ffff88007c3b7d38 R08: ffffffff832df6d0 R09: 000001c90f100000
[ 399.060617] R10: 0000000000000000 R11: 00000000000656f0 R12: ffff880079e5a000
[ 399.060617] R13: ffff88007fd18b00 R14: ffff88007cdd39c0 R15: 0000000000000000
[ 399.060617] FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000
[ 399.060617] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 399.060617] CR2: 00007f5ac2f56800 CR3: 000000007a95b000 CR4: 00000000000006e0
[ 399.060617] Stack:
[ 399.060617] 000000007fd13240 ffff88007a8f7800 ffff88007fd13240 ffff88007fd18b00
[ 399.060617] ffff88007c3b7d58 ffffffff813621ae ffff88007cdd39c0 ffff88007a4d0c40
[ 399.060617] ffff88007c3b7dd8 ffffffff810cc877 ffffffff810cc80d ffff88007fd13258
[ 399.060617] Call Trace:
[ 399.060617] [<ffffffff813621ae>] free_lock_state_work+0x2e/0x40
[ 399.060617] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
[ 399.060617] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
[ 399.060617] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
[ 399.060617] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
[ 399.060617] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
[ 399.060617] [<ffffffff810d3c6f>] kthread+0xdf/0x100
[ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 399.060617] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
[ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
(gdb) l *(nfs41_free_lock_state+0x2b)
0xffffffff8134e5bb is in nfs41_free_lock_state (fs/nfs/nfs4proc.c:8313).
8308 nfs41_free_lock_state(struct nfs_server *server, struct
nfs4_lock_state *lsp)
8309 {
8310 struct rpc_task *task;
8311 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
8312
8313 task = _nfs41_free_stateid(server, &lsp->ls_stateid,
cred, false);
8314 nfs4_free_lock_state(server, lsp);
8315 if (IS_ERR(task))
8316 return;
8317 rpc_put_task(task);
On Mon, Aug 11, 2014 at 1:35 PM, Jeff Layton
<[email protected]> wrote:
> On Mon, 11 Aug 2014 12:47:48 -0400
> Trond Myklebust <[email protected]> wrote:
>
>> On Mon, Aug 11, 2014 at 11:35 AM, Jeff Layton
>> <[email protected]> wrote:
>> > On Mon, 11 Aug 2014 08:09:24 -0700
>> > Christoph Hellwig <[email protected]> wrote:
>> >
>> >>
>> >> I managed to hit a similar but different issue with your patch applied (note
>> >> the slab poisoning pattern in rax):
>> >>
>> >> generic/089 409s ...[ 399.057379] general protection fault: 0000 [#1]
>> >> SMP
>> >> [ 399.059137] Modules linked in:
>> >> [ 399.060089] CPU: 2 PID: 4367 Comm: kworker/2:2 Not tainted 3.16.0-rc6+ #1153
>> >> [ 399.060617] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
>> >> [ 399.060617] Workqueue: nfsiod free_lock_state_work
>> >> [ 399.060617] task: ffff88007ab68810 ti: ffff88007c3b4000 task.ti: ffff88007c3b4000
>> >> [ 399.060617] RIP: 0010:[<ffffffff8134e5bb>] [<ffffffff8134e5bb>] nfs41_free_lock_state+0x2b/0x70
>> >> [ 399.060617] RSP: 0018:ffff88007c3b7d18 EFLAGS: 00010286
>> >> [ 399.060617] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007cdd3800 RCX: 0000000000000000
>> >> [ 399.060617] RDX: ffffffff81e04c60 RSI: ffff88007cdd39a0 RDI: ffff880079e5a000
>> >> [ 399.060617] RBP: ffff88007c3b7d38 R08: ffffffff832df6d0 R09: 000001c90f100000
>> >> [ 399.060617] R10: 0000000000000000 R11: 00000000000656f0 R12: ffff880079e5a000
>> >> [ 399.060617] R13: ffff88007fd18b00 R14: ffff88007cdd39c0 R15: 0000000000000000
>> >> [ 399.060617] FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000
>> >> [ 399.060617] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>> >> [ 399.060617] CR2: 00007f5ac2f56800 CR3: 000000007a95b000 CR4: 00000000000006e0
>> >> [ 399.060617] Stack:
>> >> [ 399.060617] 000000007fd13240 ffff88007a8f7800 ffff88007fd13240 ffff88007fd18b00
>> >> [ 399.060617] ffff88007c3b7d58 ffffffff813621ae ffff88007cdd39c0 ffff88007a4d0c40
>> >> [ 399.060617] ffff88007c3b7dd8 ffffffff810cc877 ffffffff810cc80d ffff88007fd13258
>> >> [ 399.060617] Call Trace:
>> >> [ 399.060617] [<ffffffff813621ae>] free_lock_state_work+0x2e/0x40
>> >> [ 399.060617] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
>> >> [ 399.060617] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
>> >> [ 399.060617] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
>> >> [ 399.060617] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
>> >> [ 399.060617] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
>> >> [ 399.060617] [<ffffffff810d3c6f>] kthread+0xdf/0x100
>> >> [ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
>> >> [ 399.060617] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
>> >> [ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
>> >>
>> >> (gdb) l *(nfs41_free_lock_state+0x2b)
>> >> 0xffffffff8134e5bb is in nfs41_free_lock_state (fs/nfs/nfs4proc.c:8313).
>> >> 8308 nfs41_free_lock_state(struct nfs_server *server, struct
>> >> nfs4_lock_state *lsp)
>> >> 8309 {
>> >> 8310 struct rpc_task *task;
>> >> 8311 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
>> >> 8312
>> >> 8313 task = _nfs41_free_stateid(server, &lsp->ls_stateid,
>> >> cred, false);
>> >> 8314 nfs4_free_lock_state(server, lsp);
>> >> 8315 if (IS_ERR(task))
>> >> 8316 return;
>> >> 8317 rpc_put_task(task);
>> >>
>> >
>> > Oof -- right. Same problem, just in a different spot. So there, we need
>> > the openowner. We don't have a pointer directly to that, so we're
>> > probably best off just holding a reference to the open stateid, and
>> > pinning the superblock too.
>> >
>> > Maybe something like this instead? I'm also running xfstests on a VM
>> > now to see if I can reproduce this and verify the fix:
>> >
>> > ---------------------[snip]-----------------------
>> >
>> > [PATCH] nfs: pin superblock and open state when running free_lock_state operations
>> >
>> > Christoph reported seeing this oops when running xfstests on a v4.1 client:
>> >
>> > generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1] SMP
>> > [ 2187.042899] Modules linked in:
>> > [ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
>> > [ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
>> > [ 2187.044287] Workqueue: nfsiod free_lock_state_work
>> > [ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
>> > [ 2187.044287] RIP: 0010:[<ffffffff81361ca6>] [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
>> > [ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296
>> > [ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
>> > [ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
>> > [ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
>> > [ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
>> > [ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
>> > [ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
>> > [ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>> > [ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
>> > [ 2187.044287] Stack:
>> > [ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
>> > [ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
>> > [ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
>> > [ 2187.044287] Call Trace:
>> > [ 2187.044287] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
>> > [ 2187.044287] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
>> > [ 2187.044287] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
>> > [ 2187.044287] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
>> > [ 2187.044287] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
>> > [ 2187.044287] [<ffffffff810d3c6f>] kthread+0xdf/0x100
>> > [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
>> > [ 2187.044287] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
>> > [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
>> > [ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
>> > [ 2187.044287] RIP [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
>> > [ 2187.044287] RSP <ffff88007a4efd58>
>> > [ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---
>> >
>> > It appears that the lock state outlived the open state with which it
>> > was associated.
>> >
>> > Fix this by pinning down the open stateid and the superblock prior to
>> > queueing the workqueue job, and then releasing putting those references
>> > once the RPC task has been queued.
>> >
>> > Reported-by: Christoph Hellwig <[email protected]>
>> > Signed-off-by: Jeff Layton <[email protected]>
>> > ---
>> > fs/nfs/nfs4state.c | 13 +++++++++----
>> > 1 file changed, 9 insertions(+), 4 deletions(-)
>> >
>> > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
>> > index a770c8e469a7..fb29c7cbe8e3 100644
>> > --- a/fs/nfs/nfs4state.c
>> > +++ b/fs/nfs/nfs4state.c
>> > @@ -804,11 +804,14 @@ free_lock_state_work(struct work_struct *work)
>> > {
>> > struct nfs4_lock_state *lsp = container_of(work,
>> > struct nfs4_lock_state, ls_release);
>> > - struct nfs4_state *state = lsp->ls_state;
>> > - struct nfs_server *server = state->owner->so_server;
>> > + struct nfs4_state *osp = lsp->ls_state;
>> > + struct super_block *sb = osp->inode->i_sb;
>> > + struct nfs_server *server = NFS_SB(sb);
>> > struct nfs_client *clp = server->nfs_client;
>> >
>> > clp->cl_mvops->free_lock_state(server, lsp);
>> > + nfs4_put_open_state(osp);
>> > + nfs_sb_deactive(sb);
>> > }
>> >
>> > /*
>> > @@ -896,9 +899,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
>> > if (list_empty(&state->lock_states))
>> > clear_bit(LK_STATE_IN_USE, &state->flags);
>> > spin_unlock(&state->state_lock);
>> > - if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
>> > + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
>> > + atomic_inc(&lsp->ls_state->count);
>> > + nfs_sb_active(lsp->ls_state->inode->i_sb);
>> > queue_work(nfsiod_workqueue, &lsp->ls_release);
>> > - else {
>> > + } else {
>> > server = state->owner->so_server;
>> > nfs4_free_lock_state(server, lsp);
>> > }
>> > --
>> > 1.9.3
>> >
>>
>> Can we step back a little? It looks to me as if we're working on a
>> whole new range of symptoms that are a consequence of a set of locking
>> rule changes for fl_release_private that were not well thought
>> through.
>> Prior to commit 72f98e72551fa, the locking rules for
>> fl_release_private stated that it _does_ allow blocking.
>> Nothing in commit 72f98e72551fa was done to change the code that did
>> block, instead it just decreed that fl_release_private no longer
>> allows blocking as if that magically makes thing work.
>>
>> This whole thing of doing an asynchronous call started because
>> locks_delete_lock() is calling lock_free_lock() instead of just
>> unlinking, and then deferring the actual freeing of the locks until
>> we've dropped the spinlocks.
>>
>> It should be trivial to change locks_delete_lock() so that after
>> calling locks_unlink_lock(), it adds to a private list that can then
>> be drained at the end of flock_lock_file(), __posix_lock_file(), and
>> locks_remove_file().
>> The lease code looks like it may need to be special cased
>> (lease_modify()), but that can just keep doing what it is currently
>> doing until someone fixes it.
>>
>> Pretty please....
>>
>
> Ok. It's not quite that simple but it should be doable...
>
> locks_release_private is also called by locks_copy_lock, and I don't
> see a good way to defer that call. We might be able to just get rid of
> it though, and just require a "pristine" lock be passed to
> locks_copy_lock. It looks like that's already done in most cases anyway
> (maybe all cases).
It is definitely the case that all existing callers are using pristine locks.
> I've already been making some motions toward doing a lock pushdown in
> the lease code anyway, so this may just give me a real reason to finish
> that up...
Ack, however please note that neither NFS nor AFS support leases, and
since they are the only filesystems that set
fl_ops->fl_release_private, you only need to consider the lock manager
fl_lmops->fl_release_private callers for now.
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
On Mon, Aug 11, 2014 at 11:35 AM, Jeff Layton
<[email protected]> wrote:
> On Mon, 11 Aug 2014 08:09:24 -0700
> Christoph Hellwig <[email protected]> wrote:
>
>>
>> I managed to hit a similar but different issue with your patch applied (note
>> the slab poisoning pattern in rax):
>>
>> generic/089 409s ...[ 399.057379] general protection fault: 0000 [#1]
>> SMP
>> [ 399.059137] Modules linked in:
>> [ 399.060089] CPU: 2 PID: 4367 Comm: kworker/2:2 Not tainted 3.16.0-rc6+ #1153
>> [ 399.060617] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
>> [ 399.060617] Workqueue: nfsiod free_lock_state_work
>> [ 399.060617] task: ffff88007ab68810 ti: ffff88007c3b4000 task.ti: ffff88007c3b4000
>> [ 399.060617] RIP: 0010:[<ffffffff8134e5bb>] [<ffffffff8134e5bb>] nfs41_free_lock_state+0x2b/0x70
>> [ 399.060617] RSP: 0018:ffff88007c3b7d18 EFLAGS: 00010286
>> [ 399.060617] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007cdd3800 RCX: 0000000000000000
>> [ 399.060617] RDX: ffffffff81e04c60 RSI: ffff88007cdd39a0 RDI: ffff880079e5a000
>> [ 399.060617] RBP: ffff88007c3b7d38 R08: ffffffff832df6d0 R09: 000001c90f100000
>> [ 399.060617] R10: 0000000000000000 R11: 00000000000656f0 R12: ffff880079e5a000
>> [ 399.060617] R13: ffff88007fd18b00 R14: ffff88007cdd39c0 R15: 0000000000000000
>> [ 399.060617] FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000
>> [ 399.060617] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>> [ 399.060617] CR2: 00007f5ac2f56800 CR3: 000000007a95b000 CR4: 00000000000006e0
>> [ 399.060617] Stack:
>> [ 399.060617] 000000007fd13240 ffff88007a8f7800 ffff88007fd13240 ffff88007fd18b00
>> [ 399.060617] ffff88007c3b7d58 ffffffff813621ae ffff88007cdd39c0 ffff88007a4d0c40
>> [ 399.060617] ffff88007c3b7dd8 ffffffff810cc877 ffffffff810cc80d ffff88007fd13258
>> [ 399.060617] Call Trace:
>> [ 399.060617] [<ffffffff813621ae>] free_lock_state_work+0x2e/0x40
>> [ 399.060617] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
>> [ 399.060617] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
>> [ 399.060617] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
>> [ 399.060617] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
>> [ 399.060617] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
>> [ 399.060617] [<ffffffff810d3c6f>] kthread+0xdf/0x100
>> [ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
>> [ 399.060617] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
>> [ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
>>
>> (gdb) l *(nfs41_free_lock_state+0x2b)
>> 0xffffffff8134e5bb is in nfs41_free_lock_state (fs/nfs/nfs4proc.c:8313).
>> 8308 nfs41_free_lock_state(struct nfs_server *server, struct
>> nfs4_lock_state *lsp)
>> 8309 {
>> 8310 struct rpc_task *task;
>> 8311 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
>> 8312
>> 8313 task = _nfs41_free_stateid(server, &lsp->ls_stateid,
>> cred, false);
>> 8314 nfs4_free_lock_state(server, lsp);
>> 8315 if (IS_ERR(task))
>> 8316 return;
>> 8317 rpc_put_task(task);
>>
>
> Oof -- right. Same problem, just in a different spot. So there, we need
> the openowner. We don't have a pointer directly to that, so we're
> probably best off just holding a reference to the open stateid, and
> pinning the superblock too.
>
> Maybe something like this instead? I'm also running xfstests on a VM
> now to see if I can reproduce this and verify the fix:
>
> ---------------------[snip]-----------------------
>
> [PATCH] nfs: pin superblock and open state when running free_lock_state operations
>
> Christoph reported seeing this oops when running xfstests on a v4.1 client:
>
> generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1] SMP
> [ 2187.042899] Modules linked in:
> [ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
> [ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
> [ 2187.044287] Workqueue: nfsiod free_lock_state_work
> [ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
> [ 2187.044287] RIP: 0010:[<ffffffff81361ca6>] [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
> [ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296
> [ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
> [ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
> [ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
> [ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
> [ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
> [ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
> [ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
> [ 2187.044287] Stack:
> [ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
> [ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
> [ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
> [ 2187.044287] Call Trace:
> [ 2187.044287] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
> [ 2187.044287] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
> [ 2187.044287] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
> [ 2187.044287] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
> [ 2187.044287] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
> [ 2187.044287] [<ffffffff810d3c6f>] kthread+0xdf/0x100
> [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> [ 2187.044287] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
> [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> [ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
> [ 2187.044287] RIP [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
> [ 2187.044287] RSP <ffff88007a4efd58>
> [ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---
>
> It appears that the lock state outlived the open state with which it
> was associated.
>
> Fix this by pinning down the open stateid and the superblock prior to
> queueing the workqueue job, and then releasing putting those references
> once the RPC task has been queued.
>
> Reported-by: Christoph Hellwig <[email protected]>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfs/nfs4state.c | 13 +++++++++----
> 1 file changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> index a770c8e469a7..fb29c7cbe8e3 100644
> --- a/fs/nfs/nfs4state.c
> +++ b/fs/nfs/nfs4state.c
> @@ -804,11 +804,14 @@ free_lock_state_work(struct work_struct *work)
> {
> struct nfs4_lock_state *lsp = container_of(work,
> struct nfs4_lock_state, ls_release);
> - struct nfs4_state *state = lsp->ls_state;
> - struct nfs_server *server = state->owner->so_server;
> + struct nfs4_state *osp = lsp->ls_state;
> + struct super_block *sb = osp->inode->i_sb;
> + struct nfs_server *server = NFS_SB(sb);
> struct nfs_client *clp = server->nfs_client;
>
> clp->cl_mvops->free_lock_state(server, lsp);
> + nfs4_put_open_state(osp);
> + nfs_sb_deactive(sb);
> }
>
> /*
> @@ -896,9 +899,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
> if (list_empty(&state->lock_states))
> clear_bit(LK_STATE_IN_USE, &state->flags);
> spin_unlock(&state->state_lock);
> - if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
> + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
> + atomic_inc(&lsp->ls_state->count);
> + nfs_sb_active(lsp->ls_state->inode->i_sb);
> queue_work(nfsiod_workqueue, &lsp->ls_release);
> - else {
> + } else {
> server = state->owner->so_server;
> nfs4_free_lock_state(server, lsp);
> }
> --
> 1.9.3
>
Can we step back a little? It looks to me as if we're working on a
whole new range of symptoms that are a consequence of a set of locking
rule changes for fl_release_private that were not well thought
through.
Prior to commit 72f98e72551fa, the locking rules for
fl_release_private stated that it _does_ allow blocking.
Nothing in commit 72f98e72551fa was done to change the code that did
block, instead it just decreed that fl_release_private no longer
allows blocking as if that magically makes thing work.
This whole thing of doing an asynchronous call started because
locks_delete_lock() is calling lock_free_lock() instead of just
unlinking, and then deferring the actual freeing of the locks until
we've dropped the spinlocks.
It should be trivial to change locks_delete_lock() so that after
calling locks_unlink_lock(), it adds to a private list that can then
be drained at the end of flock_lock_file(), __posix_lock_file(), and
locks_remove_file().
The lease code looks like it may need to be special cased
(lease_modify()), but that can just keep doing what it is currently
doing until someone fixes it.
Pretty please....
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
On Mon, 11 Aug 2014 12:47:48 -0400
Trond Myklebust <[email protected]> wrote:
> On Mon, Aug 11, 2014 at 11:35 AM, Jeff Layton
> <[email protected]> wrote:
> > On Mon, 11 Aug 2014 08:09:24 -0700
> > Christoph Hellwig <[email protected]> wrote:
> >
> >>
> >> I managed to hit a similar but different issue with your patch applied (note
> >> the slab poisoning pattern in rax):
> >>
> >> generic/089 409s ...[ 399.057379] general protection fault: 0000 [#1]
> >> SMP
> >> [ 399.059137] Modules linked in:
> >> [ 399.060089] CPU: 2 PID: 4367 Comm: kworker/2:2 Not tainted 3.16.0-rc6+ #1153
> >> [ 399.060617] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
> >> [ 399.060617] Workqueue: nfsiod free_lock_state_work
> >> [ 399.060617] task: ffff88007ab68810 ti: ffff88007c3b4000 task.ti: ffff88007c3b4000
> >> [ 399.060617] RIP: 0010:[<ffffffff8134e5bb>] [<ffffffff8134e5bb>] nfs41_free_lock_state+0x2b/0x70
> >> [ 399.060617] RSP: 0018:ffff88007c3b7d18 EFLAGS: 00010286
> >> [ 399.060617] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007cdd3800 RCX: 0000000000000000
> >> [ 399.060617] RDX: ffffffff81e04c60 RSI: ffff88007cdd39a0 RDI: ffff880079e5a000
> >> [ 399.060617] RBP: ffff88007c3b7d38 R08: ffffffff832df6d0 R09: 000001c90f100000
> >> [ 399.060617] R10: 0000000000000000 R11: 00000000000656f0 R12: ffff880079e5a000
> >> [ 399.060617] R13: ffff88007fd18b00 R14: ffff88007cdd39c0 R15: 0000000000000000
> >> [ 399.060617] FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000
> >> [ 399.060617] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> >> [ 399.060617] CR2: 00007f5ac2f56800 CR3: 000000007a95b000 CR4: 00000000000006e0
> >> [ 399.060617] Stack:
> >> [ 399.060617] 000000007fd13240 ffff88007a8f7800 ffff88007fd13240 ffff88007fd18b00
> >> [ 399.060617] ffff88007c3b7d58 ffffffff813621ae ffff88007cdd39c0 ffff88007a4d0c40
> >> [ 399.060617] ffff88007c3b7dd8 ffffffff810cc877 ffffffff810cc80d ffff88007fd13258
> >> [ 399.060617] Call Trace:
> >> [ 399.060617] [<ffffffff813621ae>] free_lock_state_work+0x2e/0x40
> >> [ 399.060617] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
> >> [ 399.060617] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
> >> [ 399.060617] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
> >> [ 399.060617] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
> >> [ 399.060617] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
> >> [ 399.060617] [<ffffffff810d3c6f>] kthread+0xdf/0x100
> >> [ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> >> [ 399.060617] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
> >> [ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> >>
> >> (gdb) l *(nfs41_free_lock_state+0x2b)
> >> 0xffffffff8134e5bb is in nfs41_free_lock_state (fs/nfs/nfs4proc.c:8313).
> >> 8308 nfs41_free_lock_state(struct nfs_server *server, struct
> >> nfs4_lock_state *lsp)
> >> 8309 {
> >> 8310 struct rpc_task *task;
> >> 8311 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
> >> 8312
> >> 8313 task = _nfs41_free_stateid(server, &lsp->ls_stateid,
> >> cred, false);
> >> 8314 nfs4_free_lock_state(server, lsp);
> >> 8315 if (IS_ERR(task))
> >> 8316 return;
> >> 8317 rpc_put_task(task);
> >>
> >
> > Oof -- right. Same problem, just in a different spot. So there, we need
> > the openowner. We don't have a pointer directly to that, so we're
> > probably best off just holding a reference to the open stateid, and
> > pinning the superblock too.
> >
> > Maybe something like this instead? I'm also running xfstests on a VM
> > now to see if I can reproduce this and verify the fix:
> >
> > ---------------------[snip]-----------------------
> >
> > [PATCH] nfs: pin superblock and open state when running free_lock_state operations
> >
> > Christoph reported seeing this oops when running xfstests on a v4.1 client:
> >
> > generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1] SMP
> > [ 2187.042899] Modules linked in:
> > [ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
> > [ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
> > [ 2187.044287] Workqueue: nfsiod free_lock_state_work
> > [ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
> > [ 2187.044287] RIP: 0010:[<ffffffff81361ca6>] [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
> > [ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296
> > [ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
> > [ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
> > [ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
> > [ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
> > [ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
> > [ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
> > [ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> > [ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
> > [ 2187.044287] Stack:
> > [ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
> > [ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
> > [ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
> > [ 2187.044287] Call Trace:
> > [ 2187.044287] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
> > [ 2187.044287] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
> > [ 2187.044287] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
> > [ 2187.044287] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
> > [ 2187.044287] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
> > [ 2187.044287] [<ffffffff810d3c6f>] kthread+0xdf/0x100
> > [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> > [ 2187.044287] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
> > [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> > [ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
> > [ 2187.044287] RIP [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
> > [ 2187.044287] RSP <ffff88007a4efd58>
> > [ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---
> >
> > It appears that the lock state outlived the open state with which it
> > was associated.
> >
> > Fix this by pinning down the open stateid and the superblock prior to
> > queueing the workqueue job, and then releasing putting those references
> > once the RPC task has been queued.
> >
> > Reported-by: Christoph Hellwig <[email protected]>
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> > fs/nfs/nfs4state.c | 13 +++++++++----
> > 1 file changed, 9 insertions(+), 4 deletions(-)
> >
> > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
> > index a770c8e469a7..fb29c7cbe8e3 100644
> > --- a/fs/nfs/nfs4state.c
> > +++ b/fs/nfs/nfs4state.c
> > @@ -804,11 +804,14 @@ free_lock_state_work(struct work_struct *work)
> > {
> > struct nfs4_lock_state *lsp = container_of(work,
> > struct nfs4_lock_state, ls_release);
> > - struct nfs4_state *state = lsp->ls_state;
> > - struct nfs_server *server = state->owner->so_server;
> > + struct nfs4_state *osp = lsp->ls_state;
> > + struct super_block *sb = osp->inode->i_sb;
> > + struct nfs_server *server = NFS_SB(sb);
> > struct nfs_client *clp = server->nfs_client;
> >
> > clp->cl_mvops->free_lock_state(server, lsp);
> > + nfs4_put_open_state(osp);
> > + nfs_sb_deactive(sb);
> > }
> >
> > /*
> > @@ -896,9 +899,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
> > if (list_empty(&state->lock_states))
> > clear_bit(LK_STATE_IN_USE, &state->flags);
> > spin_unlock(&state->state_lock);
> > - if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
> > + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
> > + atomic_inc(&lsp->ls_state->count);
> > + nfs_sb_active(lsp->ls_state->inode->i_sb);
> > queue_work(nfsiod_workqueue, &lsp->ls_release);
> > - else {
> > + } else {
> > server = state->owner->so_server;
> > nfs4_free_lock_state(server, lsp);
> > }
> > --
> > 1.9.3
> >
>
> Can we step back a little? It looks to me as if we're working on a
> whole new range of symptoms that are a consequence of a set of locking
> rule changes for fl_release_private that were not well thought
> through.
> Prior to commit 72f98e72551fa, the locking rules for
> fl_release_private stated that it _does_ allow blocking.
> Nothing in commit 72f98e72551fa was done to change the code that did
> block, instead it just decreed that fl_release_private no longer
> allows blocking as if that magically makes thing work.
>
> This whole thing of doing an asynchronous call started because
> locks_delete_lock() is calling lock_free_lock() instead of just
> unlinking, and then deferring the actual freeing of the locks until
> we've dropped the spinlocks.
>
> It should be trivial to change locks_delete_lock() so that after
> calling locks_unlink_lock(), it adds to a private list that can then
> be drained at the end of flock_lock_file(), __posix_lock_file(), and
> locks_remove_file().
> The lease code looks like it may need to be special cased
> (lease_modify()), but that can just keep doing what it is currently
> doing until someone fixes it.
>
> Pretty please....
>
Ok. It's not quite that simple but it should be doable...
locks_release_private is also called by locks_copy_lock, and I don't
see a good way to defer that call. We might be able to just get rid of
it though, and just require a "pristine" lock be passed to
locks_copy_lock. It looks like that's already done in most cases anyway
(maybe all cases).
I've already been making some motions toward doing a lock pushdown in
the lease code anyway, so this may just give me a real reason to finish
that up...
--
Jeff Layton <[email protected]>
On Mon, 11 Aug 2014 07:50:13 -0400
Jeff Layton <[email protected]> wrote:
> On Mon, 11 Aug 2014 03:42:53 -0700
> Christoph Hellwig <[email protected]> wrote:
>
> > This seems to introduce a fairly easily reproducible oops, which I can
> > reproduce when running xfstests against a Linux NFSv4.1 server:
> >
> > generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1]
> > SMP
> > [ 2187.042899] Modules linked in:
> > [ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
> > [ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
> > [ 2187.044287] Workqueue: nfsiod free_lock_state_work
> > [ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
> > [ 2187.044287] RIP: 0010:[<ffffffff81361ca6>] [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
> > [ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296
> > [ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
> > [ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
> > [ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
> > [ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
> > [ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
> > [ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
> > [ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> > [ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
> > [ 2187.044287] Stack:
> > [ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
> > [ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
> > [ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
> > [ 2187.044287] Call Trace:
> > [ 2187.044287] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
> > [ 2187.044287] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
> > [ 2187.044287] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
> > [ 2187.044287] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
> > [ 2187.044287] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
> > [ 2187.044287] [<ffffffff810d3c6f>] kthread+0xdf/0x100
> > [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> > [ 2187.044287] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
> > [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> > [ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
> > [ 2187.044287] RIP [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
> > [ 2187.044287] RSP <ffff88007a4efd58>
> > [ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---
> >
> > seems like this happens when trying to derefence the state owner in
> > the work queue:
> >
> > (gdb) l *(free_lock_state_work+0x16)
> > 0xffffffff81361ca6 is in free_lock_state_work (fs/nfs/nfs4state.c:808).
> > 803 free_lock_state_work(struct work_struct *work)
> > 804 {
> > 805 struct nfs4_lock_state *lsp = container_of(work,
> > 806 struct nfs4_lock_state,
> > ls_release);
> > 807 struct nfs4_state *state = lsp->ls_state;
> > 808 struct nfs_server *server = state->owner->so_server;
> > 809 struct nfs_client *clp = server->nfs_client;
> > 810
> > 811 clp->cl_mvops->free_lock_state(server, lsp);
> > 812 }
> >
>
> Thanks for the report, Christoph.
>
> I suspect what's happening is that the lock state is outliving the open
> state that it's associated with. That does look to be possible now that
> we're queueing ->free_lock_state to a workqueue.
>
> It looks like the simplest fix would be to have a lock state hold a
> reference to the open state, but that could be problematic too. The
> open stateid holds an inode reference, but not necessarily one to the
> superblock. I think you could end up closing a file and unmounting
> before the work runs.
>
> Mostly we just need the server pointer which implies that we need to
> pin the superblock somehow until the workqueue job is done. Should we
> just add a sb pointer to nfs4_lock_state, and call nfs_sb_active before
> queueing the work and then call nfs_sb_deactive on completion?
>
Christoph,
Does this patch fix it? Note that I haven't yet tested it but it does
build. I'll see if I can reproduce this later today and test it out.
I'm also not sure whether it's really the _right_ solution but I
believe it should fix the problem.
----------------------[snip]------------------------
[PATCH] nfs: pin superblock when running free_lock_state operations
Christoph reported seeing this oops when running xfstests on a v4.1 client:
generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1]
SMP
[ 2187.042899] Modules linked in:
[ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
[ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 2187.044287] Workqueue: nfsiod free_lock_state_work
[ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
[ 2187.044287] RIP: 0010:[<ffffffff81361ca6>] [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296
[ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
[ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
[ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
[ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
[ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
[ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
[ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
[ 2187.044287] Stack:
[ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
[ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
[ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
[ 2187.044287] Call Trace:
[ 2187.044287] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
[ 2187.044287] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
[ 2187.044287] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
[ 2187.044287] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
[ 2187.044287] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
[ 2187.044287] [<ffffffff810d3c6f>] kthread+0xdf/0x100
[ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
[ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
[ 2187.044287] RIP [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287] RSP <ffff88007a4efd58>
[ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---
It appears that the lock state outlived the open state with which it
was associated. We don't actually need to get to the open state -- it
was just a convenient way to get to the server pointer. We do however
need to ensure that the nfs_server doesn't go away until the workqueue
job has run.
Fix this by pinning down the sb via nfs_sb_active prior to queueing the
workqueue job, and then calling nfs_sb_deactive after the
free_lock_state operation has completed. Once the rpc_task is queued,
then we no longer need to hold down the sb reference.
Reported-by: Christoph Hellwig <[email protected]>
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/nfs4_fs.h | 1 +
fs/nfs/nfs4state.c | 11 +++++++----
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 92193eddb41d..6eee1fd6b4f1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -135,6 +135,7 @@ struct nfs4_lock_state {
#define NFS_LOCK_INITIALIZED 0
#define NFS_LOCK_LOST 1
unsigned long ls_flags;
+ struct super_block *ls_sb;
struct nfs_seqid_counter ls_seqid;
nfs4_stateid ls_stateid;
atomic_t ls_count;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a770c8e469a7..0a5da4bedbb2 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -804,11 +804,12 @@ free_lock_state_work(struct work_struct *work)
{
struct nfs4_lock_state *lsp = container_of(work,
struct nfs4_lock_state, ls_release);
- struct nfs4_state *state = lsp->ls_state;
- struct nfs_server *server = state->owner->so_server;
+ struct super_block *sb = lsp->ls_sb;
+ struct nfs_server *server = NFS_SB(sb);
struct nfs_client *clp = server->nfs_client;
clp->cl_mvops->free_lock_state(server, lsp);
+ nfs_sb_deactive(sb);
}
/*
@@ -896,9 +897,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
if (list_empty(&state->lock_states))
clear_bit(LK_STATE_IN_USE, &state->flags);
spin_unlock(&state->state_lock);
- if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+ lsp->ls_sb = lsp->ls_state->inode->i_sb;
+ nfs_sb_active(lsp->ls_sb);
queue_work(nfsiod_workqueue, &lsp->ls_release);
- else {
+ } else {
server = state->owner->so_server;
nfs4_free_lock_state(server, lsp);
}
--
1.9.3
--
Jeff Layton <[email protected]>
On Mon, 11 Aug 2014 03:42:53 -0700
Christoph Hellwig <[email protected]> wrote:
> This seems to introduce a fairly easily reproducible oops, which I can
> reproduce when running xfstests against a Linux NFSv4.1 server:
>
> generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1]
> SMP
> [ 2187.042899] Modules linked in:
> [ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
> [ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
> [ 2187.044287] Workqueue: nfsiod free_lock_state_work
> [ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
> [ 2187.044287] RIP: 0010:[<ffffffff81361ca6>] [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
> [ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296
> [ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
> [ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
> [ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
> [ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
> [ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
> [ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
> [ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
> [ 2187.044287] Stack:
> [ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
> [ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
> [ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
> [ 2187.044287] Call Trace:
> [ 2187.044287] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
> [ 2187.044287] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
> [ 2187.044287] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
> [ 2187.044287] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
> [ 2187.044287] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
> [ 2187.044287] [<ffffffff810d3c6f>] kthread+0xdf/0x100
> [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> [ 2187.044287] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
> [ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> [ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
> [ 2187.044287] RIP [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
> [ 2187.044287] RSP <ffff88007a4efd58>
> [ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---
>
> seems like this happens when trying to derefence the state owner in
> the work queue:
>
> (gdb) l *(free_lock_state_work+0x16)
> 0xffffffff81361ca6 is in free_lock_state_work (fs/nfs/nfs4state.c:808).
> 803 free_lock_state_work(struct work_struct *work)
> 804 {
> 805 struct nfs4_lock_state *lsp = container_of(work,
> 806 struct nfs4_lock_state,
> ls_release);
> 807 struct nfs4_state *state = lsp->ls_state;
> 808 struct nfs_server *server = state->owner->so_server;
> 809 struct nfs_client *clp = server->nfs_client;
> 810
> 811 clp->cl_mvops->free_lock_state(server, lsp);
> 812 }
>
Thanks for the report, Christoph.
I suspect what's happening is that the lock state is outliving the open
state that it's associated with. That does look to be possible now that
we're queueing ->free_lock_state to a workqueue.
It looks like the simplest fix would be to have a lock state hold a
reference to the open state, but that could be problematic too. The
open stateid holds an inode reference, but not necessarily one to the
superblock. I think you could end up closing a file and unmounting
before the work runs.
Mostly we just need the server pointer which implies that we need to
pin the superblock somehow until the workqueue job is done. Should we
just add a sb pointer to nfs4_lock_state, and call nfs_sb_active before
queueing the work and then call nfs_sb_deactive on completion?
--
Jeff Layton <[email protected]>
On Mon, 11 Aug 2014 08:09:24 -0700
Christoph Hellwig <[email protected]> wrote:
>
> I managed to hit a similar but different issue with your patch applied (note
> the slab poisoning pattern in rax):
>
> generic/089 409s ...[ 399.057379] general protection fault: 0000 [#1]
> SMP
> [ 399.059137] Modules linked in:
> [ 399.060089] CPU: 2 PID: 4367 Comm: kworker/2:2 Not tainted 3.16.0-rc6+ #1153
> [ 399.060617] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
> [ 399.060617] Workqueue: nfsiod free_lock_state_work
> [ 399.060617] task: ffff88007ab68810 ti: ffff88007c3b4000 task.ti: ffff88007c3b4000
> [ 399.060617] RIP: 0010:[<ffffffff8134e5bb>] [<ffffffff8134e5bb>] nfs41_free_lock_state+0x2b/0x70
> [ 399.060617] RSP: 0018:ffff88007c3b7d18 EFLAGS: 00010286
> [ 399.060617] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007cdd3800 RCX: 0000000000000000
> [ 399.060617] RDX: ffffffff81e04c60 RSI: ffff88007cdd39a0 RDI: ffff880079e5a000
> [ 399.060617] RBP: ffff88007c3b7d38 R08: ffffffff832df6d0 R09: 000001c90f100000
> [ 399.060617] R10: 0000000000000000 R11: 00000000000656f0 R12: ffff880079e5a000
> [ 399.060617] R13: ffff88007fd18b00 R14: ffff88007cdd39c0 R15: 0000000000000000
> [ 399.060617] FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000
> [ 399.060617] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 399.060617] CR2: 00007f5ac2f56800 CR3: 000000007a95b000 CR4: 00000000000006e0
> [ 399.060617] Stack:
> [ 399.060617] 000000007fd13240 ffff88007a8f7800 ffff88007fd13240 ffff88007fd18b00
> [ 399.060617] ffff88007c3b7d58 ffffffff813621ae ffff88007cdd39c0 ffff88007a4d0c40
> [ 399.060617] ffff88007c3b7dd8 ffffffff810cc877 ffffffff810cc80d ffff88007fd13258
> [ 399.060617] Call Trace:
> [ 399.060617] [<ffffffff813621ae>] free_lock_state_work+0x2e/0x40
> [ 399.060617] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
> [ 399.060617] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
> [ 399.060617] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
> [ 399.060617] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
> [ 399.060617] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
> [ 399.060617] [<ffffffff810d3c6f>] kthread+0xdf/0x100
> [ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
> [ 399.060617] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
> [ 399.060617] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
>
> (gdb) l *(nfs41_free_lock_state+0x2b)
> 0xffffffff8134e5bb is in nfs41_free_lock_state (fs/nfs/nfs4proc.c:8313).
> 8308 nfs41_free_lock_state(struct nfs_server *server, struct
> nfs4_lock_state *lsp)
> 8309 {
> 8310 struct rpc_task *task;
> 8311 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
> 8312
> 8313 task = _nfs41_free_stateid(server, &lsp->ls_stateid,
> cred, false);
> 8314 nfs4_free_lock_state(server, lsp);
> 8315 if (IS_ERR(task))
> 8316 return;
> 8317 rpc_put_task(task);
>
Oof -- right. Same problem, just in a different spot. So there, we need
the openowner. We don't have a pointer directly to that, so we're
probably best off just holding a reference to the open stateid, and
pinning the superblock too.
Maybe something like this instead? I'm also running xfstests on a VM
now to see if I can reproduce this and verify the fix:
---------------------[snip]-----------------------
[PATCH] nfs: pin superblock and open state when running free_lock_state operations
Christoph reported seeing this oops when running xfstests on a v4.1 client:
generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1] SMP
[ 2187.042899] Modules linked in:
[ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
[ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 2187.044287] Workqueue: nfsiod free_lock_state_work
[ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
[ 2187.044287] RIP: 0010:[<ffffffff81361ca6>] [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296
[ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
[ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
[ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
[ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
[ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
[ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
[ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
[ 2187.044287] Stack:
[ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
[ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
[ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
[ 2187.044287] Call Trace:
[ 2187.044287] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
[ 2187.044287] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
[ 2187.044287] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
[ 2187.044287] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
[ 2187.044287] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
[ 2187.044287] [<ffffffff810d3c6f>] kthread+0xdf/0x100
[ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
[ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
[ 2187.044287] RIP [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287] RSP <ffff88007a4efd58>
[ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---
It appears that the lock state outlived the open state with which it
was associated.
Fix this by pinning down the open stateid and the superblock prior to
queueing the workqueue job, and then releasing putting those references
once the RPC task has been queued.
Reported-by: Christoph Hellwig <[email protected]>
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/nfs4state.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a770c8e469a7..fb29c7cbe8e3 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -804,11 +804,14 @@ free_lock_state_work(struct work_struct *work)
{
struct nfs4_lock_state *lsp = container_of(work,
struct nfs4_lock_state, ls_release);
- struct nfs4_state *state = lsp->ls_state;
- struct nfs_server *server = state->owner->so_server;
+ struct nfs4_state *osp = lsp->ls_state;
+ struct super_block *sb = osp->inode->i_sb;
+ struct nfs_server *server = NFS_SB(sb);
struct nfs_client *clp = server->nfs_client;
clp->cl_mvops->free_lock_state(server, lsp);
+ nfs4_put_open_state(osp);
+ nfs_sb_deactive(sb);
}
/*
@@ -896,9 +899,11 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
if (list_empty(&state->lock_states))
clear_bit(LK_STATE_IN_USE, &state->flags);
spin_unlock(&state->state_lock);
- if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+ atomic_inc(&lsp->ls_state->count);
+ nfs_sb_active(lsp->ls_state->inode->i_sb);
queue_work(nfsiod_workqueue, &lsp->ls_release);
- else {
+ } else {
server = state->owner->so_server;
nfs4_free_lock_state(server, lsp);
}
--
1.9.3
This one is still reliably reproducable in Linus' tree. While we
got Jeff's patch to allow sleeping in the lock methods in, the workqueue
call in nfs hasn't been reverted yet, so this is expect.
This reverts commit 49a4bda22e186c4d0eb07f4a36b5b1a378f9398d.
Christoph reported an oops due to the above commit:
generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1]
SMP
[ 2187.042899] Modules linked in:
[ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
[ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 2187.044287] Workqueue: nfsiod free_lock_state_work
[ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
[ 2187.044287] RIP: 0010:[<ffffffff81361ca6>] [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296
[ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
[ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
[ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
[ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
[ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
[ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
[ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
[ 2187.044287] Stack:
[ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
[ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
[ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
[ 2187.044287] Call Trace:
[ 2187.044287] [<ffffffff810cc877>] process_one_work+0x1c7/0x490
[ 2187.044287] [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
[ 2187.044287] [<ffffffff810cd569>] worker_thread+0x119/0x4f0
[ 2187.044287] [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
[ 2187.044287] [<ffffffff810cd450>] ? init_pwq+0x190/0x190
[ 2187.044287] [<ffffffff810d3c6f>] kthread+0xdf/0x100
[ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287] [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
[ 2187.044287] [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
[ 2187.044287] RIP [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287] RSP <ffff88007a4efd58>
[ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---
The original reason for this patch was because the fl_release_private
operation couldn't sleep. With commit ed9814d85810 (locks: defer freeing
locks in locks_delete_lock until after i_lock has been dropped), this is
no longer a problem so we can revert this patch.
Reported-by: Christoph Hellwig <[email protected]>
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/nfs4_fs.h | 13 ++++++-------
fs/nfs/nfs4state.c | 24 ++++++------------------
2 files changed, 12 insertions(+), 25 deletions(-)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 92193eddb41d..a8b855ab4e22 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -130,16 +130,15 @@ enum {
*/
struct nfs4_lock_state {
- struct list_head ls_locks; /* Other lock stateids */
- struct nfs4_state * ls_state; /* Pointer to open state */
+ struct list_head ls_locks; /* Other lock stateids */
+ struct nfs4_state * ls_state; /* Pointer to open state */
#define NFS_LOCK_INITIALIZED 0
#define NFS_LOCK_LOST 1
- unsigned long ls_flags;
+ unsigned long ls_flags;
struct nfs_seqid_counter ls_seqid;
- nfs4_stateid ls_stateid;
- atomic_t ls_count;
- fl_owner_t ls_owner;
- struct work_struct ls_release;
+ nfs4_stateid ls_stateid;
+ atomic_t ls_count;
+ fl_owner_t ls_owner;
};
/* bits for nfs4_state->flags */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a043f618cd5a..22fe35104c0c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -799,18 +799,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
return NULL;
}
-static void
-free_lock_state_work(struct work_struct *work)
-{
- struct nfs4_lock_state *lsp = container_of(work,
- struct nfs4_lock_state, ls_release);
- struct nfs4_state *state = lsp->ls_state;
- struct nfs_server *server = state->owner->so_server;
- struct nfs_client *clp = server->nfs_client;
-
- clp->cl_mvops->free_lock_state(server, lsp);
-}
-
/*
* Return a compatible lock_state. If no initialized lock_state structure
* exists, return an uninitialized one.
@@ -832,7 +820,6 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
if (lsp->ls_seqid.owner_id < 0)
goto out_free;
INIT_LIST_HEAD(&lsp->ls_locks);
- INIT_WORK(&lsp->ls_release, free_lock_state_work);
return lsp;
out_free:
kfree(lsp);
@@ -896,12 +883,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
if (list_empty(&state->lock_states))
clear_bit(LK_STATE_IN_USE, &state->flags);
spin_unlock(&state->state_lock);
- if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
- queue_work(nfsiod_workqueue, &lsp->ls_release);
- else {
- server = state->owner->so_server;
+ server = state->owner->so_server;
+ if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+ struct nfs_client *clp = server->nfs_client;
+
+ clp->cl_mvops->free_lock_state(server, lsp);
+ } else
nfs4_free_lock_state(server, lsp);
- }
}
static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
--
1.9.3
On Mon, Sep 08, 2014 at 08:26:01AM -0400, Jeff Layton wrote:
> The original reason for this patch was because the fl_release_private
> operation couldn't sleep. With commit ed9814d85810 (locks: defer freeing
> locks in locks_delete_lock until after i_lock has been dropped), this is
> no longer a problem so we can revert this patch.
>
> Reported-by: Christoph Hellwig <[email protected]>
> Signed-off-by: Jeff Layton <[email protected]>
Looks good, I tested an equivalent revert for about half a dozen runs
of xfstests yesterday and couldn't reproduce this bug with it.
Reviewed-by: Christoph Hellwig <[email protected]>
Tested-by: Christoph Hellwig <[email protected]>