Return-Path: Received: from mail-pa0-f45.google.com ([209.85.220.45]:33530 "EHLO mail-pa0-f45.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752952AbbK2EIA (ORCPT ); Sat, 28 Nov 2015 23:08:00 -0500 Received: by pabfh17 with SMTP id fh17so153902297pab.0 for ; Sat, 28 Nov 2015 20:08:00 -0800 (PST) Subject: Re: [PATCH RFC] nfsd: serialize layout stateid morphing operations To: Jeff Layton , bfields@fieldses.org References: <1442491104-30080-1-git-send-email-jeff.layton@primarydata.com> Cc: Christoph Hellwig , linux-nfs@vger.kernel.org From: Kinglong Mee Message-ID: <565A7A14.4010902@gmail.com> Date: Sun, 29 Nov 2015 12:07:48 +0800 MIME-Version: 1.0 In-Reply-To: <1442491104-30080-1-git-send-email-jeff.layton@primarydata.com> Content-Type: text/plain; charset=windows-1252 Sender: linux-nfs-owner@vger.kernel.org List-ID: I meet two problems with this patch, 1. a dump_stack messages printed in process_one_work() at kernel/workqueue.c. BUG: workqueue leaked lock or atomic: kworker/u2:4/0x00000000/98#012 last function: nfsd4_run_cb_work [nfsd] 1 lock held by kworker/u2:4/98: #0: (&ls->ls_mutex){......}, at: [] nfsd4_cb_layout_prepare+0x24/0x40 [nfsd] CPU: 0 PID: 98 Comm: kworker/u2:4 Not tainted 4.4.0-rc2+ #333 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 Workqueue: nfsd4_callbacks nfsd4_run_cb_work [nfsd] ffff8800362b9e40 000000007fe9394f ffff880036353d58 ffffffff8136dc64 ffff880036353dd8 ffffffff810a3f12 ffffffff810a3cbd 000000000000000a ffffffffa0261d78 ffffffff82902e20 0000000000000000 ffffffffa0259241 Call Trace: [] dump_stack+0x19/0x25 [] process_one_work+0x3c2/0x4c0 [] ? process_one_work+0x16d/0x4c0 [] worker_thread+0x4a/0x440 [] ? process_one_work+0x4c0/0x4c0 [] ? process_one_work+0x4c0/0x4c0 [] kthread+0xf5/0x110 [] ? kthread_create_on_node+0x240/0x240 [] ret_from_fork+0x3f/0x70 [] ? kthread_create_on_node+0x240/0x240 2. a mutex race between layoutrecall and layoutcommit, callback thread, nfsd4_cb_layout_prepare --->mutex_lock(&ls->ls_mutex); layoutcommit thread, nfsd4_layoutcommit ---> nfsd4_preprocess_layout_stateid --> mutex_lock(&ls->ls_mutex); <---------------- hang [ 600.645142] INFO: task nfsd:11623 blocked for more than 120 seconds. [ 600.646337] Not tainted 4.4.0-rc2+ #332 [ 600.647404] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 600.648546] nfsd D ffff880064277b80 0 11623 2 0x00000000 [ 600.649803] ffff880064277b80 ffff880064278000 00000000ffffffff ffff88005dd241a8 [ 600.651021] ffffffffa025e77c 0000000000000246 ffff880064277b98 ffffffff81733103 [ 600.652255] ffff880063d7e100 ffff880064277ba8 ffffffff8173330e ffff880064277c28 [ 600.653512] Call Trace: [ 600.654765] [] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd] [ 600.656084] [] schedule+0x33/0x80 [ 600.657405] [] schedule_preempt_disabled+0xe/0x10 [ 600.658741] [] mutex_lock_nested+0x145/0x330 [ 600.660094] [] ? nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd] [ 600.661696] [] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd] [ 600.663129] [] ? nfsd4_preprocess_layout_stateid+0x5/0x400 [nfsd] [ 600.664558] [] ? printk+0x56/0x72 [ 600.665990] [] nfsd4_layoutcommit+0x13c/0x200 [nfsd] [ 600.667365] [] nfsd4_proc_compound+0x388/0x660 [nfsd] [ 600.668835] [] nfsd_dispatch+0xb8/0x200 [nfsd] [ 600.670323] [] svc_process_common+0x409/0x650 [sunrpc] [ 600.671836] [] svc_process+0xf4/0x190 [sunrpc] [ 600.673328] [] nfsd+0x135/0x1a0 [nfsd] [ 600.674825] [] ? nfsd+0x5/0x1a0 [nfsd] [ 600.676388] [] ? nfsd_destroy+0xb0/0xb0 [nfsd] [ 600.677884] [] kthread+0xf5/0x110 [ 600.679373] [] ? kthread_create_on_node+0x240/0x240 [ 600.680874] [] ret_from_fork+0x3f/0x70 [ 600.682398] [] ? kthread_create_on_node+0x240/0x240 [ 600.683893] 1 lock held by nfsd/11623: [ 600.685449] #0: (&ls->ls_mutex){......}, at: [] nfsd4_preprocess_layout_stateid+0x37c/0x400 [nfsd] [ 600.688778] Sending NMI to all CPUs: [ 600.690854] NMI backtrace for cpu 0 [ 600.691909] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332 [ 600.692523] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 [ 600.693821] task: ffff88007b900000 ti: ffff88007b8fc000 task.ti: ffff88007b8fc000 [ 600.694496] RIP: 0010:[] [] native_write_msr_safe+0xa/0x10 [ 600.695185] RSP: 0018:ffff88007b8ffd70 EFLAGS: 00000046 [ 600.695861] RAX: 0000000000000400 RBX: 0000000000000286 RCX: 0000000000000830 [ 600.696539] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000830 [ 600.697204] RBP: ffff88007b8ffd70 R08: 0000000000000400 R09: 0000000000000000 [ 600.697862] R10: 00000000000000e4 R11: 0000000000000001 R12: ffff880063d7e100 [ 600.698513] R13: 00000000003fff3c R14: ffff880063d7e308 R15: 0000000000000004 [ 600.699156] FS: 0000000000000000(0000) GS:ffffffff81c27000(0000) knlGS:0000000000000000 [ 600.699823] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 600.700459] CR2: 00007f366afd4000 CR3: 000000005dd56000 CR4: 00000000001406f0 [ 600.701106] Stack: [ 600.701745] ffff88007b8ffd88 ffffffff8104a860 ffffffff81047340 ffff88007b8ffd98 [ 600.702404] ffffffff8104a885 ffff88007b8ffda8 ffffffff8104735b ffff88007b8ffdd8 [ 600.703058] ffffffff813723ad 0000000000000078 ffff880063d7e100 00000000003fff3c [ 600.703712] Call Trace: [ 600.704355] [] __x2apic_send_IPI_mask.isra.2+0x60/0x70 [ 600.705017] [] ? setup_vector_irq+0x130/0x130 [ 600.705676] [] x2apic_send_IPI_mask+0x15/0x20 [ 600.706335] [] nmi_raise_cpu_backtrace+0x1b/0x20 [ 600.706989] [] nmi_trigger_all_cpu_backtrace+0x14d/0x1c0 [ 600.707693] [] arch_trigger_all_cpu_backtrace+0x19/0x20 [ 600.708362] [] watchdog+0x32f/0x370 [ 600.709031] [] ? watchdog+0x81/0x370 [ 600.709725] [] ? reset_hung_task_detector+0x20/0x20 [ 600.710398] [] kthread+0xf5/0x110 [ 600.711067] [] ? kthread_create_on_node+0x240/0x240 [ 600.711739] [] ret_from_fork+0x3f/0x70 [ 600.712405] [] ? kthread_create_on_node+0x240/0x240 [ 600.713073] Code: 00 55 89 f9 48 89 e5 0f 32 45 31 c0 48 c1 e2 20 44 89 06 48 09 d0 5d c3 66 0f 1f 84 00 00 00 00 00 55 89 f0 89 f9 48 89 e5 0f 30 <31> c0 5d c3 66 90 55 89 f9 48 89 e5 0f 33 48 c1 e2 20 48 09 d0 [ 600.715196] Kernel panic - not syncing: hung_task: blocked tasks [ 600.715889] CPU: 0 PID: 11 Comm: khungtaskd Not tainted 4.4.0-rc2+ #332 [ 600.716540] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 [ 600.717910] ffffffff81a4d19c 00000000480750be ffff88007b8ffd60 ffffffff8136dbf4 [ 600.718610] ffff88007b8ffde8 ffffffff81173559 ffff880000000008 ffff88007b8ffdf8 [ 600.719302] ffff88007b8ffd90 00000000480750be 0000000000000001 0000000000000001 [ 600.719984] Call Trace: [ 600.720646] [] dump_stack+0x19/0x25 [ 600.721330] [] panic+0xd3/0x212 [ 600.722009] [] watchdog+0x33b/0x370 [ 600.722686] [] ? watchdog+0x81/0x370 [ 600.723213] [] ? reset_hung_task_detector+0x20/0x20 [ 600.723674] [] kthread+0xf5/0x110 [ 600.724107] [] ? kthread_create_on_node+0x240/0x240 [ 600.724509] [] ret_from_fork+0x3f/0x70 [ 600.724903] [] ? kthread_create_on_node+0x240/0x240 thanks, Kinglong Mee On 9/17/2015 19:58, Jeff Layton wrote: > In order to allow the client to make a sane determination of what > happened with racing LAYOUTGET/LAYOUTRETURN/CB_LAYOUTRECALL calls, we > must ensure that the seqids return accurately represent the order of > operations. The simplest way to do that is to ensure that operations on > a single stateid are serialized. > > This patch adds a mutex to the layout stateid, and locks it when > checking the layout stateid's seqid. The mutex is held over the entire > operation and released after the seqid is bumped. > > Note that in the case of CB_LAYOUTRECALL we must move the increment of > the seqid and setting into a new cb "prepare" operation. The lease > infrastructure will call the lm_break callback with a spinlock held, so > and we can't take the mutex in that codepath. > > Cc: Christoph Hellwig > Signed-off-by: Jeff Layton > --- > fs/nfsd/nfs4layouts.c | 25 +++++++++++++++++++++---- > fs/nfsd/nfs4proc.c | 4 ++++ > fs/nfsd/state.h | 1 + > 3 files changed, 26 insertions(+), 4 deletions(-) > > diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c > index ebf90e487c75..4a68ab901b4b 100644 > --- a/fs/nfsd/nfs4layouts.c > +++ b/fs/nfsd/nfs4layouts.c > @@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, > INIT_LIST_HEAD(&ls->ls_perfile); > spin_lock_init(&ls->ls_lock); > INIT_LIST_HEAD(&ls->ls_layouts); > + mutex_init(&ls->ls_mutex); > ls->ls_layout_type = layout_type; > nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, > NFSPROC4_CLNT_CB_LAYOUT); > @@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, > status = nfserr_jukebox; > if (!ls) > goto out; > + mutex_lock(&ls->ls_mutex); > } else { > ls = container_of(stid, struct nfs4_layout_stateid, ls_stid); > > status = nfserr_bad_stateid; > + mutex_lock(&ls->ls_mutex); > if (stateid->si_generation > stid->sc_stateid.si_generation) > - goto out_put_stid; > + goto out_unlock_stid; > if (layout_type != ls->ls_layout_type) > - goto out_put_stid; > + goto out_unlock_stid; > } > > *lsp = ls; > return 0; > > +out_unlock_stid: > + mutex_unlock(&ls->ls_mutex); > out_put_stid: > nfs4_put_stid(stid); > out: > @@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) > trace_layout_recall(&ls->ls_stid.sc_stateid); > > atomic_inc(&ls->ls_stid.sc_count); > - update_stateid(&ls->ls_stid.sc_stateid); > - memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); > nfsd4_run_cb(&ls->ls_recall); > > out_unlock: > @@ -494,6 +497,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, > } > spin_unlock(&ls->ls_lock); > > + mutex_unlock(&ls->ls_mutex); > nfs4_put_stid(&ls->ls_stid); > nfsd4_free_layouts(&reaplist); > return nfs_ok; > @@ -608,6 +612,17 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) > } > } > > +static void > +nfsd4_cb_layout_prepare(struct nfsd4_callback *cb) > +{ > + struct nfs4_layout_stateid *ls = > + container_of(cb, struct nfs4_layout_stateid, ls_recall); > + > + mutex_lock(&ls->ls_mutex); > + update_stateid(&ls->ls_stid.sc_stateid); > + memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); > +} > + > static int > nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) > { > @@ -649,12 +664,14 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb) > > trace_layout_recall_release(&ls->ls_stid.sc_stateid); > > + mutex_unlock(&ls->ls_mutex); > nfsd4_return_all_layouts(ls, &reaplist); > nfsd4_free_layouts(&reaplist); > nfs4_put_stid(&ls->ls_stid); > } > > static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { > + .prepare = nfsd4_cb_layout_prepare, > .done = nfsd4_cb_layout_done, > .release = nfsd4_cb_layout_release, > }; > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c > index 4ce6b97b31ad..a9f096c7e99f 100644 > --- a/fs/nfsd/nfs4proc.c > +++ b/fs/nfsd/nfs4proc.c > @@ -1309,6 +1309,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, > nfserr = nfsd4_insert_layout(lgp, ls); > > out_put_stid: > + mutex_unlock(&ls->ls_mutex); > nfs4_put_stid(&ls->ls_stid); > out: > return nfserr; > @@ -1362,6 +1363,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, > goto out; > } > > + /* LAYOUTCOMMIT does not require any serialization */ > + mutex_unlock(&ls->ls_mutex); > + > if (new_size > i_size_read(inode)) { > lcp->lc_size_chg = 1; > lcp->lc_newsize = new_size; > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > index 31bde12feefe..1fa0f3848d4e 100644 > --- a/fs/nfsd/state.h > +++ b/fs/nfsd/state.h > @@ -562,6 +562,7 @@ struct nfs4_layout_stateid { > struct nfsd4_callback ls_recall; > stateid_t ls_recall_sid; > bool ls_recalled; > + struct mutex ls_mutex; > }; > > static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) >