Return-Path: Received: from daytona.panasas.com ([67.152.220.89]:34849 "EHLO daytona.panasas.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752167Ab1BAQDX (ORCPT ); Tue, 1 Feb 2011 11:03:23 -0500 Message-ID: <4D482EC8.2030105@panasas.com> Date: Tue, 01 Feb 2011 18:03:20 +0200 From: Benny Halevy To: Fred Isaman CC: linux-nfs@vger.kernel.org, Trond Myklebust Subject: Re: [PATCH 3/3] pnfs: fix pnfs lock inversion of i_lock and cl_lock References: <1296487633-21057-1-git-send-email-iisaman@netapp.com> <1296487633-21057-4-git-send-email-iisaman@netapp.com> <4D48299F.4070800@panasas.com> <80A4262E-EA90-44A4-96A6-23464177BAA6@netapp.com> In-Reply-To: <80A4262E-EA90-44A4-96A6-23464177BAA6@netapp.com> Content-Type: text/plain; charset=ISO-8859-1 Sender: linux-nfs-owner@vger.kernel.org List-ID: MIME-Version: 1.0 On 2011-02-01 17:54, Fred Isaman wrote: > > On Feb 1, 2011, at 10:41 AM, Benny Halevy wrote: > >> I'm actually seeing that with the pnfs tree over 2.8.38-rc3 >> See signature below. >> Will re-test with this patch. >> > > In your retest, note that this patch depends on patch 1 of the series to work correctly. Yeah, I noticed :-/ Benny > > Fred > >> Benny >> >> [ INFO: possible circular locking dependency detected ] >> 2.6.38-rc3-pnfs-00391-g13858b7 #5 >> ------------------------------------------------------- >> test5/2174 is trying to acquire lock: >> (&sb->s_type->i_lock_key#24){+.+...}, at: [] nfs_inode_set_] >> >> but task is already holding lock: >> (&(&clp->cl_lock)->rlock){+.+...}, at: [] nfs_inode_set_del] >> >> which lock already depends on the new lock. >> >> >> the existing dependency chain (in reverse order) is: >> >> -> #1 (&(&clp->cl_lock)->rlock){+.+...}: >> [] lock_acquire+0xd3/0x100 >> [] _raw_spin_lock+0x36/0x69 >> [] pnfs_update_layout+0x2f5/0x4d9 [nfs] >> [] nfs_write_begin+0x90/0x257 [nfs] >> [] generic_file_buffered_write+0x106/0x267 >> [] __generic_file_aio_write+0x245/0x27a >> [] generic_file_aio_write+0x5c/0xaa >> [] nfs_file_write+0xdf/0x177 [nfs] >> [] do_sync_write+0xcb/0x108 >> [] vfs_write+0xb1/0x10d >> [] sys_write+0x4d/0x77 >> [] system_call_fastpath+0x16/0x1b >> >> -> #0 (&sb->s_type->i_lock_key#24){+.+...}: >> [] __lock_acquire+0xa45/0xd3a >> [] lock_acquire+0xd3/0x100 >> [] _raw_spin_lock+0x36/0x69 >> [] nfs_inode_set_delegation+0x1f4/0x250 [nfs] >> [] nfs4_opendata_to_nfs4_state+0x26c/0x2c9 [nfs] >> [] nfs4_do_open+0x364/0x37c [nfs] >> [] nfs4_atomic_open+0x28/0x45 [nfs] >> [] nfs_open_revalidate+0xf8/0x1a1 [nfs] >> [] d_revalidate+0x21/0x56 >> [] do_revalidate+0x1d/0x7a >> [] do_lookup+0x1c4/0x1f8 >> [] link_path_walk+0x260/0x485 >> [] do_path_lookup+0x50/0x10d >> [] do_filp_open+0x137/0x65e >> [] do_sys_open+0x60/0xf2 >> [] sys_open+0x20/0x22 >> [] system_call_fastpath+0x16/0x1b >> >> >> On 2011-01-31 17:27, Fred Isaman wrote: >>> The pnfs code was using throughout the lock order i_lock, cl_lock. >>> This conflicts with the nfs delegation code. Rework the pnfs code >>> to avoid taking both locks simultaneously. >>> >>> Currently the code takes the double lock to add/remove the layout to a >>> nfs_client list, while atomically checking that the list of lsegs is >>> empty. To avoid this, we rely on existing serializations. When a >>> layout is initialized with lseg count equal zero, LAYOUTGET's >>> openstateid serialization is in effect, making it safe to assume it >>> stays zero unless we change it. And once a layout's lseg count drops >>> to zero, it is set as DESTROYED and so will stay at zero. >>> >>> Signed-off-by: Fred Isaman >>> --- >>> fs/nfs/callback_proc.c | 2 +- >>> fs/nfs/pnfs.c | 50 +++++++++++++++++++++++++++-------------------- >>> 2 files changed, 30 insertions(+), 22 deletions(-) >>> >>> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c >>> index 8958757..2f41dcce 100644 >>> --- a/fs/nfs/callback_proc.c >>> +++ b/fs/nfs/callback_proc.c >>> @@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, >>> rv = NFS4ERR_DELAY; >>> list_del_init(&lo->plh_bulk_recall); >>> spin_unlock(&ino->i_lock); >>> + pnfs_free_lseg_list(&free_me_list); >>> put_layout_hdr(lo); >>> iput(ino); >>> } >>> - pnfs_free_lseg_list(&free_me_list); >>> return rv; >>> } >>> >>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c >>> index f89c3bb..ee6c69a 100644 >>> --- a/fs/nfs/pnfs.c >>> +++ b/fs/nfs/pnfs.c >>> @@ -247,13 +247,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg, >>> BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); >>> list_del(&lseg->pls_list); >>> if (list_empty(&lseg->pls_layout->plh_segs)) { >>> - struct nfs_client *clp; >>> - >>> - clp = NFS_SERVER(ino)->nfs_client; >>> - spin_lock(&clp->cl_lock); >>> - /* List does not take a reference, so no need for put here */ >>> - list_del_init(&lseg->pls_layout->plh_layouts); >>> - spin_unlock(&clp->cl_lock); >>> set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); >>> /* Matched by initial refcount set in alloc_init_layout_hdr */ >>> put_layout_hdr_locked(lseg->pls_layout); >>> @@ -319,14 +312,30 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, >>> return invalid - removed; >>> } >>> >>> +/* note free_me must contain lsegs from a single layout_hdr */ >>> void >>> pnfs_free_lseg_list(struct list_head *free_me) >>> { >>> - struct pnfs_layout_segment *lseg, *tmp; >>> + if (!list_empty(free_me)) { >>> + struct pnfs_layout_segment *lseg, *tmp; >>> + struct pnfs_layout_hdr *lo; >>> >>> - list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { >>> - list_del(&lseg->pls_list); >>> - free_lseg(lseg); >>> + lo = list_first_entry(free_me, >>> + struct pnfs_layout_segment, >>> + pls_list)->pls_layout; >>> + >>> + if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { >>> + struct nfs_client *clp; >>> + >>> + clp = NFS_SERVER(lo->plh_inode)->nfs_client; >>> + spin_lock(&clp->cl_lock); >>> + list_del_init(&lo->plh_layouts); >>> + spin_unlock(&clp->cl_lock); >>> + } >>> + list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { >>> + list_del(&lseg->pls_list); >>> + free_lseg(lseg); >>> + } >>> } >>> } >>> >>> @@ -704,6 +713,7 @@ pnfs_update_layout(struct inode *ino, >>> struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; >>> struct pnfs_layout_hdr *lo; >>> struct pnfs_layout_segment *lseg = NULL; >>> + bool first = false; >>> >>> if (!pnfs_enabled_sb(NFS_SERVER(ino))) >>> return NULL; >>> @@ -734,7 +744,10 @@ pnfs_update_layout(struct inode *ino, >>> atomic_inc(&lo->plh_outstanding); >>> >>> get_layout_hdr(lo); >>> - if (list_empty(&lo->plh_segs)) { >>> + if (list_empty(&lo->plh_segs)) >>> + first = true; >>> + spin_unlock(&ino->i_lock); >>> + if (first) { >>> /* The lo must be on the clp list if there is any >>> * chance of a CB_LAYOUTRECALL(FILE) coming in. >>> */ >>> @@ -743,17 +756,12 @@ pnfs_update_layout(struct inode *ino, >>> list_add_tail(&lo->plh_layouts, &clp->cl_layouts); >>> spin_unlock(&clp->cl_lock); >>> } >>> - spin_unlock(&ino->i_lock); >>> >>> lseg = send_layoutget(lo, ctx, iomode); >>> - if (!lseg) { >>> - spin_lock(&ino->i_lock); >>> - if (list_empty(&lo->plh_segs)) { >>> - spin_lock(&clp->cl_lock); >>> - list_del_init(&lo->plh_layouts); >>> - spin_unlock(&clp->cl_lock); >>> - } >>> - spin_unlock(&ino->i_lock); >>> + if (!lseg && first) { >>> + spin_lock(&clp->cl_lock); >>> + list_del_init(&lo->plh_layouts); >>> + spin_unlock(&clp->cl_lock); >>> } >>> atomic_dec(&lo->plh_outstanding); >>> put_layout_hdr(lo); >