Return-Path: Received: from mail-ua1-f67.google.com ([209.85.222.67]:38007 "EHLO mail-ua1-f67.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726738AbeI1W7r (ORCPT ); Fri, 28 Sep 2018 18:59:47 -0400 Received: by mail-ua1-f67.google.com with SMTP id j17-v6so2528113uan.5 for ; Fri, 28 Sep 2018 09:35:12 -0700 (PDT) MIME-Version: 1.0 References: <20180905192400.107485-1-trond.myklebust@hammerspace.com> <20180905192400.107485-2-trond.myklebust@hammerspace.com> <20180905192400.107485-3-trond.myklebust@hammerspace.com> <20180905192400.107485-4-trond.myklebust@hammerspace.com> <20180905192400.107485-5-trond.myklebust@hammerspace.com> <20180905192400.107485-6-trond.myklebust@hammerspace.com> In-Reply-To: <20180905192400.107485-6-trond.myklebust@hammerspace.com> From: Olga Kornievskaia Date: Fri, 28 Sep 2018 12:34:59 -0400 Message-ID: Subject: Re: [PATCH 5/7] NFS: Convert lookups of the open context to RCU To: Trond Myklebust Cc: linux-nfs Content-Type: text/plain; charset="UTF-8" Sender: linux-nfs-owner@vger.kernel.org List-ID: Hi Trond, This patch ends up fixing a problem related to recall of delegations. This problem has existed in the kernel for quite awhile (early RHEL 3.10-based release show the problem, haven't tried 2.6-based). If you agree with what I present, then can the commit message for this patch be changed to reflect that it solves this problem? Problem can be seen by running "nfstest_delegation --runtests recall26". Jorge can probably describe the test better but I believe the main client machine opens a file for read and gets a read delegation to it and locks it, then from a different process in the same machine there is an appending open (open for write is sent and gets a write delegation). From a different machine another client opens the same file which triggers a CB_RECALL. The main client, un-patched, ends up simply returning the delegation and never recovers the lock. This is pretty bad as it's possible data corruption case. In the nfs_delegation_claim_locks() this condition -- if (nfs_file_open_context(fl->fl_file) != ctx) -- is true when it should have been false. There seems to be a disagreement between what's in nfs_file_open_context(fl->fl_file) and what get_nfs_open_context(inode's ctx) returns but I don't think how this patch fixes it. But it does. On Wed, Sep 5, 2018 at 3:27 PM Trond Myklebust wrote: > > Reduce contention on the inode->i_lock by ensuring that we use RCU > when looking up the NFS open context. > > Signed-off-by: Trond Myklebust > --- > fs/nfs/delegation.c | 11 ++++++----- > fs/nfs/inode.c | 35 +++++++++++++++-------------------- > fs/nfs/nfs4proc.c | 30 ++++++++++++++++++++++++------ > fs/nfs/nfs4state.c | 12 ++++++------ > fs/nfs/pnfs.c | 5 ++++- > include/linux/nfs_fs.h | 1 + > 6 files changed, 56 insertions(+), 38 deletions(-) > > diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c > index f033f3a69a3b..76d205d1c7bc 100644 > --- a/fs/nfs/delegation.c > +++ b/fs/nfs/delegation.c > @@ -136,8 +136,8 @@ static int nfs_delegation_claim_opens(struct inode *inode, > int err; > > again: > - spin_lock(&inode->i_lock); > - list_for_each_entry(ctx, &nfsi->open_files, list) { > + rcu_read_lock(); > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > state = ctx->state; > if (state == NULL) > continue; > @@ -147,8 +147,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, > continue; > if (!nfs4_stateid_match(&state->stateid, stateid)) > continue; > - get_nfs_open_context(ctx); > - spin_unlock(&inode->i_lock); > + if (!get_nfs_open_context(ctx)) > + continue; > + rcu_read_unlock(); > sp = state->owner; > /* Block nfs4_proc_unlck */ > mutex_lock(&sp->so_delegreturn_mutex); > @@ -164,7 +165,7 @@ static int nfs_delegation_claim_opens(struct inode *inode, > return err; > goto again; > } > - spin_unlock(&inode->i_lock); > + rcu_read_unlock(); > return 0; > } > > diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c > index 052db41a7f80..5b1eee4952b7 100644 > --- a/fs/nfs/inode.c > +++ b/fs/nfs/inode.c > @@ -977,9 +977,9 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context); > > struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) > { > - if (ctx != NULL) > - refcount_inc(&ctx->lock_context.count); > - return ctx; > + if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count)) > + return ctx; > + return NULL; > } > EXPORT_SYMBOL_GPL(get_nfs_open_context); > > @@ -988,13 +988,13 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) > struct inode *inode = d_inode(ctx->dentry); > struct super_block *sb = ctx->dentry->d_sb; > > + if (!refcount_dec_and_test(&ctx->lock_context.count)) > + return; > if (!list_empty(&ctx->list)) { > - if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) > - return; > - list_del(&ctx->list); > + spin_lock(&inode->i_lock); > + list_del_rcu(&ctx->list); > spin_unlock(&inode->i_lock); > - } else if (!refcount_dec_and_test(&ctx->lock_context.count)) > - return; > + } > if (inode != NULL) > NFS_PROTO(inode)->close_context(ctx, is_sync); > if (ctx->cred != NULL) > @@ -1002,7 +1002,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) > dput(ctx->dentry); > nfs_sb_deactive(sb); > kfree(ctx->mdsthreshold); > - kfree(ctx); > + kfree_rcu(ctx, rcu_head); > } > > void put_nfs_open_context(struct nfs_open_context *ctx) > @@ -1026,10 +1026,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) > struct nfs_inode *nfsi = NFS_I(inode); > > spin_lock(&inode->i_lock); > - if (ctx->mode & FMODE_WRITE) > - list_add(&ctx->list, &nfsi->open_files); > - else > - list_add_tail(&ctx->list, &nfsi->open_files); > + list_add_tail_rcu(&ctx->list, &nfsi->open_files); > spin_unlock(&inode->i_lock); > } > EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); > @@ -1050,16 +1047,17 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c > struct nfs_inode *nfsi = NFS_I(inode); > struct nfs_open_context *pos, *ctx = NULL; > > - spin_lock(&inode->i_lock); > - list_for_each_entry(pos, &nfsi->open_files, list) { > + rcu_read_lock(); > + list_for_each_entry_rcu(pos, &nfsi->open_files, list) { > if (cred != NULL && pos->cred != cred) > continue; > if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) > continue; > ctx = get_nfs_open_context(pos); > - break; > + if (ctx) > + break; > } > - spin_unlock(&inode->i_lock); > + rcu_read_unlock(); > return ctx; > } > > @@ -1077,9 +1075,6 @@ void nfs_file_clear_open_context(struct file *filp) > if (ctx->error < 0) > invalidate_inode_pages2(inode->i_mapping); > filp->private_data = NULL; > - spin_lock(&inode->i_lock); > - list_move_tail(&ctx->list, &NFS_I(inode)->open_files); > - spin_unlock(&inode->i_lock); > put_nfs_open_context_sync(ctx); > } > } > diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c > index 8220a168282e..10c20a5b075d 100644 > --- a/fs/nfs/nfs4proc.c > +++ b/fs/nfs/nfs4proc.c > @@ -1933,23 +1933,41 @@ nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) > return ret; > } > > -static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) > +static struct nfs_open_context * > +nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode) > { > struct nfs_inode *nfsi = NFS_I(state->inode); > struct nfs_open_context *ctx; > > - spin_lock(&state->inode->i_lock); > - list_for_each_entry(ctx, &nfsi->open_files, list) { > + rcu_read_lock(); > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > if (ctx->state != state) > continue; > - get_nfs_open_context(ctx); > - spin_unlock(&state->inode->i_lock); > + if ((ctx->mode & mode) != mode) > + continue; > + if (!get_nfs_open_context(ctx)) > + continue; > + rcu_read_unlock(); > return ctx; > } > - spin_unlock(&state->inode->i_lock); > + rcu_read_unlock(); > return ERR_PTR(-ENOENT); > } > > +static struct nfs_open_context * > +nfs4_state_find_open_context(struct nfs4_state *state) > +{ > + struct nfs_open_context *ctx; > + > + ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE); > + if (!IS_ERR(ctx)) > + return ctx; > + ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE); > + if (!IS_ERR(ctx)) > + return ctx; > + return nfs4_state_find_open_context_mode(state, FMODE_READ); > +} > + > static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, > struct nfs4_state *state, enum open_claim_type4 claim) > { > diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c > index 40a08cd483f0..be92ce4259e9 100644 > --- a/fs/nfs/nfs4state.c > +++ b/fs/nfs/nfs4state.c > @@ -1437,8 +1437,8 @@ void nfs_inode_find_state_and_recover(struct inode *inode, > struct nfs4_state *state; > bool found = false; > > - spin_lock(&inode->i_lock); > - list_for_each_entry(ctx, &nfsi->open_files, list) { > + rcu_read_lock(); > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > state = ctx->state; > if (state == NULL) > continue; > @@ -1456,7 +1456,7 @@ void nfs_inode_find_state_and_recover(struct inode *inode, > nfs4_state_mark_reclaim_nograce(clp, state)) > found = true; > } > - spin_unlock(&inode->i_lock); > + rcu_read_unlock(); > > nfs_inode_find_delegation_state_and_recover(inode, stateid); > if (found) > @@ -1469,13 +1469,13 @@ static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) > struct nfs_inode *nfsi = NFS_I(inode); > struct nfs_open_context *ctx; > > - spin_lock(&inode->i_lock); > - list_for_each_entry(ctx, &nfsi->open_files, list) { > + rcu_read_lock(); > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > if (ctx->state != state) > continue; > set_bit(NFS_CONTEXT_BAD, &ctx->flags); > } > - spin_unlock(&inode->i_lock); > + rcu_read_unlock(); > } > > static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) > diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c > index c5672c02afd6..06cb90e9bc6e 100644 > --- a/fs/nfs/pnfs.c > +++ b/fs/nfs/pnfs.c > @@ -1339,6 +1339,7 @@ bool pnfs_roc(struct inode *ino, > if (!nfs_have_layout(ino)) > return false; > retry: > + rcu_read_lock(); > spin_lock(&ino->i_lock); > lo = nfsi->layout; > if (!lo || !pnfs_layout_is_valid(lo) || > @@ -1349,6 +1350,7 @@ bool pnfs_roc(struct inode *ino, > pnfs_get_layout_hdr(lo); > if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { > spin_unlock(&ino->i_lock); > + rcu_read_unlock(); > wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, > TASK_UNINTERRUPTIBLE); > pnfs_put_layout_hdr(lo); > @@ -1362,7 +1364,7 @@ bool pnfs_roc(struct inode *ino, > skip_read = true; > } > > - list_for_each_entry(ctx, &nfsi->open_files, list) { > + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { > state = ctx->state; > if (state == NULL) > continue; > @@ -1410,6 +1412,7 @@ bool pnfs_roc(struct inode *ino, > > out_noroc: > spin_unlock(&ino->i_lock); > + rcu_read_unlock(); > pnfs_layoutcommit_inode(ino, true); > if (roc) { > struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; > diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h > index d2f4f88a0e66..6e0417c02279 100644 > --- a/include/linux/nfs_fs.h > +++ b/include/linux/nfs_fs.h > @@ -83,6 +83,7 @@ struct nfs_open_context { > > struct list_head list; > struct nfs4_threshold *mdsthreshold; > + struct rcu_head rcu_head; > }; > > struct nfs_open_dir_context { > -- > 2.17.1 >