Figuring out the root case for the REMOVE/CLOSE race and
suggesting the solution was done by Neil Brown.
Currently what happens is that direct IO calls hold a reference
on the open context which is decremented as an asynchronous task
in the nfs_direct_complete(). Before reference is decremented,
control is returned to the application which is free to close the
file. When close is being processed, it decrements its reference
on the open_context but since directIO still holds one, it doesn't
sent a close on the wire. It returns control to the application
which is free to do other operations. For instance, it can delete a
file. Direct IO is finally releasing its reference and triggering
an asynchronous close. Which races with the REMOVE. On the server,
REMOVE can be processed before the CLOSE, failing the REMOVE with
EACCES as the file is still opened.
Signed-off-by: Olga Kornievskaia <[email protected]>
Suggested-by: Neil Brown <[email protected]>
CC: [email protected]
---
fs/nfs/direct.c | 13 +++++++++----
fs/nfs/file.c | 1 +
2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 1b79dd5..3d113cf 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -267,8 +267,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
struct inode *inode = dreq->inode;
- inode_dio_end(inode);
-
if (dreq->iocb) {
long res = (long) dreq->error;
if (dreq->count != 0) {
@@ -280,7 +278,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
complete(&dreq->completion);
+ igrab(inode);
nfs_direct_req_release(dreq);
+ inode_dio_end(inode);
+ iput(inode);
}
static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
@@ -410,8 +411,10 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* generic layer handle the completion.
*/
if (requested_bytes == 0) {
- inode_dio_end(inode);
+ igrab(inode);
nfs_direct_req_release(dreq);
+ inode_dio_end(inode);
+ iput(inode);
return result < 0 ? result : -EIO;
}
@@ -864,8 +867,10 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* generic layer handle the completion.
*/
if (requested_bytes == 0) {
- inode_dio_end(inode);
+ igrab(inode);
nfs_direct_req_release(dreq);
+ inode_dio_end(inode);
+ iput(inode);
return result < 0 ? result : -EIO;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f96367a..ccd6c16 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -83,6 +83,7 @@ int nfs_check_flags(int flags)
dprintk("NFS: release(%pD2)\n", filp);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
+ inode_dio_wait(inode);
nfs_file_clear_open_context(filp);
return 0;
}
--
1.8.3.1
As a recent patch highlighted, inode_dio_end() must be called after
nfs_direct_req_release() is called.
It would make the code more robust if nfs_direct_req_release() did that
call itself, placing it after put_nfs_open_context().
To achieve this:
- move the inode_dio_begin() calls to the moment when a
'struct nfs_direct_req' is allocated,
- move the inode_dio_end() calls to just before the
'struct nfs_direct_req' is freed,
- use igrab to make req->inode a counted reference so that
it can be used after put_nfs_open_context() (which calls
dput(), that releasing the only reference we previously held
on the inode).
This patch doesn't change behaviour at all, it just simplifies the code
a little.
Signed-off-by: NeilBrown <[email protected]>
---
fs/nfs/direct.c | 19 ++++++-------------
1 file changed, 6 insertions(+), 13 deletions(-)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3d113cf8908a..ab32b23639d3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -221,6 +221,8 @@ static void nfs_direct_req_free(struct kref *kref)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
put_nfs_open_context(dreq->ctx);
+ inode_dio_end(dreq->inode);
+ iput(dreq->inode);
kmem_cache_free(nfs_direct_cachep, dreq);
}
@@ -278,10 +280,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
complete(&dreq->completion);
- igrab(inode);
nfs_direct_req_release(dreq);
- inode_dio_end(inode);
- iput(inode);
}
static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
@@ -359,7 +358,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
&nfs_direct_read_completion_ops);
get_dreq(dreq);
desc.pg_dreq = dreq;
- inode_dio_begin(inode);
while (iov_iter_count(iter)) {
struct page **pagevec;
@@ -411,10 +409,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* generic layer handle the completion.
*/
if (requested_bytes == 0) {
- igrab(inode);
nfs_direct_req_release(dreq);
- inode_dio_end(inode);
- iput(inode);
return result < 0 ? result : -EIO;
}
@@ -467,7 +462,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (dreq == NULL)
goto out;
- dreq->inode = inode;
+ dreq->inode = igrab(inode);
+ inode_dio_begin(inode);
dreq->bytes_left = dreq->max_count = count;
dreq->io_start = iocb->ki_pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
@@ -807,7 +803,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
get_dreq(dreq);
- inode_dio_begin(inode);
NFS_I(inode)->write_io += iov_iter_count(iter);
while (iov_iter_count(iter)) {
@@ -867,10 +862,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* generic layer handle the completion.
*/
if (requested_bytes == 0) {
- igrab(inode);
nfs_direct_req_release(dreq);
- inode_dio_end(inode);
- iput(inode);
return result < 0 ? result : -EIO;
}
@@ -929,7 +921,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
if (!dreq)
goto out;
- dreq->inode = inode;
+ dreq->inode = igrab(inode);
+ inode_dio_begin(inode);
dreq->bytes_left = dreq->max_count = count;
dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
--
2.26.2