There is a possible race in how the nfs_invalidate_mapping function is
handled. Currently, we go and invalidate the pages in the file and then
clear NFS_INO_INVALID_DATA.
The problem is that it's possible for a stale page to creep into the
mapping after the page was invalidated (i.e., via readahead). If another
writer comes along and sets the flag after that happens but before
invalidate_inode_pages2 returns then we could clear the flag
without the cache having been properly invalidated.
So, we must clear the flag first and then invalidate the pages. Doing
this however, opens another race:
It's possible to have two concurrent read() calls that end up in
nfs_revalidate_mapping at the same time. The first one clears the
NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping.
Just before calling that though, the other task races in, checks the
flag and finds it cleared. At that point, it trusts that the mapping is
good and gets the lock on the page, allowing the read() to be satisfied
from the cache even though the data is no longer valid.
These effects are easily manifested by running diotest3 from the LTP
test suite on NFS. That program does a series of DIO writes and buffered
reads. The operations are serialized and page-aligned but the existing
code fails the test since it occasionally allows a read to come out of
the cache incorrectly. While mixing direct and buffered I/O isn't
recommended, I believe it's possible to hit this in other ways that just
use buffered I/O, though that situation is much harder to reproduce.
The problem is that the checking/clearing of that flag and the
invalidation of the mapping really need to be atomic. Fix this by
serializing concurrent invalidations with a bitlock.
At the same time, we also need to allow other places that check
NFS_INO_INVALID_DATA to check whether we might be in the middle of
invalidating the file, so fix up a couple of places that do that
to look for the new NFS_INO_INVALIDATING flag.
Doing this requires us to be careful not to set the bitlock
unnecessarily, so this code only does that if it believes it will
be doing an invalidation.
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/dir.c | 3 ++-
fs/nfs/inode.c | 42 ++++++++++++++++++++++++++++++++++++++----
fs/nfs/nfstrace.h | 1 +
fs/nfs/write.c | 6 +++++-
include/linux/nfs_fs.h | 1 +
5 files changed, 47 insertions(+), 6 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b266f73..b39a046 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -288,7 +288,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
new_pos = desc->current_index + i;
if (ctx->attr_gencount != nfsi->attr_gencount
- || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+ || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+ || test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) {
ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
} else if (new_pos < desc->ctx->pos) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c63e152..0a972ee 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -977,11 +977,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
if (ret < 0)
return ret;
}
- spin_lock(&inode->i_lock);
- nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode)) {
+ spin_lock(&inode->i_lock);
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
- spin_unlock(&inode->i_lock);
+ spin_unlock(&inode->i_lock);
+ }
nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
nfs_fscache_wait_on_invalidate(inode);
@@ -1008,6 +1008,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
int ret = 0;
/* swapfiles are not supposed to be shared. */
@@ -1019,12 +1020,45 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
if (ret < 0)
goto out;
}
+
+ /*
+ * We must clear NFS_INO_INVALID_DATA first to ensure that
+ * invalidations that come in while we're shooting down the mappings
+ * are respected. But, that leaves a race window where one revalidator
+ * can clear the flag, and then another checks it before the mapping
+ * gets invalidated. Fix that by serializing access to this part of
+ * the function.
+ *
+ * At the same time, we need to allow other tasks to see whether we
+ * might be in the middle of invalidating the pages, so we only set
+ * the bit lock here if it looks like we're going to be doing that.
+ */
+ for (;;) {
+ ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (ret)
+ goto out;
+ if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ goto out;
+ if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
+ break;
+ }
+
+ spin_lock(&inode->i_lock);
if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
+ } else {
+ /* something raced in and cleared the flag */
+ spin_unlock(&inode->i_lock);
}
+ clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
+ smp_mb__after_clear_bit();
+ wake_up_bit(bitlock, NFS_INO_INVALIDATING);
out:
return ret;
}
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 89fe741..59f838c 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -36,6 +36,7 @@
__print_flags(v, "|", \
{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
{ 1 << NFS_INO_STALE, "STALE" }, \
+ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
{ 1 << NFS_INO_COMMIT, "COMMIT" }, \
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a44a872..5511a42 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -909,9 +909,13 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
*/
static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
if (nfs_have_delegated_attributes(inode))
goto out;
- if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+ if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+ return false;
+ if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
return false;
out:
return PageUptodate(page) != 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4899737..18fb16f 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -215,6 +215,7 @@ struct nfs_inode {
#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */
#define NFS_INO_STALE (1) /* possible stale inode */
#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
+#define NFS_INO_INVALIDATING (3) /* inode is being invalidated */
#define NFS_INO_FLUSHING (4) /* inode is flushing out data */
#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */
#define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */
--
1.8.5.3
On Jan 28, 2014, at 8:38, Jeff Layton <[email protected]> wrote:
> On Mon, 27 Jan 2014 13:46:15 -0500
> Jeff Layton <[email protected]> wrote:
>
>> There is a possible race in how the nfs_invalidate_mapping function is
>> handled. Currently, we go and invalidate the pages in the file and then
>> clear NFS_INO_INVALID_DATA.
>>
>> The problem is that it's possible for a stale page to creep into the
>> mapping after the page was invalidated (i.e., via readahead). If another
>> writer comes along and sets the flag after that happens but before
>> invalidate_inode_pages2 returns then we could clear the flag
>> without the cache having been properly invalidated.
>>
>> So, we must clear the flag first and then invalidate the pages. Doing
>> this however, opens another race:
>>
>> It's possible to have two concurrent read() calls that end up in
>> nfs_revalidate_mapping at the same time. The first one clears the
>> NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping.
>>
>> Just before calling that though, the other task races in, checks the
>> flag and finds it cleared. At that point, it trusts that the mapping is
>> good and gets the lock on the page, allowing the read() to be satisfied
>> from the cache even though the data is no longer valid.
>>
>> These effects are easily manifested by running diotest3 from the LTP
>> test suite on NFS. That program does a series of DIO writes and buffered
>> reads. The operations are serialized and page-aligned but the existing
>> code fails the test since it occasionally allows a read to come out of
>> the cache incorrectly. While mixing direct and buffered I/O isn't
>> recommended, I believe it's possible to hit this in other ways that just
>> use buffered I/O, though that situation is much harder to reproduce.
>>
>> The problem is that the checking/clearing of that flag and the
>> invalidation of the mapping really need to be atomic. Fix this by
>> serializing concurrent invalidations with a bitlock.
>>
>> At the same time, we also need to allow other places that check
>> NFS_INO_INVALID_DATA to check whether we might be in the middle of
>> invalidating the file, so fix up a couple of places that do that
>> to look for the new NFS_INO_INVALIDATING flag.
>>
>> Doing this requires us to be careful not to set the bitlock
>> unnecessarily, so this code only does that if it believes it will
>> be doing an invalidation.
>>
>> Signed-off-by: Jeff Layton <[email protected]>
>> ---
>> fs/nfs/dir.c | 3 ++-
>> fs/nfs/inode.c | 42 ++++++++++++++++++++++++++++++++++++++----
>> fs/nfs/nfstrace.h | 1 +
>> fs/nfs/write.c | 6 +++++-
>> include/linux/nfs_fs.h | 1 +
>> 5 files changed, 47 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
>> index b266f73..b39a046 100644
>> --- a/fs/nfs/dir.c
>> +++ b/fs/nfs/dir.c
>> @@ -288,7 +288,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
>>
>> new_pos = desc->current_index + i;
>> if (ctx->attr_gencount != nfsi->attr_gencount
>> - || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
>> + || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
>> + || test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) {
>> ctx->duped = 0;
>> ctx->attr_gencount = nfsi->attr_gencount;
>> } else if (new_pos < desc->ctx->pos) {
>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
>> index c63e152..0a972ee 100644
>> --- a/fs/nfs/inode.c
>> +++ b/fs/nfs/inode.c
>> @@ -977,11 +977,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
>> if (ret < 0)
>> return ret;
>> }
>> - spin_lock(&inode->i_lock);
>> - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
>> - if (S_ISDIR(inode->i_mode))
>> + if (S_ISDIR(inode->i_mode)) {
>> + spin_lock(&inode->i_lock);
>> memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
>> - spin_unlock(&inode->i_lock);
>> + spin_unlock(&inode->i_lock);
>> + }
>> nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
>> nfs_fscache_wait_on_invalidate(inode);
>>
>> @@ -1008,6 +1008,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
>> int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
>> {
>> struct nfs_inode *nfsi = NFS_I(inode);
>> + unsigned long *bitlock = &nfsi->flags;
>> int ret = 0;
>>
>> /* swapfiles are not supposed to be shared. */
>> @@ -1019,12 +1020,45 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
>> if (ret < 0)
>> goto out;
>> }
>> +
>> + /*
>> + * We must clear NFS_INO_INVALID_DATA first to ensure that
>> + * invalidations that come in while we're shooting down the mappings
>> + * are respected. But, that leaves a race window where one revalidator
>> + * can clear the flag, and then another checks it before the mapping
>> + * gets invalidated. Fix that by serializing access to this part of
>> + * the function.
>> + *
>> + * At the same time, we need to allow other tasks to see whether we
>> + * might be in the middle of invalidating the pages, so we only set
>> + * the bit lock here if it looks like we're going to be doing that.
>> + */
>> + for (;;) {
>> + ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING,
>> + nfs_wait_bit_killable, TASK_KILLABLE);
>> + if (ret)
>> + goto out;
>> + if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
>> + goto out;
>> + if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
>> + break;
>> + }
>> +
>
> So, I should mention that while the testcase does pass with this
> patchset, I think there are still races in here.
>
> I think it's possible for two tasks to enter this code nearly
> simultaneously such that the bitlock is clear. One gets the lock and
> then clears NFS_INO_INVALID_DATA, and the other then checks the flag,
> finds it clear and exits the function before the pages were invalidated.
So what? Something can always come up later to cause the NFS client to invalidate the cache again. That?s pretty much per the definition of weak cache consistency.
All we should need to care about in nfs_revalidate_mapping is that the function respects any and all NFS_INO_INVALID_DATA settings that were in effect _before_ entering the above code.
> I wonder if we'd be better served with a new cache_validity flag
> NFS_INO_INVALIDATING_DATA or something, and not have other functions
> trying to peek at the bitlock.
The one thing that I?m not sure of above is whether or not we need an ACCESS_ONCE() around the read of nfsi->cache_validity. I think not, since wait_on_bit and test_and_set_bit should both contain barriers, but I?m not an ultimate authority in the matter.
--
Trond Myklebust
Linux NFS client maintainer
Commit d529ef83c355f97027ff85298a9709fe06216a66 (NFS: fix the handling
of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping) introduces
a potential race, since it doesn't test the value of nfsi->cache_validity
and set the bitlock in nfsi->flags atomically.
Signed-off-by: Trond Myklebust <[email protected]>
Cc: Jeff Layton <[email protected]>
---
fs/nfs/inode.c | 22 ++++++++++------------
1 file changed, 10 insertions(+), 12 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0a972ee9ccc1..8a5bcb6040ac 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1038,24 +1038,22 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
goto out;
- if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ spin_lock(&inode->i_lock);
+ if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA)) {
+ spin_unlock(&inode->i_lock);
goto out;
+ }
if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
break;
- }
-
- spin_lock(&inode->i_lock);
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
- nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
- spin_unlock(&inode->i_lock);
- trace_nfs_invalidate_mapping_enter(inode);
- ret = nfs_invalidate_mapping(inode, mapping);
- trace_nfs_invalidate_mapping_exit(inode, ret);
- } else {
- /* something raced in and cleared the flag */
spin_unlock(&inode->i_lock);
}
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
+ trace_nfs_invalidate_mapping_enter(inode);
+ ret = nfs_invalidate_mapping(inode, mapping);
+ trace_nfs_invalidate_mapping_exit(inode, ret);
+
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
smp_mb__after_clear_bit();
wake_up_bit(bitlock, NFS_INO_INVALIDATING);
--
1.8.5.3
On Tue, 28 Jan 2014 09:49:10 -0500
Trond Myklebust <[email protected]> wrote:
> Commit d529ef83c355f97027ff85298a9709fe06216a66 (NFS: fix the handling
> of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping) introduces
> a potential race, since it doesn't test the value of nfsi->cache_validity
> and set the bitlock in nfsi->flags atomically.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> Cc: Jeff Layton <[email protected]>
> ---
> fs/nfs/inode.c | 22 ++++++++++------------
> 1 file changed, 10 insertions(+), 12 deletions(-)
>
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index 0a972ee9ccc1..8a5bcb6040ac 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -1038,24 +1038,22 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
> nfs_wait_bit_killable, TASK_KILLABLE);
> if (ret)
> goto out;
> - if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
> + spin_lock(&inode->i_lock);
> + if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA)) {
> + spin_unlock(&inode->i_lock);
> goto out;
> + }
> if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
> break;
> - }
> -
I don't think that patch will help.
Consider the case where we have two tasks that enter
nfs_revalidate_mapping simultaneously. NFS_INO_INVALIDATING is
initially clear and NFS_INO_INVALID_DATA is set:
Task 1 Task2
---------------------------------------------------------------------
wait_on_bit(NFS_INO_INVALIDATING) wait_on_bit(NFS_INO_INVALIDATING)
gets i_lock spins on i_lock
checks NFS_INO_INVALID_DATA
sets NFS_INO_INVALIDATING
clears NFS_INO_INVALID_DATA
drops i_lock gets i_lock, finds that NFS_INO_INVALID_DATA
is clear, and exits the function
calls nfs_invalidate_mapping
...so task2 just returned from nfs_revalidate_mapping before the pages
got invalidated.
> - spin_lock(&inode->i_lock);
> - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
> - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
> - spin_unlock(&inode->i_lock);
> - trace_nfs_invalidate_mapping_enter(inode);
> - ret = nfs_invalidate_mapping(inode, mapping);
> - trace_nfs_invalidate_mapping_exit(inode, ret);
> - } else {
> - /* something raced in and cleared the flag */
> spin_unlock(&inode->i_lock);
> }
>
> + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
> + spin_unlock(&inode->i_lock);
> + trace_nfs_invalidate_mapping_enter(inode);
> + ret = nfs_invalidate_mapping(inode, mapping);
> + trace_nfs_invalidate_mapping_exit(inode, ret);
> +
> clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
> smp_mb__after_clear_bit();
> wake_up_bit(bitlock, NFS_INO_INVALIDATING);
--
Jeff Layton <[email protected]>
On Tue, 28 Jan 2014 10:50:22 -0500
Trond Myklebust <[email protected]> wrote:
> Commit d529ef83c355f97027ff85298a9709fe06216a66 (NFS: fix the handling
> of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping) introduces
> a potential race, since it doesn't test the value of nfsi->cache_validity
> and set the bitlock in nfsi->flags atomically.
>
> Signed-off-by: Trond Myklebust <[email protected]>
> Cc: Jeff Layton <[email protected]>
> ---
> fs/nfs/inode.c | 28 ++++++++++++++--------------
> 1 file changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index 0a972ee9ccc1..e5070aa5f175 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -1038,24 +1038,24 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
> nfs_wait_bit_killable, TASK_KILLABLE);
> if (ret)
> goto out;
> - if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
> - goto out;
> - if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
> + spin_lock(&inode->i_lock);
> + if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
> + spin_unlock(&inode->i_lock);
> + continue;
> + }
> + if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
> break;
> - }
> -
> - spin_lock(&inode->i_lock);
> - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
> - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
> - spin_unlock(&inode->i_lock);
> - trace_nfs_invalidate_mapping_enter(inode);
> - ret = nfs_invalidate_mapping(inode, mapping);
> - trace_nfs_invalidate_mapping_exit(inode, ret);
> - } else {
> - /* something raced in and cleared the flag */
> spin_unlock(&inode->i_lock);
> + goto out;
> }
>
> + set_bit(NFS_INO_INVALIDATING, bitlock);
Do we need a memory barrier here to ensure the ordering or does the
set_bit() have one implied? Note that nfs_readdir_search_for_cookie and
nfs_write_pageuptodate don't hold the i_lock when checking these values.
> + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
> + spin_unlock(&inode->i_lock);
> + trace_nfs_invalidate_mapping_enter(inode);
> + ret = nfs_invalidate_mapping(inode, mapping);
> + trace_nfs_invalidate_mapping_exit(inode, ret);
> +
> clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
> smp_mb__after_clear_bit();
> wake_up_bit(bitlock, NFS_INO_INVALIDATING);
Other than the point about the mb, I think this will cover it. I think
the patch I sent earlier is easier to prove correct however...
--
Jeff Layton <[email protected]>
Commit d529ef83c355f97027ff85298a9709fe06216a66 (NFS: fix the handling
of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping) introduces
a potential race, since it doesn't test the value of nfsi->cache_validity
and set the bitlock in nfsi->flags atomically.
Signed-off-by: Trond Myklebust <[email protected]>
Cc: Jeff Layton <[email protected]>
---
fs/nfs/inode.c | 28 ++++++++++++++--------------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0a972ee9ccc1..e5070aa5f175 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1038,24 +1038,24 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
goto out;
- if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
- goto out;
- if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
break;
- }
-
- spin_lock(&inode->i_lock);
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
- nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
- spin_unlock(&inode->i_lock);
- trace_nfs_invalidate_mapping_enter(inode);
- ret = nfs_invalidate_mapping(inode, mapping);
- trace_nfs_invalidate_mapping_exit(inode, ret);
- } else {
- /* something raced in and cleared the flag */
spin_unlock(&inode->i_lock);
+ goto out;
}
+ set_bit(NFS_INO_INVALIDATING, bitlock);
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
+ trace_nfs_invalidate_mapping_enter(inode);
+ ret = nfs_invalidate_mapping(inode, mapping);
+ trace_nfs_invalidate_mapping_exit(inode, ret);
+
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
smp_mb__after_clear_bit();
wake_up_bit(bitlock, NFS_INO_INVALIDATING);
--
1.8.5.3
On Mon, 27 Jan 2014 13:46:15 -0500
Jeff Layton <[email protected]> wrote:
> There is a possible race in how the nfs_invalidate_mapping function is
> handled. Currently, we go and invalidate the pages in the file and then
> clear NFS_INO_INVALID_DATA.
>
> The problem is that it's possible for a stale page to creep into the
> mapping after the page was invalidated (i.e., via readahead). If another
> writer comes along and sets the flag after that happens but before
> invalidate_inode_pages2 returns then we could clear the flag
> without the cache having been properly invalidated.
>
> So, we must clear the flag first and then invalidate the pages. Doing
> this however, opens another race:
>
> It's possible to have two concurrent read() calls that end up in
> nfs_revalidate_mapping at the same time. The first one clears the
> NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping.
>
> Just before calling that though, the other task races in, checks the
> flag and finds it cleared. At that point, it trusts that the mapping is
> good and gets the lock on the page, allowing the read() to be satisfied
> from the cache even though the data is no longer valid.
>
> These effects are easily manifested by running diotest3 from the LTP
> test suite on NFS. That program does a series of DIO writes and buffered
> reads. The operations are serialized and page-aligned but the existing
> code fails the test since it occasionally allows a read to come out of
> the cache incorrectly. While mixing direct and buffered I/O isn't
> recommended, I believe it's possible to hit this in other ways that just
> use buffered I/O, though that situation is much harder to reproduce.
>
> The problem is that the checking/clearing of that flag and the
> invalidation of the mapping really need to be atomic. Fix this by
> serializing concurrent invalidations with a bitlock.
>
> At the same time, we also need to allow other places that check
> NFS_INO_INVALID_DATA to check whether we might be in the middle of
> invalidating the file, so fix up a couple of places that do that
> to look for the new NFS_INO_INVALIDATING flag.
>
> Doing this requires us to be careful not to set the bitlock
> unnecessarily, so this code only does that if it believes it will
> be doing an invalidation.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/nfs/dir.c | 3 ++-
> fs/nfs/inode.c | 42 ++++++++++++++++++++++++++++++++++++++----
> fs/nfs/nfstrace.h | 1 +
> fs/nfs/write.c | 6 +++++-
> include/linux/nfs_fs.h | 1 +
> 5 files changed, 47 insertions(+), 6 deletions(-)
>
> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> index b266f73..b39a046 100644
> --- a/fs/nfs/dir.c
> +++ b/fs/nfs/dir.c
> @@ -288,7 +288,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
>
> new_pos = desc->current_index + i;
> if (ctx->attr_gencount != nfsi->attr_gencount
> - || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
> + || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
> + || test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) {
> ctx->duped = 0;
> ctx->attr_gencount = nfsi->attr_gencount;
> } else if (new_pos < desc->ctx->pos) {
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index c63e152..0a972ee 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -977,11 +977,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
> if (ret < 0)
> return ret;
> }
> - spin_lock(&inode->i_lock);
> - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
> - if (S_ISDIR(inode->i_mode))
> + if (S_ISDIR(inode->i_mode)) {
> + spin_lock(&inode->i_lock);
> memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
> - spin_unlock(&inode->i_lock);
> + spin_unlock(&inode->i_lock);
> + }
> nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
> nfs_fscache_wait_on_invalidate(inode);
>
> @@ -1008,6 +1008,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
> int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
> {
> struct nfs_inode *nfsi = NFS_I(inode);
> + unsigned long *bitlock = &nfsi->flags;
> int ret = 0;
>
> /* swapfiles are not supposed to be shared. */
> @@ -1019,12 +1020,45 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
> if (ret < 0)
> goto out;
> }
> +
> + /*
> + * We must clear NFS_INO_INVALID_DATA first to ensure that
> + * invalidations that come in while we're shooting down the mappings
> + * are respected. But, that leaves a race window where one revalidator
> + * can clear the flag, and then another checks it before the mapping
> + * gets invalidated. Fix that by serializing access to this part of
> + * the function.
> + *
> + * At the same time, we need to allow other tasks to see whether we
> + * might be in the middle of invalidating the pages, so we only set
> + * the bit lock here if it looks like we're going to be doing that.
> + */
> + for (;;) {
> + ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING,
> + nfs_wait_bit_killable, TASK_KILLABLE);
> + if (ret)
> + goto out;
> + if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
> + goto out;
> + if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
> + break;
> + }
> +
So, I should mention that while the testcase does pass with this
patchset, I think there are still races in here.
I think it's possible for two tasks to enter this code nearly
simultaneously such that the bitlock is clear. One gets the lock and
then clears NFS_INO_INVALID_DATA, and the other then checks the flag,
finds it clear and exits the function before the pages were invalidated.
I wonder if we'd be better served with a new cache_validity flag
NFS_INO_INVALIDATING_DATA or something, and not have other functions
trying to peek at the bitlock.
> + spin_lock(&inode->i_lock);
> if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
> + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
> + spin_unlock(&inode->i_lock);
> trace_nfs_invalidate_mapping_enter(inode);
> ret = nfs_invalidate_mapping(inode, mapping);
> trace_nfs_invalidate_mapping_exit(inode, ret);
> + } else {
> + /* something raced in and cleared the flag */
> + spin_unlock(&inode->i_lock);
> }
>
> + clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
> + smp_mb__after_clear_bit();
> + wake_up_bit(bitlock, NFS_INO_INVALIDATING);
> out:
> return ret;
> }
> diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
> index 89fe741..59f838c 100644
> --- a/fs/nfs/nfstrace.h
> +++ b/fs/nfs/nfstrace.h
> @@ -36,6 +36,7 @@
> __print_flags(v, "|", \
> { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
> { 1 << NFS_INO_STALE, "STALE" }, \
> + { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
> { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
> { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
> { 1 << NFS_INO_COMMIT, "COMMIT" }, \
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index a44a872..5511a42 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -909,9 +909,13 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
> */
> static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
> {
> + struct nfs_inode *nfsi = NFS_I(inode);
> +
> if (nfs_have_delegated_attributes(inode))
> goto out;
> - if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
> + if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
> + return false;
> + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
> return false;
> out:
> return PageUptodate(page) != 0;
> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> index 4899737..18fb16f 100644
> --- a/include/linux/nfs_fs.h
> +++ b/include/linux/nfs_fs.h
> @@ -215,6 +215,7 @@ struct nfs_inode {
> #define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */
> #define NFS_INO_STALE (1) /* possible stale inode */
> #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
> +#define NFS_INO_INVALIDATING (3) /* inode is being invalidated */
> #define NFS_INO_FLUSHING (4) /* inode is flushing out data */
> #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */
> #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */
--
Jeff Layton <[email protected]>
Commit d529ef83c355f97027ff85298a9709fe06216a66 (NFS: fix the handling
of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping) introduces
a potential race, since it doesn't test the value of nfsi->cache_validity
and set the bitlock in nfsi->flags atomically.
Signed-off-by: Trond Myklebust <[email protected]>
Cc: Jeff Layton <[email protected]>
---
fs/nfs/inode.c | 29 +++++++++++++++--------------
1 file changed, 15 insertions(+), 14 deletions(-)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0a972ee9ccc1..1ff95ba4fea3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1038,23 +1038,24 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
goto out;
- if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA)) {
+ spin_unlock(&inode->i_lock);
goto out;
- if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
- break;
+ }
+ set_bit(NFS_INO_INVALIDATING, bitlock);
+ break;
}
- spin_lock(&inode->i_lock);
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
- nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
- spin_unlock(&inode->i_lock);
- trace_nfs_invalidate_mapping_enter(inode);
- ret = nfs_invalidate_mapping(inode, mapping);
- trace_nfs_invalidate_mapping_exit(inode, ret);
- } else {
- /* something raced in and cleared the flag */
- spin_unlock(&inode->i_lock);
- }
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
+ trace_nfs_invalidate_mapping_enter(inode);
+ ret = nfs_invalidate_mapping(inode, mapping);
+ trace_nfs_invalidate_mapping_exit(inode, ret);
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
smp_mb__after_clear_bit();
--
1.8.5.3
On Jan 28, 2014, at 11:05, Jeff Layton <[email protected]> wrote:
> On Tue, 28 Jan 2014 10:50:22 -0500
> Trond Myklebust <[email protected]> wrote:
>
>> Commit d529ef83c355f97027ff85298a9709fe06216a66 (NFS: fix the handling
>> of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping) introduces
>> a potential race, since it doesn't test the value of nfsi->cache_validity
>> and set the bitlock in nfsi->flags atomically.
>>
>> Signed-off-by: Trond Myklebust <[email protected]>
>> Cc: Jeff Layton <[email protected]>
>> ---
>> fs/nfs/inode.c | 28 ++++++++++++++--------------
>> 1 file changed, 14 insertions(+), 14 deletions(-)
>>
>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
>> index 0a972ee9ccc1..e5070aa5f175 100644
>> --- a/fs/nfs/inode.c
>> +++ b/fs/nfs/inode.c
>> @@ -1038,24 +1038,24 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
>> nfs_wait_bit_killable, TASK_KILLABLE);
>> if (ret)
>> goto out;
>> - if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
>> - goto out;
>> - if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
>> + spin_lock(&inode->i_lock);
>> + if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
>> + spin_unlock(&inode->i_lock);
>> + continue;
>> + }
>> + if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
>> break;
>> - }
>> -
>> - spin_lock(&inode->i_lock);
>> - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
>> - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
>> - spin_unlock(&inode->i_lock);
>> - trace_nfs_invalidate_mapping_enter(inode);
>> - ret = nfs_invalidate_mapping(inode, mapping);
>> - trace_nfs_invalidate_mapping_exit(inode, ret);
>> - } else {
>> - /* something raced in and cleared the flag */
>> spin_unlock(&inode->i_lock);
>> + goto out;
>> }
>>
>> + set_bit(NFS_INO_INVALIDATING, bitlock);
>
> Do we need a memory barrier here to ensure the ordering or does the
> set_bit() have one implied? Note that nfs_readdir_search_for_cookie and
> nfs_write_pageuptodate don't hold the i_lock when checking these values.
There seems little point in putting a barrier here. If we need stronger semantics, then we can and should use the spin lock. That said, neither nfs_readdir_search_for_cookie nor nfs_write_pageuptodate is required by close-to-open to have such semantics: the strong checks happen at open().
>> + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
>> + spin_unlock(&inode->i_lock);
>> + trace_nfs_invalidate_mapping_enter(inode);
>> + ret = nfs_invalidate_mapping(inode, mapping);
>> + trace_nfs_invalidate_mapping_exit(inode, ret);
>> +
>> clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
>> smp_mb__after_clear_bit();
>> wake_up_bit(bitlock, NFS_INO_INVALIDATING);
>
> Other than the point about the mb, I think this will cover it. I think
> the patch I sent earlier is easier to prove correct however...
>
--
Trond Myklebust
Linux NFS client maintainer
On Tue, 28 Jan 2014 12:24:34 -0500
Trond Myklebust <[email protected]> wrote:
>
> On Jan 28, 2014, at 11:05, Jeff Layton <[email protected]> wrote:
>
> > On Tue, 28 Jan 2014 10:50:22 -0500
> > Trond Myklebust <[email protected]> wrote:
> >
> >> Commit d529ef83c355f97027ff85298a9709fe06216a66 (NFS: fix the handling
> >> of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping) introduces
> >> a potential race, since it doesn't test the value of nfsi->cache_validity
> >> and set the bitlock in nfsi->flags atomically.
> >>
> >> Signed-off-by: Trond Myklebust <[email protected]>
> >> Cc: Jeff Layton <[email protected]>
> >> ---
> >> fs/nfs/inode.c | 28 ++++++++++++++--------------
> >> 1 file changed, 14 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> >> index 0a972ee9ccc1..e5070aa5f175 100644
> >> --- a/fs/nfs/inode.c
> >> +++ b/fs/nfs/inode.c
> >> @@ -1038,24 +1038,24 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
> >> nfs_wait_bit_killable, TASK_KILLABLE);
> >> if (ret)
> >> goto out;
> >> - if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
> >> - goto out;
> >> - if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
> >> + spin_lock(&inode->i_lock);
> >> + if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
> >> + spin_unlock(&inode->i_lock);
> >> + continue;
> >> + }
> >> + if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
> >> break;
> >> - }
> >> -
> >> - spin_lock(&inode->i_lock);
> >> - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
> >> - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
> >> - spin_unlock(&inode->i_lock);
> >> - trace_nfs_invalidate_mapping_enter(inode);
> >> - ret = nfs_invalidate_mapping(inode, mapping);
> >> - trace_nfs_invalidate_mapping_exit(inode, ret);
> >> - } else {
> >> - /* something raced in and cleared the flag */
> >> spin_unlock(&inode->i_lock);
> >> + goto out;
> >> }
> >>
> >> + set_bit(NFS_INO_INVALIDATING, bitlock);
> >
> > Do we need a memory barrier here to ensure the ordering or does the
> > set_bit() have one implied? Note that nfs_readdir_search_for_cookie and
> > nfs_write_pageuptodate don't hold the i_lock when checking these values.
>
> There seems little point in putting a barrier here. If we need stronger semantics, then we can and should use the spin lock. That said, neither nfs_readdir_search_for_cookie nor nfs_write_pageuptodate is required by close-to-open to have such semantics: the strong checks happen at open().
>
Hmm...I don't know...
The concern I'd have is that if the clearing of NFS_INO_INVALID_DATA
gets reordered before the set of NFS_INO_INVALIDATING, then we could
end up with another task (e.g. in nfs_write_pageuptodate) seeing them
both as clear when the pages really are no longer valid.
Maybe something like this?
-------------------------8<----------------------------
[PATCH] nfs: add memory barriers around NFS_INO_INVALID_DATA and NFS_INO_INVALIDATING
If the setting of NFS_INO_INVALIDATING gets reordered to before the
clearing of NFS_INO_INVALID_DATA, then another task may hit a race
window where both appear to be clear, even though the inode's pages are
still in need of invalidation. Fix this by adding the appropriate memory
barriers.
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/dir.c | 14 +++++++++++---
fs/nfs/inode.c | 1 +
fs/nfs/write.c | 1 +
3 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b39a046..be38b57 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -274,6 +274,15 @@ out_eof:
return -EBADCOOKIE;
}
+static bool
+nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
+{
+ if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+ return false;
+ smp_rmb();
+ return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
+}
+
static
int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
{
@@ -287,9 +296,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
struct nfs_open_dir_context *ctx = desc->file->private_data;
new_pos = desc->current_index + i;
- if (ctx->attr_gencount != nfsi->attr_gencount
- || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
- || test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) {
+ if (ctx->attr_gencount != nfsi->attr_gencount ||
+ !nfs_readdir_inode_mapping_valid(nfsi)) {
ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
} else if (new_pos < desc->ctx->pos) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e5070aa..02e1851 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1050,6 +1050,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
}
set_bit(NFS_INO_INVALIDATING, bitlock);
+ smp_wmb();
nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5511a42..9a3b6a4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -915,6 +915,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
goto out;
if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
return false;
+ smp_rmb();
if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
return false;
out:
--
1.8.5.3
On Tue, 28 Jan 2014 08:50:19 -0500
Trond Myklebust <[email protected]> wrote:
>
> On Jan 28, 2014, at 8:38, Jeff Layton <[email protected]> wrote:
>
> > On Mon, 27 Jan 2014 13:46:15 -0500
> > Jeff Layton <[email protected]> wrote:
> >
> >> There is a possible race in how the nfs_invalidate_mapping function is
> >> handled. Currently, we go and invalidate the pages in the file and then
> >> clear NFS_INO_INVALID_DATA.
> >>
> >> The problem is that it's possible for a stale page to creep into the
> >> mapping after the page was invalidated (i.e., via readahead). If another
> >> writer comes along and sets the flag after that happens but before
> >> invalidate_inode_pages2 returns then we could clear the flag
> >> without the cache having been properly invalidated.
> >>
> >> So, we must clear the flag first and then invalidate the pages. Doing
> >> this however, opens another race:
> >>
> >> It's possible to have two concurrent read() calls that end up in
> >> nfs_revalidate_mapping at the same time. The first one clears the
> >> NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping.
> >>
> >> Just before calling that though, the other task races in, checks the
> >> flag and finds it cleared. At that point, it trusts that the mapping is
> >> good and gets the lock on the page, allowing the read() to be satisfied
> >> from the cache even though the data is no longer valid.
> >>
> >> These effects are easily manifested by running diotest3 from the LTP
> >> test suite on NFS. That program does a series of DIO writes and buffered
> >> reads. The operations are serialized and page-aligned but the existing
> >> code fails the test since it occasionally allows a read to come out of
> >> the cache incorrectly. While mixing direct and buffered I/O isn't
> >> recommended, I believe it's possible to hit this in other ways that just
> >> use buffered I/O, though that situation is much harder to reproduce.
> >>
> >> The problem is that the checking/clearing of that flag and the
> >> invalidation of the mapping really need to be atomic. Fix this by
> >> serializing concurrent invalidations with a bitlock.
> >>
> >> At the same time, we also need to allow other places that check
> >> NFS_INO_INVALID_DATA to check whether we might be in the middle of
> >> invalidating the file, so fix up a couple of places that do that
> >> to look for the new NFS_INO_INVALIDATING flag.
> >>
> >> Doing this requires us to be careful not to set the bitlock
> >> unnecessarily, so this code only does that if it believes it will
> >> be doing an invalidation.
> >>
> >> Signed-off-by: Jeff Layton <[email protected]>
> >> ---
> >> fs/nfs/dir.c | 3 ++-
> >> fs/nfs/inode.c | 42 ++++++++++++++++++++++++++++++++++++++----
> >> fs/nfs/nfstrace.h | 1 +
> >> fs/nfs/write.c | 6 +++++-
> >> include/linux/nfs_fs.h | 1 +
> >> 5 files changed, 47 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
> >> index b266f73..b39a046 100644
> >> --- a/fs/nfs/dir.c
> >> +++ b/fs/nfs/dir.c
> >> @@ -288,7 +288,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
> >>
> >> new_pos = desc->current_index + i;
> >> if (ctx->attr_gencount != nfsi->attr_gencount
> >> - || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
> >> + || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
> >> + || test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) {
> >> ctx->duped = 0;
> >> ctx->attr_gencount = nfsi->attr_gencount;
> >> } else if (new_pos < desc->ctx->pos) {
> >> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> >> index c63e152..0a972ee 100644
> >> --- a/fs/nfs/inode.c
> >> +++ b/fs/nfs/inode.c
> >> @@ -977,11 +977,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
> >> if (ret < 0)
> >> return ret;
> >> }
> >> - spin_lock(&inode->i_lock);
> >> - nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
> >> - if (S_ISDIR(inode->i_mode))
> >> + if (S_ISDIR(inode->i_mode)) {
> >> + spin_lock(&inode->i_lock);
> >> memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
> >> - spin_unlock(&inode->i_lock);
> >> + spin_unlock(&inode->i_lock);
> >> + }
> >> nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
> >> nfs_fscache_wait_on_invalidate(inode);
> >>
> >> @@ -1008,6 +1008,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
> >> int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
> >> {
> >> struct nfs_inode *nfsi = NFS_I(inode);
> >> + unsigned long *bitlock = &nfsi->flags;
> >> int ret = 0;
> >>
> >> /* swapfiles are not supposed to be shared. */
> >> @@ -1019,12 +1020,45 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
> >> if (ret < 0)
> >> goto out;
> >> }
> >> +
> >> + /*
> >> + * We must clear NFS_INO_INVALID_DATA first to ensure that
> >> + * invalidations that come in while we're shooting down the mappings
> >> + * are respected. But, that leaves a race window where one revalidator
> >> + * can clear the flag, and then another checks it before the mapping
> >> + * gets invalidated. Fix that by serializing access to this part of
> >> + * the function.
> >> + *
> >> + * At the same time, we need to allow other tasks to see whether we
> >> + * might be in the middle of invalidating the pages, so we only set
> >> + * the bit lock here if it looks like we're going to be doing that.
> >> + */
> >> + for (;;) {
> >> + ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING,
> >> + nfs_wait_bit_killable, TASK_KILLABLE);
> >> + if (ret)
> >> + goto out;
> >> + if (!(nfsi->cache_validity & NFS_INO_INVALID_DATA))
> >> + goto out;
> >> + if (!test_and_set_bit_lock(NFS_INO_INVALIDATING, bitlock))
> >> + break;
> >> + }
> >> +
> >
> > So, I should mention that while the testcase does pass with this
> > patchset, I think there are still races in here.
> >
> > I think it's possible for two tasks to enter this code nearly
> > simultaneously such that the bitlock is clear. One gets the lock and
> > then clears NFS_INO_INVALID_DATA, and the other then checks the flag,
> > finds it clear and exits the function before the pages were invalidated.
>
> So what? Something can always come up later to cause the NFS client to invalidate the cache again. That?s pretty much per the definition of weak cache consistency.
>
> All we should need to care about in nfs_revalidate_mapping is that the function respects any and all NFS_INO_INVALID_DATA settings that were in effect _before_ entering the above code.
>
Right, and in the race I described (poorly) above, that wouldn't
happen. One task could end up exiting before invalidate_inode_pages2
and never see the flag as set.
The race is somewhat unlikely, but I'd rather we lay this to rest one
and for all while we're in here...
> > I wonder if we'd be better served with a new cache_validity flag
> > NFS_INO_INVALIDATING_DATA or something, and not have other functions
> > trying to peek at the bitlock.
>
> The one thing that I?m not sure of above is whether or not we need an ACCESS_ONCE() around the read of nfsi->cache_validity. I think not, since wait_on_bit and test_and_set_bit should both contain barriers, but I?m not an ultimate authority in the matter.
>
Hmm...I'm not sure either. OTOH, I think a patch like this one
(untested) might be a simpler approach. It does require an extra
cache_validity flag and i_lock acquisition during an invalidation, but
the handling of the bitlock is less fiddly, and other codepaths won't
need to go poking around in nfsi->flags.
-------------------------8<-----------------------------
NFS: fix the handling of NFS_INO_INVALID_DATA flag in nfs_revalidate_mapping
There is a possible race in how the nfs_invalidate_mapping function is
handled. Currently, we go and invalidate the pages in the file and then
clear NFS_INO_INVALID_DATA.
The problem is that it's possible for a stale page to creep into the
mapping after the page was invalidated (i.e., via readahead). If another
writer comes along and sets the flag after that happens but before
invalidate_inode_pages2 returns then we could clear the flag
without the cache having been properly invalidated.
So, we must clear the flag first and then invalidate the pages. Doing
this however, opens another race:
It's possible to have two concurrent read() calls that end up in
nfs_revalidate_mapping at the same time. The first one clears the
NFS_INO_INVALID_DATA flag and then goes to call nfs_invalidate_mapping.
Just before calling that though, the other task races in, checks the
flag and finds it cleared. At that point, it trusts that the mapping is
good and gets the lock on the page, allowing the read() to be satisfied
from the cache even though the data is no longer valid.
These effects are easily manifested by running diotest3 from the LTP
test suite on NFS. That program does a series of DIO writes and buffered
reads. The operations are serialized and page-aligned but the existing
code fails the test since it occasionally allows a read to come out of
the cache incorrectly. While mixing direct and buffered I/O isn't
recommended, I believe it's possible to hit this in other ways that just
use buffered I/O, though that situation is much harder to reproduce.
The problem is that the checking/clearing of that flag and the
invalidation of the mapping really need to be atomic. Fix this by
serializing concurrent revalidations with a bitlock, and adding a
new cache_validity flag NFS_INO_INVALIDATING that indicates that
the inode is in the middle of having its mapping shot down.
At the same time, we also need to allow other places that check
NFS_INO_INVALID_DATA to check whether we might be in the middle of
invalidating the file, so fix up a couple of places that do that
to look for the new NFS_INO_INVALIDATING flag.
Signed-off-by: Jeff Layton <[email protected]>
---
fs/nfs/dir.c | 2 +-
fs/nfs/inode.c | 43 +++++++++++++++++++++++++++++++++++++++----
fs/nfs/nfstrace.h | 4 +++-
fs/nfs/write.c | 4 +++-
include/linux/nfs_fs.h | 2 ++
5 files changed, 48 insertions(+), 7 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 812154a..36ddf13 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -288,7 +288,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
new_pos = desc->current_index + i;
if (ctx->attr_gencount != nfsi->attr_gencount
- || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+ || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALIDATING))) {
ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
} else if (new_pos < desc->ctx->pos) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00ad1c2..06aef40 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -977,11 +977,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
if (ret < 0)
return ret;
}
- spin_lock(&inode->i_lock);
- nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode)) {
+ spin_lock(&inode->i_lock);
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
- spin_unlock(&inode->i_lock);
+ spin_unlock(&inode->i_lock);
+ }
nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
nfs_fscache_wait_on_invalidate(inode);
@@ -1007,6 +1007,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
int ret = 0;
/* swapfiles are not supposed to be shared. */
@@ -1018,12 +1019,46 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
if (ret < 0)
goto out;
}
+
+ /*
+ * We must clear NFS_INO_INVALID_DATA first to ensure that
+ * invalidations that come in while we're shooting down the mappings
+ * are respected. But, that leaves a race window where one revalidator
+ * can clear the flag, and then another checks it before the mapping
+ * gets invalidated. Fix that by serializing access to this part of
+ * the function.
+ */
+ ret = wait_on_bit_lock(bitlock, NFS_INO_REVALIDATING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (ret)
+ goto out;
+
+ spin_lock(&inode->i_lock);
if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+ /*
+ * Set the INVALIDATING flag to indicate that the inode is
+ * currently having its mapping shot down. Clear the
+ * INVALID_DATA flag so that later invalidations are respected.
+ * Note that the order of operations matters here since we
+ * don't allow a race window where both bits are clear.
+ */
+ nfsi->cache_validity |= NFS_INO_INVALIDATING;
+ smp_wmb();
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
+
trace_nfs_invalidate_mapping_enter(inode);
ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
+
+ spin_lock(&inode->i_lock);
+ nfsi->cache_validity &= ~NFS_INO_INVALIDATING;
}
+ spin_unlock(&inode->i_lock);
+ clear_bit_unlock(NFS_INO_REVALIDATING, bitlock);
+ smp_mb__after_clear_bit();
+ wake_up_bit(bitlock, NFS_INO_REVALIDATING);
out:
return ret;
}
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 89fe741..67019ba 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -30,12 +30,14 @@
{ NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
{ NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \
{ NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
- { NFS_INO_INVALID_LABEL, "INVALID_LABEL" })
+ { NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \
+ { NFS_INO_INVALIDATING, "INVALIDATING" })
#define nfs_show_nfsi_flags(v) \
__print_flags(v, "|", \
{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
{ 1 << NFS_INO_STALE, "STALE" }, \
+ { 1 << NFS_INO_REVALIDATING, "REVALIDATING" }, \
{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
{ 1 << NFS_INO_COMMIT, "COMMIT" }, \
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c1d5482..cd7e521 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -909,9 +909,11 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
*/
static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
if (nfs_have_delegated_attributes(inode))
goto out;
- if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+ if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALIDATING))
return false;
out:
return PageUptodate(page) != 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4899737..08d8226 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -208,6 +208,7 @@ struct nfs_inode {
#define NFS_INO_REVAL_PAGECACHE 0x0020 /* must revalidate pagecache */
#define NFS_INO_REVAL_FORCED 0x0040 /* force revalidation ignoring a delegation */
#define NFS_INO_INVALID_LABEL 0x0080 /* cached label is invalid */
+#define NFS_INO_INVALIDATING 0x0100 /* inode pages being invalidated */
/*
* Bit offsets in flags field
@@ -215,6 +216,7 @@ struct nfs_inode {
#define NFS_INO_ADVISE_RDPLUS (0) /* advise readdirplus */
#define NFS_INO_STALE (1) /* possible stale inode */
#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
+#define NFS_INO_REVALIDATING (3) /* inode is being revalidated */
#define NFS_INO_FLUSHING (4) /* inode is flushing out data */
#define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */
#define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */
--
1.8.5.3