struct file_ra_state ra.mmap_miss could be accessed concurrently during
page faults as noticed by KCSAN,
BUG: KCSAN: data-race in filemap_fault / filemap_map_pages
write to 0xffff9b1700a2c1b4 of 4 bytes by task 3292 on cpu 30:
filemap_fault+0x920/0xfc0
do_sync_mmap_readahead at mm/filemap.c:2384
(inlined by) filemap_fault at mm/filemap.c:2486
__xfs_filemap_fault+0x112/0x3e0 [xfs]
xfs_filemap_fault+0x74/0x90 [xfs]
__do_fault+0x9e/0x220
do_fault+0x4a0/0x920
__handle_mm_fault+0xc69/0xd00
handle_mm_fault+0xfc/0x2f0
do_page_fault+0x263/0x6f9
page_fault+0x34/0x40
read to 0xffff9b1700a2c1b4 of 4 bytes by task 3313 on cpu 32:
filemap_map_pages+0xc2e/0xd80
filemap_map_pages at mm/filemap.c:2625
do_fault+0x3da/0x920
__handle_mm_fault+0xc69/0xd00
handle_mm_fault+0xfc/0x2f0
do_page_fault+0x263/0x6f9
page_fault+0x34/0x40
Reported by Kernel Concurrency Sanitizer on:
CPU: 32 PID: 3313 Comm: systemd-udevd Tainted: G W L 5.5.0-next-20200210+ #1
Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019
ra.mmap_miss is used to contribute the readahead decisions, a data race
could be undesirable. Since the stores are aligned and less than
word-size, assume they are safe. Thus, fixing it by adding READ_ONCE()
for the loads except those places comparing to zero where they are safe.
Signed-off-by: Qian Cai <[email protected]>
---
mm/filemap.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 1784478270e1..b6c1d37f7ea3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2380,14 +2380,14 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
}
/* Avoid banging the cache line if not needed */
- if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
+ if (READ_ONCE(ra->mmap_miss) < MMAP_LOTSAMISS * 10)
ra->mmap_miss++;
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
*/
- if (ra->mmap_miss > MMAP_LOTSAMISS)
+ if (READ_ONCE(ra->mmap_miss) > MMAP_LOTSAMISS)
return fpin;
/*
@@ -2418,7 +2418,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
- if (ra->mmap_miss > 0)
+ if (data_race(ra->mmap_miss > 0))
ra->mmap_miss--;
if (PageReadahead(page)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
@@ -2622,7 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
if (page->index >= max_idx)
goto unlock;
- if (file->f_ra.mmap_miss > 0)
+ if (data_race(file->f_ra.mmap_miss > 0))
file->f_ra.mmap_miss--;
vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
--
1.8.3.1
On Mon, Feb 10, 2020 at 12:00:29PM -0500, Qian Cai wrote:
> @@ -2622,7 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
> if (page->index >= max_idx)
> goto unlock;
>
> - if (file->f_ra.mmap_miss > 0)
> + if (data_race(file->f_ra.mmap_miss > 0))
> file->f_ra.mmap_miss--;
How is this safe? Two threads can each see 1, and then both decrement the
in-memory copy, causing it to end up at -1.
On Mon, Feb 10, 2020 at 09:25:11AM -0800, Matthew Wilcox wrote:
> On Mon, Feb 10, 2020 at 12:00:29PM -0500, Qian Cai wrote:
> > @@ -2622,7 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
> > if (page->index >= max_idx)
> > goto unlock;
> >
> > - if (file->f_ra.mmap_miss > 0)
> > + if (data_race(file->f_ra.mmap_miss > 0))
> > file->f_ra.mmap_miss--;
>
> How is this safe? Two threads can each see 1, and then both decrement the
> in-memory copy, causing it to end up at -1.
Right, it is bogus.
Below is my completely untested attempt on fix this. It still allows
races, but they will only lead to missed accounting, but not underflow.
diff --git a/mm/filemap.c b/mm/filemap.c
index 1784478270e1..1919d37c646a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2365,6 +2365,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
struct file *fpin = NULL;
pgoff_t offset = vmf->pgoff;
+ unsigned mmap_miss;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
@@ -2380,14 +2381,15 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
}
/* Avoid banging the cache line if not needed */
- if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
- ra->mmap_miss++;
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss < MMAP_LOTSAMISS * 10)
+ WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
*/
- if (ra->mmap_miss > MMAP_LOTSAMISS)
+ if (mmap_miss > MMAP_LOTSAMISS)
return fpin;
/*
@@ -2413,13 +2415,15 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
struct file *fpin = NULL;
+ unsigned int mmap_miss;
pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
- if (ra->mmap_miss > 0)
- ra->mmap_miss--;
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
if (PageReadahead(page)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_readahead(mapping, ra, file,
@@ -2586,7 +2590,9 @@ void filemap_map_pages(struct vm_fault *vmf,
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *page;
+ unsigned long mmap_miss;
+ mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
rcu_read_lock();
xas_for_each(&xas, page, end_pgoff) {
if (xas_retry(&xas, page))
@@ -2622,8 +2628,8 @@ void filemap_map_pages(struct vm_fault *vmf,
if (page->index >= max_idx)
goto unlock;
- if (file->f_ra.mmap_miss > 0)
- file->f_ra.mmap_miss--;
+ if (mmap_miss > 0)
+ mmap_miss--;
vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
@@ -2643,6 +2649,7 @@ void filemap_map_pages(struct vm_fault *vmf,
break;
}
rcu_read_unlock();
+ WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
}
EXPORT_SYMBOL(filemap_map_pages);
--
Kirill A. Shutemov
On Mon, 2020-02-10 at 09:25 -0800, Matthew Wilcox wrote:
> On Mon, Feb 10, 2020 at 12:00:29PM -0500, Qian Cai wrote:
> > @@ -2622,7 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
> > if (page->index >= max_idx)
> > goto unlock;
> >
> > - if (file->f_ra.mmap_miss > 0)
> > + if (data_race(file->f_ra.mmap_miss > 0))
> > file->f_ra.mmap_miss--;
>
> How is this safe? Two threads can each see 1, and then both decrement the
> in-memory copy, causing it to end up at -1.
Well, I meant to say it is safe from *data* races rather than all other races,
but it is a good catch for the underflow cases and makes some sense to fix them
together (so we don't need to touch the same lines over and over again).
On Mon, Feb 10, 2020 at 02:20:48PM -0500, Qian Cai wrote:
> On Mon, 2020-02-10 at 09:25 -0800, Matthew Wilcox wrote:
> > On Mon, Feb 10, 2020 at 12:00:29PM -0500, Qian Cai wrote:
> > > @@ -2622,7 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
> > > if (page->index >= max_idx)
> > > goto unlock;
> > >
> > > - if (file->f_ra.mmap_miss > 0)
> > > + if (data_race(file->f_ra.mmap_miss > 0))
> > > file->f_ra.mmap_miss--;
> >
> > How is this safe? Two threads can each see 1, and then both decrement the
> > in-memory copy, causing it to end up at -1.
>
> Well, I meant to say it is safe from *data* races rather than all other races,
> but it is a good catch for the underflow cases and makes some sense to fix them
> together (so we don't need to touch the same lines over and over again).
My point is that this is a legitimate warning from the sanitiser.
The point of your patches should not be to remove all the warnings!
On Mon, 2020-02-10 at 21:05 +0300, Kirill A. Shutemov wrote:
> On Mon, Feb 10, 2020 at 09:25:11AM -0800, Matthew Wilcox wrote:
> > On Mon, Feb 10, 2020 at 12:00:29PM -0500, Qian Cai wrote:
> > > @@ -2622,7 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
> > > if (page->index >= max_idx)
> > > goto unlock;
> > >
> > > - if (file->f_ra.mmap_miss > 0)
> > > + if (data_race(file->f_ra.mmap_miss > 0))
> > > file->f_ra.mmap_miss--;
> >
> > How is this safe? Two threads can each see 1, and then both decrement the
> > in-memory copy, causing it to end up at -1.
>
> Right, it is bogus.
>
> Below is my completely untested attempt on fix this. It still allows
> races, but they will only lead to missed accounting, but not underflow.
Looks good to me. Do you plan to send out an official patch?
>
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1784478270e1..1919d37c646a 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2365,6 +2365,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> struct address_space *mapping = file->f_mapping;
> struct file *fpin = NULL;
> pgoff_t offset = vmf->pgoff;
> + unsigned mmap_miss;
>
> /* If we don't want any read-ahead, don't bother */
> if (vmf->vma->vm_flags & VM_RAND_READ)
> @@ -2380,14 +2381,15 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
> }
>
> /* Avoid banging the cache line if not needed */
> - if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
> - ra->mmap_miss++;
> + mmap_miss = READ_ONCE(ra->mmap_miss);
> + if (mmap_miss < MMAP_LOTSAMISS * 10)
> + WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
>
> /*
> * Do we miss much more than hit in this file? If so,
> * stop bothering with read-ahead. It will only hurt.
> */
> - if (ra->mmap_miss > MMAP_LOTSAMISS)
> + if (mmap_miss > MMAP_LOTSAMISS)
> return fpin;
>
> /*
> @@ -2413,13 +2415,15 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
> struct file_ra_state *ra = &file->f_ra;
> struct address_space *mapping = file->f_mapping;
> struct file *fpin = NULL;
> + unsigned int mmap_miss;
> pgoff_t offset = vmf->pgoff;
>
> /* If we don't want any read-ahead, don't bother */
> if (vmf->vma->vm_flags & VM_RAND_READ)
> return fpin;
> - if (ra->mmap_miss > 0)
> - ra->mmap_miss--;
> + mmap_miss = READ_ONCE(ra->mmap_miss);
> + if (mmap_miss)
> + WRITE_ONCE(ra->mmap_miss, --mmap_miss);
> if (PageReadahead(page)) {
> fpin = maybe_unlock_mmap_for_io(vmf, fpin);
> page_cache_async_readahead(mapping, ra, file,
> @@ -2586,7 +2590,9 @@ void filemap_map_pages(struct vm_fault *vmf,
> unsigned long max_idx;
> XA_STATE(xas, &mapping->i_pages, start_pgoff);
> struct page *page;
> + unsigned long mmap_miss;
>
> + mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
> rcu_read_lock();
> xas_for_each(&xas, page, end_pgoff) {
> if (xas_retry(&xas, page))
> @@ -2622,8 +2628,8 @@ void filemap_map_pages(struct vm_fault *vmf,
> if (page->index >= max_idx)
> goto unlock;
>
> - if (file->f_ra.mmap_miss > 0)
> - file->f_ra.mmap_miss--;
> + if (mmap_miss > 0)
> + mmap_miss--;
>
> vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
> if (vmf->pte)
> @@ -2643,6 +2649,7 @@ void filemap_map_pages(struct vm_fault *vmf,
> break;
> }
> rcu_read_unlock();
> + WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
> }
> EXPORT_SYMBOL(filemap_map_pages);
>
On Mon, 2020-02-10 at 11:21 -0800, Matthew Wilcox wrote:
> On Mon, Feb 10, 2020 at 02:20:48PM -0500, Qian Cai wrote:
> > On Mon, 2020-02-10 at 09:25 -0800, Matthew Wilcox wrote:
> > > On Mon, Feb 10, 2020 at 12:00:29PM -0500, Qian Cai wrote:
> > > > @@ -2622,7 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
> > > > if (page->index >= max_idx)
> > > > goto unlock;
> > > >
> > > > - if (file->f_ra.mmap_miss > 0)
> > > > + if (data_race(file->f_ra.mmap_miss > 0))
> > > > file->f_ra.mmap_miss--;
> > >
> > > How is this safe? Two threads can each see 1, and then both decrement the
> > > in-memory copy, causing it to end up at -1.
> >
> > Well, I meant to say it is safe from *data* races rather than all other races,
> > but it is a good catch for the underflow cases and makes some sense to fix them
> > together (so we don't need to touch the same lines over and over again).
>
> My point is that this is a legitimate warning from the sanitiser.
> The point of your patches should not be to remove all the warnings!
The KCSAN will assume the write is "atomic" if it is aligned and within word-
size which is the case forĀ "ra->mmap_miss", so I somehow skip auditing the
locking around the concurrent writers, but I got your point. Next time, I'll
spend a bit more time looking.
On Mon, 10 Feb 2020 at 21:28, Qian Cai <[email protected]> wrote:
>
> On Mon, 2020-02-10 at 11:21 -0800, Matthew Wilcox wrote:
> > On Mon, Feb 10, 2020 at 02:20:48PM -0500, Qian Cai wrote:
> > > On Mon, 2020-02-10 at 09:25 -0800, Matthew Wilcox wrote:
> > > > On Mon, Feb 10, 2020 at 12:00:29PM -0500, Qian Cai wrote:
> > > > > @@ -2622,7 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
> > > > > if (page->index >= max_idx)
> > > > > goto unlock;
> > > > >
> > > > > - if (file->f_ra.mmap_miss > 0)
> > > > > + if (data_race(file->f_ra.mmap_miss > 0))
> > > > > file->f_ra.mmap_miss--;
> > > >
> > > > How is this safe? Two threads can each see 1, and then both decrement the
> > > > in-memory copy, causing it to end up at -1.
> > >
> > > Well, I meant to say it is safe from *data* races rather than all other races,
> > > but it is a good catch for the underflow cases and makes some sense to fix them
> > > together (so we don't need to touch the same lines over and over again).
> >
> > My point is that this is a legitimate warning from the sanitiser.
> > The point of your patches should not be to remove all the warnings!
>
> The KCSAN will assume the write is "atomic" if it is aligned and within word-
> size which is the case for "ra->mmap_miss", so I somehow skip auditing the
> locking around the concurrent writers, but I got your point. Next time, I'll
> spend a bit more time looking.
Note: the fact that we assume writes aligned up to word-size are
atomic is based on current preferences we were told about. Just
because the tool won't complain right now (although a simple config
switch will make it complain again), we don't want to forget the
writes entirely. If it is a simple write, do the WRITE_ONCE if it
makes sense. I, for one, still can't prove if all compilers won't
screw up a write due to an omitted WRITE_ONCE somehow. [Yes, for more
complex ops like 'var++', turning them into 'WRITE_ONCE(var, var + 1)'
isn't as readable, so these are a bit tricky until we get primitives
to properly deal with them.]
On Mon, Feb 10, 2020 at 02:58:17PM -0500, Qian Cai wrote:
> On Mon, 2020-02-10 at 21:05 +0300, Kirill A. Shutemov wrote:
> > On Mon, Feb 10, 2020 at 09:25:11AM -0800, Matthew Wilcox wrote:
> > > On Mon, Feb 10, 2020 at 12:00:29PM -0500, Qian Cai wrote:
> > > > @@ -2622,7 +2622,7 @@ void filemap_map_pages(struct vm_fault *vmf,
> > > > if (page->index >= max_idx)
> > > > goto unlock;
> > > >
> > > > - if (file->f_ra.mmap_miss > 0)
> > > > + if (data_race(file->f_ra.mmap_miss > 0))
> > > > file->f_ra.mmap_miss--;
> > >
> > > How is this safe? Two threads can each see 1, and then both decrement the
> > > in-memory copy, causing it to end up at -1.
> >
> > Right, it is bogus.
> >
> > Below is my completely untested attempt on fix this. It still allows
> > races, but they will only lead to missed accounting, but not underflow.
>
> Looks good to me. Do you plan to send out an official patch?
Feel free to submit it. After testing.
--
Kirill A. Shutemov