When reading the pcpu_nr_empty_pop_pages in pcpu_alloc()
and writing the pcpu_nr_empty_pop_pages in
pcpu_update_empty_pages() at the same time,
the data-race occurs.
===========
read-write to 0xffffffff882fdd4c of 4 bytes by task 9424 on cpu 0:
pcpu_update_empty_pages
pcpu_chunk_populated
pcpu_balance_populated
pcpu_balance_workfn
process_one_work
worker_thread
kthread
ret_from_fork
read to 0xffffffff882fdd4c of 4 bytes by task 9386 on cpu 3:
pcpu_alloc
__alloc_percpu_gfp
fib_nh_common_init
fib_nh_init
fib_create_info
fib_table_insert
fib_magic
......
sock_sendmsg_nosec
sock_sendmsg
__sys_sendto
__do_sys_sendto
__se_sys_sendto
__x64_sys_sendto
do_syscall_64
entry_SYSCALL_64_after_hwframe
============
The same problem will occur in these functions:
pcpu_reclaim_populated(), pcpu_update_empty_pages(),
pcpu_isolate_chunk().
To fix this issue, use READ_ONCE() and WRITE_ONCE() to
read and write the pcpu_nr_empty_pop_pages.
Signed-off-by: Yuanzheng Song <[email protected]>
---
mm/percpu.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/mm/percpu.c b/mm/percpu.c
index 293009cc03ef..e8ef92e698ab 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -574,7 +574,9 @@ static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
if (!chunk->isolated) {
chunk->isolated = true;
- pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
+ WRITE_ONCE(pcpu_nr_empty_pop_pages,
+ READ_ONCE(pcpu_nr_empty_pop_pages) -
+ chunk->nr_empty_pop_pages);
}
list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
}
@@ -585,7 +587,9 @@ static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
if (chunk->isolated) {
chunk->isolated = false;
- pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
+ WRITE_ONCE(pcpu_nr_empty_pop_pages,
+ READ_ONCE(pcpu_nr_empty_pop_pages) +
+ chunk->nr_empty_pop_pages);
pcpu_chunk_relocate(chunk, -1);
}
}
@@ -603,7 +607,8 @@ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
chunk->nr_empty_pop_pages += nr;
if (chunk != pcpu_reserved_chunk && !chunk->isolated)
- pcpu_nr_empty_pop_pages += nr;
+ WRITE_ONCE(pcpu_nr_empty_pop_pages,
+ READ_ONCE(pcpu_nr_empty_pop_pages) + nr);
}
/*
@@ -1874,7 +1879,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
mutex_unlock(&pcpu_alloc_mutex);
}
- if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
+ if (READ_ONCE(pcpu_nr_empty_pop_pages) < PCPU_EMPTY_POP_PAGES_LOW)
pcpu_schedule_balance_work();
/* clear the areas and return address relative to base address */
@@ -2062,7 +2067,7 @@ static void pcpu_balance_populated(void)
pcpu_atomic_alloc_failed = false;
} else {
nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
- pcpu_nr_empty_pop_pages,
+ READ_ONCE(pcpu_nr_empty_pop_pages),
0, PCPU_EMPTY_POP_PAGES_HIGH);
}
@@ -2163,7 +2168,8 @@ static void pcpu_reclaim_populated(void)
break;
/* reintegrate chunk to prevent atomic alloc failures */
- if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
+ if (READ_ONCE(pcpu_nr_empty_pop_pages) <
+ PCPU_EMPTY_POP_PAGES_HIGH) {
reintegrate = true;
goto end_chunk;
}
--
2.25.1
On Mon, 25 Oct 2021, Yuanzheng Song wrote:
> When reading the pcpu_nr_empty_pop_pages in pcpu_alloc()
> and writing the pcpu_nr_empty_pop_pages in
> pcpu_update_empty_pages() at the same time,
> the data-race occurs.
Looks like a use case for the atomic RMV instructions.
> To fix this issue, use READ_ONCE() and WRITE_ONCE() to
> read and write the pcpu_nr_empty_pop_pages.
Never thought that READ_ONCE and WRITE_ONCE can fix races like
this. Really?
> diff --git a/mm/percpu.c b/mm/percpu.c
> index 293009cc03ef..e8ef92e698ab 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -574,7 +574,9 @@ static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
>
> if (!chunk->isolated) {
> chunk->isolated = true;
> - pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
> + WRITE_ONCE(pcpu_nr_empty_pop_pages,
> + READ_ONCE(pcpu_nr_empty_pop_pages) -
> + chunk->nr_empty_pop_pages);
atomic_sub()?
> }
> list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
> }
> @@ -585,7 +587,9 @@ static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
>
> if (chunk->isolated) {
> chunk->isolated = false;
> - pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
> + WRITE_ONCE(pcpu_nr_empty_pop_pages,
> + READ_ONCE(pcpu_nr_empty_pop_pages) +
> + chunk->nr_empty_pop_pages);
atomic_add()?
Hello,
On Mon, Oct 25, 2021 at 09:50:48AM +0200, Christoph Lameter wrote:
> On Mon, 25 Oct 2021, Yuanzheng Song wrote:
>
> > When reading the pcpu_nr_empty_pop_pages in pcpu_alloc()
> > and writing the pcpu_nr_empty_pop_pages in
> > pcpu_update_empty_pages() at the same time,
> > the data-race occurs.
>
> Looks like a use case for the atomic RMV instructions.
>
Yeah. I see 2 options. Switch the variable over to an atomic or we can
move the read behind pcpu_lock. All the writes are already behind it
othewise that would actually be problematic. In this particular case,
reading a wrong # of empty pages isn't a big deal as eventually the
background work will get scheduled.
Thanks,
Dennis
> > To fix this issue, use READ_ONCE() and WRITE_ONCE() to
> > read and write the pcpu_nr_empty_pop_pages.
>
> Never thought that READ_ONCE and WRITE_ONCE can fix races like
> this. Really?
>
> > diff --git a/mm/percpu.c b/mm/percpu.c
> > index 293009cc03ef..e8ef92e698ab 100644
> > --- a/mm/percpu.c
> > +++ b/mm/percpu.c
> > @@ -574,7 +574,9 @@ static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
> >
> > if (!chunk->isolated) {
> > chunk->isolated = true;
> > - pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
> > + WRITE_ONCE(pcpu_nr_empty_pop_pages,
> > + READ_ONCE(pcpu_nr_empty_pop_pages) -
> > + chunk->nr_empty_pop_pages);
>
> atomic_sub()?
>
> > }
> > list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
> > }
> > @@ -585,7 +587,9 @@ static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
> >
> > if (chunk->isolated) {
> > chunk->isolated = false;
> > - pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
> > + WRITE_ONCE(pcpu_nr_empty_pop_pages,
> > + READ_ONCE(pcpu_nr_empty_pop_pages) +
> > + chunk->nr_empty_pop_pages);
>
> atomic_add()?
>
Hello,
Thanks for the advice, Dennis Zhou and Christoph Lameter.
I really appreciate it.
I edited this patch by changing the pcpu_nr_empty_pop_pages to atomic_t variable.
Here is the v2 patch: https://patchwork.kernel.org/project/linux-mm/patch/[email protected]/.
Would you mind reviewing it again?
Thanks,
Yuanzheng Song
-----Original Message-----
From: Dennis Zhou [mailto:[email protected]]
Sent: Tuesday, October 26, 2021 10:42 AM
To: Christoph Lameter <[email protected]>
Cc: songyuanzheng <[email protected]>; [email protected]; [email protected]; [email protected]; [email protected]; [email protected]
Subject: Re: [PATCH -next] mm/percpu: fix data-race with pcpu_nr_empty_pop_pages
Hello,
On Mon, Oct 25, 2021 at 09:50:48AM +0200, Christoph Lameter wrote:
> On Mon, 25 Oct 2021, Yuanzheng Song wrote:
>
> > When reading the pcpu_nr_empty_pop_pages in pcpu_alloc() and writing
> > the pcpu_nr_empty_pop_pages in
> > pcpu_update_empty_pages() at the same time, the data-race occurs.
>
> Looks like a use case for the atomic RMV instructions.
>
Yeah. I see 2 options. Switch the variable over to an atomic or we can move the read behind pcpu_lock. All the writes are already behind it othewise that would actually be problematic. In this particular case, reading a wrong # of empty pages isn't a big deal as eventually the background work will get scheduled.
Thanks,
Dennis
> > To fix this issue, use READ_ONCE() and WRITE_ONCE() to read and
> > write the pcpu_nr_empty_pop_pages.
>
> Never thought that READ_ONCE and WRITE_ONCE can fix races like this.
> Really?
>
> > diff --git a/mm/percpu.c b/mm/percpu.c index
> > 293009cc03ef..e8ef92e698ab 100644
> > --- a/mm/percpu.c
> > +++ b/mm/percpu.c
> > @@ -574,7 +574,9 @@ static void pcpu_isolate_chunk(struct pcpu_chunk
> > *chunk)
> >
> > if (!chunk->isolated) {
> > chunk->isolated = true;
> > - pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
> > + WRITE_ONCE(pcpu_nr_empty_pop_pages,
> > + READ_ONCE(pcpu_nr_empty_pop_pages) -
> > + chunk->nr_empty_pop_pages);
>
> atomic_sub()?
>
> > }
> > list_move(&chunk->list,
> > &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
> > }
> > @@ -585,7 +587,9 @@ static void pcpu_reintegrate_chunk(struct
> > pcpu_chunk *chunk)
> >
> > if (chunk->isolated) {
> > chunk->isolated = false;
> > - pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
> > + WRITE_ONCE(pcpu_nr_empty_pop_pages,
> > + READ_ONCE(pcpu_nr_empty_pop_pages) +
> > + chunk->nr_empty_pop_pages);
>
> atomic_add()?
>