2021-06-17 20:45:00

by Roman Gushchin

[permalink] [raw]
Subject: [PATCH v2] percpu: optimize locking in pcpu_balance_workfn()

pcpu_balance_workfn() unconditionally calls pcpu_balance_free(),
pcpu_reclaim_populated(), pcpu_balance_populated() and
pcpu_balance_free() again.

Each call to pcpu_balance_free() and pcpu_reclaim_populated() will
cause at least one acquisition of the pcpu_lock. So even if the
balancing was scheduled because of a failed atomic allocation,
pcpu_lock will be acquired at least 4 times. This obviously
increases the contention on the pcpu_lock.

To optimize the scheme let's grab the pcpu_lock on the upper level
(in pcpu_balance_workfn()) and keep it generally locked for the whole
duration of the scheduled work, but release conditionally to perform
any slow operations like chunk (de)population and creation of new
chunks.

Signed-off-by: Roman Gushchin <[email protected]>
---
mm/percpu.c | 41 +++++++++++++++++++++++++++++------------
1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index e7b9ca82e9aa..deee7e5bb255 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1980,6 +1980,9 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
* If empty_only is %false, reclaim all fully free chunks regardless of the
* number of populated pages. Otherwise, only reclaim chunks that have no
* populated pages.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
*/
static void pcpu_balance_free(bool empty_only)
{
@@ -1987,12 +1990,12 @@ static void pcpu_balance_free(bool empty_only)
struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
struct pcpu_chunk *chunk, *next;

+ lockdep_assert_held(&pcpu_lock);
+
/*
* There's no reason to keep around multiple unused chunks and VM
* areas can be scarce. Destroy all free chunks except for one.
*/
- spin_lock_irq(&pcpu_lock);
-
list_for_each_entry_safe(chunk, next, free_head, list) {
WARN_ON(chunk->immutable);

@@ -2004,8 +2007,10 @@ static void pcpu_balance_free(bool empty_only)
list_move(&chunk->list, &to_free);
}

- spin_unlock_irq(&pcpu_lock);
+ if (list_empty(&to_free))
+ return;

+ spin_unlock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, &to_free, list) {
unsigned int rs, re;

@@ -2019,6 +2024,7 @@ static void pcpu_balance_free(bool empty_only)
pcpu_destroy_chunk(chunk);
cond_resched();
}
+ spin_lock_irq(&pcpu_lock);
}

/**
@@ -2029,6 +2035,9 @@ static void pcpu_balance_free(bool empty_only)
* OOM killer to be triggered. We should avoid doing so until an actual
* allocation causes the failure as it is possible that requests can be
* serviced from already backed regions.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
*/
static void pcpu_balance_populated(void)
{
@@ -2037,6 +2046,8 @@ static void pcpu_balance_populated(void)
struct pcpu_chunk *chunk;
int slot, nr_to_pop, ret;

+ lockdep_assert_held(&pcpu_lock);
+
/*
* Ensure there are certain number of free populated pages for
* atomic allocs. Fill up from the most packed so that atomic
@@ -2064,13 +2075,11 @@ static void pcpu_balance_populated(void)
if (!nr_to_pop)
break;

- spin_lock_irq(&pcpu_lock);
list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
nr_unpop = chunk->nr_pages - chunk->nr_populated;
if (nr_unpop)
break;
}
- spin_unlock_irq(&pcpu_lock);

if (!nr_unpop)
continue;
@@ -2080,12 +2089,13 @@ static void pcpu_balance_populated(void)
chunk->nr_pages) {
int nr = min_t(int, re - rs, nr_to_pop);

+ spin_unlock_irq(&pcpu_lock);
ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
if (!ret) {
nr_to_pop -= nr;
- spin_lock_irq(&pcpu_lock);
pcpu_chunk_populated(chunk, rs, rs + nr);
- spin_unlock_irq(&pcpu_lock);
} else {
nr_to_pop = 0;
}
@@ -2097,11 +2107,12 @@ static void pcpu_balance_populated(void)

if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */
+ spin_unlock_irq(&pcpu_lock);
chunk = pcpu_create_chunk(gfp);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
if (chunk) {
- spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
- spin_unlock_irq(&pcpu_lock);
goto retry_pop;
}
}
@@ -2117,6 +2128,10 @@ static void pcpu_balance_populated(void)
* populated pages threshold, reintegrate the chunk if it has empty free pages.
* Each chunk is scanned in the reverse order to keep populated pages close to
* the beginning of the chunk.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
+ *
*/
static void pcpu_reclaim_populated(void)
{
@@ -2124,7 +2139,7 @@ static void pcpu_reclaim_populated(void)
struct pcpu_block_md *block;
int i, end;

- spin_lock_irq(&pcpu_lock);
+ lockdep_assert_held(&pcpu_lock);

restart:
/*
@@ -2190,8 +2205,6 @@ static void pcpu_reclaim_populated(void)
list_move(&chunk->list,
&pcpu_chunk_lists[pcpu_sidelined_slot]);
}
-
- spin_unlock_irq(&pcpu_lock);
}

/**
@@ -2212,10 +2225,14 @@ static void pcpu_balance_workfn(struct work_struct *work)
* appropriate.
*/
mutex_lock(&pcpu_alloc_mutex);
+ spin_lock_irq(&pcpu_lock);
+
pcpu_balance_free(false);
pcpu_reclaim_populated();
pcpu_balance_populated();
pcpu_balance_free(true);
+
+ spin_unlock_irq(&pcpu_lock);
mutex_unlock(&pcpu_alloc_mutex);
}

--
2.31.1


2021-06-17 21:31:15

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH v2] percpu: optimize locking in pcpu_balance_workfn()

On Thu, Jun 17, 2021 at 12:03:22PM -0700, Roman Gushchin wrote:
> +++ b/mm/percpu.c
> @@ -1980,6 +1980,9 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
> * If empty_only is %false, reclaim all fully free chunks regardless of the
> * number of populated pages. Otherwise, only reclaim chunks that have no
> * populated pages.
> + *
> + * CONTEXT:
> + * pcpu_lock (can be dropped temporarily)
> */

What's the shouting all about? I would write it like this:

* Context: Process context. Caller must hold pcpu_lock with interrupts
* disabled. This function may drop the lock and re-enable interrupts if
* it needs to sleep, but will return with the lock held and interrupts
* disabled.

2021-06-18 02:40:20

by Dennis Zhou

[permalink] [raw]
Subject: Re: [PATCH v2] percpu: optimize locking in pcpu_balance_workfn()

Hello,

On Thu, Jun 17, 2021 at 08:56:23PM +0100, Matthew Wilcox wrote:
> On Thu, Jun 17, 2021 at 12:03:22PM -0700, Roman Gushchin wrote:
> > +++ b/mm/percpu.c
> > @@ -1980,6 +1980,9 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
> > * If empty_only is %false, reclaim all fully free chunks regardless of the
> > * number of populated pages. Otherwise, only reclaim chunks that have no
> > * populated pages.
> > + *
> > + * CONTEXT:
> > + * pcpu_lock (can be dropped temporarily)
> > */
>
> What's the shouting all about? I would write it like this:

At this point it's just to keep the code consistent.

>
> * Context: Process context. Caller must hold pcpu_lock with interrupts
> * disabled. This function may drop the lock and re-enable interrupts if
> * it needs to sleep, but will return with the lock held and interrupts
> * disabled.
>

This is related to background work done via a workqueue not in process
context. I'll take a look at cleaning up the documentation in the
future.

Thanks,
Dennis

2021-06-18 06:17:05

by Dennis Zhou

[permalink] [raw]
Subject: Re: [PATCH v2] percpu: optimize locking in pcpu_balance_workfn()

Hello,

On Thu, Jun 17, 2021 at 12:03:22PM -0700, Roman Gushchin wrote:
> pcpu_balance_workfn() unconditionally calls pcpu_balance_free(),
> pcpu_reclaim_populated(), pcpu_balance_populated() and
> pcpu_balance_free() again.
>
> Each call to pcpu_balance_free() and pcpu_reclaim_populated() will
> cause at least one acquisition of the pcpu_lock. So even if the
> balancing was scheduled because of a failed atomic allocation,
> pcpu_lock will be acquired at least 4 times. This obviously
> increases the contention on the pcpu_lock.
>
> To optimize the scheme let's grab the pcpu_lock on the upper level
> (in pcpu_balance_workfn()) and keep it generally locked for the whole
> duration of the scheduled work, but release conditionally to perform
> any slow operations like chunk (de)population and creation of new
> chunks.
>
> Signed-off-by: Roman Gushchin <[email protected]>
> ---
> mm/percpu.c | 41 +++++++++++++++++++++++++++++------------
> 1 file changed, 29 insertions(+), 12 deletions(-)
>
> diff --git a/mm/percpu.c b/mm/percpu.c
> index e7b9ca82e9aa..deee7e5bb255 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -1980,6 +1980,9 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
> * If empty_only is %false, reclaim all fully free chunks regardless of the
> * number of populated pages. Otherwise, only reclaim chunks that have no
> * populated pages.
> + *
> + * CONTEXT:
> + * pcpu_lock (can be dropped temporarily)
> */
> static void pcpu_balance_free(bool empty_only)
> {
> @@ -1987,12 +1990,12 @@ static void pcpu_balance_free(bool empty_only)
> struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
> struct pcpu_chunk *chunk, *next;
>
> + lockdep_assert_held(&pcpu_lock);
> +
> /*
> * There's no reason to keep around multiple unused chunks and VM
> * areas can be scarce. Destroy all free chunks except for one.
> */
> - spin_lock_irq(&pcpu_lock);
> -
> list_for_each_entry_safe(chunk, next, free_head, list) {
> WARN_ON(chunk->immutable);
>
> @@ -2004,8 +2007,10 @@ static void pcpu_balance_free(bool empty_only)
> list_move(&chunk->list, &to_free);
> }
>
> - spin_unlock_irq(&pcpu_lock);
> + if (list_empty(&to_free))
> + return;
>
> + spin_unlock_irq(&pcpu_lock);
> list_for_each_entry_safe(chunk, next, &to_free, list) {
> unsigned int rs, re;
>
> @@ -2019,6 +2024,7 @@ static void pcpu_balance_free(bool empty_only)
> pcpu_destroy_chunk(chunk);
> cond_resched();
> }
> + spin_lock_irq(&pcpu_lock);
> }
>
> /**
> @@ -2029,6 +2035,9 @@ static void pcpu_balance_free(bool empty_only)
> * OOM killer to be triggered. We should avoid doing so until an actual
> * allocation causes the failure as it is possible that requests can be
> * serviced from already backed regions.
> + *
> + * CONTEXT:
> + * pcpu_lock (can be dropped temporarily)
> */
> static void pcpu_balance_populated(void)
> {
> @@ -2037,6 +2046,8 @@ static void pcpu_balance_populated(void)
> struct pcpu_chunk *chunk;
> int slot, nr_to_pop, ret;
>
> + lockdep_assert_held(&pcpu_lock);
> +
> /*
> * Ensure there are certain number of free populated pages for
> * atomic allocs. Fill up from the most packed so that atomic
> @@ -2064,13 +2075,11 @@ static void pcpu_balance_populated(void)
> if (!nr_to_pop)
> break;
>
> - spin_lock_irq(&pcpu_lock);
> list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
> nr_unpop = chunk->nr_pages - chunk->nr_populated;
> if (nr_unpop)
> break;
> }
> - spin_unlock_irq(&pcpu_lock);
>
> if (!nr_unpop)
> continue;
> @@ -2080,12 +2089,13 @@ static void pcpu_balance_populated(void)
> chunk->nr_pages) {
> int nr = min_t(int, re - rs, nr_to_pop);
>
> + spin_unlock_irq(&pcpu_lock);
> ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
> + cond_resched();
> + spin_lock_irq(&pcpu_lock);
> if (!ret) {
> nr_to_pop -= nr;
> - spin_lock_irq(&pcpu_lock);
> pcpu_chunk_populated(chunk, rs, rs + nr);
> - spin_unlock_irq(&pcpu_lock);
> } else {
> nr_to_pop = 0;
> }
> @@ -2097,11 +2107,12 @@ static void pcpu_balance_populated(void)
>
> if (nr_to_pop) {
> /* ran out of chunks to populate, create a new one and retry */
> + spin_unlock_irq(&pcpu_lock);
> chunk = pcpu_create_chunk(gfp);
> + cond_resched();
> + spin_lock_irq(&pcpu_lock);
> if (chunk) {
> - spin_lock_irq(&pcpu_lock);
> pcpu_chunk_relocate(chunk, -1);
> - spin_unlock_irq(&pcpu_lock);
> goto retry_pop;
> }
> }
> @@ -2117,6 +2128,10 @@ static void pcpu_balance_populated(void)
> * populated pages threshold, reintegrate the chunk if it has empty free pages.
> * Each chunk is scanned in the reverse order to keep populated pages close to
> * the beginning of the chunk.
> + *
> + * CONTEXT:
> + * pcpu_lock (can be dropped temporarily)
> + *
> */
> static void pcpu_reclaim_populated(void)
> {
> @@ -2124,7 +2139,7 @@ static void pcpu_reclaim_populated(void)
> struct pcpu_block_md *block;
> int i, end;
>
> - spin_lock_irq(&pcpu_lock);
> + lockdep_assert_held(&pcpu_lock);
>
> restart:
> /*
> @@ -2190,8 +2205,6 @@ static void pcpu_reclaim_populated(void)
> list_move(&chunk->list,
> &pcpu_chunk_lists[pcpu_sidelined_slot]);
> }
> -
> - spin_unlock_irq(&pcpu_lock);
> }
>
> /**
> @@ -2212,10 +2225,14 @@ static void pcpu_balance_workfn(struct work_struct *work)
> * appropriate.
> */
> mutex_lock(&pcpu_alloc_mutex);
> + spin_lock_irq(&pcpu_lock);
> +
> pcpu_balance_free(false);
> pcpu_reclaim_populated();
> pcpu_balance_populated();
> pcpu_balance_free(true);
> +
> + spin_unlock_irq(&pcpu_lock);
> mutex_unlock(&pcpu_alloc_mutex);
> }
>
> --
> 2.31.1
>

I've applied this to for-5.14.

Thanks,
Dennis