Percpu counter's compare and add are separate functions: without locking
around them (which would defeat their purpose), it has been possible to
overflow the intended limit. Imagine all the other CPUs fallocating
tmpfs huge pages to the limit, in between this CPU's compare and its add.
I have not seen reports of that happening; but tmpfs's recent addition
of dquot_alloc_block_nodirty() in between the compare and the add makes
it even more likely, and I'd be uncomfortable to leave it unfixed.
Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.
I believe this implementation is correct, and slightly more efficient
than the combination of compare and add (taking the lock once rather
than twice when nearing full - the last 128MiB of a tmpfs volume on a
machine with 128 CPUs and 4KiB pages); but it does beg for a better
design - when nearing full, there is no new batching, but the costly
percpu counter sum across CPUs still has to be done, while locked.
Follow __percpu_counter_sum()'s example, including cpu_dying_mask as
well as cpu_online_mask: but shouldn't __percpu_counter_compare() and
__percpu_counter_limited_add() then be adding a num_dying_cpus() to
num_online_cpus(), when they calculate the maximum which could be held
across CPUs? But the times when it matters would be vanishingly rare.
Signed-off-by: Hugh Dickins <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Darrick J. Wong <[email protected]>
---
Tim, Dave, Darrick: I didn't want to waste your time on patches 1-7,
which are just internal to shmem, and do not affect this patch (which
applies to v6.6-rc and linux-next as is): but want to run this by you.
include/linux/percpu_counter.h | 23 +++++++++++++++
lib/percpu_counter.c | 53 ++++++++++++++++++++++++++++++++++
mm/shmem.c | 10 +++----
3 files changed, 81 insertions(+), 5 deletions(-)
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d01351b1526f..8cb7c071bd5c 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -57,6 +57,8 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
+bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
+ s64 amount, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);
static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
@@ -69,6 +71,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+ return __percpu_counter_limited_add(fbc, limit, amount,
+ percpu_counter_batch);
+}
+
/*
* With percpu_counter_add_local() and percpu_counter_sub_local(), counts
* are accumulated in local per cpu counter and not in fbc->count until
@@ -185,6 +194,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
local_irq_restore(flags);
}
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+ unsigned long flags;
+ s64 count;
+
+ local_irq_save(flags);
+ count = fbc->count + amount;
+ if (count <= limit)
+ fbc->count = count;
+ local_irq_restore(flags);
+ return count <= limit;
+}
+
/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 9073430dc865..58a3392f471b 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -278,6 +278,59 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
}
EXPORT_SYMBOL(__percpu_counter_compare);
+/*
+ * Compare counter, and add amount if the total is within limit.
+ * Return true if amount was added, false if it would exceed limit.
+ */
+bool __percpu_counter_limited_add(struct percpu_counter *fbc,
+ s64 limit, s64 amount, s32 batch)
+{
+ s64 count;
+ s64 unknown;
+ unsigned long flags;
+ bool good;
+
+ if (amount > limit)
+ return false;
+
+ local_irq_save(flags);
+ unknown = batch * num_online_cpus();
+ count = __this_cpu_read(*fbc->counters);
+
+ /* Skip taking the lock when safe */
+ if (abs(count + amount) <= batch &&
+ fbc->count + unknown <= limit) {
+ this_cpu_add(*fbc->counters, amount);
+ local_irq_restore(flags);
+ return true;
+ }
+
+ raw_spin_lock(&fbc->lock);
+ count = fbc->count + amount;
+
+ /* Skip percpu_counter_sum() when safe */
+ if (count + unknown > limit) {
+ s32 *pcount;
+ int cpu;
+
+ for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
+ pcount = per_cpu_ptr(fbc->counters, cpu);
+ count += *pcount;
+ }
+ }
+
+ good = count <= limit;
+ if (good) {
+ count = __this_cpu_read(*fbc->counters);
+ fbc->count += count + amount;
+ __this_cpu_sub(*fbc->counters, count);
+ }
+
+ raw_spin_unlock(&fbc->lock);
+ local_irq_restore(flags);
+ return good;
+}
+
static int __init percpu_counter_startup(void)
{
int ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 4f4ab26bc58a..7cb72c747954 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -217,15 +217,15 @@ static int shmem_inode_acct_blocks(struct inode *inode, long pages)
might_sleep(); /* when quotas */
if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks - pages) > 0)
+ if (!percpu_counter_limited_add(&sbinfo->used_blocks,
+ sbinfo->max_blocks, pages))
goto unacct;
err = dquot_alloc_block_nodirty(inode, pages);
- if (err)
+ if (err) {
+ percpu_counter_sub(&sbinfo->used_blocks, pages);
goto unacct;
-
- percpu_counter_add(&sbinfo->used_blocks, pages);
+ }
} else {
err = dquot_alloc_block_nodirty(inode, pages);
if (err)
--
2.35.3
On Fri 29-09-23 20:42:45, Hugh Dickins wrote:
> Percpu counter's compare and add are separate functions: without locking
> around them (which would defeat their purpose), it has been possible to
> overflow the intended limit. Imagine all the other CPUs fallocating
> tmpfs huge pages to the limit, in between this CPU's compare and its add.
>
> I have not seen reports of that happening; but tmpfs's recent addition
> of dquot_alloc_block_nodirty() in between the compare and the add makes
> it even more likely, and I'd be uncomfortable to leave it unfixed.
>
> Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.
>
> I believe this implementation is correct, and slightly more efficient
> than the combination of compare and add (taking the lock once rather
> than twice when nearing full - the last 128MiB of a tmpfs volume on a
> machine with 128 CPUs and 4KiB pages); but it does beg for a better
> design - when nearing full, there is no new batching, but the costly
> percpu counter sum across CPUs still has to be done, while locked.
>
> Follow __percpu_counter_sum()'s example, including cpu_dying_mask as
> well as cpu_online_mask: but shouldn't __percpu_counter_compare() and
> __percpu_counter_limited_add() then be adding a num_dying_cpus() to
> num_online_cpus(), when they calculate the maximum which could be held
> across CPUs? But the times when it matters would be vanishingly rare.
>
> Signed-off-by: Hugh Dickins <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Dave Chinner <[email protected]>
> Cc: Darrick J. Wong <[email protected]>
Looks good to me. Feel free to add:
Reviewed-by: Jan Kara <[email protected]>
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Fri, Sep 29, 2023 at 08:42:45PM -0700, Hugh Dickins wrote:
> Percpu counter's compare and add are separate functions: without locking
> around them (which would defeat their purpose), it has been possible to
> overflow the intended limit. Imagine all the other CPUs fallocating
> tmpfs huge pages to the limit, in between this CPU's compare and its add.
>
> I have not seen reports of that happening; but tmpfs's recent addition
> of dquot_alloc_block_nodirty() in between the compare and the add makes
> it even more likely, and I'd be uncomfortable to leave it unfixed.
>
> Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.
>
> I believe this implementation is correct, and slightly more efficient
> than the combination of compare and add (taking the lock once rather
> than twice when nearing full - the last 128MiB of a tmpfs volume on a
> machine with 128 CPUs and 4KiB pages); but it does beg for a better
> design - when nearing full, there is no new batching, but the costly
> percpu counter sum across CPUs still has to be done, while locked.
>
> Follow __percpu_counter_sum()'s example, including cpu_dying_mask as
> well as cpu_online_mask: but shouldn't __percpu_counter_compare() and
> __percpu_counter_limited_add() then be adding a num_dying_cpus() to
> num_online_cpus(), when they calculate the maximum which could be held
> across CPUs? But the times when it matters would be vanishingly rare.
>
> Signed-off-by: Hugh Dickins <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Dave Chinner <[email protected]>
> Cc: Darrick J. Wong <[email protected]>
> ---
> Tim, Dave, Darrick: I didn't want to waste your time on patches 1-7,
> which are just internal to shmem, and do not affect this patch (which
> applies to v6.6-rc and linux-next as is): but want to run this by you.
Hmmmm. IIUC, this only works for addition that approaches the limit
from below?
So if we are approaching the limit from above (i.e. add of a
negative amount, limit is zero) then this code doesn't work the same
as the open-coded compare+add operation would?
Hence I think this looks like a "add if result is less than"
operation, which is distinct from then "add if result is greater
than" operation that we use this same pattern for in XFS and ext4.
Perhaps a better name is in order?
I'm also not a great fan of having two
similar-but-not-quite-the-same implementations for the two
comparisons, but unless we decide to convert the XFs slow path to
this it doesn't matter that much at the moment....
Implementation seems OK at a quick glance, though.
Cheers,
Dave.
--
Dave Chinner
[email protected]
>Signed-off-by: Hugh Dickins <[email protected]>
>Cc: Tim Chen <[email protected]>
>Cc: Dave Chinner <[email protected]>
>Cc: Darrick J. Wong <[email protected]>
>---
>Tim, Dave, Darrick: I didn't want to waste your time on patches 1-7, which are
>just internal to shmem, and do not affect this patch (which applies to v6.6-rc
>and linux-next as is): but want to run this by you.
>
> include/linux/percpu_counter.h | 23 +++++++++++++++
> lib/percpu_counter.c | 53 ++++++++++++++++++++++++++++++++++
> mm/shmem.c | 10 +++----
> 3 files changed, 81 insertions(+), 5 deletions(-)
>
>diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
>index d01351b1526f..8cb7c071bd5c 100644
>--- a/include/linux/percpu_counter.h
>+++ b/include/linux/percpu_counter.h
>@@ -57,6 +57,8 @@ void percpu_counter_add_batch(struct percpu_counter
>*fbc, s64 amount,
> s32 batch);
> s64 __percpu_counter_sum(struct percpu_counter *fbc); int
>__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
>+bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
>+ s64 amount, s32 batch);
> void percpu_counter_sync(struct percpu_counter *fbc);
>
> static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
>@@ -69,6 +71,13 @@ static inline void percpu_counter_add(struct
>percpu_counter *fbc, s64 amount)
> percpu_counter_add_batch(fbc, amount, percpu_counter_batch); }
>
>+static inline bool
>+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64
>+amount) {
>+ return __percpu_counter_limited_add(fbc, limit, amount,
>+ percpu_counter_batch);
>+}
>+
> /*
> * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
> * are accumulated in local per cpu counter and not in fbc->count until @@ -
>185,6 +194,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64
>amount)
> local_irq_restore(flags);
> }
>
>+static inline bool
>+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64
>+amount) {
>+ unsigned long flags;
>+ s64 count;
>+
>+ local_irq_save(flags);
>+ count = fbc->count + amount;
>+ if (count <= limit)
>+ fbc->count = count;
>+ local_irq_restore(flags);
>+ return count <= limit;
>+}
>+
> /* non-SMP percpu_counter_add_local is the same with percpu_counter_add
>*/ static inline void percpu_counter_add_local(struct percpu_counter *fbc,
>s64 amount) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index
>9073430dc865..58a3392f471b 100644
>--- a/lib/percpu_counter.c
>+++ b/lib/percpu_counter.c
>@@ -278,6 +278,59 @@ int __percpu_counter_compare(struct
>percpu_counter *fbc, s64 rhs, s32 batch) }
>EXPORT_SYMBOL(__percpu_counter_compare);
>
>+/*
>+ * Compare counter, and add amount if the total is within limit.
>+ * Return true if amount was added, false if it would exceed limit.
>+ */
>+bool __percpu_counter_limited_add(struct percpu_counter *fbc,
>+ s64 limit, s64 amount, s32 batch) {
>+ s64 count;
>+ s64 unknown;
>+ unsigned long flags;
>+ bool good;
>+
>+ if (amount > limit)
>+ return false;
>+
>+ local_irq_save(flags);
>+ unknown = batch * num_online_cpus();
>+ count = __this_cpu_read(*fbc->counters);
>+
>+ /* Skip taking the lock when safe */
>+ if (abs(count + amount) <= batch &&
>+ fbc->count + unknown <= limit) {
>+ this_cpu_add(*fbc->counters, amount);
>+ local_irq_restore(flags);
>+ return true;
>+ }
>+
>+ raw_spin_lock(&fbc->lock);
>+ count = fbc->count + amount;
>+
Perhaps we can fast path the case where for sure
we will exceed limit?
if (fbc->count + amount - unknown > limit)
return false;
Tim
>+ /* Skip percpu_counter_sum() when safe */
>+ if (count + unknown > limit) {
>+ s32 *pcount;
>+ int cpu;
>+
>+ for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
>+ pcount = per_cpu_ptr(fbc->counters, cpu);
>+ count += *pcount;
>+ }
>+ }
>+
>+ good = count <= limit;
>+ if (good) {
>+ count = __this_cpu_read(*fbc->counters);
>+ fbc->count += count + amount;
>+ __this_cpu_sub(*fbc->counters, count);
>+ }
>+
>+ raw_spin_unlock(&fbc->lock);
>+ local_irq_restore(flags);
>+ return good;
>+}
>+
> static int __init percpu_counter_startup(void) {
> int ret;
>diff --git a/mm/shmem.c b/mm/shmem.c
>index 4f4ab26bc58a..7cb72c747954 100644
>--- a/mm/shmem.c
>+++ b/mm/shmem.c
>@@ -217,15 +217,15 @@ static int shmem_inode_acct_blocks(struct inode
>*inode, long pages)
>
> might_sleep(); /* when quotas */
> if (sbinfo->max_blocks) {
>- if (percpu_counter_compare(&sbinfo->used_blocks,
>- sbinfo->max_blocks - pages) > 0)
>+ if (!percpu_counter_limited_add(&sbinfo->used_blocks,
>+ sbinfo->max_blocks, pages))
> goto unacct;
>
> err = dquot_alloc_block_nodirty(inode, pages);
>- if (err)
>+ if (err) {
>+ percpu_counter_sub(&sbinfo->used_blocks, pages);
> goto unacct;
>-
>- percpu_counter_add(&sbinfo->used_blocks, pages);
>+ }
> } else {
> err = dquot_alloc_block_nodirty(inode, pages);
> if (err)
>--
>2.35.3
On Thu, 5 Oct 2023, Dave Chinner wrote:
> On Fri, Sep 29, 2023 at 08:42:45PM -0700, Hugh Dickins wrote:
> > Percpu counter's compare and add are separate functions: without locking
> > around them (which would defeat their purpose), it has been possible to
> > overflow the intended limit. Imagine all the other CPUs fallocating
> > tmpfs huge pages to the limit, in between this CPU's compare and its add.
> >
> > I have not seen reports of that happening; but tmpfs's recent addition
> > of dquot_alloc_block_nodirty() in between the compare and the add makes
> > it even more likely, and I'd be uncomfortable to leave it unfixed.
> >
> > Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.
> >
> > I believe this implementation is correct, and slightly more efficient
> > than the combination of compare and add (taking the lock once rather
> > than twice when nearing full - the last 128MiB of a tmpfs volume on a
> > machine with 128 CPUs and 4KiB pages); but it does beg for a better
> > design - when nearing full, there is no new batching, but the costly
> > percpu counter sum across CPUs still has to be done, while locked.
> >
> > Follow __percpu_counter_sum()'s example, including cpu_dying_mask as
> > well as cpu_online_mask: but shouldn't __percpu_counter_compare() and
> > __percpu_counter_limited_add() then be adding a num_dying_cpus() to
> > num_online_cpus(), when they calculate the maximum which could be held
> > across CPUs? But the times when it matters would be vanishingly rare.
> >
> > Signed-off-by: Hugh Dickins <[email protected]>
> > Cc: Tim Chen <[email protected]>
> > Cc: Dave Chinner <[email protected]>
> > Cc: Darrick J. Wong <[email protected]>
> > ---
> > Tim, Dave, Darrick: I didn't want to waste your time on patches 1-7,
> > which are just internal to shmem, and do not affect this patch (which
> > applies to v6.6-rc and linux-next as is): but want to run this by you.
>
> Hmmmm. IIUC, this only works for addition that approaches the limit
> from below?
That's certainly how I was thinking about it, and what I need for tmpfs.
Precisely what its limitations (haha) are, I'll have to take care to
spell out.
(IIRC - it's a while since I wrote it - it can be used for subtraction,
but goes the very slow way when it could go the fast way - uncompared
percpu_counter_sub() much better for that. You might be proposing that
a tweak could adjust it to going the fast way when coming down from the
"limit", but going the slow way as it approaches 0 - that would be neat,
but I've not yet looked into whether it's feasily done.)
>
> So if we are approaching the limit from above (i.e. add of a
> negative amount, limit is zero) then this code doesn't work the same
> as the open-coded compare+add operation would?
To it and to me, a limit of 0 means nothing positive can be added
(and it immediately returns false for that case); and adding anything
negative would be an error since the positive would not have been allowed.
Would a negative limit have any use?
It's definitely not allowing all the possibilities that you could arrange
with a separate compare and add; whether it's ruling out some useful
possibilities to which it can easily be generalized, I'm not sure.
Well worth a look - but it'll be easier for me to break it than get
it right, so I might just stick to adding some comments.
I might find that actually I prefer your way round: getting slower
as approaching 0, without any need for specifying a limit?? That the
tmpfs case pushed it in this direction, when it's better reversed? Or
that might be an embarrassing delusion which I'll regret having mentioned.
>
> Hence I think this looks like a "add if result is less than"
> operation, which is distinct from then "add if result is greater
> than" operation that we use this same pattern for in XFS and ext4.
> Perhaps a better name is in order?
The name still seems good to me, but a comment above it on its
assumptions/limitations well worth adding.
I didn't find a percpu_counter_compare() in ext4, and haven't got
far yet with understanding the XFS ones: tomorrow...
>
> I'm also not a great fan of having two
> similar-but-not-quite-the-same implementations for the two
> comparisons, but unless we decide to convert the XFs slow path to
> this it doesn't matter that much at the moment....
>
> Implementation seems OK at a quick glance, though.
Thanks!
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]
On Thu, 5 Oct 2023, Chen, Tim C wrote:
> >--- a/lib/percpu_counter.c
> >+++ b/lib/percpu_counter.c
> >@@ -278,6 +278,59 @@ int __percpu_counter_compare(struct
> >percpu_counter *fbc, s64 rhs, s32 batch) }
> >EXPORT_SYMBOL(__percpu_counter_compare);
> >
> >+/*
> >+ * Compare counter, and add amount if the total is within limit.
> >+ * Return true if amount was added, false if it would exceed limit.
> >+ */
> >+bool __percpu_counter_limited_add(struct percpu_counter *fbc,
> >+ s64 limit, s64 amount, s32 batch) {
> >+ s64 count;
> >+ s64 unknown;
> >+ unsigned long flags;
> >+ bool good;
> >+
> >+ if (amount > limit)
> >+ return false;
> >+
> >+ local_irq_save(flags);
> >+ unknown = batch * num_online_cpus();
> >+ count = __this_cpu_read(*fbc->counters);
> >+
> >+ /* Skip taking the lock when safe */
> >+ if (abs(count + amount) <= batch &&
> >+ fbc->count + unknown <= limit) {
> >+ this_cpu_add(*fbc->counters, amount);
> >+ local_irq_restore(flags);
> >+ return true;
> >+ }
> >+
> >+ raw_spin_lock(&fbc->lock);
> >+ count = fbc->count + amount;
> >+
>
> Perhaps we can fast path the case where for sure
> we will exceed limit?
>
> if (fbc->count + amount - unknown > limit)
> return false;
Thanks, that sounds reasonable: I'll try to add something like that -
but haven't thought about it carefully enough yet (too easy for me
to overlook some negative case which messes everything up).
Hugh
Hello,
On Thu, Oct 05, 2023 at 10:42:17PM -0700, Hugh Dickins wrote:
> On Thu, 5 Oct 2023, Chen, Tim C wrote:
>
> > >--- a/lib/percpu_counter.c
> > >+++ b/lib/percpu_counter.c
> > >@@ -278,6 +278,59 @@ int __percpu_counter_compare(struct
> > >percpu_counter *fbc, s64 rhs, s32 batch) }
> > >EXPORT_SYMBOL(__percpu_counter_compare);
> > >
> > >+/*
> > >+ * Compare counter, and add amount if the total is within limit.
> > >+ * Return true if amount was added, false if it would exceed limit.
> > >+ */
> > >+bool __percpu_counter_limited_add(struct percpu_counter *fbc,
> > >+ s64 limit, s64 amount, s32 batch) {
> > >+ s64 count;
> > >+ s64 unknown;
> > >+ unsigned long flags;
> > >+ bool good;
> > >+
> > >+ if (amount > limit)
> > >+ return false;
> > >+
> > >+ local_irq_save(flags);
> > >+ unknown = batch * num_online_cpus();
> > >+ count = __this_cpu_read(*fbc->counters);
> > >+
> > >+ /* Skip taking the lock when safe */
> > >+ if (abs(count + amount) <= batch &&
> > >+ fbc->count + unknown <= limit) {
> > >+ this_cpu_add(*fbc->counters, amount);
> > >+ local_irq_restore(flags);
> > >+ return true;
> > >+ }
> > >+
> > >+ raw_spin_lock(&fbc->lock);
> > >+ count = fbc->count + amount;
> > >+
> >
> > Perhaps we can fast path the case where for sure
> > we will exceed limit?
> >
> > if (fbc->count + amount - unknown > limit)
> > return false;
>
> Thanks, that sounds reasonable: I'll try to add something like that -
> but haven't thought about it carefully enough yet (too easy for me
> to overlook some negative case which messes everything up).
>
> Hugh
>
Sorry for the late chime in. I'm traveling right now.
I haven't been super happy lately with percpu_counter as it has had a
few corner cases such as the cpu_dying_mask fiasco which I thought we
fixed with a series from tglx [1]. If not I can resurrect it and pull
it.
I feel like percpu_counter is deviating from its original intended
usecase which, from my perspective, was a thin wrapper around a percpu
variable. At this point we seem to be bolting onto percpu_counter
instead of giving it a clear focus for what it's supposed to do well.
I think I understand the use case, and ultimately it's kind of the
duality where I think it was xfs is using percpu_counters where it must
be > 0 for the value to make sense and there was a race condition with
cpu dying [2].
At this point, I think it's probably better to wholy think about the
lower bound and upper bound problem of percpu_counter wrt the # of
online cpus.
Thanks,
Dennis
[1] https://lore.kernel.org/lkml/[email protected]/
[2] https://lore.kernel.org/lkml/[email protected]/
On Thu, Oct 05, 2023 at 10:35:33PM -0700, Hugh Dickins wrote:
> On Thu, 5 Oct 2023, Dave Chinner wrote:
> > On Fri, Sep 29, 2023 at 08:42:45PM -0700, Hugh Dickins wrote:
> > > Percpu counter's compare and add are separate functions: without locking
> > > around them (which would defeat their purpose), it has been possible to
> > > overflow the intended limit. Imagine all the other CPUs fallocating
> > > tmpfs huge pages to the limit, in between this CPU's compare and its add.
> > >
> > > I have not seen reports of that happening; but tmpfs's recent addition
> > > of dquot_alloc_block_nodirty() in between the compare and the add makes
> > > it even more likely, and I'd be uncomfortable to leave it unfixed.
> > >
> > > Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.
> > >
> > > I believe this implementation is correct, and slightly more efficient
> > > than the combination of compare and add (taking the lock once rather
> > > than twice when nearing full - the last 128MiB of a tmpfs volume on a
> > > machine with 128 CPUs and 4KiB pages); but it does beg for a better
> > > design - when nearing full, there is no new batching, but the costly
> > > percpu counter sum across CPUs still has to be done, while locked.
> > >
> > > Follow __percpu_counter_sum()'s example, including cpu_dying_mask as
> > > well as cpu_online_mask: but shouldn't __percpu_counter_compare() and
> > > __percpu_counter_limited_add() then be adding a num_dying_cpus() to
> > > num_online_cpus(), when they calculate the maximum which could be held
> > > across CPUs? But the times when it matters would be vanishingly rare.
> > >
> > > Signed-off-by: Hugh Dickins <[email protected]>
> > > Cc: Tim Chen <[email protected]>
> > > Cc: Dave Chinner <[email protected]>
> > > Cc: Darrick J. Wong <[email protected]>
> > > ---
> > > Tim, Dave, Darrick: I didn't want to waste your time on patches 1-7,
> > > which are just internal to shmem, and do not affect this patch (which
> > > applies to v6.6-rc and linux-next as is): but want to run this by you.
> >
> > Hmmmm. IIUC, this only works for addition that approaches the limit
> > from below?
>
> That's certainly how I was thinking about it, and what I need for tmpfs.
> Precisely what its limitations (haha) are, I'll have to take care to
> spell out.
>
> (IIRC - it's a while since I wrote it - it can be used for subtraction,
> but goes the very slow way when it could go the fast way - uncompared
> percpu_counter_sub() much better for that. You might be proposing that
> a tweak could adjust it to going the fast way when coming down from the
> "limit", but going the slow way as it approaches 0 - that would be neat,
> but I've not yet looked into whether it's feasily done.)
>
> >
> > So if we are approaching the limit from above (i.e. add of a
> > negative amount, limit is zero) then this code doesn't work the same
> > as the open-coded compare+add operation would?
>
> To it and to me, a limit of 0 means nothing positive can be added
> (and it immediately returns false for that case); and adding anything
> negative would be an error since the positive would not have been allowed.
>
> Would a negative limit have any use?
I don't have any use for it, but the XFS case is decrementing free
space to determine if ENOSPC has been hit. It's the opposite
implemention to shmem, which increments used space to determine if
ENOSPC is hit.
> It's definitely not allowing all the possibilities that you could arrange
> with a separate compare and add; whether it's ruling out some useful
> possibilities to which it can easily be generalized, I'm not sure.
>
> Well worth a look - but it'll be easier for me to break it than get
> it right, so I might just stick to adding some comments.
>
> I might find that actually I prefer your way round: getting slower
> as approaching 0, without any need for specifying a limit?? That the
> tmpfs case pushed it in this direction, when it's better reversed? Or
> that might be an embarrassing delusion which I'll regret having mentioned.
I think there's cases for both approaching and upper limit from
before and a lower limit from above. Both are the same "compare and
add" algorithm, just with minor logic differences...
> > Hence I think this looks like a "add if result is less than"
> > operation, which is distinct from then "add if result is greater
> > than" operation that we use this same pattern for in XFS and ext4.
> > Perhaps a better name is in order?
>
> The name still seems good to me, but a comment above it on its
> assumptions/limitations well worth adding.
>
> I didn't find a percpu_counter_compare() in ext4, and haven't got
Go search for EXT4_FREECLUSTERS_WATERMARK....
> far yet with understanding the XFS ones: tomorrow...
XFS detects being near ENOSPC to change the batch update size so
taht when near ENOSPC the percpu counter always aggregates to the
global sum on every modification. i.e. it becomes more accurate (but
slower) near the ENOSPC threshold. Then if the result of the
subtraction ends up being less than zero, it takes a lock (i.e. goes
even slower!), undoes the subtraction that took it below zero, and
determines if it can dip into the reserve pool or ENOSPC should be
reported.
Some of that could be optimised, but we need that external "lock and
undo" mechanism to manage the reserve pool space atomically at
ENOSPC...
Cheers,
Dave.
--
Dave Chinner
[email protected]
On Fri, 6 Oct 2023, Dennis Zhou wrote:
>
> Sorry for the late chime in. I'm traveling right now.
No problem at all, thanks for looking.
>
> I haven't been super happy lately with percpu_counter as it has had a
> few corner cases such as the cpu_dying_mask fiasco which I thought we
> fixed with a series from tglx [1]. If not I can resurrect it and pull
> it.
>
> I feel like percpu_counter is deviating from its original intended
> usecase which, from my perspective, was a thin wrapper around a percpu
> variable. At this point we seem to be bolting onto percpu_counter
> instead of giving it a clear focus for what it's supposed to do well.
> I think I understand the use case, and ultimately it's kind of the
> duality where I think it was xfs is using percpu_counters where it must
> be > 0 for the value to make sense and there was a race condition with
> cpu dying [2].
>
> At this point, I think it's probably better to wholy think about the
> lower bound and upper bound problem of percpu_counter wrt the # of
> online cpus.
>
> Thanks,
> Dennis
>
> [1] https://lore.kernel.org/lkml/[email protected]/
> [2] https://lore.kernel.org/lkml/[email protected]/
Thanks for the links. I can see that the current cpu_dying situation
is not ideal, but don't see any need to get any deeper into that for
percpu_counter_limited_add(): I did consider an update to remove its
use of cpu_dying_mask, but that just seemed wrong - it should do the
same as is currently done in __percpu_counter_sum(), then be updated
along with that when cpu_dying is sorted, by tglx's series or otherwise.
I don't think I agree with you about percpu_counter deviating from its
original intended usecase; but I haven't studied the history to see what
that initial usecase was. Whatever, we've had percpu_counter_add() and
percpu_counter_compare() for many years, and percpu_counter_limited_add()
is just an atomic combination of those: I don't see it as deviating at all.
Hugh
On Mon, 9 Oct 2023, Dave Chinner wrote:
> On Thu, Oct 05, 2023 at 10:35:33PM -0700, Hugh Dickins wrote:
> > On Thu, 5 Oct 2023, Dave Chinner wrote:
> > >
> > > Hmmmm. IIUC, this only works for addition that approaches the limit
> > > from below?
> >
> > That's certainly how I was thinking about it, and what I need for tmpfs.
> > Precisely what its limitations (haha) are, I'll have to take care to
> > spell out.
> >
> > (IIRC - it's a while since I wrote it - it can be used for subtraction,
> > but goes the very slow way when it could go the fast way - uncompared
> > percpu_counter_sub() much better for that. You might be proposing that
> > a tweak could adjust it to going the fast way when coming down from the
> > "limit", but going the slow way as it approaches 0 - that would be neat,
> > but I've not yet looked into whether it's feasily done.)
Easily done once I'd looked at it from the right angle.
> >
> > >
> > > So if we are approaching the limit from above (i.e. add of a
> > > negative amount, limit is zero) then this code doesn't work the same
> > > as the open-coded compare+add operation would?
> >
> > To it and to me, a limit of 0 means nothing positive can be added
> > (and it immediately returns false for that case); and adding anything
> > negative would be an error since the positive would not have been allowed.
> >
> > Would a negative limit have any use?
There was no reason to exclude it, once I was thinking clearly
about the comparisons.
>
> I don't have any use for it, but the XFS case is decrementing free
> space to determine if ENOSPC has been hit. It's the opposite
> implemention to shmem, which increments used space to determine if
> ENOSPC is hit.
Right.
>
> > It's definitely not allowing all the possibilities that you could arrange
> > with a separate compare and add; whether it's ruling out some useful
> > possibilities to which it can easily be generalized, I'm not sure.
> >
> > Well worth a look - but it'll be easier for me to break it than get
> > it right, so I might just stick to adding some comments.
> >
> > I might find that actually I prefer your way round: getting slower
> > as approaching 0, without any need for specifying a limit?? That the
> > tmpfs case pushed it in this direction, when it's better reversed? Or
> > that might be an embarrassing delusion which I'll regret having mentioned.
>
> I think there's cases for both approaching and upper limit from
> before and a lower limit from above. Both are the same "compare and
> add" algorithm, just with minor logic differences...
Good, thanks, you've saved me: I was getting a bit fundamentalist there,
thinking to offer one simplest primitive from which anything could be
built. But when it came down to it, I had no enthusiam for rewriting
tmpfs's used_blocks as free_blocks, just to avoid that limit argument.
>
> > > Hence I think this looks like a "add if result is less than"
> > > operation, which is distinct from then "add if result is greater
> > > than" operation that we use this same pattern for in XFS and ext4.
> > > Perhaps a better name is in order?
> >
> > The name still seems good to me, but a comment above it on its
> > assumptions/limitations well worth adding.
> >
> > I didn't find a percpu_counter_compare() in ext4, and haven't got
>
> Go search for EXT4_FREECLUSTERS_WATERMARK....
Ah, not a percpu_counter_compare() user, but doing its own thing.
>
> > far yet with understanding the XFS ones: tomorrow...
>
> XFS detects being near ENOSPC to change the batch update size so
> taht when near ENOSPC the percpu counter always aggregates to the
> global sum on every modification. i.e. it becomes more accurate (but
> slower) near the ENOSPC threshold. Then if the result of the
> subtraction ends up being less than zero, it takes a lock (i.e. goes
> even slower!), undoes the subtraction that took it below zero, and
> determines if it can dip into the reserve pool or ENOSPC should be
> reported.
>
> Some of that could be optimised, but we need that external "lock and
> undo" mechanism to manage the reserve pool space atomically at
> ENOSPC...
Thanks for going above and beyond with the description; but I'll be
honest and admit that I only looked quickly, and did not reach any
conclusion as to whether such usage could or should be converted
to percpu_counter_limited_add() - which would never take any XFS
locks, of course, so might just end up doubling the slow work.
But absolutely I agree with you, and thank you for pointing out,
how stupidly useless percpu_counter_limited_add() was for decrementing -
it was nothing more than a slow way of doing percpu_counter_sub().
I'm about to send in a 9/8, extending it to be more useful: thanks.
Hugh
Though tmpfs does not need it, percpu_counter_limited_add() can be twice
as useful if it works sensibly with negative amounts (subs) - typically
decrements towards a limit of 0 or nearby: as suggested by Dave Chinner.
And in the course of that reworking, skip the percpu counter sum if it is
already obvious that the limit would be passed: as suggested by Tim Chen.
Extend the comment above __percpu_counter_limited_add(), defining the
behaviour with positive and negative amounts, allowing negative limits,
but not bothering about overflow beyond S64_MAX.
Signed-off-by: Hugh Dickins <[email protected]>
---
include/linux/percpu_counter.h | 11 +++++--
lib/percpu_counter.c | 54 +++++++++++++++++++++++++---------
2 files changed, 49 insertions(+), 16 deletions(-)
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 8cb7c071bd5c..3a44dd1e33d2 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -198,14 +198,21 @@ static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
unsigned long flags;
+ bool good = false;
s64 count;
+ if (amount == 0)
+ return true;
+
local_irq_save(flags);
count = fbc->count + amount;
- if (count <= limit)
+ if ((amount > 0 && count <= limit) ||
+ (amount < 0 && count >= limit)) {
fbc->count = count;
+ good = true;
+ }
local_irq_restore(flags);
- return count <= limit;
+ return good;
}
/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 58a3392f471b..44dd133594d4 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -279,8 +279,16 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
EXPORT_SYMBOL(__percpu_counter_compare);
/*
- * Compare counter, and add amount if the total is within limit.
- * Return true if amount was added, false if it would exceed limit.
+ * Compare counter, and add amount if total is: less than or equal to limit if
+ * amount is positive, or greater than or equal to limit if amount is negative.
+ * Return true if amount is added, or false if total would be beyond the limit.
+ *
+ * Negative limit is allowed, but unusual.
+ * When negative amounts (subs) are given to percpu_counter_limited_add(),
+ * the limit would most naturally be 0 - but other limits are also allowed.
+ *
+ * Overflow beyond S64_MAX is not allowed for: counter, limit and amount
+ * are all assumed to be sane (far from S64_MIN and S64_MAX).
*/
bool __percpu_counter_limited_add(struct percpu_counter *fbc,
s64 limit, s64 amount, s32 batch)
@@ -288,10 +296,10 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc,
s64 count;
s64 unknown;
unsigned long flags;
- bool good;
+ bool good = false;
- if (amount > limit)
- return false;
+ if (amount == 0)
+ return true;
local_irq_save(flags);
unknown = batch * num_online_cpus();
@@ -299,7 +307,8 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc,
/* Skip taking the lock when safe */
if (abs(count + amount) <= batch &&
- fbc->count + unknown <= limit) {
+ ((amount > 0 && fbc->count + unknown <= limit) ||
+ (amount < 0 && fbc->count - unknown >= limit))) {
this_cpu_add(*fbc->counters, amount);
local_irq_restore(flags);
return true;
@@ -309,7 +318,19 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc,
count = fbc->count + amount;
/* Skip percpu_counter_sum() when safe */
- if (count + unknown > limit) {
+ if (amount > 0) {
+ if (count - unknown > limit)
+ goto out;
+ if (count + unknown <= limit)
+ good = true;
+ } else {
+ if (count + unknown < limit)
+ goto out;
+ if (count - unknown >= limit)
+ good = true;
+ }
+
+ if (!good) {
s32 *pcount;
int cpu;
@@ -317,15 +338,20 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc,
pcount = per_cpu_ptr(fbc->counters, cpu);
count += *pcount;
}
+ if (amount > 0) {
+ if (count > limit)
+ goto out;
+ } else {
+ if (count < limit)
+ goto out;
+ }
+ good = true;
}
- good = count <= limit;
- if (good) {
- count = __this_cpu_read(*fbc->counters);
- fbc->count += count + amount;
- __this_cpu_sub(*fbc->counters, count);
- }
-
+ count = __this_cpu_read(*fbc->counters);
+ fbc->count += count + amount;
+ __this_cpu_sub(*fbc->counters, count);
+out:
raw_spin_unlock(&fbc->lock);
local_irq_restore(flags);
return good;
--
2.35.3
On Fri, Sep 29, 2023 at 08:42:45PM -0700, Hugh Dickins wrote:
> Percpu counter's compare and add are separate functions: without locking
> around them (which would defeat their purpose), it has been possible to
> overflow the intended limit. Imagine all the other CPUs fallocating
> tmpfs huge pages to the limit, in between this CPU's compare and its add.
>
> I have not seen reports of that happening; but tmpfs's recent addition
> of dquot_alloc_block_nodirty() in between the compare and the add makes
> it even more likely, and I'd be uncomfortable to leave it unfixed.
>
> Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.
>
I think the posted (and by now landed) code is racy.
I had seen there was a follow up patch which further augmented the
routine, but it did not alter the issue below so I'm responding to this
thread.
> +/*
> + * Compare counter, and add amount if the total is within limit.
> + * Return true if amount was added, false if it would exceed limit.
> + */
> +bool __percpu_counter_limited_add(struct percpu_counter *fbc,
> + s64 limit, s64 amount, s32 batch)
> +{
> + s64 count;
> + s64 unknown;
> + unsigned long flags;
> + bool good;
> +
> + if (amount > limit)
> + return false;
> +
> + local_irq_save(flags);
> + unknown = batch * num_online_cpus();
> + count = __this_cpu_read(*fbc->counters);
> +
> + /* Skip taking the lock when safe */
> + if (abs(count + amount) <= batch &&
> + fbc->count + unknown <= limit) {
> + this_cpu_add(*fbc->counters, amount);
> + local_irq_restore(flags);
> + return true;
> + }
> +
Note the value of fbc->count is *not* stabilized.
> + raw_spin_lock(&fbc->lock);
> + count = fbc->count + amount;
> +
> + /* Skip percpu_counter_sum() when safe */
> + if (count + unknown > limit) {
> + s32 *pcount;
> + int cpu;
> +
> + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
> + pcount = per_cpu_ptr(fbc->counters, cpu);
> + count += *pcount;
> + }
> + }
> +
> + good = count <= limit;
> + if (good) {
> + count = __this_cpu_read(*fbc->counters);
> + fbc->count += count + amount;
> + __this_cpu_sub(*fbc->counters, count);
> + }
> +
> + raw_spin_unlock(&fbc->lock);
> + local_irq_restore(flags);
> + return good;
> +}
> +
Consider 2 cpus executing the func at the same time, where one is
updating fbc->counter in the slow path while the other is testing it in
the fast path.
cpu0 cpu1
if (abs(count + amount) <= batch &&
fbc->count + unknown <= limit)
/* gets past the per-cpu traversal */
/* at this point cpu0 decided to bump fbc->count, while cpu1 decided to
* bump the local counter */
this_cpu_add(*fbc->counters, amount);
fbc->count += count + amount;
Suppose fbc->count update puts it close enough to the limit that an
addition from cpu1 would put the entire thing over said limit.
Since the 2 operations are not synchronized cpu1 can observe fbc->count
prior to the bump and update it's local counter, resulting in
aforementioned overflow.
Am I misreading something here? Is this prevented?
To my grep the only user is quotas in shmem and I wonder if that use is
even justified. I am aware of performance realities of atomics. However,
it very well may be that whatever codepaths which exercise the counter
are suffering multicore issues elsewhere, making an atomic (in a
dedicated cacheline) a perfectly sane choice for the foreseeable future.
Should this be true there would be 0 rush working out a fixed variant of
the routine.
On Sat, May 25, 2024 at 08:00:15AM +0200, Mateusz Guzik wrote:
> On Fri, Sep 29, 2023 at 08:42:45PM -0700, Hugh Dickins wrote:
> > Percpu counter's compare and add are separate functions: without locking
> > around them (which would defeat their purpose), it has been possible to
> > overflow the intended limit. Imagine all the other CPUs fallocating
> > tmpfs huge pages to the limit, in between this CPU's compare and its add.
> >
> > I have not seen reports of that happening; but tmpfs's recent addition
> > of dquot_alloc_block_nodirty() in between the compare and the add makes
> > it even more likely, and I'd be uncomfortable to leave it unfixed.
> >
> > Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it.
> >
>
> I think the posted (and by now landed) code is racy.
>
> I had seen there was a follow up patch which further augmented the
> routine, but it did not alter the issue below so I'm responding to this
> thread.
>
> > +/*
> > + * Compare counter, and add amount if the total is within limit.
> > + * Return true if amount was added, false if it would exceed limit.
> > + */
> > +bool __percpu_counter_limited_add(struct percpu_counter *fbc,
> > + s64 limit, s64 amount, s32 batch)
> > +{
> > + s64 count;
> > + s64 unknown;
> > + unsigned long flags;
> > + bool good;
> > +
> > + if (amount > limit)
> > + return false;
> > +
> > + local_irq_save(flags);
> > + unknown = batch * num_online_cpus();
> > + count = __this_cpu_read(*fbc->counters);
> > +
> > + /* Skip taking the lock when safe */
> > + if (abs(count + amount) <= batch &&
> > + fbc->count + unknown <= limit) {
> > + this_cpu_add(*fbc->counters, amount);
> > + local_irq_restore(flags);
> > + return true;
> > + }
> > +
>
> Note the value of fbc->count is *not* stabilized.
>
> > + raw_spin_lock(&fbc->lock);
> > + count = fbc->count + amount;
> > +
> > + /* Skip percpu_counter_sum() when safe */
> > + if (count + unknown > limit) {
> > + s32 *pcount;
> > + int cpu;
> > +
> > + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
> > + pcount = per_cpu_ptr(fbc->counters, cpu);
> > + count += *pcount;
> > + }
> > + }
> > +
> > + good = count <= limit;
> > + if (good) {
> > + count = __this_cpu_read(*fbc->counters);
> > + fbc->count += count + amount;
> > + __this_cpu_sub(*fbc->counters, count);
> > + }
> > +
> > + raw_spin_unlock(&fbc->lock);
> > + local_irq_restore(flags);
> > + return good;
> > +}
> > +
>
> Consider 2 cpus executing the func at the same time, where one is
> updating fbc->counter in the slow path while the other is testing it in
> the fast path.
>
> cpu0 cpu1
> if (abs(count + amount) <= batch &&
> fbc->count + unknown <= limit)
> /* gets past the per-cpu traversal */
> /* at this point cpu0 decided to bump fbc->count, while cpu1 decided to
> * bump the local counter */
> this_cpu_add(*fbc->counters, amount);
> fbc->count += count + amount;
>
> Suppose fbc->count update puts it close enough to the limit that an
> addition from cpu1 would put the entire thing over said limit.
>
> Since the 2 operations are not synchronized cpu1 can observe fbc->count
> prior to the bump and update it's local counter, resulting in
> aforementioned overflow.
>
> Am I misreading something here? Is this prevented?
>
> To my grep the only user is quotas in shmem and I wonder if that use is
> even justified. I am aware of performance realities of atomics. However,
> it very well may be that whatever codepaths which exercise the counter
> are suffering multicore issues elsewhere, making an atomic (in a
> dedicated cacheline) a perfectly sane choice for the foreseeable future.
> Should this be true there would be 0 rush working out a fixed variant of
> the routine.
So I tried it out and I don't believe a per-cpu counter for mem usage of
shmem is warranted at this time. In a setting where usage keeps changing
there is a massive bottleneck around folio_lruvec_lock.
The rare case where a bunch of memory is just populated one off on a
tmpfs mount can probably suffer the atomic, it can only go so far up
before you are out of memory.
For the value to keep changing some of the pages will have to be
released and this is what I'm testing below by slapping ftruncate on a
test doing 1MB file writes in a loop (in will-it-scale):
diff --git a/tests/write1.c b/tests/write1.c
index d860904..790ddb3 100644
--- a/tests/write1.c
+++ b/tests/write1.c
@@ -28,6 +28,7 @@ void testcase(unsigned long long *iterations, unsigned long nr)
size = 0;
lseek(fd, 0, SEEK_SET);
}
+ ftruncate(fd, 0);
(*iterations)++;
}
That this is allocates pages for 1MB size and releases them continously.
When mounting tmpfs on /tmp and benching with "./write1_processes -t 20"
(20 workers) I see almost all of the time spent spinning in
__pv_queued_spin_lock_slowpath.
As such I don't believe the per-cpu split buys anything in terms of
scalability and as I explained in the previous mail I think the routine
used here is buggy, while shmem is the only user. Should this switch to
a centralized atomic (either cmpxchg loop or xadd + backpedal) there
should be no loss in scalability given the lruvec problem. Then the
routine could be commented out or whacked for the time being.
backtraces for interested:
bpftrace -e 'kprobe:__pv_queued_spin_lock_slowpath { @[kstack()] = count(); }'
@[
__pv_queued_spin_lock_slowpath+5
_raw_spin_lock_irqsave+61
folio_lruvec_lock_irqsave+95
__page_cache_release+130
folios_put_refs+139
shmem_undo_range+702
shmem_setattr+983
notify_change+556
do_truncate+131
do_ftruncate+254
__x64_sys_ftruncate+62
do_syscall_64+87
entry_SYSCALL_64_after_hwframe+118
]: 4750931
@[
__pv_queued_spin_lock_slowpath+5
_raw_spin_lock_irqsave+61
folio_lruvec_lock_irqsave+95
folio_batch_move_lru+139
lru_add_drain_cpu+141
__folio_batch_release+49
shmem_undo_range+702
shmem_setattr+983
notify_change+556
do_truncate+131
do_ftruncate+254
__x64_sys_ftruncate+62
do_syscall_64+87
entry_SYSCALL_64_after_hwframe+118
]: 4761483