2008-08-27 15:29:05

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

fbc->count is of type s64. The change was introduced by
0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
from long to s64. Moving to s64 also means on 32 bit architectures
we can get wrong values on fbc->count. Since fbc->count is read
more frequently and updated rarely use seqlocks. This should
reduce the impact of locking in the read path for 32bit arch.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
CC: Peter Zijlstra <[email protected]>
CC: Andrew Morton <[email protected]>
CC: [email protected]
---
include/linux/percpu_counter.h | 28 ++++++++++++++++++++++++----
lib/percpu_counter.c | 20 ++++++++++----------
2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccd..1b711a1 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -6,7 +6,7 @@
* WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
*/

-#include <linux/spinlock.h>
+#include <linux/seqlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
@@ -16,7 +16,7 @@
#ifdef CONFIG_SMP

struct percpu_counter {
- spinlock_t lock;
+ seqlock_t lock;
s64 count;
#ifdef CONFIG_HOTPLUG_CPU
struct list_head list; /* All percpu_counters are on a list */
@@ -53,10 +53,30 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
return __percpu_counter_sum(fbc);
}

-static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+#if BITS_PER_LONG == 64
+static inline s64 fbc_count(struct percpu_counter *fbc)
{
return fbc->count;
}
+#else
+/* doesn't have atomic 64 bit operation */
+static inline s64 fbc_count(struct percpu_counter *fbc)
+{
+ s64 ret;
+ unsigned seq;
+ do {
+ seq = read_seqbegin(&fbc->lock);
+ ret = fbc->count;
+ } while (read_seqretry(&fbc->lock, seq));
+ return ret;
+
+}
+#endif
+
+static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+{
+ return fbc_count(fbc);
+}

/*
* It is possible for the percpu_counter_read() to return a small negative
@@ -65,7 +85,7 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
*/
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
- s64 ret = fbc->count;
+ s64 ret = fbc_count(fbc);

barrier(); /* Prevent reloads of fbc->count */
if (ret >= 0)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index a866389..83bb809 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -18,13 +18,13 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
int cpu;

- spin_lock(&fbc->lock);
+ write_seqlock(&fbc->lock);
for_each_possible_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
*pcount = 0;
}
fbc->count = amount;
- spin_unlock(&fbc->lock);
+ write_sequnlock(&fbc->lock);
}
EXPORT_SYMBOL(percpu_counter_set);

@@ -37,10 +37,10 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
pcount = per_cpu_ptr(fbc->counters, cpu);
count = *pcount + amount;
if (count >= batch || count <= -batch) {
- spin_lock(&fbc->lock);
+ write_seqlock(&fbc->lock);
fbc->count += count;
*pcount = 0;
- spin_unlock(&fbc->lock);
+ write_sequnlock(&fbc->lock);
} else {
*pcount = count;
}
@@ -57,7 +57,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
s64 ret;
int cpu;

- spin_lock(&fbc->lock);
+ write_seqlock(&fbc->lock);
ret = fbc->count;
for_each_online_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
@@ -66,7 +66,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
}
fbc->count = ret;

- spin_unlock(&fbc->lock);
+ write_sequnlock(&fbc->lock);
return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(__percpu_counter_sum);

int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
{
- spin_lock_init(&fbc->lock);
+ seqlock_init(&fbc->lock);
fbc->count = amount;
fbc->counters = alloc_percpu(s32);
if (!fbc->counters)
@@ -95,7 +95,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount)

err = percpu_counter_init(fbc, amount);
if (!err)
- lockdep_set_class(&fbc->lock, &percpu_counter_irqsafe);
+ lockdep_set_class(&fbc->lock.lock, &percpu_counter_irqsafe);
return err;
}

@@ -130,11 +130,11 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
s32 *pcount;
unsigned long flags;

- spin_lock_irqsave(&fbc->lock, flags);
+ write_seqlock_irqsave(&fbc->lock, flags);
pcount = per_cpu_ptr(fbc->counters, cpu);
fbc->count += *pcount;
*pcount = 0;
- spin_unlock_irqrestore(&fbc->lock, flags);
+ write_sequnlock_irqrestore(&fbc->lock, flags);
}
mutex_unlock(&percpu_counters_lock);
return NOTIFY_OK;
--
1.6.0.1.90.g27a6e


2008-08-27 19:07:42

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, 27 Aug 2008 20:58:26 +0530
"Aneesh Kumar K.V" <[email protected]> wrote:

> fbc->count is of type s64. The change was introduced by
> 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> from long to s64. Moving to s64 also means on 32 bit architectures
> we can get wrong values on fbc->count. Since fbc->count is read
> more frequently and updated rarely use seqlocks. This should
> reduce the impact of locking in the read path for 32bit arch.
>

So... yesterday's suggestionm to investigate implementing this at a
lower level wasn't popular?

> include/linux/percpu_counter.h | 28 ++++++++++++++++++++++++----
> lib/percpu_counter.c | 20 ++++++++++----------
> 2 files changed, 34 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 9007ccd..1b711a1 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -6,7 +6,7 @@
> * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
> */
>
> -#include <linux/spinlock.h>
> +#include <linux/seqlock.h>
> #include <linux/smp.h>
> #include <linux/list.h>
> #include <linux/threads.h>
> @@ -16,7 +16,7 @@
> #ifdef CONFIG_SMP
>
> struct percpu_counter {
> - spinlock_t lock;
> + seqlock_t lock;
> s64 count;
> #ifdef CONFIG_HOTPLUG_CPU
> struct list_head list; /* All percpu_counters are on a list */
> @@ -53,10 +53,30 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> return __percpu_counter_sum(fbc);
> }
>
> -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> +#if BITS_PER_LONG == 64
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> {
> return fbc->count;
> }
> +#else
> +/* doesn't have atomic 64 bit operation */
> +static inline s64 fbc_count(struct percpu_counter *fbc)
> +{
> + s64 ret;
> + unsigned seq;
> + do {
> + seq = read_seqbegin(&fbc->lock);
> + ret = fbc->count;
> + } while (read_seqretry(&fbc->lock, seq));
> + return ret;
> +

Please don't put unneeded blank lines into random places.

> +}
> +#endif

This is now too large to be inlined.

> +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> +{
> + return fbc_count(fbc);
> +}

This change means that a percpu_counter_read() from interrupt context
on a 32-bit machine is now deadlockable, whereas it previously was not
deadlockable on either 32-bit or 64-bit.

This flows on to the lib/proportions.c, which uses
percpu_counter_read() and also does spin_lock_irqsave() internally,
indicating that it is (or was) designed to be used in IRQ contexts.

It means that bdi_stat() can no longer be used from interrupt context.

So a whole lot of thought and review and checking is needed here. It
should all be spelled out in the changelog. This will be a horridly
rare deadlock, so suitable WARN_ON()s should be added to detect when
callers are vulnerable to it.

Or we make the whole thing irq-safe.

2008-08-27 21:02:08

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, 2008-08-27 at 12:05 -0700, Andrew Morton wrote:
> On Wed, 27 Aug 2008 20:58:26 +0530
> "Aneesh Kumar K.V" <[email protected]> wrote:
>
> > fbc->count is of type s64. The change was introduced by
> > 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> > from long to s64. Moving to s64 also means on 32 bit architectures
> > we can get wrong values on fbc->count. Since fbc->count is read
> > more frequently and updated rarely use seqlocks. This should
> > reduce the impact of locking in the read path for 32bit arch.
> >
>
> So... yesterday's suggestionm to investigate implementing this at a
> lower level wasn't popular?

I think its a good idea to investigate a generic atomic64_t type.

i386 could possibly use cmpxchg8 if and when available, although using
that to read might be rather too expensive.

Doing something like:

struct atomic64_t {
seqlock_t lock;
s64 val;
};

might be somewhat unexpected from the sizeof() angle of things. Then
there is of course the possiblity of hashing the locks...



> > include/linux/percpu_counter.h | 28 ++++++++++++++++++++++++----
> > lib/percpu_counter.c | 20 ++++++++++----------
> > 2 files changed, 34 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> > index 9007ccd..1b711a1 100644
> > --- a/include/linux/percpu_counter.h
> > +++ b/include/linux/percpu_counter.h
> > @@ -6,7 +6,7 @@
> > * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
> > */
> >
> > -#include <linux/spinlock.h>
> > +#include <linux/seqlock.h>
> > #include <linux/smp.h>
> > #include <linux/list.h>
> > #include <linux/threads.h>
> > @@ -16,7 +16,7 @@
> > #ifdef CONFIG_SMP
> >
> > struct percpu_counter {
> > - spinlock_t lock;
> > + seqlock_t lock;
> > s64 count;
> > #ifdef CONFIG_HOTPLUG_CPU
> > struct list_head list; /* All percpu_counters are on a list */
> > @@ -53,10 +53,30 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> > return __percpu_counter_sum(fbc);
> > }
> >
> > -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > +#if BITS_PER_LONG == 64
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > {
> > return fbc->count;
> > }
> > +#else
> > +/* doesn't have atomic 64 bit operation */
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > +{
> > + s64 ret;
> > + unsigned seq;
> > + do {
> > + seq = read_seqbegin(&fbc->lock);
> > + ret = fbc->count;
> > + } while (read_seqretry(&fbc->lock, seq));
> > + return ret;
> > +
>
> Please don't put unneeded blank lines into random places.
>
> > +}
> > +#endif
>
> This is now too large to be inlined.
>
> > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > +{
> > + return fbc_count(fbc);
> > +}
>
> This change means that a percpu_counter_read() from interrupt context
> on a 32-bit machine is now deadlockable, whereas it previously was not
> deadlockable on either 32-bit or 64-bit.
>
> This flows on to the lib/proportions.c, which uses
> percpu_counter_read() and also does spin_lock_irqsave() internally,
> indicating that it is (or was) designed to be used in IRQ contexts.

percpu_counter() never was irq safe, which is why the proportion stuff
does all the irq disabling bits by hand.

> It means that bdi_stat() can no longer be used from interrupt context.

Actually, as long as the write side of the seqlock usage is done with
IRQs disabled, the read side should be good.

If the read loop gets preempted by a write action, the seq count will
not match up and we'll just try again.

The only lethal combination is trying to do the read loop while inside
the write side.

If you look at backing-dev.h, you'll see that all modifying operations
disable IRQs.

> So a whole lot of thought and review and checking is needed here. It
> should all be spelled out in the changelog. This will be a horridly
> rare deadlock, so suitable WARN_ON()s should be added to detect when
> callers are vulnerable to it.
>
> Or we make the whole thing irq-safe.

That'd rather substantially penalize those cases where we don't need it.
>From what I understood this whole pushf/popf stuff is insanely expensive
on a few archs.

2008-08-27 21:24:36

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, 27 Aug 2008 23:01:52 +0200
Peter Zijlstra <[email protected]> wrote:

> >
> > > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > > +{
> > > + return fbc_count(fbc);
> > > +}
> >
> > This change means that a percpu_counter_read() from interrupt context
> > on a 32-bit machine is now deadlockable, whereas it previously was not
> > deadlockable on either 32-bit or 64-bit.
> >
> > This flows on to the lib/proportions.c, which uses
> > percpu_counter_read() and also does spin_lock_irqsave() internally,
> > indicating that it is (or was) designed to be used in IRQ contexts.
>
> percpu_counter() never was irq safe, which is why the proportion stuff
> does all the irq disabling bits by hand.

percpu_counter_read() was irq-safe. That changes here. Needs careful
review, changelogging and, preferably, runtime checks. But perhaps
they should be inside some CONFIG_thing which won't normally be done in
production.

otoh, percpu_counter_read() is in fact a rare operation, so a bit of
overhead probably won't matter.

(write-often, read-rarely is the whole point. This patch's changelog's
assertion that "Since fbc->count is read more frequently and updated
rarely" is probably wrong. Most percpu_counters will have their
fbc->count modified far more frequently than having it read from).

> > It means that bdi_stat() can no longer be used from interrupt context.
>
> Actually, as long as the write side of the seqlock usage is done with
> IRQs disabled, the read side should be good.

yup.

> If the read loop gets preempted by a write action, the seq count will
> not match up and we'll just try again.
>
> The only lethal combination is trying to do the read loop while inside
> the write side.

yup

> If you look at backing-dev.h, you'll see that all modifying operations
> disable IRQs.

OK.

> > So a whole lot of thought and review and checking is needed here. It
> > should all be spelled out in the changelog. This will be a horridly
> > rare deadlock, so suitable WARN_ON()s should be added to detect when
> > callers are vulnerable to it.
> >
> > Or we make the whole thing irq-safe.
>
> That'd rather substantially penalize those cases where we don't need it.
> >From what I understood this whole pushf/popf stuff is insanely expensive
> on a few archs.

Sure. I _expect_ that this interface change won't actually break
anything. But it adds a restriction which we should think about, and
document.



btw, what the heck is percpu_counter_init_irq()? Some mysterious
lockdep-specific thing?

<does git-fiddle. Oh. crappy changelog.>

I let that one leak through uncommented. Must be getting old.
Probably it will need an EXPORT_SYMBOL() sometime.


I expect that if we're going to go ahead and make percpu_counter_read()
no longer usable from interrupt context then we'll eventually end up
needing the full suite of _irq() and _irqsave() interface functions.
percpu_counter_add_irqsave(), etc.

2008-08-28 03:48:42

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, Aug 27, 2008 at 12:05:53PM -0700, Andrew Morton wrote:
> On Wed, 27 Aug 2008 20:58:26 +0530
> "Aneesh Kumar K.V" <[email protected]> wrote:
>
> > fbc->count is of type s64. The change was introduced by
> > 0216bfcffe424a5473daa4da47440881b36c1f4 which changed the type
> > from long to s64. Moving to s64 also means on 32 bit architectures
> > we can get wrong values on fbc->count. Since fbc->count is read
> > more frequently and updated rarely use seqlocks. This should
> > reduce the impact of locking in the read path for 32bit arch.
> >
>
> So... yesterday's suggestionm to investigate implementing this at a
> lower level wasn't popular?

I wanted to sent the entire patchset which fixes ENOSPC issues with
delalloc. It happened to be on the next day you looked at the previous
mail. Sending the patch again in now way mean we should not have
generic atomic64_t;


>
> > include/linux/percpu_counter.h | 28 ++++++++++++++++++++++++----
> > lib/percpu_counter.c | 20 ++++++++++----------
> > 2 files changed, 34 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> > index 9007ccd..1b711a1 100644
> > --- a/include/linux/percpu_counter.h
> > +++ b/include/linux/percpu_counter.h
> > @@ -6,7 +6,7 @@
> > * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
> > */
> >
> > -#include <linux/spinlock.h>
> > +#include <linux/seqlock.h>
> > #include <linux/smp.h>
> > #include <linux/list.h>
> > #include <linux/threads.h>
> > @@ -16,7 +16,7 @@
> > #ifdef CONFIG_SMP
> >
> > struct percpu_counter {
> > - spinlock_t lock;
> > + seqlock_t lock;
> > s64 count;
> > #ifdef CONFIG_HOTPLUG_CPU
> > struct list_head list; /* All percpu_counters are on a list */
> > @@ -53,10 +53,30 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
> > return __percpu_counter_sum(fbc);
> > }
> >
> > -static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > +#if BITS_PER_LONG == 64
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > {
> > return fbc->count;
> > }
> > +#else
> > +/* doesn't have atomic 64 bit operation */
> > +static inline s64 fbc_count(struct percpu_counter *fbc)
> > +{
> > + s64 ret;
> > + unsigned seq;
> > + do {
> > + seq = read_seqbegin(&fbc->lock);
> > + ret = fbc->count;
> > + } while (read_seqretry(&fbc->lock, seq));
> > + return ret;
> > +
>
> Please don't put unneeded blank lines into random places.
>

Will fix

> > +}
> > +#endif
>
> This is now too large to be inlined.
>


How do we actually figure that out ? I have been making that mistakes
quiet often.

> > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > +{
> > + return fbc_count(fbc);
> > +}
>

-aneesh

2008-08-28 03:52:35

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, Aug 27, 2008 at 02:22:50PM -0700, Andrew Morton wrote:
> On Wed, 27 Aug 2008 23:01:52 +0200
> Peter Zijlstra <[email protected]> wrote:
>
> > >
> > > > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > > > +{
> > > > + return fbc_count(fbc);
> > > > +}
> > >
> > > This change means that a percpu_counter_read() from interrupt context
> > > on a 32-bit machine is now deadlockable, whereas it previously was not
> > > deadlockable on either 32-bit or 64-bit.
> > >
> > > This flows on to the lib/proportions.c, which uses
> > > percpu_counter_read() and also does spin_lock_irqsave() internally,
> > > indicating that it is (or was) designed to be used in IRQ contexts.
> >
> > percpu_counter() never was irq safe, which is why the proportion stuff
> > does all the irq disabling bits by hand.
>
> percpu_counter_read() was irq-safe. That changes here. Needs careful
> review, changelogging and, preferably, runtime checks. But perhaps
> they should be inside some CONFIG_thing which won't normally be done in
> production.
>
> otoh, percpu_counter_read() is in fact a rare operation, so a bit of
> overhead probably won't matter.
>
> (write-often, read-rarely is the whole point. This patch's changelog's
> assertion that "Since fbc->count is read more frequently and updated
> rarely" is probably wrong. Most percpu_counters will have their
> fbc->count modified far more frequently than having it read from).

we may actually be doing percpu_counter_add. But that doesn't update
fbc->count. Only if the local percpu values cross FBC_BATCH we update
fbc->count. If we are modifying fbc->count more frequently than
reading fbc->count then i guess we would be contenting of fbc->lock more.


-aneesh

2008-08-28 04:08:07

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Thu, 28 Aug 2008 09:18:16 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:

> > This is now too large to be inlined.
> >
>
>
> How do we actually figure that out ? I have been making that mistakes
> quiet often.

Well. Experience and guesswork, mainly.

But a useful metric is to look and the /bin/size output before and
after the inlining. In this case fs/ext3/ialloc.o's text shrunk 40-odd
bytes, which we think is a net benefit due to reduced CPU cache
pressure.

2008-08-28 04:10:48

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Thu, 28 Aug 2008 09:22:00 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:

> On Wed, Aug 27, 2008 at 02:22:50PM -0700, Andrew Morton wrote:
> > On Wed, 27 Aug 2008 23:01:52 +0200
> > Peter Zijlstra <[email protected]> wrote:
> >
> > > >
> > > > > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > > > > +{
> > > > > + return fbc_count(fbc);
> > > > > +}
> > > >
> > > > This change means that a percpu_counter_read() from interrupt context
> > > > on a 32-bit machine is now deadlockable, whereas it previously was not
> > > > deadlockable on either 32-bit or 64-bit.
> > > >
> > > > This flows on to the lib/proportions.c, which uses
> > > > percpu_counter_read() and also does spin_lock_irqsave() internally,
> > > > indicating that it is (or was) designed to be used in IRQ contexts.
> > >
> > > percpu_counter() never was irq safe, which is why the proportion stuff
> > > does all the irq disabling bits by hand.
> >
> > percpu_counter_read() was irq-safe. That changes here. Needs careful
> > review, changelogging and, preferably, runtime checks. But perhaps
> > they should be inside some CONFIG_thing which won't normally be done in
> > production.
> >
> > otoh, percpu_counter_read() is in fact a rare operation, so a bit of
> > overhead probably won't matter.
> >
> > (write-often, read-rarely is the whole point. This patch's changelog's
> > assertion that "Since fbc->count is read more frequently and updated
> > rarely" is probably wrong. Most percpu_counters will have their
> > fbc->count modified far more frequently than having it read from).
>
> we may actually be doing percpu_counter_add. But that doesn't update
> fbc->count. Only if the local percpu values cross FBC_BATCH we update
> fbc->count. If we are modifying fbc->count more frequently than
> reading fbc->count then i guess we would be contenting of fbc->lock more.
>
>

Yep. The frequency of modification of fbc->count is of the order of a
tenth or a hundredth of the frequency of
precpu_counter_<modification>() calls.

But in many cases the frequency of percpu_counter_read() calls is far
far less than this. For example, the percpu_counter_read() may only
happen when userspace polls a /proc file.

2008-08-28 04:19:50

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Thursday 28 August 2008 14:06, Andrew Morton wrote:
> On Thu, 28 Aug 2008 09:18:16 +0530 "Aneesh Kumar K.V"
<[email protected]> wrote:
> > > This is now too large to be inlined.
> >
> > How do we actually figure that out ? I have been making that mistakes
> > quiet often.
>
> Well. Experience and guesswork, mainly.
>
> But a useful metric is to look and the /bin/size output before and
> after the inlining. In this case fs/ext3/ialloc.o's text shrunk 40-odd
> bytes, which we think is a net benefit due to reduced CPU cache
> pressure.

Weighed against register save/restore, compiler barrier, and function
call cost of uninlined. These can add up to 10s of cycles per call I've
seen, so if it is called several times between each icache miss it can
easily be worth inlining. Basically, measurement is required, and if it
isn't important enough to measure policy tends to default to uninline if
that saves space.

2008-08-28 07:59:51

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture

On Wed, 2008-08-27 at 14:22 -0700, Andrew Morton wrote:

> btw, what the heck is percpu_counter_init_irq()? Some mysterious
> lockdep-specific thing?
>
> <does git-fiddle. Oh. crappy changelog.>
>
> I let that one leak through uncommented. Must be getting old.
> Probably it will need an EXPORT_SYMBOL() sometime.

Basically all it does it break the percpu_counter lock into two classes.

One for the irq-unsafe users and one for the irq-safe users. Without
this lockdep goes splat complaining about irq recursion deadlocks and
the like between these two separate users.

2008-08-28 23:00:01

by Mingming Cao

[permalink] [raw]
Subject: Re: [PATCH -V3 01/11] percpu_counters: make fbc->count read atomic on 32 bit architecture


在 2008-08-27三的 21:09 -0700,Andrew Morton写道:
> On Thu, 28 Aug 2008 09:22:00 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:
>
> > On Wed, Aug 27, 2008 at 02:22:50PM -0700, Andrew Morton wrote:
> > > On Wed, 27 Aug 2008 23:01:52 +0200
> > > Peter Zijlstra <[email protected]> wrote:
> > >
> > > > >
> > > > > > +static inline s64 percpu_counter_read(struct percpu_counter *fbc)
> > > > > > +{
> > > > > > + return fbc_count(fbc);
> > > > > > +}
> > > > >
> > > > > This change means that a percpu_counter_read() from interrupt context
> > > > > on a 32-bit machine is now deadlockable, whereas it previously was not
> > > > > deadlockable on either 32-bit or 64-bit.
> > > > >
> > > > > This flows on to the lib/proportions.c, which uses
> > > > > percpu_counter_read() and also does spin_lock_irqsave() internally,
> > > > > indicating that it is (or was) designed to be used in IRQ contexts.
> > > >
> > > > percpu_counter() never was irq safe, which is why the proportion stuff
> > > > does all the irq disabling bits by hand.
> > >
> > > percpu_counter_read() was irq-safe. That changes here. Needs careful
> > > review, changelogging and, preferably, runtime checks. But perhaps
> > > they should be inside some CONFIG_thing which won't normally be done in
> > > production.
> > >
> > > otoh, percpu_counter_read() is in fact a rare operation, so a bit of
> > > overhead probably won't matter.
> > >
> > > (write-often, read-rarely is the whole point. This patch's changelog's
> > > assertion that "Since fbc->count is read more frequently and updated
> > > rarely" is probably wrong. Most percpu_counters will have their
> > > fbc->count modified far more frequently than having it read from).
> >
> > we may actually be doing percpu_counter_add. But that doesn't update
> > fbc->count. Only if the local percpu values cross FBC_BATCH we update
> > fbc->count. If we are modifying fbc->count more frequently than
> > reading fbc->count then i guess we would be contenting of fbc->lock more.
> >
> >
>
> Yep. The frequency of modification of fbc->count is of the order of a
> tenth or a hundredth of the frequency of
> precpu_counter_<modification>() calls.
>
> But in many cases the frequency of percpu_counter_read() calls is far
> far less than this. For example, the percpu_counter_read() may only
> happen when userspace polls a /proc file.
>
>

The global counter is is much more frequently accessed with delalloc.:(

With delayed allocation, we have to do read the free blocks counter at
each write_begin(), to make sure there is enough free blocks to do
block reservation to prevent lately writepages returns ENOSPC.

Mingming