__percpu_counter_add() cache the result in percpu variable until it
exceeds the batch.
The prop_norm_percpu() use the percpu_counter_read(&pl->events) to read
the counter ,and use percpu_counter_add(&pl->events, -half) to half the
counter.
There are potential problems:
1.The counter may be negative
2.After some calculation, it may be converted to a big positive number.
1.
For example, the batch is 32, when the bdi add 32 to the pl->events,
the pl->events->count will be 32.Suppose one of the percpu counter is 1.
In the prop_norm_percpu(),the half will be 16.Because it is under the
batch, the pl->events->count won't be modified and one of the percpu
counter may be -15. If call the prop_norm_percpu() again, the half will
still be 16,though it should be 8.The percpu counter may be -31.
Now, there pl->events->count is still 32.
If do the third calculation, the percpu counter will be -47, it will
be merged into the pl->evnets->count.Then pl->events->count will be
negative.
2.When the pl->events->count is negative,
unsigned long val = percpu_counter_read(&pl->events);
This statement may return a negative number, so the val would be a big
number.Because of the overflow, the pl->events->count will be converted
into a big positive number after some calculation.
Because of the overflow, I catch some very big numerators when call the
prop_fraction_percpu().
I think that it should use percpu_counter_sum() instead of the
percpu_counter_read() to be more robust.
diff -Nur a/proportions.c b/proportions.c
--- a/proportions.c 2007-12-12 11:05:59.000000000 +0800
+++ b/proportions.c 2007-12-13 11:05:40.000000000 +0800
@@ -241,7 +241,7 @@
* can never result in a negative number.
*/
while (pl->period != global_period) {
- unsigned long val = percpu_counter_read(&pl->events);
+ unsigned long val = percpu_counter_sum(&pl->events);
unsigned long half = (val + 1) >> 1;
/*
Here is the relative codes:
static
void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu
*pl)
{
/*
* For each missed period, we half the local counter.
* basically:
* pl->events >> (global_period - pl->period);
*
* but since the distributed nature of percpu counters make division
* rather hard, use a regular subtraction loop. This is safe, because
* the events will only every be incremented, hence the subtraction
* can never result in a negative number.
*/
while (pl->period != global_period) {
unsigned long val = percpu_counter_read(&pl->events);
unsigned long half = (val + 1) >> 1;
/*
* Half of zero won't be much less, break out.
* This limits the loop to shift iterations, even
* if we missed a million.
*/
if (!val)
break;
percpu_counter_add(&pl->events, -half);
pl->period += period;
}
pl->period = global_period;
spin_unlock_irqrestore(&pl->lock, flags);
}
void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32
batch)
{
s64 count;
s32 *pcount;
int cpu = get_cpu();
pcount = per_cpu_ptr(fbc->counters, cpu);
count = *pcount + amount;
if (count >= batch || count <= -batch) {
spin_lock(&fbc->lock);
fbc->count += count;
*pcount = 0;
spin_unlock(&fbc->lock);
} else {
*pcount = count;
}
put_cpu();
}
On Thu, 2007-12-13 at 11:30 +0800, zhejiang wrote:
> __percpu_counter_add() cache the result in percpu variable until it
> exceeds the batch.
> The prop_norm_percpu() use the percpu_counter_read(&pl->events) to read
> the counter ,and use percpu_counter_add(&pl->events, -half) to half the
> counter.
>
> There are potential problems:
> 1.The counter may be negative
> 2.After some calculation, it may be converted to a big positive number.
>
> 1.
> For example, the batch is 32, when the bdi add 32 to the pl->events,
> the pl->events->count will be 32.Suppose one of the percpu counter is 1.
>
> In the prop_norm_percpu(),the half will be 16.Because it is under the
> batch, the pl->events->count won't be modified and one of the percpu
> counter may be -15. If call the prop_norm_percpu() again, the half will
> still be 16,though it should be 8.The percpu counter may be -31.
> Now, there pl->events->count is still 32.
> If do the third calculation, the percpu counter will be -47, it will
> be merged into the pl->evnets->count.Then pl->events->count will be
> negative.
>
> 2.When the pl->events->count is negative,
> unsigned long val = percpu_counter_read(&pl->events);
> This statement may return a negative number, so the val would be a big
> number.Because of the overflow, the pl->events->count will be converted
> into a big positive number after some calculation.
>
> Because of the overflow, I catch some very big numerators when call the
> prop_fraction_percpu().
>
> I think that it should use percpu_counter_sum() instead of the
> percpu_counter_read() to be more robust.
>
> diff -Nur a/proportions.c b/proportions.c
> --- a/proportions.c 2007-12-12 11:05:59.000000000 +0800
> +++ b/proportions.c 2007-12-13 11:05:40.000000000 +0800
> @@ -241,7 +241,7 @@
> * can never result in a negative number.
> */
> while (pl->period != global_period) {
> - unsigned long val = percpu_counter_read(&pl->events);
> + unsigned long val = percpu_counter_sum(&pl->events);
> unsigned long half = (val + 1) >> 1;
>
> /*
>
_sum is unacceptably slow here. I'm thinking something like
bdi_stat_error() as used in balance_dirty_pages() would also solve this
issue, no?
On Thu, 2007-12-13 at 22:54 +0100, Peter Zijlstra wrote:
> On Thu, 2007-12-13 at 11:30 +0800, zhejiang wrote:
> > __percpu_counter_add() cache the result in percpu variable until it
> > exceeds the batch.
> > The prop_norm_percpu() use the percpu_counter_read(&pl->events) to read
> > the counter ,and use percpu_counter_add(&pl->events, -half) to half the
> > counter.
> >
> > There are potential problems:
> > 1.The counter may be negative
> > 2.After some calculation, it may be converted to a big positive number.
> >
> > 1.
> > For example, the batch is 32, when the bdi add 32 to the pl->events,
> > the pl->events->count will be 32.Suppose one of the percpu counter is 1.
> >
> > In the prop_norm_percpu(),the half will be 16.Because it is under the
> > batch, the pl->events->count won't be modified and one of the percpu
> > counter may be -15. If call the prop_norm_percpu() again, the half will
> > still be 16,though it should be 8.The percpu counter may be -31.
> > Now, there pl->events->count is still 32.
> > If do the third calculation, the percpu counter will be -47, it will
> > be merged into the pl->evnets->count.Then pl->events->count will be
> > negative.
> >
> > 2.When the pl->events->count is negative,
> > unsigned long val = percpu_counter_read(&pl->events);
> > This statement may return a negative number, so the val would be a big
> > number.Because of the overflow, the pl->events->count will be converted
> > into a big positive number after some calculation.
> >
> > Because of the overflow, I catch some very big numerators when call the
> > prop_fraction_percpu().
> >
> > I think that it should use percpu_counter_sum() instead of the
> > percpu_counter_read() to be more robust.
> >
> > diff -Nur a/proportions.c b/proportions.c
> > --- a/proportions.c 2007-12-12 11:05:59.000000000 +0800
> > +++ b/proportions.c 2007-12-13 11:05:40.000000000 +0800
> > @@ -241,7 +241,7 @@
> > * can never result in a negative number.
> > */
> > while (pl->period != global_period) {
> > - unsigned long val = percpu_counter_read(&pl->events);
> > + unsigned long val = percpu_counter_sum(&pl->events);
> > unsigned long half = (val + 1) >> 1;
> >
> > /*
> >
>
> _sum is unacceptably slow here. I'm thinking something like
> bdi_stat_error() as used in balance_dirty_pages() would also solve this
> issue, no?
_sum is slow, but it can avoid over-subtraction when the counter is less
than 2 * BDI_STAT_BATCH.
How about do the calculation like this patch?
Change the val and half into long type, if we found the val is negative,
it means that we did subtraction too much.
So we actually add some value to it.
diff -Nur a/proportions.c b/proportions.c
--- a/proportions.c 2007-12-12 11:05:59.000000000 +0800
+++ b/proportions.c 2007-12-14 16:21:08.000000000 +0800
@@ -241,8 +241,18 @@
* can never result in a negative number.
*/
while (pl->period != global_period) {
- unsigned long val = percpu_counter_read(&pl->events);
- unsigned long half = (val + 1) >> 1;
+ long val = percpu_counter_read(&pl->events);
+ long half;
+
+ if (val >= 0) {
+ half = (val + 1) >> 1;
+ } else {
+ /*
+ *If val is negative, the counter has been
subtracted too much.
+ *Do some compensation here.
+ */
+ half = (val - 1) / 2;
+ }
/*
* Half of zero won't be much less, break out.
On Fri, 2007-12-14 at 16:26 +0800, zhejiang wrote:
> On Thu, 2007-12-13 at 22:54 +0100, Peter Zijlstra wrote:
> > On Thu, 2007-12-13 at 11:30 +0800, zhejiang wrote:
> > > __percpu_counter_add() cache the result in percpu variable until it
> > > exceeds the batch.
> > > The prop_norm_percpu() use the percpu_counter_read(&pl->events) to read
> > > the counter ,and use percpu_counter_add(&pl->events, -half) to half the
> > > counter.
> > >
> > > There are potential problems:
> > > 1.The counter may be negative
> > > 2.After some calculation, it may be converted to a big positive number.
> > >
> > > 1.
> > > For example, the batch is 32, when the bdi add 32 to the pl->events,
> > > the pl->events->count will be 32.Suppose one of the percpu counter is 1.
> > >
> > > In the prop_norm_percpu(),the half will be 16.Because it is under the
> > > batch, the pl->events->count won't be modified and one of the percpu
> > > counter may be -15. If call the prop_norm_percpu() again, the half will
> > > still be 16,though it should be 8.The percpu counter may be -31.
> > > Now, there pl->events->count is still 32.
> > > If do the third calculation, the percpu counter will be -47, it will
> > > be merged into the pl->evnets->count.Then pl->events->count will be
> > > negative.
> > >
> > > 2.When the pl->events->count is negative,
> > > unsigned long val = percpu_counter_read(&pl->events);
> > > This statement may return a negative number, so the val would be a big
> > > number.Because of the overflow, the pl->events->count will be converted
> > > into a big positive number after some calculation.
> > >
> > > Because of the overflow, I catch some very big numerators when call the
> > > prop_fraction_percpu().
> > >
> > > I think that it should use percpu_counter_sum() instead of the
> > > percpu_counter_read() to be more robust.
> > >
> > > diff -Nur a/proportions.c b/proportions.c
> > > --- a/proportions.c 2007-12-12 11:05:59.000000000 +0800
> > > +++ b/proportions.c 2007-12-13 11:05:40.000000000 +0800
> > > @@ -241,7 +241,7 @@
> > > * can never result in a negative number.
> > > */
> > > while (pl->period != global_period) {
> > > - unsigned long val = percpu_counter_read(&pl->events);
> > > + unsigned long val = percpu_counter_sum(&pl->events);
> > > unsigned long half = (val + 1) >> 1;
> > >
> > > /*
> > >
> >
> > _sum is unacceptably slow here. I'm thinking something like
> > bdi_stat_error() as used in balance_dirty_pages() would also solve this
> > issue, no?
>
> _sum is slow, but it can avoid over-subtraction when the counter is less
> than 2 * BDI_STAT_BATCH.
>
> How about do the calculation like this patch?
>
> Change the val and half into long type, if we found the val is negative,
> it means that we did subtraction too much.
> So we actually add some value to it.
>
> diff -Nur a/proportions.c b/proportions.c
> --- a/proportions.c 2007-12-12 11:05:59.000000000 +0800
> +++ b/proportions.c 2007-12-14 16:21:08.000000000 +0800
> @@ -241,8 +241,18 @@
> * can never result in a negative number.
> */
> while (pl->period != global_period) {
> - unsigned long val = percpu_counter_read(&pl->events);
> - unsigned long half = (val + 1) >> 1;
> + long val = percpu_counter_read(&pl->events);
> + long half;
> +
> + if (val >= 0) {
> + half = (val + 1) >> 1;
> + } else {
> + /*
> + *If val is negative, the counter has been
> subtracted too much.
> + *Do some compensation here.
> + */
> + half = (val - 1) / 2;
> + }
>
> /*
> * Half of zero won't be much less, break out.
How about something like this:
diff --git a/lib/proportions.c b/lib/proportions.c
index 332d8c5..4afb330 100644
--- a/lib/proportions.c
+++ b/lib/proportions.c
@@ -190,6 +190,8 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
* PERCPU
*/
+#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
int prop_local_init_percpu(struct prop_local_percpu *pl)
{
spin_lock_init(&pl->lock);
@@ -203,6 +205,16 @@ void prop_local_destroy_percpu(struct prop_local_percpu *pl)
percpu_counter_destroy(&pl->events);
}
+static unsigned long prop_percpu_read(struct prop_local_percpu *pl)
+{
+ unsigned long val = percpu_counter_read(&pl->events);
+
+ if (val < (nr_cpu_ids * PROP_BATCH))
+ val = percpu_counter_sum(&pl->events);
+
+ return val;
+}
+
/*
* Catch up with missed period expirations.
*
@@ -241,7 +253,7 @@ void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
* can never result in a negative number.
*/
while (pl->period != global_period) {
- unsigned long val = percpu_counter_read(&pl->events);
+ unsigned long val = prop_percpu_read(pl);
unsigned long half = (val + 1) >> 1;
/*
@@ -252,7 +264,7 @@ void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
if (!val)
break;
- percpu_counter_add(&pl->events, -half);
+ __percpu_counter_add(&pl->events, -half, PROP_BATCH);
pl->period += period;
}
pl->period = global_period;
@@ -267,7 +279,7 @@ void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
struct prop_global *pg = prop_get_global(pd);
prop_norm_percpu(pg, pl);
- percpu_counter_add(&pl->events, 1);
+ __percpu_counter_add(&pl->events, 1, PROP_BATCH);
percpu_counter_add(&pg->events, 1);
prop_put_global(pd, pg);
}
On Fri, 2007-12-14 at 09:47 +0100, Peter Zijlstra wrote:
> How about something like this:
>
> diff --git a/lib/proportions.c b/lib/proportions.c
> index 332d8c5..4afb330 100644
> --- a/lib/proportions.c
> +++ b/lib/proportions.c
> @@ -190,6 +190,8 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
> * PERCPU
> */
>
> +#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
> +
> int prop_local_init_percpu(struct prop_local_percpu *pl)
> {
> spin_lock_init(&pl->lock);
> @@ -203,6 +205,16 @@ void prop_local_destroy_percpu(struct prop_local_percpu *pl)
> percpu_counter_destroy(&pl->events);
> }
>
> +static unsigned long prop_percpu_read(struct prop_local_percpu *pl)
> +{
> + unsigned long val = percpu_counter_read(&pl->events);
> +
> + if (val < (nr_cpu_ids * PROP_BATCH))
> + val = percpu_counter_sum(&pl->events);
> +
> + return val;
> +}
> +
> /*
> * Catch up with missed period expirations.
> *
> @@ -241,7 +253,7 @@ void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
> * can never result in a negative number.
> */
> while (pl->period != global_period) {
> - unsigned long val = percpu_counter_read(&pl->events);
> + unsigned long val = prop_percpu_read(pl);
> unsigned long half = (val + 1) >> 1;
>
> /*
> @@ -252,7 +264,7 @@ void prop_norm_percpu(struct prop_global *pg, struct prop_local_percpu *pl)
> if (!val)
> break;
>
> - percpu_counter_add(&pl->events, -half);
> + __percpu_counter_add(&pl->events, -half, PROP_BATCH);
> pl->period += period;
> }
> pl->period = global_period;
> @@ -267,7 +279,7 @@ void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl)
> struct prop_global *pg = prop_get_global(pd);
>
> prop_norm_percpu(pg, pl);
> - percpu_counter_add(&pl->events, 1);
> + __percpu_counter_add(&pl->events, 1, PROP_BATCH);
> percpu_counter_add(&pg->events, 1);
> prop_put_global(pd, pg);
> }
>
Might as well stack something like this on top:
Index: linux-2.6/lib/proportions.c
===================================================================
--- linux-2.6.orig/lib/proportions.c
+++ linux-2.6/lib/proportions.c
@@ -242,31 +242,19 @@ void prop_norm_percpu(struct prop_global
spin_lock_irqsave(&pl->lock, flags);
prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
+
/*
* For each missed period, we half the local counter.
* basically:
* pl->events >> (global_period - pl->period);
- *
- * but since the distributed nature of percpu counters make division
- * rather hard, use a regular subtraction loop. This is safe, because
- * the events will only every be incremented, hence the subtraction
- * can never result in a negative number.
*/
- while (pl->period != global_period) {
- unsigned long val = prop_local_read_percpu(pl);
- unsigned long half = (val + 1) >> 1;
-
- /*
- * Half of zero won't be much less, break out.
- * This limits the loop to shift iterations, even
- * if we missed a million.
- */
- if (!val)
- break;
-
- __percpu_counter_add(&pl->events, -half, PROP_BATCH);
- pl->period += period;
- }
+ period = (global_period - pl->period) >> (pg->shift - 1);
+ if (period < BITS_PER_LONG) {
+ s64 val = (u64)prop_local_read_percpu(pl);
+ __percpu_counter_add(&pl->events, -val + (val >> period), PROP_BATCH);
+ } else
+ percpu_counter_set(&pl->events, 0);
+
pl->period = global_period;
spin_unlock_irqrestore(&pl->lock, flags);
}
Subject: lib: proportion: fix underflow in prop_norm_percpu()
Zhe Jiang noticed that its possible to underflow pl->events in
prop_norm_percpu() when the value returned by percpu_counter_read() is less
than the error on that read and the period delay > 1. In that case half might
not trigger the batch increment and the value will be identical on the next
iteration, causing the same half to be subtracted again and again.
Fix this by rewriting the division as a single subtraction instead of a
subtraction loop and using percpu_counter_sum() when the value returned
by percpu_counter_read() is smaller than the error.
The latter is still needed if we want pl->events to shrink properly in the
error region.
Jiang, can I get a Reviewed-by from you? - if you agree that is :-)
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: zhejiang <[email protected]>
---
lib/proportions.c | 36 +++++++++++++++---------------------
1 file changed, 15 insertions(+), 21 deletions(-)
Index: linux-2.6/lib/proportions.c
===================================================================
--- linux-2.6.orig/lib/proportions.c
+++ linux-2.6/lib/proportions.c
@@ -190,6 +190,8 @@ prop_adjust_shift(int *pl_shift, unsigne
* PERCPU
*/
+#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
int prop_local_init_percpu(struct prop_local_percpu *pl)
{
spin_lock_init(&pl->lock);
@@ -230,31 +232,23 @@ void prop_norm_percpu(struct prop_global
spin_lock_irqsave(&pl->lock, flags);
prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
+
/*
* For each missed period, we half the local counter.
* basically:
* pl->events >> (global_period - pl->period);
- *
- * but since the distributed nature of percpu counters make division
- * rather hard, use a regular subtraction loop. This is safe, because
- * the events will only every be incremented, hence the subtraction
- * can never result in a negative number.
*/
- while (pl->period != global_period) {
- unsigned long val = percpu_counter_read(&pl->events);
- unsigned long half = (val + 1) >> 1;
-
- /*
- * Half of zero won't be much less, break out.
- * This limits the loop to shift iterations, even
- * if we missed a million.
- */
- if (!val)
- break;
-
- percpu_counter_add(&pl->events, -half);
- pl->period += period;
- }
+ period = (global_period - pl->period) >> (pg->shift - 1);
+ if (period < BITS_PER_LONG) {
+ s64 val = percpu_counter_read(&pl->events);
+
+ if (val < (nr_cpu_ids * PROP_BATCH))
+ val = percpu_counter_sum(&pl->events);
+
+ __percpu_counter_add(&pl->events, -val + (val >> period), PROP_BATCH);
+ } else
+ percpu_counter_set(&pl->events, 0);
+
pl->period = global_period;
spin_unlock_irqrestore(&pl->lock, flags);
}
@@ -267,7 +261,7 @@ void __prop_inc_percpu(struct prop_descr
struct prop_global *pg = prop_get_global(pd);
prop_norm_percpu(pg, pl);
- percpu_counter_add(&pl->events, 1);
+ __percpu_counter_add(&pl->events, 1, PROP_BATCH);
percpu_counter_add(&pg->events, 1);
prop_put_global(pd, pg);
}
On Fri, 2007-12-14 at 17:01 +0100, Peter Zijlstra wrote:
> Subject: lib: proportion: fix underflow in prop_norm_percpu()
>
> Zhe Jiang noticed that its possible to underflow pl->events in
> prop_norm_percpu() when the value returned by percpu_counter_read() is less
> than the error on that read and the period delay > 1. In that case half might
> not trigger the batch increment and the value will be identical on the next
> iteration, causing the same half to be subtracted again and again.
>
> Fix this by rewriting the division as a single subtraction instead of a
> subtraction loop and using percpu_counter_sum() when the value returned
> by percpu_counter_read() is smaller than the error.
>
> The latter is still needed if we want pl->events to shrink properly in the
> error region.
>
> Jiang, can I get a Reviewed-by from you? - if you agree that is :-)
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Cc: zhejiang <[email protected]>
> ---
> lib/proportions.c | 36 +++++++++++++++---------------------
> 1 file changed, 15 insertions(+), 21 deletions(-)
>
> Index: linux-2.6/lib/proportions.c
> ===================================================================
> --- linux-2.6.orig/lib/proportions.c
> +++ linux-2.6/lib/proportions.c
> @@ -190,6 +190,8 @@ prop_adjust_shift(int *pl_shift, unsigne
> * PERCPU
> */
>
> +#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
> +
> int prop_local_init_percpu(struct prop_local_percpu *pl)
> {
> spin_lock_init(&pl->lock);
> @@ -230,31 +232,23 @@ void prop_norm_percpu(struct prop_global
>
> spin_lock_irqsave(&pl->lock, flags);
> prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
> +
> /*
> * For each missed period, we half the local counter.
> * basically:
> * pl->events >> (global_period - pl->period);
> - *
> - * but since the distributed nature of percpu counters make division
> - * rather hard, use a regular subtraction loop. This is safe, because
> - * the events will only every be incremented, hence the subtraction
> - * can never result in a negative number.
> */
> - while (pl->period != global_period) {
> - unsigned long val = percpu_counter_read(&pl->events);
> - unsigned long half = (val + 1) >> 1;
> -
> - /*
> - * Half of zero won't be much less, break out.
> - * This limits the loop to shift iterations, even
> - * if we missed a million.
> - */
> - if (!val)
> - break;
> -
> - percpu_counter_add(&pl->events, -half);
> - pl->period += period;
> - }
> + period = (global_period - pl->period) >> (pg->shift - 1);
> + if (period < BITS_PER_LONG) {
> + s64 val = percpu_counter_read(&pl->events);
> +
> + if (val < (nr_cpu_ids * PROP_BATCH))
> + val = percpu_counter_sum(&pl->events);
> +
> + __percpu_counter_add(&pl->events, -val + (val >> period), PROP_BATCH);
> + } else
> + percpu_counter_set(&pl->events, 0);
> +
> pl->period = global_period;
> spin_unlock_irqrestore(&pl->lock, flags);
> }
> @@ -267,7 +261,7 @@ void __prop_inc_percpu(struct prop_descr
> struct prop_global *pg = prop_get_global(pd);
>
> prop_norm_percpu(pg, pl);
> - percpu_counter_add(&pl->events, 1);
> + __percpu_counter_add(&pl->events, 1, PROP_BATCH);
> percpu_counter_add(&pg->events, 1);
> prop_put_global(pd, pg);
> }
>
It's okay! Thanks!
On Fri, 2007-12-14 at 17:01 +0100, Peter Zijlstra wrote:
> Subject: lib: proportion: fix underflow in prop_norm_percpu()
>
> Zhe Jiang noticed that its possible to underflow pl->events in
> prop_norm_percpu() when the value returned by percpu_counter_read() is less
> than the error on that read and the period delay > 1. In that case half might
> not trigger the batch increment and the value will be identical on the next
> iteration, causing the same half to be subtracted again and again.
>
> Fix this by rewriting the division as a single subtraction instead of a
> subtraction loop and using percpu_counter_sum() when the value returned
> by percpu_counter_read() is smaller than the error.
>
> The latter is still needed if we want pl->events to shrink properly in the
> error region.
>
> Jiang, can I get a Reviewed-by from you? - if you agree that is :-)
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Cc: zhejiang <[email protected]>
> ---
> lib/proportions.c | 36 +++++++++++++++---------------------
> 1 file changed, 15 insertions(+), 21 deletions(-)
>
> Index: linux-2.6/lib/proportions.c
> ===================================================================
> --- linux-2.6.orig/lib/proportions.c
> +++ linux-2.6/lib/proportions.c
> @@ -190,6 +190,8 @@ prop_adjust_shift(int *pl_shift, unsigne
> * PERCPU
> */
>
> +#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
> +
> int prop_local_init_percpu(struct prop_local_percpu *pl)
> {
> spin_lock_init(&pl->lock);
> @@ -230,31 +232,23 @@ void prop_norm_percpu(struct prop_global
>
> spin_lock_irqsave(&pl->lock, flags);
> prop_adjust_shift(&pl->shift, &pl->period, pg->shift);
> +
> /*
> * For each missed period, we half the local counter.
> * basically:
> * pl->events >> (global_period - pl->period);
> - *
> - * but since the distributed nature of percpu counters make division
> - * rather hard, use a regular subtraction loop. This is safe, because
> - * the events will only every be incremented, hence the subtraction
> - * can never result in a negative number.
> */
> - while (pl->period != global_period) {
> - unsigned long val = percpu_counter_read(&pl->events);
> - unsigned long half = (val + 1) >> 1;
> -
> - /*
> - * Half of zero won't be much less, break out.
> - * This limits the loop to shift iterations, even
> - * if we missed a million.
> - */
> - if (!val)
> - break;
> -
> - percpu_counter_add(&pl->events, -half);
> - pl->period += period;
> - }
> + period = (global_period - pl->period) >> (pg->shift - 1);
> + if (period < BITS_PER_LONG) {
> + s64 val = percpu_counter_read(&pl->events);
> +
> + if (val < (nr_cpu_ids * PROP_BATCH))
> + val = percpu_counter_sum(&pl->events);
> +
> + __percpu_counter_add(&pl->events, -val + (val >> period), PROP_BATCH);
> + } else
> + percpu_counter_set(&pl->events, 0);
> +
> pl->period = global_period;
> spin_unlock_irqrestore(&pl->lock, flags);
> }
> @@ -267,7 +261,7 @@ void __prop_inc_percpu(struct prop_descr
> struct prop_global *pg = prop_get_global(pd);
>
> prop_norm_percpu(pg, pl);
> - percpu_counter_add(&pl->events, 1);
> + __percpu_counter_add(&pl->events, 1, PROP_BATCH);
> percpu_counter_add(&pg->events, 1);
> prop_put_global(pd, pg);
> }
>
Reviewed-by: Jiang Zhe <[email protected]>
Thanks!