2010-12-10 20:39:01

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH] sched: Fix the irqtime code to deal with u64 wraps

OK, so here's the latest version, using fancy __this_cpu thingies.

I started a new thread since the old one was quite unwieldy.

Now, admittedly this patch is a tad large, esp for -rc5. So either we
need lots of Reviewed-by and such or I need to shrink this patch
somehow.


---
Subject: sched: Fix the irqtime code to deal with u64 wraps
From: Peter Zijlstra <[email protected]>
Date: Thu Dec 09 14:15:34 CET 2010

ARM systems have a 32bit sched_clock() [ which needs to be fixed ],
but this exposed a bug in the irq_time code as well, it doesn't deal
with wraps at all.

Fix the irq_time code to deal with u64 wraps by re-writing the code to
only use delta increments, which avoids the whole issue.

Furthermore, solve the problem of 32bit arches reading partial updates
of the u64 time values.

Cc: Venkatesh Pallipadi <[email protected]>
Reported-by: Mikael Pettersson <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
---
kernel/sched.c | 176 +++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 121 insertions(+), 55 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -636,22 +636,16 @@ static inline struct task_group *task_gr

#endif /* CONFIG_CGROUP_SCHED */

-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
-
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update) {
- int cpu = cpu_of(rq);
- u64 irq_time;
+ s64 delta;

- rq->clock = sched_clock_cpu(cpu);
- irq_time = irq_time_cpu(cpu);
- if (rq->clock - irq_time > rq->clock_task)
- rq->clock_task = rq->clock - irq_time;
+ if (rq->skip_clock_update)
+ return;

- sched_irq_time_avg_update(rq, irq_time);
- }
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
}

/*
@@ -1918,90 +1912,162 @@ static void deactivate_task(struct rq *r
#ifdef CONFIG_IRQ_TIME_ACCOUNTING

/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
+ * There are no locks covering percpu hardirq/softirq time. They are only
+ * modified in account_system_vtime, on corresponding CPU with interrupts
+ * disabled. So, writes are safe.
+ *
* They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ *
+ * This may result in other CPU reading this CPU's irq time and can race with
+ * irq/account_system_vtime on this CPU. We would either get old or new value
+ * with a side effect of accounting a slice of irq time to wrong task when irq
+ * is in progress while we read rq->clock. That is a worthy compromise in place
+ * of having locks on each irq in account_system_time.
*/
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;

-void enable_sched_clock_irqtime(void)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
{
- sched_clock_irqtime = 1;
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
}

-void disable_sched_clock_irqtime(void)
+static inline void irq_time_write_end(void)
{
- sched_clock_irqtime = 0;
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
}

-static u64 irq_time_cpu(int cpu)
+static inline u64 irq_time_read(int cpu)
{
- if (!sched_clock_irqtime)
- return 0;
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}

+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}
+#endif /* CONFIG_64BIT */
+
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}

+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
void account_system_vtime(struct task_struct *curr)
{
unsigned long flags;
+ s64 delta;
int cpu;
- u64 now, delta;

if (!sched_clock_irqtime)
return;

local_irq_save(flags);
-
cpu = smp_processor_id();
- now = sched_clock_cpu(cpu);
- delta = now - per_cpu(irq_start_time, cpu);
- per_cpu(irq_start_time, cpu) = now;
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
+
+ if (hardirq_count())
+ __this_cpu_add(cpu_hardirq_time, delta);
/*
- * We do not account for softirq time from ksoftirqd here.
- * We want to continue accounting softirq time to ksoftirqd thread
- * in that case, so as not to confuse scheduler with a special task
- * that do not consume any time, but still wants to run.
+ * We do not account for softirq time from ksoftirqd here. We want to
+ * continue accounting softirq time to ksoftirqd thread in that case,
+ * so as not to confuse scheduler with a special task that do not
+ * consume any time, but still wants to run.
*/
- if (hardirq_count())
- per_cpu(cpu_hardirq_time, cpu) += delta;
else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
- per_cpu(cpu_softirq_time, cpu) += delta;
+ __this_cpu_add(cpu_softirq_time, delta);

+ irq_time_write_end();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(account_system_vtime);

-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static u64 irq_time_cpu(struct rq *rq)
{
- if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
- u64 delta_irq = curr_irq_time - rq->prev_irq_time;
- rq->prev_irq_time = curr_irq_time;
- sched_rt_avg_update(rq, delta_irq);
- }
+ /*
+ * See the comment in update_rq_clock_task(), ideally we'd update
+ * the *irq_time values using rq->clock here.
+ */
+ return irq_time_read(cpu_of(rq));
}

-#else
-
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- return 0;
+ s64 irq_delta;
+
+ irq_delta = irq_time_cpu(rq) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ rq->clock_task += delta;
+
+ if (irq_delta && sched_feat(NONIRQ_POWER))
+ sched_rt_avg_update(rq, irq_delta);
}

-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */

-#endif
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+ rq->clock_task += delta;
+}
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

#include "sched_idletask.c"
#include "sched_fair.c"


2010-12-10 22:10:03

by Venkatesh Pallipadi

[permalink] [raw]
Subject: Re: [PATCH] sched: Fix the irqtime code to deal with u64 wraps

On Fri, Dec 10, 2010 at 12:38 PM, Peter Zijlstra <[email protected]> wrote:
> OK, so here's the latest version, using fancy __this_cpu thingies.
>
> I started a new thread since the old one was quite unwieldy.
>
> Now, admittedly this patch is a tad large, esp for -rc5. So either we
> need lots of Reviewed-by and such or I need to shrink this patch
> somehow.
>

- Needs prototype declaration for
static void update_rq_clock_task(struct rq *rq, s64 delta)
before first use.
- I guess irq_time_cpu() function can be eliminated making the patch a
bit smaller.
- May be split into two patches, first to handle 64 bit overflow and
second to handle the 32 bit access of these u64 variables?

Otherwise, change looks good and works OK with my quick tests.

Reviewed-by: Venkatesh Pallipadi <[email protected]>

>
> ---
> Subject: sched: Fix the irqtime code to deal with u64 wraps
> From: Peter Zijlstra <[email protected]>
> Date: Thu Dec 09 14:15:34 CET 2010
>
> ARM systems have a 32bit sched_clock() [ which needs to be fixed ],
> but this exposed a bug in the irq_time code as well, it doesn't deal
> with wraps at all.
>
> Fix the irq_time code to deal with u64 wraps by re-writing the code to
> only use delta increments, which avoids the whole issue.
>
> Furthermore, solve the problem of 32bit arches reading partial updates
> of the u64 time values.
>
> Cc: Venkatesh Pallipadi <[email protected]>
> Reported-by: Mikael Pettersson <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> ?kernel/sched.c | ?176 +++++++++++++++++++++++++++++++++++++++------------------
> ?1 file changed, 121 insertions(+), 55 deletions(-)
>
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -636,22 +636,16 @@ static inline struct task_group *task_gr
>
> ?#endif /* CONFIG_CGROUP_SCHED */
>
> -static u64 irq_time_cpu(int cpu);
> -static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
> -
> -inline void update_rq_clock(struct rq *rq)
> +static void update_rq_clock(struct rq *rq)
> ?{
> - ? ? ? if (!rq->skip_clock_update) {
> - ? ? ? ? ? ? ? int cpu = cpu_of(rq);
> - ? ? ? ? ? ? ? u64 irq_time;
> + ? ? ? s64 delta;
>
> - ? ? ? ? ? ? ? rq->clock = sched_clock_cpu(cpu);
> - ? ? ? ? ? ? ? irq_time = irq_time_cpu(cpu);
> - ? ? ? ? ? ? ? if (rq->clock - irq_time > rq->clock_task)
> - ? ? ? ? ? ? ? ? ? ? ? rq->clock_task = rq->clock - irq_time;
> + ? ? ? if (rq->skip_clock_update)
> + ? ? ? ? ? ? ? return;
>
> - ? ? ? ? ? ? ? sched_irq_time_avg_update(rq, irq_time);
> - ? ? ? }
> + ? ? ? delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
> + ? ? ? rq->clock += delta;
> + ? ? ? update_rq_clock_task(rq, delta);
> ?}
>
> ?/*
> @@ -1918,90 +1912,162 @@ static void deactivate_task(struct rq *r
> ?#ifdef CONFIG_IRQ_TIME_ACCOUNTING
>
> ?/*
> - * There are no locks covering percpu hardirq/softirq time.
> - * They are only modified in account_system_vtime, on corresponding CPU
> - * with interrupts disabled. So, writes are safe.
> + * There are no locks covering percpu hardirq/softirq time. They are only
> + * modified in account_system_vtime, on corresponding CPU with interrupts
> + * disabled. So, writes are safe.
> + *
> ?* They are read and saved off onto struct rq in update_rq_clock().
> - * This may result in other CPU reading this CPU's irq time and can
> - * race with irq/account_system_vtime on this CPU. We would either get old
> - * or new value (or semi updated value on 32 bit) with a side effect of
> - * accounting a slice of irq time to wrong task when irq is in progress
> - * while we read rq->clock. That is a worthy compromise in place of having
> - * locks on each irq in account_system_time.
> + *
> + * This may result in other CPU reading this CPU's irq time and can race with
> + * irq/account_system_vtime on this CPU. We would either get old or new value
> + * with a side effect of accounting a slice of irq time to wrong task when irq
> + * is in progress while we read rq->clock. That is a worthy compromise in place
> + * of having locks on each irq in account_system_time.
> ?*/
> ?static DEFINE_PER_CPU(u64, cpu_hardirq_time);
> ?static DEFINE_PER_CPU(u64, cpu_softirq_time);
> -
> ?static DEFINE_PER_CPU(u64, irq_start_time);
> -static int sched_clock_irqtime;
>
> -void enable_sched_clock_irqtime(void)
> +#ifndef CONFIG_64BIT
> +static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
> +
> +static inline void irq_time_write_begin(void)
> ?{
> - ? ? ? sched_clock_irqtime = 1;
> + ? ? ? __this_cpu_inc(irq_time_seq.sequence);
> + ? ? ? smp_wmb();
> ?}
>
> -void disable_sched_clock_irqtime(void)
> +static inline void irq_time_write_end(void)
> ?{
> - ? ? ? sched_clock_irqtime = 0;
> + ? ? ? smp_wmb();
> + ? ? ? __this_cpu_inc(irq_time_seq.sequence);
> ?}
>
> -static u64 irq_time_cpu(int cpu)
> +static inline u64 irq_time_read(int cpu)
> ?{
> - ? ? ? if (!sched_clock_irqtime)
> - ? ? ? ? ? ? ? return 0;
> + ? ? ? u64 irq_time;
> + ? ? ? unsigned seq;
> +
> + ? ? ? do {
> + ? ? ? ? ? ? ? seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
> + ? ? ? ? ? ? ? irq_time = per_cpu(cpu_softirq_time, cpu) +
> + ? ? ? ? ? ? ? ? ? ? ? ? ?per_cpu(cpu_hardirq_time, cpu);
> + ? ? ? } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
> +
> + ? ? ? return irq_time;
> +}
> +#else /* CONFIG_64BIT */
> +static inline void irq_time_write_begin(void)
> +{
> +}
>
> +static inline void irq_time_write_end(void)
> +{
> +}
> +
> +static inline u64 irq_time_read(int cpu)
> +{
> ? ? ? ?return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
> ?}
> +#endif /* CONFIG_64BIT */
> +
> +static int sched_clock_irqtime;
> +
> +void enable_sched_clock_irqtime(void)
> +{
> + ? ? ? sched_clock_irqtime = 1;
> +}
> +
> +void disable_sched_clock_irqtime(void)
> +{
> + ? ? ? sched_clock_irqtime = 0;
> +}
>
> +/*
> + * Called before incrementing preempt_count on {soft,}irq_enter
> + * and before decrementing preempt_count on {soft,}irq_exit.
> + */
> ?void account_system_vtime(struct task_struct *curr)
> ?{
> ? ? ? ?unsigned long flags;
> + ? ? ? s64 delta;
> ? ? ? ?int cpu;
> - ? ? ? u64 now, delta;
>
> ? ? ? ?if (!sched_clock_irqtime)
> ? ? ? ? ? ? ? ?return;
>
> ? ? ? ?local_irq_save(flags);
> -
> ? ? ? ?cpu = smp_processor_id();
> - ? ? ? now = sched_clock_cpu(cpu);
> - ? ? ? delta = now - per_cpu(irq_start_time, cpu);
> - ? ? ? per_cpu(irq_start_time, cpu) = now;
> + ? ? ? delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
> + ? ? ? __this_cpu_add(irq_start_time, delta);
> +
> + ? ? ? irq_time_write_begin();
> +
> + ? ? ? if (hardirq_count())
> + ? ? ? ? ? ? ? __this_cpu_add(cpu_hardirq_time, delta);
> ? ? ? ?/*
> - ? ? ? ?* We do not account for softirq time from ksoftirqd here.
> - ? ? ? ?* We want to continue accounting softirq time to ksoftirqd thread
> - ? ? ? ?* in that case, so as not to confuse scheduler with a special task
> - ? ? ? ?* that do not consume any time, but still wants to run.
> + ? ? ? ?* We do not account for softirq time from ksoftirqd here. We want to
> + ? ? ? ?* continue accounting softirq time to ksoftirqd thread in that case,
> + ? ? ? ?* so as not to confuse scheduler with a special task that do not
> + ? ? ? ?* consume any time, but still wants to run.
> ? ? ? ? */
> - ? ? ? if (hardirq_count())
> - ? ? ? ? ? ? ? per_cpu(cpu_hardirq_time, cpu) += delta;
> ? ? ? ?else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
> - ? ? ? ? ? ? ? per_cpu(cpu_softirq_time, cpu) += delta;
> + ? ? ? ? ? ? ? __this_cpu_add(cpu_softirq_time, delta);
>
> + ? ? ? irq_time_write_end();
> ? ? ? ?local_irq_restore(flags);
> ?}
> ?EXPORT_SYMBOL_GPL(account_system_vtime);
>
> -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
> +static u64 irq_time_cpu(struct rq *rq)
> ?{
> - ? ? ? if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
> - ? ? ? ? ? ? ? u64 delta_irq = curr_irq_time - rq->prev_irq_time;
> - ? ? ? ? ? ? ? rq->prev_irq_time = curr_irq_time;
> - ? ? ? ? ? ? ? sched_rt_avg_update(rq, delta_irq);
> - ? ? ? }
> + ? ? ? /*
> + ? ? ? ?* See the comment in update_rq_clock_task(), ideally we'd update
> + ? ? ? ?* the *irq_time values using rq->clock here.
> + ? ? ? ?*/
> + ? ? ? return irq_time_read(cpu_of(rq));
> ?}
>
> -#else
> -
> -static u64 irq_time_cpu(int cpu)
> +static void update_rq_clock_task(struct rq *rq, s64 delta)
> ?{
> - ? ? ? return 0;
> + ? ? ? s64 irq_delta;
> +
> + ? ? ? irq_delta = irq_time_cpu(rq) - rq->prev_irq_time;
> +
> + ? ? ? /*
> + ? ? ? ?* Since irq_time is only updated on {soft,}irq_exit, we might run into
> + ? ? ? ?* this case when a previous update_rq_clock() happened inside a
> + ? ? ? ?* {soft,}irq region.
> + ? ? ? ?*
> + ? ? ? ?* When this happens, we stop ->clock_task and only update the
> + ? ? ? ?* prev_irq_time stamp to account for the part that fit, so that a next
> + ? ? ? ?* update will consume the rest. This ensures ->clock_task is
> + ? ? ? ?* monotonic.
> + ? ? ? ?*
> + ? ? ? ?* It does however cause some slight miss-attribution of {soft,}irq
> + ? ? ? ?* time, a more accurate solution would be to update the irq_time using
> + ? ? ? ?* the current rq->clock timestamp, except that would require using
> + ? ? ? ?* atomic ops.
> + ? ? ? ?*/
> + ? ? ? if (irq_delta > delta)
> + ? ? ? ? ? ? ? irq_delta = delta;
> +
> + ? ? ? rq->prev_irq_time += irq_delta;
> + ? ? ? delta -= irq_delta;
> + ? ? ? rq->clock_task += delta;
> +
> + ? ? ? if (irq_delta && sched_feat(NONIRQ_POWER))
> + ? ? ? ? ? ? ? sched_rt_avg_update(rq, irq_delta);
> ?}
>
> -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
> +#else /* CONFIG_IRQ_TIME_ACCOUNTING */
>
> -#endif
> +static void update_rq_clock_task(struct rq *rq, s64 delta)
> +{
> + ? ? ? rq->clock_task += delta;
> +}
> +
> +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
>
> ?#include "sched_idletask.c"
> ?#include "sched_fair.c"
>
>

2010-12-11 02:23:04

by Nicolas Pitre

[permalink] [raw]
Subject: Re: [PATCH] sched: Fix the irqtime code to deal with u64 wraps

On Fri, 10 Dec 2010, Peter Zijlstra wrote:

> OK, so here's the latest version, using fancy __this_cpu thingies.
>
> I started a new thread since the old one was quite unwieldy.
>
> Now, admittedly this patch is a tad large, esp for -rc5. So either we
> need lots of Reviewed-by and such or I need to shrink this patch
> somehow.
>
>
> ---
> Subject: sched: Fix the irqtime code to deal with u64 wraps
> From: Peter Zijlstra <[email protected]>
> Date: Thu Dec 09 14:15:34 CET 2010
>
> ARM systems have a 32bit sched_clock() [ which needs to be fixed ],
> but this exposed a bug in the irq_time code as well, it doesn't deal
> with wraps at all.
[...]

That would be more close to reality if you stated "Some ARM systems have
a 32bit sched_clock() [" above. Most ARM systems in use today have a
sched_clock() with more than 32 bits, like 55 bits or so.


Nicolas

2010-12-12 13:03:17

by Mikael Pettersson

[permalink] [raw]
Subject: Re: [PATCH] sched: Fix the irqtime code to deal with u64 wraps

On Fri, 10 Dec 2010 21:38:26 +0100, Peter Zijlstra <[email protected]> wrote:
> OK, so here's the latest version, using fancy __this_cpu thingies.
>
> I started a new thread since the old one was quite unwieldy.
>
> Now, admittedly this patch is a tad large, esp for -rc5. So either we
> need lots of Reviewed-by and such or I need to shrink this patch
> somehow.
>
>
> ---
> Subject: sched: Fix the irqtime code to deal with u64 wraps
> From: Peter Zijlstra <[email protected]>
> Date: Thu Dec 09 14:15:34 CET 2010
>
> ARM systems have a 32bit sched_clock() [ which needs to be fixed ],
> but this exposed a bug in the irq_time code as well, it doesn't deal
> with wraps at all.
>
> Fix the irq_time code to deal with u64 wraps by re-writing the code to
> only use delta increments, which avoids the whole issue.
>
> Furthermore, solve the problem of 32bit arches reading partial updates
> of the u64 time values.
>
> Cc: Venkatesh Pallipadi <[email protected]>
> Reported-by: Mikael Pettersson <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>

Tested 2.6.37-rc5 + this patch on ARM/mach-ixp4xx and ARM/mach-iop32x,
and it did solve the previously reported interactivity problems. Thanks.

Tested-by: Mikael Pettersson <[email protected]>

I did however have to apply the following to avoid a compilation error:

--- linux-2.6.37-rc5/kernel/sched.c.~1~ 2010-12-12 13:03:07.000000000 +0100
+++ linux-2.6.37-rc5/kernel/sched.c 2010-12-12 13:05:40.000000000 +0100
@@ -636,6 +636,8 @@ static inline struct task_group *task_gr

#endif /* CONFIG_CGROUP_SCHED */

+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
static void update_rq_clock(struct rq *rq)
{
s64 delta;

> ---
> kernel/sched.c | 176 +++++++++++++++++++++++++++++++++++++++------------------
> 1 file changed, 121 insertions(+), 55 deletions(-)
>
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -636,22 +636,16 @@ static inline struct task_group *task_gr
>
> #endif /* CONFIG_CGROUP_SCHED */
>
> -static u64 irq_time_cpu(int cpu);
> -static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
> -
> -inline void update_rq_clock(struct rq *rq)
> +static void update_rq_clock(struct rq *rq)
> {
> - if (!rq->skip_clock_update) {
> - int cpu = cpu_of(rq);
> - u64 irq_time;
> + s64 delta;
>
> - rq->clock = sched_clock_cpu(cpu);
> - irq_time = irq_time_cpu(cpu);
> - if (rq->clock - irq_time > rq->clock_task)
> - rq->clock_task = rq->clock - irq_time;
> + if (rq->skip_clock_update)
> + return;
>
> - sched_irq_time_avg_update(rq, irq_time);
> - }
> + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
> + rq->clock += delta;
> + update_rq_clock_task(rq, delta);
> }
>
> /*
> @@ -1918,90 +1912,162 @@ static void deactivate_task(struct rq *r
> #ifdef CONFIG_IRQ_TIME_ACCOUNTING
>
> /*
> - * There are no locks covering percpu hardirq/softirq time.
> - * They are only modified in account_system_vtime, on corresponding CPU
> - * with interrupts disabled. So, writes are safe.
> + * There are no locks covering percpu hardirq/softirq time. They are only
> + * modified in account_system_vtime, on corresponding CPU with interrupts
> + * disabled. So, writes are safe.
> + *
> * They are read and saved off onto struct rq in update_rq_clock().
> - * This may result in other CPU reading this CPU's irq time and can
> - * race with irq/account_system_vtime on this CPU. We would either get old
> - * or new value (or semi updated value on 32 bit) with a side effect of
> - * accounting a slice of irq time to wrong task when irq is in progress
> - * while we read rq->clock. That is a worthy compromise in place of having
> - * locks on each irq in account_system_time.
> + *
> + * This may result in other CPU reading this CPU's irq time and can race with
> + * irq/account_system_vtime on this CPU. We would either get old or new value
> + * with a side effect of accounting a slice of irq time to wrong task when irq
> + * is in progress while we read rq->clock. That is a worthy compromise in place
> + * of having locks on each irq in account_system_time.
> */
> static DEFINE_PER_CPU(u64, cpu_hardirq_time);
> static DEFINE_PER_CPU(u64, cpu_softirq_time);
> -
> static DEFINE_PER_CPU(u64, irq_start_time);
> -static int sched_clock_irqtime;
>
> -void enable_sched_clock_irqtime(void)
> +#ifndef CONFIG_64BIT
> +static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
> +
> +static inline void irq_time_write_begin(void)
> {
> - sched_clock_irqtime = 1;
> + __this_cpu_inc(irq_time_seq.sequence);
> + smp_wmb();
> }
>
> -void disable_sched_clock_irqtime(void)
> +static inline void irq_time_write_end(void)
> {
> - sched_clock_irqtime = 0;
> + smp_wmb();
> + __this_cpu_inc(irq_time_seq.sequence);
> }
>
> -static u64 irq_time_cpu(int cpu)
> +static inline u64 irq_time_read(int cpu)
> {
> - if (!sched_clock_irqtime)
> - return 0;
> + u64 irq_time;
> + unsigned seq;
> +
> + do {
> + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
> + irq_time = per_cpu(cpu_softirq_time, cpu) +
> + per_cpu(cpu_hardirq_time, cpu);
> + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
> +
> + return irq_time;
> +}
> +#else /* CONFIG_64BIT */
> +static inline void irq_time_write_begin(void)
> +{
> +}
>
> +static inline void irq_time_write_end(void)
> +{
> +}
> +
> +static inline u64 irq_time_read(int cpu)
> +{
> return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
> }
> +#endif /* CONFIG_64BIT */
> +
> +static int sched_clock_irqtime;
> +
> +void enable_sched_clock_irqtime(void)
> +{
> + sched_clock_irqtime = 1;
> +}
> +
> +void disable_sched_clock_irqtime(void)
> +{
> + sched_clock_irqtime = 0;
> +}
>
> +/*
> + * Called before incrementing preempt_count on {soft,}irq_enter
> + * and before decrementing preempt_count on {soft,}irq_exit.
> + */
> void account_system_vtime(struct task_struct *curr)
> {
> unsigned long flags;
> + s64 delta;
> int cpu;
> - u64 now, delta;
>
> if (!sched_clock_irqtime)
> return;
>
> local_irq_save(flags);
> -
> cpu = smp_processor_id();
> - now = sched_clock_cpu(cpu);
> - delta = now - per_cpu(irq_start_time, cpu);
> - per_cpu(irq_start_time, cpu) = now;
> + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
> + __this_cpu_add(irq_start_time, delta);
> +
> + irq_time_write_begin();
> +
> + if (hardirq_count())
> + __this_cpu_add(cpu_hardirq_time, delta);
> /*
> - * We do not account for softirq time from ksoftirqd here.
> - * We want to continue accounting softirq time to ksoftirqd thread
> - * in that case, so as not to confuse scheduler with a special task
> - * that do not consume any time, but still wants to run.
> + * We do not account for softirq time from ksoftirqd here. We want to
> + * continue accounting softirq time to ksoftirqd thread in that case,
> + * so as not to confuse scheduler with a special task that do not
> + * consume any time, but still wants to run.
> */
> - if (hardirq_count())
> - per_cpu(cpu_hardirq_time, cpu) += delta;
> else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
> - per_cpu(cpu_softirq_time, cpu) += delta;
> + __this_cpu_add(cpu_softirq_time, delta);
>
> + irq_time_write_end();
> local_irq_restore(flags);
> }
> EXPORT_SYMBOL_GPL(account_system_vtime);
>
> -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
> +static u64 irq_time_cpu(struct rq *rq)
> {
> - if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
> - u64 delta_irq = curr_irq_time - rq->prev_irq_time;
> - rq->prev_irq_time = curr_irq_time;
> - sched_rt_avg_update(rq, delta_irq);
> - }
> + /*
> + * See the comment in update_rq_clock_task(), ideally we'd update
> + * the *irq_time values using rq->clock here.
> + */
> + return irq_time_read(cpu_of(rq));
> }
>
> -#else
> -
> -static u64 irq_time_cpu(int cpu)
> +static void update_rq_clock_task(struct rq *rq, s64 delta)
> {
> - return 0;
> + s64 irq_delta;
> +
> + irq_delta = irq_time_cpu(rq) - rq->prev_irq_time;
> +
> + /*
> + * Since irq_time is only updated on {soft,}irq_exit, we might run into
> + * this case when a previous update_rq_clock() happened inside a
> + * {soft,}irq region.
> + *
> + * When this happens, we stop ->clock_task and only update the
> + * prev_irq_time stamp to account for the part that fit, so that a next
> + * update will consume the rest. This ensures ->clock_task is
> + * monotonic.
> + *
> + * It does however cause some slight miss-attribution of {soft,}irq
> + * time, a more accurate solution would be to update the irq_time using
> + * the current rq->clock timestamp, except that would require using
> + * atomic ops.
> + */
> + if (irq_delta > delta)
> + irq_delta = delta;
> +
> + rq->prev_irq_time += irq_delta;
> + delta -= irq_delta;
> + rq->clock_task += delta;
> +
> + if (irq_delta && sched_feat(NONIRQ_POWER))
> + sched_rt_avg_update(rq, irq_delta);
> }
>
> -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
> +#else /* CONFIG_IRQ_TIME_ACCOUNTING */
>
> -#endif
> +static void update_rq_clock_task(struct rq *rq, s64 delta)
> +{
> + rq->clock_task += delta;
> +}
> +
> +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
>
> #include "sched_idletask.c"
> #include "sched_fair.c"

2010-12-13 12:14:54

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 2/2] sched: Fix the irqtime code for 32bit

Subject: sched: Fix the irqtime code for 32bit
From: Peter Zijlstra <[email protected]>
Date: Thu Dec 09 14:15:34 CET 2010

Since the irqtime accounting is using non-atomic u64 and can be read
from remote cpus (writes are strictly cpu local, reads are not) we
have to deal with observing partial updates.

When we do observe partial updates the clock movement (in particular,
->clock_task movement) will go funny (in either direction), a
subsequent clock update (observing the full update) will make it go
funny in the oposite direction.

Since we rely on these clocks to be strictly monotonic we cannot
suffer backwards motion. One possible solution would be to simply
ignore all backwards deltas, but that will lead to accounting
artefacts, most notable: clock_task + irq_time != clock, this
inaccuracy would end up in user visible stats.

Therefore serialize the reads using a seqcount.

Reviewed-by: Venkatesh Pallipadi <[email protected]>
Reported-by: Mikael Pettersson <[email protected]>
Tested-by: Mikael Pettersson <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <new-submission>
---
kernel/sched.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 45 insertions(+), 6 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -1920,10 +1920,9 @@ static void deactivate_task(struct rq *r
* They are read and saved off onto struct rq in update_rq_clock().
* This may result in other CPU reading this CPU's irq time and can
* race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
*/
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1941,10 +1940,48 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}

-static inline u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}
+#endif /* CONFIG_64BIT */

/*
* Called before incrementing preempt_count on {soft,}irq_enter
@@ -1965,6 +2002,7 @@ void account_system_vtime(struct task_st
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);

+ irq_time_write_begin();
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -1976,6 +2014,7 @@ void account_system_vtime(struct task_st
else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
__this_cpu_add(cpu_softirq_time, delta);

+ irq_time_write_end();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(account_system_vtime);
@@ -1984,7 +2023,7 @@ static void update_rq_clock_task(struct
{
s64 irq_delta;

- irq_delta = irq_time_cpu(cpu_of(rq)) - rq->prev_irq_time;
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into

2010-12-13 12:18:48

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 1/2] sched: Fix the irqtime code to deal with u64 wraps

Subject: sched: Fix the irqtime code to deal with u64 wraps
From: Peter Zijlstra <[email protected]>
Date: Thu Dec 09 14:15:34 CET 2010

Some ARM systems have a short sched_clock() [ which needs to be fixed
too ], but this exposed a bug in the irq_time code as well, it doesn't
deal with wraps at all.

Fix the irq_time code to deal with u64 wraps by re-writing the code to
only use delta increments, which avoids the whole issue.

Reviewed-by: Venkatesh Pallipadi <[email protected]>
Reported-by: Mikael Pettersson <[email protected]>
Tested-by: Mikael Pettersson <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <new-submission>
---
kernel/sched.c | 83 ++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 50 insertions(+), 33 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -636,23 +636,18 @@ static inline struct task_group *task_gr

#endif /* CONFIG_CGROUP_SCHED */

-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);

-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
{
- int cpu = cpu_of(rq);
- u64 irq_time;
+ s64 delta;

if (rq->skip_clock_update)
return;

- rq->clock = sched_clock_cpu(cpu);
- irq_time = irq_time_cpu(cpu);
- if (rq->clock - irq_time > rq->clock_task)
- rq->clock_task = rq->clock - irq_time;
-
- sched_irq_time_avg_update(rq, irq_time);
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
}

/*
@@ -1946,19 +1941,20 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}

-static u64 irq_time_cpu(int cpu)
+static inline u64 irq_time_cpu(int cpu)
{
- if (!sched_clock_irqtime)
- return 0;
-
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}

+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
void account_system_vtime(struct task_struct *curr)
{
unsigned long flags;
+ s64 delta;
int cpu;
- u64 now, delta;

if (!sched_clock_irqtime)
return;
@@ -1966,9 +1962,9 @@ void account_system_vtime(struct task_st
local_irq_save(flags);

cpu = smp_processor_id();
- now = sched_clock_cpu(cpu);
- delta = now - per_cpu(irq_start_time, cpu);
- per_cpu(irq_start_time, cpu) = now;
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -1976,33 +1972,54 @@ void account_system_vtime(struct task_st
* that do not consume any time, but still wants to run.
*/
if (hardirq_count())
- per_cpu(cpu_hardirq_time, cpu) += delta;
+ __this_cpu_add(cpu_hardirq_time, delta);
else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
- per_cpu(cpu_softirq_time, cpu) += delta;
+ __this_cpu_add(cpu_softirq_time, delta);

local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(account_system_vtime);

-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
- u64 delta_irq = curr_irq_time - rq->prev_irq_time;
- rq->prev_irq_time = curr_irq_time;
- sched_rt_avg_update(rq, delta_irq);
- }
+ s64 irq_delta;
+
+ irq_delta = irq_time_cpu(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ rq->clock_task += delta;
+
+ if (irq_delta && sched_feat(NONIRQ_POWER))
+ sched_rt_avg_update(rq, irq_delta);
}

-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */

-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- return 0;
+ rq->clock_task += delta;
}

-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
-
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

#include "sched_idletask.c"
#include "sched_fair.c"

2010-12-15 01:14:32

by Daniel Kopko

[permalink] [raw]
Subject: RE: [PATCH 1/2] sched: Fix the irqtime code to deal with u64 wraps


Sorry for the lack of quoted text and lack of In-Reply-To headers. (I am
replying from a webmail account in reference to the following message:
http://marc.info/?l=linux-arm-kernel&m=129224292425624&w=2 ). It seems that
perhaps an unintentional omission of inline happened in this patch set, when
converting update_rq_clock to static linkage.


-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)


The rest of the functions which had static linkage added retained
the inline modifier.

This is my first message to LKML. Can anyone recommend a decent free
email provider that will work well here? Also, please CC me directly if
convenient
for any replies.

Thanks,

Daniel Kopko


2010-12-15 18:16:30

by Venkatesh Pallipadi

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched: Fix the irqtime code to deal with u64 wraps

Peter,

This looks like something that happened while splitting this into two
patches. I needed a trivial change like below before I could apply
these two patches on linus-git.

Thanks,
Venki
---
@@ -641,17 +641,18 @@ static void sched_irq_time_avg_update(struct rq
*rq, u64 irq_time);

inline void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update) {
- int cpu = cpu_of(rq);
- u64 irq_time;
+ int cpu = cpu_of(rq);
+ u64 irq_time;

- rq->clock = sched_clock_cpu(cpu);
- irq_time = irq_time_cpu(cpu);
- if (rq->clock - irq_time > rq->clock_task)
- rq->clock_task = rq->clock - irq_time;
+ if (rq->skip_clock_update)
+ return;

- sched_irq_time_avg_update(rq, irq_time);
- }
+ rq->clock = sched_clock_cpu(cpu);
+ irq_time = irq_time_cpu(cpu);
+ if (rq->clock - irq_time > rq->clock_task)
+ rq->clock_task = rq->clock - irq_time;
+
+ sched_irq_time_avg_update(rq, irq_time);
}

/*

---

On Mon, Dec 13, 2010 at 4:13 AM, Peter Zijlstra <[email protected]> wrote:
> Subject: sched: Fix the irqtime code to deal with u64 wraps
> From: Peter Zijlstra <[email protected]>
> Date: Thu Dec 09 14:15:34 CET 2010
>
> Some ARM systems have a short sched_clock() [ which needs to be fixed
> too ], but this exposed a bug in the irq_time code as well, it doesn't
> deal with wraps at all.
>
> Fix the irq_time code to deal with u64 wraps by re-writing the code to
> only use delta increments, which avoids the whole issue.
>
> Reviewed-by: Venkatesh Pallipadi <[email protected]>
> Reported-by: Mikael Pettersson <[email protected]>
> Tested-by: Mikael Pettersson <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> LKML-Reference: <new-submission>
> ---
> ?kernel/sched.c | ? 83 ++++++++++++++++++++++++++++++++++-----------------------
> ?1 file changed, 50 insertions(+), 33 deletions(-)
>
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -636,23 +636,18 @@ static inline struct task_group *task_gr
>
> ?#endif /* CONFIG_CGROUP_SCHED */
>
> -static u64 irq_time_cpu(int cpu);
> -static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
> +static void update_rq_clock_task(struct rq *rq, s64 delta);
>
> -inline void update_rq_clock(struct rq *rq)
> +static void update_rq_clock(struct rq *rq)
> ?{
> - ? ? ? int cpu = cpu_of(rq);
> - ? ? ? u64 irq_time;
> + ? ? ? s64 delta;
>
> ? ? ? ?if (rq->skip_clock_update)
> ? ? ? ? ? ? ? ?return;
>
> - ? ? ? rq->clock = sched_clock_cpu(cpu);
> - ? ? ? irq_time = irq_time_cpu(cpu);
> - ? ? ? if (rq->clock - irq_time > rq->clock_task)
> - ? ? ? ? ? ? ? rq->clock_task = rq->clock - irq_time;
> -
> - ? ? ? sched_irq_time_avg_update(rq, irq_time);
> + ? ? ? delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
> + ? ? ? rq->clock += delta;
> + ? ? ? update_rq_clock_task(rq, delta);
> ?}
>
> ?/*
> @@ -1946,19 +1941,20 @@ void disable_sched_clock_irqtime(void)
> ? ? ? ?sched_clock_irqtime = 0;
> ?}
>
> -static u64 irq_time_cpu(int cpu)
> +static inline u64 irq_time_cpu(int cpu)
> ?{
> - ? ? ? if (!sched_clock_irqtime)
> - ? ? ? ? ? ? ? return 0;
> -
> ? ? ? ?return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
> ?}
>
> +/*
> + * Called before incrementing preempt_count on {soft,}irq_enter
> + * and before decrementing preempt_count on {soft,}irq_exit.
> + */
> ?void account_system_vtime(struct task_struct *curr)
> ?{
> ? ? ? ?unsigned long flags;
> + ? ? ? s64 delta;
> ? ? ? ?int cpu;
> - ? ? ? u64 now, delta;
>
> ? ? ? ?if (!sched_clock_irqtime)
> ? ? ? ? ? ? ? ?return;
> @@ -1966,9 +1962,9 @@ void account_system_vtime(struct task_st
> ? ? ? ?local_irq_save(flags);
>
> ? ? ? ?cpu = smp_processor_id();
> - ? ? ? now = sched_clock_cpu(cpu);
> - ? ? ? delta = now - per_cpu(irq_start_time, cpu);
> - ? ? ? per_cpu(irq_start_time, cpu) = now;
> + ? ? ? delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
> + ? ? ? __this_cpu_add(irq_start_time, delta);
> +
> ? ? ? ?/*
> ? ? ? ? * We do not account for softirq time from ksoftirqd here.
> ? ? ? ? * We want to continue accounting softirq time to ksoftirqd thread
> @@ -1976,33 +1972,54 @@ void account_system_vtime(struct task_st
> ? ? ? ? * that do not consume any time, but still wants to run.
> ? ? ? ? */
> ? ? ? ?if (hardirq_count())
> - ? ? ? ? ? ? ? per_cpu(cpu_hardirq_time, cpu) += delta;
> + ? ? ? ? ? ? ? __this_cpu_add(cpu_hardirq_time, delta);
> ? ? ? ?else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
> - ? ? ? ? ? ? ? per_cpu(cpu_softirq_time, cpu) += delta;
> + ? ? ? ? ? ? ? __this_cpu_add(cpu_softirq_time, delta);
>
> ? ? ? ?local_irq_restore(flags);
> ?}
> ?EXPORT_SYMBOL_GPL(account_system_vtime);
>
> -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
> +static void update_rq_clock_task(struct rq *rq, s64 delta)
> ?{
> - ? ? ? if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
> - ? ? ? ? ? ? ? u64 delta_irq = curr_irq_time - rq->prev_irq_time;
> - ? ? ? ? ? ? ? rq->prev_irq_time = curr_irq_time;
> - ? ? ? ? ? ? ? sched_rt_avg_update(rq, delta_irq);
> - ? ? ? }
> + ? ? ? s64 irq_delta;
> +
> + ? ? ? irq_delta = irq_time_cpu(cpu_of(rq)) - rq->prev_irq_time;
> +
> + ? ? ? /*
> + ? ? ? ?* Since irq_time is only updated on {soft,}irq_exit, we might run into
> + ? ? ? ?* this case when a previous update_rq_clock() happened inside a
> + ? ? ? ?* {soft,}irq region.
> + ? ? ? ?*
> + ? ? ? ?* When this happens, we stop ->clock_task and only update the
> + ? ? ? ?* prev_irq_time stamp to account for the part that fit, so that a next
> + ? ? ? ?* update will consume the rest. This ensures ->clock_task is
> + ? ? ? ?* monotonic.
> + ? ? ? ?*
> + ? ? ? ?* It does however cause some slight miss-attribution of {soft,}irq
> + ? ? ? ?* time, a more accurate solution would be to update the irq_time using
> + ? ? ? ?* the current rq->clock timestamp, except that would require using
> + ? ? ? ?* atomic ops.
> + ? ? ? ?*/
> + ? ? ? if (irq_delta > delta)
> + ? ? ? ? ? ? ? irq_delta = delta;
> +
> + ? ? ? rq->prev_irq_time += irq_delta;
> + ? ? ? delta -= irq_delta;
> + ? ? ? rq->clock_task += delta;
> +
> + ? ? ? if (irq_delta && sched_feat(NONIRQ_POWER))
> + ? ? ? ? ? ? ? sched_rt_avg_update(rq, irq_delta);
> ?}
>
> -#else
> +#else /* CONFIG_IRQ_TIME_ACCOUNTING */
>
> -static u64 irq_time_cpu(int cpu)
> +static void update_rq_clock_task(struct rq *rq, s64 delta)
> ?{
> - ? ? ? return 0;
> + ? ? ? rq->clock_task += delta;
> ?}
>
> -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
> -
> -#endif
> +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
>
> ?#include "sched_idletask.c"
> ?#include "sched_fair.c"
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at ?http://www.tux.org/lkml/
>

2010-12-15 18:24:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched: Fix the irqtime code to deal with u64 wraps

On Wed, 2010-12-15 at 10:16 -0800, Venkatesh Pallipadi wrote:
> Peter,
>
> This looks like something that happened while splitting this into two
> patches. I needed a trivial change like below before I could apply
> these two patches on linus-git.
>
> Thanks,
> Venki
> ---
> @@ -641,17 +641,18 @@ static void sched_irq_time_avg_update(struct rq
> *rq, u64 irq_time);
>
> inline void update_rq_clock(struct rq *rq)
> {
> - if (!rq->skip_clock_update) {
> - int cpu = cpu_of(rq);
> - u64 irq_time;
> + int cpu = cpu_of(rq);
> + u64 irq_time;
>
> - rq->clock = sched_clock_cpu(cpu);
> - irq_time = irq_time_cpu(cpu);
> - if (rq->clock - irq_time > rq->clock_task)
> - rq->clock_task = rq->clock - irq_time;
> + if (rq->skip_clock_update)
> + return;
>
> - sched_irq_time_avg_update(rq, irq_time);
> - }
> + rq->clock = sched_clock_cpu(cpu);
> + irq_time = irq_time_cpu(cpu);
> + if (rq->clock - irq_time > rq->clock_task)
> + rq->clock_task = rq->clock - irq_time;
> +
> + sched_irq_time_avg_update(rq, irq_time);
> }
>
> /*


That's due another patch in tip/sched/urgent

2010-12-16 12:31:55

by Peter Zijlstra

[permalink] [raw]
Subject: [tip:sched/urgent] sched: Fix the irqtime code to deal with u64 wraps

Commit-ID: fe44d62122829959e960bc699318d58966922a69
Gitweb: http://git.kernel.org/tip/fe44d62122829959e960bc699318d58966922a69
Author: Peter Zijlstra <[email protected]>
AuthorDate: Thu, 9 Dec 2010 14:15:34 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 16 Dec 2010 11:17:46 +0100

sched: Fix the irqtime code to deal with u64 wraps

Some ARM systems have a short sched_clock() [ which needs to be fixed
too ], but this exposed a bug in the irq_time code as well, it doesn't
deal with wraps at all.

Fix the irq_time code to deal with u64 wraps by re-writing the code to
only use delta increments, which avoids the whole issue.

Reviewed-by: Venkatesh Pallipadi <[email protected]>
Reported-by: Mikael Pettersson <[email protected]>
Tested-by: Mikael Pettersson <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <1292242433.6803.199.camel@twins>
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched.c | 83 +++++++++++++++++++++++++++++++++----------------------
1 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index da14302..79b557c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -636,23 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)

#endif /* CONFIG_CGROUP_SCHED */

-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);

-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
{
- int cpu = cpu_of(rq);
- u64 irq_time;
+ s64 delta;

if (rq->skip_clock_update)
return;

- rq->clock = sched_clock_cpu(cpu);
- irq_time = irq_time_cpu(cpu);
- if (rq->clock - irq_time > rq->clock_task)
- rq->clock_task = rq->clock - irq_time;
-
- sched_irq_time_avg_update(rq, irq_time);
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
}

/*
@@ -1946,19 +1941,20 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}

-static u64 irq_time_cpu(int cpu)
+static inline u64 irq_time_cpu(int cpu)
{
- if (!sched_clock_irqtime)
- return 0;
-
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}

+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
void account_system_vtime(struct task_struct *curr)
{
unsigned long flags;
+ s64 delta;
int cpu;
- u64 now, delta;

if (!sched_clock_irqtime)
return;
@@ -1966,9 +1962,9 @@ void account_system_vtime(struct task_struct *curr)
local_irq_save(flags);

cpu = smp_processor_id();
- now = sched_clock_cpu(cpu);
- delta = now - per_cpu(irq_start_time, cpu);
- per_cpu(irq_start_time, cpu) = now;
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -1976,33 +1972,54 @@ void account_system_vtime(struct task_struct *curr)
* that do not consume any time, but still wants to run.
*/
if (hardirq_count())
- per_cpu(cpu_hardirq_time, cpu) += delta;
+ __this_cpu_add(cpu_hardirq_time, delta);
else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
- per_cpu(cpu_softirq_time, cpu) += delta;
+ __this_cpu_add(cpu_softirq_time, delta);

local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(account_system_vtime);

-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
- u64 delta_irq = curr_irq_time - rq->prev_irq_time;
- rq->prev_irq_time = curr_irq_time;
- sched_rt_avg_update(rq, delta_irq);
- }
+ s64 irq_delta;
+
+ irq_delta = irq_time_cpu(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ rq->clock_task += delta;
+
+ if (irq_delta && sched_feat(NONIRQ_POWER))
+ sched_rt_avg_update(rq, irq_delta);
}

-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */

-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
{
- return 0;
+ rq->clock_task += delta;
}

-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
-
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

#include "sched_idletask.c"
#include "sched_fair.c"

2010-12-16 12:32:13

by Peter Zijlstra

[permalink] [raw]
Subject: [tip:sched/urgent] sched: Fix the irqtime code for 32bit

Commit-ID: 8e92c20183ed0579d94501311b81c42b65cb2129
Gitweb: http://git.kernel.org/tip/8e92c20183ed0579d94501311b81c42b65cb2129
Author: Peter Zijlstra <[email protected]>
AuthorDate: Thu, 9 Dec 2010 14:15:34 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 16 Dec 2010 11:17:47 +0100

sched: Fix the irqtime code for 32bit

Since the irqtime accounting is using non-atomic u64 and can be read
from remote cpus (writes are strictly cpu local, reads are not) we
have to deal with observing partial updates.

When we do observe partial updates the clock movement (in particular,
->clock_task movement) will go funny (in either direction), a
subsequent clock update (observing the full update) will make it go
funny in the oposite direction.

Since we rely on these clocks to be strictly monotonic we cannot
suffer backwards motion. One possible solution would be to simply
ignore all backwards deltas, but that will lead to accounting
artefacts, most notable: clock_task + irq_time != clock, this
inaccuracy would end up in user visible stats.

Therefore serialize the reads using a seqcount.

Reviewed-by: Venkatesh Pallipadi <[email protected]>
Reported-by: Mikael Pettersson <[email protected]>
Tested-by: Mikael Pettersson <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <1292242434.6803.200.camel@twins>
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 79b557c..456c990 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1920,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
* They are read and saved off onto struct rq in update_rq_clock().
* This may result in other CPU reading this CPU's irq time and can
* race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
*/
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1941,10 +1940,48 @@ void disable_sched_clock_irqtime(void)
sched_clock_irqtime = 0;
}

-static inline u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
{
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
}
+#endif /* CONFIG_64BIT */

/*
* Called before incrementing preempt_count on {soft,}irq_enter
@@ -1965,6 +2002,7 @@ void account_system_vtime(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);

+ irq_time_write_begin();
/*
* We do not account for softirq time from ksoftirqd here.
* We want to continue accounting softirq time to ksoftirqd thread
@@ -1976,6 +2014,7 @@ void account_system_vtime(struct task_struct *curr)
else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
__this_cpu_add(cpu_softirq_time, delta);

+ irq_time_write_end();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(account_system_vtime);
@@ -1984,7 +2023,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
{
s64 irq_delta;

- irq_delta = irq_time_cpu(cpu_of(rq)) - rq->prev_irq_time;
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into