This patch allows the user to migrate currently queued
standard timers from one cpu to another, thereby reducing
timer induced latency on the chosen cpu. Timers that
were placed with add_timer_on() are considered to have
'cpu affinity' and are not moved.
The changes in drivers/base/cpu.c provide a clean and
convenient interface for triggering the migration through
sysfs, via writing the destination cpu number to a file
associated with the source cpu.
Note that migrating timers will not, by itself, keep new
timers off of the chosen cpu. But with careful control of
thread affinity, one can control the affinity of new timers
and keep timer induced latencies off of the chosen cpu.
This particular patch does not affect the hrtimers. That
could be addressed later.
Signed-off-by: Dimitri Sivanich <[email protected]>
Index: linux/kernel/timer.c
===================================================================
--- linux.orig/kernel/timer.c
+++ linux/kernel/timer.c
@@ -147,6 +147,7 @@ void fastcall init_timer(struct timer_li
{
timer->entry.next = NULL;
timer->base = __raw_get_cpu_var(tvec_bases);
+ timer->aff = 0;
}
EXPORT_SYMBOL(init_timer);
@@ -250,6 +251,7 @@ void add_timer_on(struct timer_list *tim
BUG_ON(timer_pending(timer) || !timer->function);
spin_lock_irqsave(&base->lock, flags);
timer->base = base;
+ timer->aff = 1; /* Don't migrate */
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
}
@@ -1661,6 +1663,52 @@ static void __devinit migrate_timers(int
}
#endif /* CONFIG_HOTPLUG_CPU */
+static void move_timer_list(tvec_base_t *new_base, struct list_head *head)
+{
+ struct timer_list *timer, *t;
+
+ list_for_each_entry_safe(timer, t, head, entry) {
+ if (timer->aff)
+ continue;
+ detach_timer(timer, 0);
+ timer->base = new_base;
+ internal_add_timer(new_base, timer);
+ }
+}
+
+int move_timers(int cpu, int dest)
+{
+ tvec_base_t *old_base;
+ tvec_base_t *new_base;
+ unsigned long flags;
+ int i;
+
+ if (cpu == dest)
+ return -EINVAL;
+
+ if (!cpu_online(cpu) || !cpu_online(dest))
+ return -EINVAL;
+
+ old_base = per_cpu(tvec_bases, cpu);
+ new_base = per_cpu(tvec_bases, dest);
+
+ spin_lock_irqsave(&new_base->lock, flags);
+ spin_lock(&old_base->lock);
+
+ for (i = 0; i < TVR_SIZE; i++)
+ move_timer_list(new_base, old_base->tv1.vec + i);
+ for (i = 0; i < TVN_SIZE; i++) {
+ move_timer_list(new_base, old_base->tv2.vec + i);
+ move_timer_list(new_base, old_base->tv3.vec + i);
+ move_timer_list(new_base, old_base->tv4.vec + i);
+ move_timer_list(new_base, old_base->tv5.vec + i);
+ }
+
+ spin_unlock(&old_base->lock);
+ spin_unlock_irqrestore(&new_base->lock, flags);
+ return 0;
+}
+
static int __cpuinit timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
Index: linux/drivers/base/cpu.c
===================================================================
--- linux.orig/drivers/base/cpu.c
+++ linux/drivers/base/cpu.c
@@ -54,6 +54,26 @@ static ssize_t store_online(struct sys_d
}
static SYSDEV_ATTR(online, 0600, show_online, store_online);
+static ssize_t store_migrate(struct sys_device *dev, const char *buf,
+ size_t count)
+{
+ unsigned int cpu = dev->id;
+ unsigned long dest;
+ int rc;
+
+ dest = simple_strtoul(buf, NULL, 10);
+ if (dest > INT_MAX)
+ return -EINVAL;
+
+ rc = move_timers(cpu, dest);
+ if (rc < 0)
+ return rc;
+ else
+ return count;
+}
+
+static SYSDEV_ATTR(timer_migrate, 0200, NULL, store_migrate);
+
static void __devinit register_cpu_control(struct cpu *cpu)
{
sysdev_create_file(&cpu->sysdev, &attr_online);
@@ -62,6 +82,8 @@ void unregister_cpu(struct cpu *cpu)
{
int logical_cpu = cpu->sysdev.id;
+ sysdev_remove_file(&cpu->sysdev, &attr_timer_migrate);
+
unregister_cpu_under_node(logical_cpu, cpu_to_node(logical_cpu));
sysdev_remove_file(&cpu->sysdev, &attr_online);
@@ -124,6 +146,8 @@ int __devinit register_cpu(struct cpu *c
cpu_sys_devices[num] = &cpu->sysdev;
if (!error)
register_cpu_under_node(num, cpu_to_node(num));
+ if (!error)
+ sysdev_create_file(&cpu->sysdev, &attr_timer_migrate);
#ifdef CONFIG_KEXEC
if (!error)
Index: linux/include/linux/timer.h
===================================================================
--- linux.orig/include/linux/timer.h
+++ linux/include/linux/timer.h
@@ -15,6 +15,8 @@ struct timer_list {
unsigned long data;
struct tvec_t_base_s *base;
+
+ short aff;
};
extern struct tvec_t_base_s boot_tvec_bases;
@@ -24,6 +26,7 @@ extern struct tvec_t_base_s boot_tvec_ba
.expires = (_expires), \
.data = (_data), \
.base = &boot_tvec_bases, \
+ .aff = 0, \
}
#define DEFINE_TIMER(_name, _function, _expires, _data) \
@@ -95,6 +98,7 @@ static inline void add_timer(struct time
extern void init_timers(void);
extern void run_local_timers(void);
+extern int move_timers(int, int);
struct hrtimer;
extern int it_real_fn(struct hrtimer *);
>>>>> "Dimitri" == Dimitri Sivanich <[email protected]> writes:
Dimitri> This patch allows the user to migrate currently queued
Dimitri> standard timers from one cpu to another, thereby reducing
Dimitri> timer induced latency on the chosen cpu. Timers that were
Dimitri> placed with add_timer_on() are considered to have 'cpu
Dimitri> affinity' and are not moved.
Dimitri> The changes in drivers/base/cpu.c provide a clean and
Dimitri> convenient interface for triggering the migration through
Dimitri> sysfs, via writing the destination cpu number to a file
Dimitri> associated with the source cpu.
Hi Dimitri,
I just took a quick look at your patch, and at least on the surface it
looks pretty nice to me.
One minor nit, why choose short for the affinity field in struct
timer_list, it seems a strange size to pick for something which is
either 0 or 1. Wouldn't int or char be better? I don't know if all
CPUs have 16 bit stores, but they should have 8 and 32 bit.
The name 'aff' for affinity might not be good either, since we tend to
refer to affinity as a mask specifying where it's locked to, maybe
'locked' would be better?
All in the nit-picking department though.
Cheers,
Jes
Index: linux/include/linux/timer.h
===================================================================
--- linux.orig/include/linux/timer.h
+++ linux/include/linux/timer.h
@@ -15,6 +15,8 @@ struct timer_list {
unsigned long data;
struct tvec_t_base_s *base;
+
+ short aff;
};
Hi Jes,
On Thu, Sep 14, 2006 at 10:11:39AM -0400, Jes Sorensen wrote:
> Hi Dimitri,
>
> I just took a quick look at your patch, and at least on the surface it
> looks pretty nice to me.
>
> One minor nit, why choose short for the affinity field in struct
> timer_list, it seems a strange size to pick for something which is
> either 0 or 1. Wouldn't int or char be better? I don't know if all
> CPUs have 16 bit stores, but they should have 8 and 32 bit.
Yes, you're probably right. I would have no problem with this being
changed to a 'char'.
>
> The name 'aff' for affinity might not be good either, since we tend to
> refer to affinity as a mask specifying where it's locked to, maybe
> 'locked' would be better?
A field name of 'locked' would be OK with me.
>
> All in the nit-picking department though.
>
> Cheers,
> Jes
>
>
> Index: linux/include/linux/timer.h
> ===================================================================
> --- linux.orig/include/linux/timer.h
> +++ linux/include/linux/timer.h
> @@ -15,6 +15,8 @@ struct timer_list {
> unsigned long data;
>
> struct tvec_t_base_s *base;
> +
> + short aff;
> };
>
On Thu, 14 Sep 2006 08:29:17 -0500
Dimitri Sivanich <[email protected]> wrote:
> This patch allows the user to migrate currently queued
> standard timers from one cpu to another, thereby reducing
> timer induced latency on the chosen cpu.
Need more details, please. Why would a user want to do that?
Performance-related, I assume? If so, some numbers would be nice.
What is the use-case?
Why was a sysfs file chosen as the user inteface?
What are the permissions on that sysfs file and why?
Does it need to be available to non-altix^H^H^H^H^HNUMA machines?
The code you have there is suspiciously similar to migrate_timers(). Do we
really need to duplicate it?
> Index: linux/kernel/timer.c
> ===================================================================
> --- linux.orig/kernel/timer.c
> +++ linux/kernel/timer.c
> @@ -147,6 +147,7 @@ void fastcall init_timer(struct timer_li
> {
> timer->entry.next = NULL;
> timer->base = __raw_get_cpu_var(tvec_bases);
> + timer->aff = 0;
As Jes mentioned: `aff' isn't a very clear identifier. Maybe is_bound_to_cpu?
> }
> EXPORT_SYMBOL(init_timer);
>
> @@ -250,6 +251,7 @@ void add_timer_on(struct timer_list *tim
> BUG_ON(timer_pending(timer) || !timer->function);
> spin_lock_irqsave(&base->lock, flags);
> timer->base = base;
> + timer->aff = 1; /* Don't migrate */
> internal_add_timer(base, timer);
> spin_unlock_irqrestore(&base->lock, flags);
> }
> @@ -1661,6 +1663,52 @@ static void __devinit migrate_timers(int
> }
> #endif /* CONFIG_HOTPLUG_CPU */
>
> +static void move_timer_list(tvec_base_t *new_base, struct list_head *head)
> +{
> + struct timer_list *timer, *t;
> +
> + list_for_each_entry_safe(timer, t, head, entry) {
> + if (timer->aff)
> + continue;
> + detach_timer(timer, 0);
> + timer->base = new_base;
> + internal_add_timer(new_base, timer);
> + }
> +}
> +
> +int move_timers(int cpu, int dest)
> +{
Again, unfortunate naming. What's 'dest'? Better would be `source_cpu'
and `dest_cpu', no?
> + tvec_base_t *old_base;
> + tvec_base_t *new_base;
> + unsigned long flags;
> + int i;
> +
> + if (cpu == dest)
> + return -EINVAL;
> +
> + if (!cpu_online(cpu) || !cpu_online(dest))
> + return -EINVAL;
Racy against CPU hotplug. Wrapping it all in preempt_disable() (with a
comment explaining why) would suffice.
> + old_base = per_cpu(tvec_bases, cpu);
> + new_base = per_cpu(tvec_bases, dest);
> + spin_lock_irqsave(&new_base->lock, flags);
> + spin_lock(&old_base->lock);
If one CPU does move_timers(0, 1) and another CPU does move_timers(1, 0) at
the same time, we have an AB/BA deadlock, don't we?
If so, fixes would include:
a) always take the lower-addressed-lock first or
b) wrap the whole operation inside a single global lock.
Either way, lockdep is likely to get upset about this and special
annotations and cursing might be needed.
> +
> + for (i = 0; i < TVR_SIZE; i++)
> + move_timer_list(new_base, old_base->tv1.vec + i);
> + for (i = 0; i < TVN_SIZE; i++) {
> + move_timer_list(new_base, old_base->tv2.vec + i);
> + move_timer_list(new_base, old_base->tv3.vec + i);
> + move_timer_list(new_base, old_base->tv4.vec + i);
> + move_timer_list(new_base, old_base->tv5.vec + i);
> + }
> +
> + spin_unlock(&old_base->lock);
> + spin_unlock_irqrestore(&new_base->lock, flags);
> + return 0;
> +}
> +
> static int __cpuinit timer_cpu_notify(struct notifier_block *self,
> unsigned long action, void *hcpu)
> {
> Index: linux/drivers/base/cpu.c
> ===================================================================
> --- linux.orig/drivers/base/cpu.c
> +++ linux/drivers/base/cpu.c
> @@ -54,6 +54,26 @@ static ssize_t store_online(struct sys_d
> }
> static SYSDEV_ATTR(online, 0600, show_online, store_online);
>
> +static ssize_t store_migrate(struct sys_device *dev, const char *buf,
> + size_t count)
A better name would be `store_timer_migrate'.
> +{
> + unsigned int cpu = dev->id;
> + unsigned long dest;
> + int rc;
> +
> + dest = simple_strtoul(buf, NULL, 10);
> + if (dest > INT_MAX)
> + return -EINVAL;
> +
> + rc = move_timers(cpu, dest);
> + if (rc < 0)
> + return rc;
> + else
> + return count;
> +}
> +
> +static SYSDEV_ATTR(timer_migrate, 0200, NULL, store_migrate);
I'm supposed to point you at Documentation/ABI/, sorry.
> ===================================================================
> --- linux.orig/include/linux/timer.h
> +++ linux/include/linux/timer.h
> @@ -15,6 +15,8 @@ struct timer_list {
> unsigned long data;
>
> struct tvec_t_base_s *base;
> +
> + short aff;
> };
>
> extern struct tvec_t_base_s boot_tvec_bases;
> @@ -24,6 +26,7 @@ extern struct tvec_t_base_s boot_tvec_ba
> .expires = (_expires), \
> .data = (_data), \
> .base = &boot_tvec_bases, \
> + .aff = 0, \
It's not really needed if it's zero.
> }
>
> #define DEFINE_TIMER(_name, _function, _expires, _data) \
> @@ -95,6 +98,7 @@ static inline void add_timer(struct time
>
> extern void init_timers(void);
> extern void run_local_timers(void);
> +extern int move_timers(int, int);
I think it's nice to include the identifiers here (source_cpu and
dest_cpu), as a little bit of documentation.
(And I think it's more idiomatic to put the destination arg first, although
we violate that in rather a lot of places).
On Thu, 2006-09-14 at 08:29 -0500, Dimitri Sivanich wrote:
> This patch allows the user to migrate currently queued
> standard timers from one cpu to another, thereby reducing
> timer induced latency on the chosen cpu. Timers that
> were placed with add_timer_on() are considered to have
> 'cpu affinity' and are not moved.
>
> The changes in drivers/base/cpu.c provide a clean and
> convenient interface for triggering the migration through
> sysfs, via writing the destination cpu number to a file
> associated with the source cpu.
>
> Note that migrating timers will not, by itself, keep new
> timers off of the chosen cpu. But with careful control of
> thread affinity, one can control the affinity of new timers
> and keep timer induced latencies off of the chosen cpu.
>
> This particular patch does not affect the hrtimers. That
> could be addressed later.
Are you trying to work around the latencies caused by long running timer
callbacks ? I'm not convinced that this is not curing the symptoms
instead of the root cause.
tglx
On Fri, Sep 15, 2006 at 06:39:19PM +0200, Thomas Gleixner wrote:
> Are you trying to work around the latencies caused by long running timer
> callbacks ? I'm not convinced that this is not curing the symptoms
> instead of the root cause.
Yes, both latency from long running timer callbacks as well as
potential latency from a temporal grouping of timer callbacks
(those occuring on the same tick).
While I agree that root causes of the former should be addressed,
more latencies of this type can always easily creep in. Timer
migration works as a long term preventative aid, not just a fix
for the problem of the moment. And adding this needn't restrict
anyone from looking at the aforementioned root causes.