2013-03-27 00:56:37

by Nathan Zimmer

[permalink] [raw]
Subject: [PATCH v5 0/2] timer_list: Fix /proc/timer_list failure on 4096 cpus

On systems with 4096 cores attemping to read /proc/timer_list
fails because we are trying to push all the data into a single
kmalloc buffer.

A better solution is to not us the single_open mechanism but to
provide our own seq_operations and treat each cpu as an
individual record.

The output should be identical to the previous version.

v2: Added comments on the iteration and other fixups pointed to by Andrew.
v3: Corrected the case where max_cpus != nr_cpu_ids by exiting early.
v5: Use seq_open_private and supply a proper iterator rather then a big mess.

Nathan Zimmer (2):
timer_list: split timer_list_show_tickdevices
timer_list: convert timer list to be a proper seq_file

kernel/time/timer_list.c | 104 +++++++++++++++++++++++++++++++++++++----------
1 file changed, 83 insertions(+), 21 deletions(-)

--
1.8.1.2


2013-03-27 00:56:44

by Nathan Zimmer

[permalink] [raw]
Subject: [PATCH 2/2] timer_list: convert timer list to be a proper seq_file

When running with 4096 cores attemping to read /proc/timer_list will fail
with an ENOMEM condition. On a sufficantly large systems the total amount
of data is more then 4mb, so it won't fit into a single buffer. The
failure can also occur on smaller systems when memory fragmentation is
high as reported by Dave Jones.

Convert /proc/timer_list to a proper seq_file with its own iterator. This
is a little more complex given that we have to make two passes with two
separate headers.

sysrq_timer_list_show also needed to be updated to reflect the fact that
now timer_list_show only does one cpu at at time.

Signed-off-by: Nathan Zimmer <[email protected]>
Reported-by: Dave Jones <[email protected]>
Cc: John Stultz <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Stephen Boyd <[email protected]>

v2: Added comments on the iteration and other fixups pointed to by Andrew.
v3: Corrected the case where max_cpus != nr_cpu_ids by exiting early.
v5: Use seq_open_private and supply a proper iterator rather then a big mess.
---
kernel/time/timer_list.c | 89 +++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 77 insertions(+), 12 deletions(-)

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 380a589..3bdf283 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -20,6 +20,13 @@

#include <asm/uaccess.h>

+
+struct timer_list_iter {
+ int cpu;
+ bool second_pass;
+ u64 now;
+};
+
typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);

DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
@@ -247,43 +254,101 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
}
#endif

-static int timer_list_show(struct seq_file *m, void *v)
+static inline void timer_list_header(struct seq_file *m, u64 now)
{
- u64 now = ktime_to_ns(ktime_get());
- int cpu;
-
SEQ_printf(m, "Timer List Version: v0.7\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
+}
+
+static int timer_list_show(struct seq_file *m, void *v)
+{
+ struct timer_list_iter *iter = v;
+ u64 now = ktime_to_ns(ktime_get());
+
+ if (iter->cpu == -1 && !iter->second_pass)
+ timer_list_header(m, now);
+ else if (!iter->second_pass)
+ print_cpu(m, iter->cpu, iter->now);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ else if (iter->cpu == -1 && iter->second_pass)
+ timer_list_show_tickdevices_header(m);
+ else
+ print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
+#endif
+ return 0;
+}
+
+void sysrq_timer_list_show(void)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ timer_list_header(NULL, now);

for_each_online_cpu(cpu)
- print_cpu(m, cpu, now);
+ print_cpu(NULL, cpu, now);

#ifdef CONFIG_GENERIC_CLOCKEVENTS
- timer_list_show_tickdevices_header(m);
+ timer_list_show_tickdevices_header(NULL);
for_each_online_cpu(cpu)
- print_tickdevice(m, tick_get_device(cpu), cpu);
+ print_tickdevice(NULL, tick_get_device(cpu), cpu);
#endif
+ return;
+}

- return 0;
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+
+ if (!*offset) {
+ iter->cpu = -1;
+ iter->now = ktime_to_ns(ktime_get());
+ } else if (iter->cpu >= nr_cpu_ids) {
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ if (!iter->second_pass) {
+ iter->cpu = -1;
+ iter->second_pass = true;
+ } else
+ return NULL;
+#else
+ return NULL;
+#endif
+ }
+ return iter;
}

-void sysrq_timer_list_show(void)
+static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
{
- timer_list_show(NULL, NULL);
+ struct timer_list_iter *iter = file->private;
+ iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
+ ++*offset;
+ return timer_list_start(file, offset);
}

+static void timer_list_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations timer_list_sops = {
+ .start = timer_list_start,
+ .next = timer_list_next,
+ .stop = timer_list_stop,
+ .show = timer_list_show,
+};
+
static int timer_list_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, timer_list_show, NULL);
+ return seq_open_private(filp, &timer_list_sops,
+ sizeof(struct timer_list_iter));
}

static const struct file_operations timer_list_fops = {
.open = timer_list_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release_private,
};

static int __init init_timer_list_procfs(void)
--
1.8.1.2

2013-03-27 00:57:01

by Nathan Zimmer

[permalink] [raw]
Subject: [PATCH 1/2] timer_list: split timer_list_show_tickdevices

Split timer_list_show_tickdevices() out the header and just pull the rest up
to timer_list_show. Also tweak the location of the whitespace. This is all
to prep for the fix.

Signed-off-by: Nathan Zimmer <[email protected]>
Reported-by: Dave Jones <[email protected]>
Cc: John Stultz <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Stephen Boyd <[email protected]>

v4: correct extra whitespace
---
kernel/time/timer_list.c | 21 +++++++++------------
1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9..380a589 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -133,7 +133,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;

- SEQ_printf(m, "\n");
SEQ_printf(m, "cpu: %d\n", cpu);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
SEQ_printf(m, " clock %d:\n", i);
@@ -187,6 +186,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)

#undef P
#undef P_ns
+ SEQ_printf(m, "\n");
}

#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +195,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
{
struct clock_event_device *dev = td->evtdev;

- SEQ_printf(m, "\n");
SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
if (cpu < 0)
SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +229,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->event_handler);
SEQ_printf(m, "\n");
SEQ_printf(m, " retries: %lu\n", dev->retries);
+ SEQ_printf(m, "\n");
}

-static void timer_list_show_tickdevices(struct seq_file *m)
+static void timer_list_show_tickdevices_header(struct seq_file *m)
{
- int cpu;
-
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
print_tickdevice(m, tick_get_broadcast_device(), -1);
SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,12 +244,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
#endif
SEQ_printf(m, "\n");
#endif
- for_each_online_cpu(cpu)
- print_tickdevice(m, tick_get_device(cpu), cpu);
- SEQ_printf(m, "\n");
}
-#else
-static void timer_list_show_tickdevices(struct seq_file *m) { }
#endif

static int timer_list_show(struct seq_file *m, void *v)
@@ -262,12 +255,16 @@ static int timer_list_show(struct seq_file *m, void *v)
SEQ_printf(m, "Timer List Version: v0.7\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+ SEQ_printf(m, "\n");

for_each_online_cpu(cpu)
print_cpu(m, cpu, now);

- SEQ_printf(m, "\n");
- timer_list_show_tickdevices(m);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ timer_list_show_tickdevices_header(m);
+ for_each_online_cpu(cpu)
+ print_tickdevice(m, tick_get_device(cpu), cpu);
+#endif

return 0;
}
--
1.8.1.2

2013-03-27 19:14:16

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/2] timer_list: convert timer list to be a proper seq_file

On Tue, 26 Mar 2013 19:56:30 -0500 Nathan Zimmer <[email protected]> wrote:

> When running with 4096 cores attemping to read /proc/timer_list will fail
> with an ENOMEM condition. On a sufficantly large systems the total amount
> of data is more then 4mb, so it won't fit into a single buffer. The
> failure can also occur on smaller systems when memory fragmentation is
> high as reported by Dave Jones.
>
> Convert /proc/timer_list to a proper seq_file with its own iterator. This
> is a little more complex given that we have to make two passes with two
> separate headers.
>
> sysrq_timer_list_show also needed to be updated to reflect the fact that
> now timer_list_show only does one cpu at at time.
>
> ...
>
> --- a/kernel/time/timer_list.c
> +++ b/kernel/time/timer_list.c
> @@ -20,6 +20,13 @@
>
> #include <asm/uaccess.h>
>
> +
> +struct timer_list_iter {
> + int cpu;
> + bool second_pass;
> + u64 now;
> +};
> +
> typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
>
> DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
> @@ -247,43 +254,101 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
> }
> #endif
>
> -static int timer_list_show(struct seq_file *m, void *v)
> +static inline void timer_list_header(struct seq_file *m, u64 now)

There's really no point in the explicit inline directive - modern gcc's
basically ignore it anyway.

> {
> - u64 now = ktime_to_ns(ktime_get());
> - int cpu;
> -
> SEQ_printf(m, "Timer List Version: v0.7\n");
> SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
> SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
> SEQ_printf(m, "\n");
> +}
>
> ...
>
> +void sysrq_timer_list_show(void)
> +{
> + u64 now = ktime_to_ns(ktime_get());
> + int cpu;
> +
> + timer_list_header(NULL, now);
>
> for_each_online_cpu(cpu)

hm, it seems this code is taking the optimistic approach to CPU hotplug.

> - print_cpu(m, cpu, now);
> + print_cpu(NULL, cpu, now);

2013-03-27 20:05:46

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH 2/2] timer_list: convert timer list to be a proper seq_file

On Wed, 27 Mar 2013, Andrew Morton wrote:
> On Tue, 26 Mar 2013 19:56:30 -0500 Nathan Zimmer <[email protected]> wrote:
> > +void sysrq_timer_list_show(void)
> > +{
> > + u64 now = ktime_to_ns(ktime_get());
> > + int cpu;
> > +
> > + timer_list_header(NULL, now);
> >
> > for_each_online_cpu(cpu)
>
> hm, it seems this code is taking the optimistic approach to CPU hotplug.

Right. It's not really relevant. The data structures are not going
away and this can cope with the fact that the cpu went away under us.

Thanks,

tglx

Subject: [tip:timers/core] timer_list: Split timer_list_show_tickdevices

Commit-ID: 60cf7ea849e77c8782dee147cfb8c38d1984236e
Gitweb: http://git.kernel.org/tip/60cf7ea849e77c8782dee147cfb8c38d1984236e
Author: Nathan Zimmer <[email protected]>
AuthorDate: Tue, 26 Mar 2013 19:56:29 -0500
Committer: Thomas Gleixner <[email protected]>
CommitDate: Wed, 17 Apr 2013 20:51:02 +0200

timer_list: Split timer_list_show_tickdevices

Split timer_list_show_tickdevices() into the header printout and pull
the rest up to timer_list_show. This is a preparatory patch for
converting timer_list to a proper seqfile with its own iterator

Signed-off-by: Nathan Zimmer <[email protected]>
Reported-by: Dave Jones <[email protected]>
Cc: John Stultz <[email protected]>
Cc: Stephen Boyd <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
kernel/time/timer_list.c | 21 +++++++++------------
1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9..380a589 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -133,7 +133,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;

- SEQ_printf(m, "\n");
SEQ_printf(m, "cpu: %d\n", cpu);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
SEQ_printf(m, " clock %d:\n", i);
@@ -187,6 +186,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)

#undef P
#undef P_ns
+ SEQ_printf(m, "\n");
}

#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +195,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
{
struct clock_event_device *dev = td->evtdev;

- SEQ_printf(m, "\n");
SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
if (cpu < 0)
SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +229,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->event_handler);
SEQ_printf(m, "\n");
SEQ_printf(m, " retries: %lu\n", dev->retries);
+ SEQ_printf(m, "\n");
}

-static void timer_list_show_tickdevices(struct seq_file *m)
+static void timer_list_show_tickdevices_header(struct seq_file *m)
{
- int cpu;
-
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
print_tickdevice(m, tick_get_broadcast_device(), -1);
SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,12 +244,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
#endif
SEQ_printf(m, "\n");
#endif
- for_each_online_cpu(cpu)
- print_tickdevice(m, tick_get_device(cpu), cpu);
- SEQ_printf(m, "\n");
}
-#else
-static void timer_list_show_tickdevices(struct seq_file *m) { }
#endif

static int timer_list_show(struct seq_file *m, void *v)
@@ -262,12 +255,16 @@ static int timer_list_show(struct seq_file *m, void *v)
SEQ_printf(m, "Timer List Version: v0.7\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+ SEQ_printf(m, "\n");

for_each_online_cpu(cpu)
print_cpu(m, cpu, now);

- SEQ_printf(m, "\n");
- timer_list_show_tickdevices(m);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ timer_list_show_tickdevices_header(m);
+ for_each_online_cpu(cpu)
+ print_tickdevice(m, tick_get_device(cpu), cpu);
+#endif

return 0;
}

Subject: [tip:timers/core] timer_list: Convert timer list to be a proper seq_file

Commit-ID: b3956a896ea57f25cacd74708b8fab611543a81d
Gitweb: http://git.kernel.org/tip/b3956a896ea57f25cacd74708b8fab611543a81d
Author: Nathan Zimmer <[email protected]>
AuthorDate: Tue, 26 Mar 2013 19:56:30 -0500
Committer: Thomas Gleixner <[email protected]>
CommitDate: Wed, 17 Apr 2013 20:51:02 +0200

timer_list: Convert timer list to be a proper seq_file

When running with 4096 cores attemping to read /proc/timer_list will fail
with an ENOMEM condition. On a sufficantly large systems the total amount
of data is more then 4mb, so it won't fit into a single buffer. The
failure can also occur on smaller systems when memory fragmentation is
high as reported by Dave Jones.

Convert /proc/timer_list to a proper seq_file with its own iterator. This
is a little more complex given that we have to make two passes with two
separate headers.

sysrq_timer_list_show also needed to be updated to reflect the fact that
now timer_list_show only does one cpu at at time.

Signed-off-by: Nathan Zimmer <[email protected]>
Reported-by: Dave Jones <[email protected]>
Cc: John Stultz <[email protected]>
Cc: Stephen Boyd <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Thomas Gleixner <[email protected]>
---
kernel/time/timer_list.c | 89 +++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 77 insertions(+), 12 deletions(-)

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 380a589..3bdf283 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -20,6 +20,13 @@

#include <asm/uaccess.h>

+
+struct timer_list_iter {
+ int cpu;
+ bool second_pass;
+ u64 now;
+};
+
typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);

DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
@@ -247,43 +254,101 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
}
#endif

-static int timer_list_show(struct seq_file *m, void *v)
+static inline void timer_list_header(struct seq_file *m, u64 now)
{
- u64 now = ktime_to_ns(ktime_get());
- int cpu;
-
SEQ_printf(m, "Timer List Version: v0.7\n");
SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
SEQ_printf(m, "\n");
+}
+
+static int timer_list_show(struct seq_file *m, void *v)
+{
+ struct timer_list_iter *iter = v;
+ u64 now = ktime_to_ns(ktime_get());
+
+ if (iter->cpu == -1 && !iter->second_pass)
+ timer_list_header(m, now);
+ else if (!iter->second_pass)
+ print_cpu(m, iter->cpu, iter->now);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ else if (iter->cpu == -1 && iter->second_pass)
+ timer_list_show_tickdevices_header(m);
+ else
+ print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
+#endif
+ return 0;
+}
+
+void sysrq_timer_list_show(void)
+{
+ u64 now = ktime_to_ns(ktime_get());
+ int cpu;
+
+ timer_list_header(NULL, now);

for_each_online_cpu(cpu)
- print_cpu(m, cpu, now);
+ print_cpu(NULL, cpu, now);

#ifdef CONFIG_GENERIC_CLOCKEVENTS
- timer_list_show_tickdevices_header(m);
+ timer_list_show_tickdevices_header(NULL);
for_each_online_cpu(cpu)
- print_tickdevice(m, tick_get_device(cpu), cpu);
+ print_tickdevice(NULL, tick_get_device(cpu), cpu);
#endif
+ return;
+}

- return 0;
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+
+ if (!*offset) {
+ iter->cpu = -1;
+ iter->now = ktime_to_ns(ktime_get());
+ } else if (iter->cpu >= nr_cpu_ids) {
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ if (!iter->second_pass) {
+ iter->cpu = -1;
+ iter->second_pass = true;
+ } else
+ return NULL;
+#else
+ return NULL;
+#endif
+ }
+ return iter;
}

-void sysrq_timer_list_show(void)
+static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
{
- timer_list_show(NULL, NULL);
+ struct timer_list_iter *iter = file->private;
+ iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
+ ++*offset;
+ return timer_list_start(file, offset);
}

+static void timer_list_stop(struct seq_file *seq, void *v)
+{
+}
+
+static const struct seq_operations timer_list_sops = {
+ .start = timer_list_start,
+ .next = timer_list_next,
+ .stop = timer_list_stop,
+ .show = timer_list_show,
+};
+
static int timer_list_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, timer_list_show, NULL);
+ return seq_open_private(filp, &timer_list_sops,
+ sizeof(struct timer_list_iter));
}

static const struct file_operations timer_list_fops = {
.open = timer_list_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release_private,
};

static int __init init_timer_list_procfs(void)