2018-11-19 11:30:28

by Alexey Brodkin

[permalink] [raw]
Subject: [PATCH v3] clocksource/drivers/arc_timer: Utilize generic sched_clock

It turned out we used to use default implementation of sched_clock()
from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
by default we had 10 msec granularity of time measurement.

Now given ARC built-in timers are clocked with the same frequency as
CPU cores we may get much higher precision of time tracking.

Thus we switch to generic sched_clock which really reads ARC hardware
counters.

This is especially helpful for measuring short events.
That's what we used to have:
------------------------------>8------------------------
$ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null

Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':

10.000000 task-clock (msec) # 2.832 CPUs utilized
1 context-switches # 0.100 K/sec
1 cpu-migrations # 0.100 K/sec
63 page-faults # 0.006 M/sec
3049480 cycles # 0.305 GHz
1091259 instructions # 0.36 insn per cycle
256828 branches # 25.683 M/sec
27026 branch-misses # 10.52% of all branches

0.003530687 seconds time elapsed

0.000000000 seconds user
0.010000000 seconds sys
------------------------------>8------------------------

And now we'll see:
------------------------------>8------------------------
$ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null

Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':

3.004322 task-clock (msec) # 0.865 CPUs utilized
1 context-switches # 0.333 K/sec
1 cpu-migrations # 0.333 K/sec
63 page-faults # 0.021 M/sec
2986734 cycles # 0.994 GHz
1087466 instructions # 0.36 insn per cycle
255209 branches # 84.947 M/sec
26002 branch-misses # 10.19% of all branches

0.003474829 seconds time elapsed

0.003519000 seconds user
0.000000000 seconds sys
------------------------------>8------------------------

Note how much more meaningful is the second output - time spent for
execution pretty much matches number of cycles spent (we're runnign
@ 1GHz here).

Signed-off-by: Alexey Brodkin <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Vineet Gupta <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---

Changes v2 -> v3:
* Rebased on top of v4.20-rc3
* Moved GENERIC_SCHED_CLOCK to alphabetically-sorted location in
arch/arc/Kconfig

Changes v1 -> v2:
* Timer read callbacks marked as "notrace"
* ARC Timer1 explicitly described as 32-bit one on
sched_clock_register() invocation

arch/arc/Kconfig | 1 +
drivers/clocksource/Kconfig | 1 +
drivers/clocksource/arc_timer.c | 22 ++++++++++++++++++++++
3 files changed, 24 insertions(+)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index c9e2a1323536..74b5a654f664 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -26,6 +26,7 @@ config ARC
select GENERIC_IRQ_SHOW
select GENERIC_PCI_IOMAP
select GENERIC_PENDING_IRQ if SMP
+ select GENERIC_SCHED_CLOCK
select GENERIC_SMP_IDLE_THREAD
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 55c77e44bb2d..d9c8a779dd7d 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -290,6 +290,7 @@ config CLKSRC_MPS2

config ARC_TIMERS
bool "Support for 32-bit TIMERn counters in ARC Cores" if COMPILE_TEST
+ depends on GENERIC_SCHED_CLOCK
select TIMER_OF
help
These are legacy 32-bit TIMER0 and TIMER1 counters found on all ARC cores
diff --git a/drivers/clocksource/arc_timer.c b/drivers/clocksource/arc_timer.c
index 20da9b1d7f7d..b28970ca4a7a 100644
--- a/drivers/clocksource/arc_timer.c
+++ b/drivers/clocksource/arc_timer.c
@@ -23,6 +23,7 @@
#include <linux/cpu.h>
#include <linux/of.h>
#include <linux/of_irq.h>
+#include <linux/sched_clock.h>

#include <soc/arc/timers.h>
#include <soc/arc/mcip.h>
@@ -88,6 +89,11 @@ static u64 arc_read_gfrc(struct clocksource *cs)
return (((u64)h) << 32) | l;
}

+static notrace u64 arc_gfrc_clock_read(void)
+{
+ return arc_read_gfrc(NULL);
+}
+
static struct clocksource arc_counter_gfrc = {
.name = "ARConnect GFRC",
.rating = 400,
@@ -111,6 +117,8 @@ static int __init arc_cs_setup_gfrc(struct device_node *node)
if (ret)
return ret;

+ sched_clock_register(arc_gfrc_clock_read, 64, arc_timer_freq);
+
return clocksource_register_hz(&arc_counter_gfrc, arc_timer_freq);
}
TIMER_OF_DECLARE(arc_gfrc, "snps,archs-timer-gfrc", arc_cs_setup_gfrc);
@@ -139,6 +147,11 @@ static u64 arc_read_rtc(struct clocksource *cs)
return (((u64)h) << 32) | l;
}

+static notrace u64 arc_rtc_clock_read(void)
+{
+ return arc_read_rtc(NULL);
+}
+
static struct clocksource arc_counter_rtc = {
.name = "ARCv2 RTC",
.rating = 350,
@@ -170,6 +183,8 @@ static int __init arc_cs_setup_rtc(struct device_node *node)

write_aux_reg(AUX_RTC_CTRL, 1);

+ sched_clock_register(arc_rtc_clock_read, 64, arc_timer_freq);
+
return clocksource_register_hz(&arc_counter_rtc, arc_timer_freq);
}
TIMER_OF_DECLARE(arc_rtc, "snps,archs-timer-rtc", arc_cs_setup_rtc);
@@ -185,6 +200,11 @@ static u64 arc_read_timer1(struct clocksource *cs)
return (u64) read_aux_reg(ARC_REG_TIMER1_CNT);
}

+static notrace u64 arc_timer1_clock_read(void)
+{
+ return arc_read_timer1(NULL);
+}
+
static struct clocksource arc_counter_timer1 = {
.name = "ARC Timer1",
.rating = 300,
@@ -209,6 +229,8 @@ static int __init arc_cs_setup_timer1(struct device_node *node)
write_aux_reg(ARC_REG_TIMER1_CNT, 0);
write_aux_reg(ARC_REG_TIMER1_CTRL, TIMER_CTRL_NH);

+ sched_clock_register(arc_timer1_clock_read, 32, arc_timer_freq);
+
return clocksource_register_hz(&arc_counter_timer1, arc_timer_freq);
}

--
2.19.1



2018-11-19 12:01:46

by Daniel Lezcano

[permalink] [raw]
Subject: Re: [PATCH v3] clocksource/drivers/arc_timer: Utilize generic sched_clock

On 19/11/2018 12:29, Alexey Brodkin wrote:

[ ... ]

> arch/arc/Kconfig | 1 +
> drivers/clocksource/Kconfig | 1 +
> drivers/clocksource/arc_timer.c | 22 ++++++++++++++++++++++
> 3 files changed, 24 insertions(+)
>
> diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig

Can I have an Ack for the arch part ?

Thanks
-- Daniel

--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog


2018-11-19 17:35:13

by Vineet Gupta

[permalink] [raw]
Subject: Re: [PATCH v3] clocksource/drivers/arc_timer: Utilize generic sched_clock

On 11/19/18 3:30 AM, Alexey Brodkin wrote:
> It turned out we used to use default implementation of sched_clock()
> from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
> by default we had 10 msec granularity of time measurement.
>
> Now given ARC built-in timers are clocked with the same frequency as
> CPU cores we may get much higher precision of time tracking.
>
> Thus we switch to generic sched_clock which really reads ARC hardware
> counters.
>
> This is especially helpful for measuring short events.
> That's what we used to have:
> ------------------------------>8------------------------
> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>
> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>
> 10.000000 task-clock (msec) # 2.832 CPUs utilized
> 1 context-switches # 0.100 K/sec
> 1 cpu-migrations # 0.100 K/sec
> 63 page-faults # 0.006 M/sec
> 3049480 cycles # 0.305 GHz
> 1091259 instructions # 0.36 insn per cycle
> 256828 branches # 25.683 M/sec
> 27026 branch-misses # 10.52% of all branches
>
> 0.003530687 seconds time elapsed
>
> 0.000000000 seconds user
> 0.010000000 seconds sys
> ------------------------------>8------------------------
>
> And now we'll see:
> ------------------------------>8------------------------
> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>
> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>
> 3.004322 task-clock (msec) # 0.865 CPUs utilized
> 1 context-switches # 0.333 K/sec
> 1 cpu-migrations # 0.333 K/sec
> 63 page-faults # 0.021 M/sec
> 2986734 cycles # 0.994 GHz
> 1087466 instructions # 0.36 insn per cycle
> 255209 branches # 84.947 M/sec
> 26002 branch-misses # 10.19% of all branches
>
> 0.003474829 seconds time elapsed
>
> 0.003519000 seconds user
> 0.000000000 seconds sys
> ------------------------------>8------------------------
>
> Note how much more meaningful is the second output - time spent for
> execution pretty much matches number of cycles spent (we're runnign
> @ 1GHz here).
>
> Signed-off-by: Alexey Brodkin <[email protected]>
> Cc: Daniel Lezcano <[email protected]>
> Cc: Vineet Gupta <[email protected]>
> Cc: Thomas Gleixner <[email protected]>

Acked-by: Vineet Gupta <[email protected]>

Thx,
-Vineet

2018-11-19 21:51:13

by Daniel Lezcano

[permalink] [raw]
Subject: Re: [PATCH v3] clocksource/drivers/arc_timer: Utilize generic sched_clock

On 19/11/2018 12:29, Alexey Brodkin wrote:
> It turned out we used to use default implementation of sched_clock()
> from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
> by default we had 10 msec granularity of time measurement.
>
> Now given ARC built-in timers are clocked with the same frequency as
> CPU cores we may get much higher precision of time tracking.
>
> Thus we switch to generic sched_clock which really reads ARC hardware
> counters.
>
> This is especially helpful for measuring short events.
> That's what we used to have:
> ------------------------------>8------------------------
> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>
> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>
> 10.000000 task-clock (msec) # 2.832 CPUs utilized
> 1 context-switches # 0.100 K/sec
> 1 cpu-migrations # 0.100 K/sec
> 63 page-faults # 0.006 M/sec
> 3049480 cycles # 0.305 GHz
> 1091259 instructions # 0.36 insn per cycle
> 256828 branches # 25.683 M/sec
> 27026 branch-misses # 10.52% of all branches
>
> 0.003530687 seconds time elapsed
>
> 0.000000000 seconds user
> 0.010000000 seconds sys
> ------------------------------>8------------------------
>
> And now we'll see:
> ------------------------------>8------------------------
> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>
> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>
> 3.004322 task-clock (msec) # 0.865 CPUs utilized
> 1 context-switches # 0.333 K/sec
> 1 cpu-migrations # 0.333 K/sec
> 63 page-faults # 0.021 M/sec
> 2986734 cycles # 0.994 GHz
> 1087466 instructions # 0.36 insn per cycle
> 255209 branches # 84.947 M/sec
> 26002 branch-misses # 10.19% of all branches
>
> 0.003474829 seconds time elapsed
>
> 0.003519000 seconds user
> 0.000000000 seconds sys
> ------------------------------>8------------------------
>
> Note how much more meaningful is the second output - time spent for
> execution pretty much matches number of cycles spent (we're runnign
> @ 1GHz here).
>
> Signed-off-by: Alexey Brodkin <[email protected]>
> Cc: Daniel Lezcano <[email protected]>
> Cc: Vineet Gupta <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> ---

Applied, thanks.


--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog


2018-11-19 21:54:05

by Alexey Brodkin

[permalink] [raw]
Subject: Re: [PATCH v3] clocksource/drivers/arc_timer: Utilize generic sched_clock

Hi Daniel,

On Mon, 2018-11-19 at 22:50 +0100, Daniel Lezcano wrote:
> On 19/11/2018 12:29, Alexey Brodkin wrote:
> > It turned out we used to use default implementation of sched_clock()
> > from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
> > by default we had 10 msec granularity of time measurement.
> >
> > Now given ARC built-in timers are clocked with the same frequency as
> > CPU cores we may get much higher precision of time tracking.
> >
> > Thus we switch to generic sched_clock which really reads ARC hardware
> > counters.
> >
> > This is especially helpful for measuring short events.
> > That's what we used to have:
> > ------------------------------>8------------------------
> > $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
> >
> > Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
> >
> > 10.000000 task-clock (msec) # 2.832 CPUs utilized
> > 1 context-switches # 0.100 K/sec
> > 1 cpu-migrations # 0.100 K/sec
> > 63 page-faults # 0.006 M/sec
> > 3049480 cycles # 0.305 GHz
> > 1091259 instructions # 0.36 insn per cycle
> > 256828 branches # 25.683 M/sec
> > 27026 branch-misses # 10.52% of all branches
> >
> > 0.003530687 seconds time elapsed
> >
> > 0.000000000 seconds user
> > 0.010000000 seconds sys
> > ------------------------------>8------------------------
> >
> > And now we'll see:
> > ------------------------------>8------------------------
> > $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
> >
> > Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
> >
> > 3.004322 task-clock (msec) # 0.865 CPUs utilized
> > 1 context-switches # 0.333 K/sec
> > 1 cpu-migrations # 0.333 K/sec
> > 63 page-faults # 0.021 M/sec
> > 2986734 cycles # 0.994 GHz
> > 1087466 instructions # 0.36 insn per cycle
> > 255209 branches # 84.947 M/sec
> > 26002 branch-misses # 10.19% of all branches
> >
> > 0.003474829 seconds time elapsed
> >
> > 0.003519000 seconds user
> > 0.000000000 seconds sys
> > ------------------------------>8------------------------
> >
> > Note how much more meaningful is the second output - time spent for
> > execution pretty much matches number of cycles spent (we're runnign
> > @ 1GHz here).
> >
> > Signed-off-by: Alexey Brodkin <[email protected]>
> > Cc: Daniel Lezcano <[email protected]>
> > Cc: Vineet Gupta <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>
> > ---
>
> Applied, thanks.

Maybe a bit too late but I completely forgot to add stable tag into Cc list.
Any chance to add it still?

-Alexey

2018-11-19 22:10:59

by Daniel Lezcano

[permalink] [raw]
Subject: Re: [PATCH v3] clocksource/drivers/arc_timer: Utilize generic sched_clock

On 19/11/2018 22:53, Alexey Brodkin wrote:
> Hi Daniel,
>
> On Mon, 2018-11-19 at 22:50 +0100, Daniel Lezcano wrote:
>> On 19/11/2018 12:29, Alexey Brodkin wrote:
>>> It turned out we used to use default implementation of sched_clock()
>>> from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
>>> by default we had 10 msec granularity of time measurement.
>>>
>>> Now given ARC built-in timers are clocked with the same frequency as
>>> CPU cores we may get much higher precision of time tracking.
>>>
>>> Thus we switch to generic sched_clock which really reads ARC hardware
>>> counters.
>>>
>>> This is especially helpful for measuring short events.
>>> That's what we used to have:
>>> ------------------------------>8------------------------
>>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>>
>>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>>
>>> 10.000000 task-clock (msec) # 2.832 CPUs utilized
>>> 1 context-switches # 0.100 K/sec
>>> 1 cpu-migrations # 0.100 K/sec
>>> 63 page-faults # 0.006 M/sec
>>> 3049480 cycles # 0.305 GHz
>>> 1091259 instructions # 0.36 insn per cycle
>>> 256828 branches # 25.683 M/sec
>>> 27026 branch-misses # 10.52% of all branches
>>>
>>> 0.003530687 seconds time elapsed
>>>
>>> 0.000000000 seconds user
>>> 0.010000000 seconds sys
>>> ------------------------------>8------------------------
>>>
>>> And now we'll see:
>>> ------------------------------>8------------------------
>>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>>
>>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>>
>>> 3.004322 task-clock (msec) # 0.865 CPUs utilized
>>> 1 context-switches # 0.333 K/sec
>>> 1 cpu-migrations # 0.333 K/sec
>>> 63 page-faults # 0.021 M/sec
>>> 2986734 cycles # 0.994 GHz
>>> 1087466 instructions # 0.36 insn per cycle
>>> 255209 branches # 84.947 M/sec
>>> 26002 branch-misses # 10.19% of all branches
>>>
>>> 0.003474829 seconds time elapsed
>>>
>>> 0.003519000 seconds user
>>> 0.000000000 seconds sys
>>> ------------------------------>8------------------------
>>>
>>> Note how much more meaningful is the second output - time spent for
>>> execution pretty much matches number of cycles spent (we're runnign
>>> @ 1GHz here).
>>>
>>> Signed-off-by: Alexey Brodkin <[email protected]>
>>> Cc: Daniel Lezcano <[email protected]>
>>> Cc: Vineet Gupta <[email protected]>
>>> Cc: Thomas Gleixner <[email protected]>
>>> ---
>>
>> Applied, thanks.
>
> Maybe a bit too late but I completely forgot to add stable tag into Cc list.
> Any chance to add it still?

Yes, no problem, I've added it.



[1]
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/Documentation/process/stable-kernel-rules.rst?h=timers/core


--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog


2018-11-19 22:13:26

by Alexey Brodkin

[permalink] [raw]
Subject: Re: [PATCH v3] clocksource/drivers/arc_timer: Utilize generic sched_clock

On Mon, 2018-11-19 at 23:09 +0100, Daniel Lezcano wrote:
> On 19/11/2018 22:53, Alexey Brodkin wrote:
> > Hi Daniel,
> >
> > On Mon, 2018-11-19 at 22:50 +0100, Daniel Lezcano wrote:
> > > On 19/11/2018 12:29, Alexey Brodkin wrote:
> > > > It turned out we used to use default implementation of sched_clock()
> > > > from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
> > > > by default we had 10 msec granularity of time measurement.
> > > >
> > > > Now given ARC built-in timers are clocked with the same frequency as
> > > > CPU cores we may get much higher precision of time tracking.
> > > >
> > > > Thus we switch to generic sched_clock which really reads ARC hardware
> > > > counters.
> > > >
> > > > This is especially helpful for measuring short events.
> > > > That's what we used to have:
> > > > ------------------------------>8------------------------
> > > > $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
> > > >
> > > > Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
> > > >
> > > > 10.000000 task-clock (msec) # 2.832 CPUs utilized
> > > > 1 context-switches # 0.100 K/sec
> > > > 1 cpu-migrations # 0.100 K/sec
> > > > 63 page-faults # 0.006 M/sec
> > > > 3049480 cycles # 0.305 GHz
> > > > 1091259 instructions # 0.36 insn per cycle
> > > > 256828 branches # 25.683 M/sec
> > > > 27026 branch-misses # 10.52% of all branches
> > > >
> > > > 0.003530687 seconds time elapsed
> > > >
> > > > 0.000000000 seconds user
> > > > 0.010000000 seconds sys
> > > > ------------------------------>8------------------------
> > > >
> > > > And now we'll see:
> > > > ------------------------------>8------------------------
> > > > $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
> > > >
> > > > Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
> > > >
> > > > 3.004322 task-clock (msec) # 0.865 CPUs utilized
> > > > 1 context-switches # 0.333 K/sec
> > > > 1 cpu-migrations # 0.333 K/sec
> > > > 63 page-faults # 0.021 M/sec
> > > > 2986734 cycles # 0.994 GHz
> > > > 1087466 instructions # 0.36 insn per cycle
> > > > 255209 branches # 84.947 M/sec
> > > > 26002 branch-misses # 10.19% of all branches
> > > >
> > > > 0.003474829 seconds time elapsed
> > > >
> > > > 0.003519000 seconds user
> > > > 0.000000000 seconds sys
> > > > ------------------------------>8------------------------
> > > >
> > > > Note how much more meaningful is the second output - time spent for
> > > > execution pretty much matches number of cycles spent (we're runnign
> > > > @ 1GHz here).
> > > >
> > > > Signed-off-by: Alexey Brodkin <[email protected]>
> > > > Cc: Daniel Lezcano <[email protected]>
> > > > Cc: Vineet Gupta <[email protected]>
> > > > Cc: Thomas Gleixner <[email protected]>
> > > > ---
> > >
> > > Applied, thanks.
> >
> > Maybe a bit too late but I completely forgot to add stable tag into Cc list.
> > Any chance to add it still?
>
> Yes, no problem, I've added it.

Cool, thanks!

-Alexey