It turned out we used to use default implementation of sched_clock()
from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
by default we had 10 msec granularity of time measurement.
Now given ARC built-in timers are clocked with the same frequency as
CPU cores we may get much higher precision of time tracking.
Thus we switch to generic sched_clock which really reads ARC hardware
counters.
This is especially helpful for measuring short events.
That's what we used to have:
------------------------------>8------------------------
$ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
10.000000 task-clock (msec) # 2.832 CPUs utilized
1 context-switches # 0.100 K/sec
1 cpu-migrations # 0.100 K/sec
63 page-faults # 0.006 M/sec
3049480 cycles # 0.305 GHz
1091259 instructions # 0.36 insn per cycle
256828 branches # 25.683 M/sec
27026 branch-misses # 10.52% of all branches
0.003530687 seconds time elapsed
0.000000000 seconds user
0.010000000 seconds sys
------------------------------>8------------------------
And now we'll see:
------------------------------>8------------------------
$ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
3.004322 task-clock (msec) # 0.865 CPUs utilized
1 context-switches # 0.333 K/sec
1 cpu-migrations # 0.333 K/sec
63 page-faults # 0.021 M/sec
2986734 cycles # 0.994 GHz
1087466 instructions # 0.36 insn per cycle
255209 branches # 84.947 M/sec
26002 branch-misses # 10.19% of all branches
0.003474829 seconds time elapsed
0.003519000 seconds user
0.000000000 seconds sys
------------------------------>8------------------------
Note how much more meaningful is the second output - time spent for
execution pretty much matches number of cycles spent (we're running
@ 1GHz here).
Signed-off-by: Alexey Brodkin <[email protected]>
Cc: Daniel Lezcano <[email protected]>
Cc: Vineet Gupta <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
Changes v1 -> v2:
* Timer read callbacks marked as "notrace"
* ARC Timer1 explicitly described as 32-bit one on
sched_clock_register() invocation
arch/arc/Kconfig | 1 +
drivers/clocksource/Kconfig | 1 +
drivers/clocksource/arc_timer.c | 22 ++++++++++++++++++++++
3 files changed, 24 insertions(+)
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 5151d81476a1..714f769389a4 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -9,6 +9,7 @@
config ARC
def_bool y
select ARC_TIMERS
+ select GENERIC_SCHED_CLOCK
select ARCH_HAS_SYNC_DMA_FOR_CPU
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select ARCH_HAS_SG_CHAIN
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index dec0dd88ec15..3268dad4effe 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -290,6 +290,7 @@ config CLKSRC_MPS2
config ARC_TIMERS
bool "Support for 32-bit TIMERn counters in ARC Cores" if COMPILE_TEST
+ depends on GENERIC_SCHED_CLOCK
select TIMER_OF
help
These are legacy 32-bit TIMER0 and TIMER1 counters found on all ARC cores
diff --git a/drivers/clocksource/arc_timer.c b/drivers/clocksource/arc_timer.c
index 20da9b1d7f7d..b28970ca4a7a 100644
--- a/drivers/clocksource/arc_timer.c
+++ b/drivers/clocksource/arc_timer.c
@@ -23,6 +23,7 @@
#include <linux/cpu.h>
#include <linux/of.h>
#include <linux/of_irq.h>
+#include <linux/sched_clock.h>
#include <soc/arc/timers.h>
#include <soc/arc/mcip.h>
@@ -88,6 +89,11 @@ static u64 arc_read_gfrc(struct clocksource *cs)
return (((u64)h) << 32) | l;
}
+static notrace u64 arc_gfrc_clock_read(void)
+{
+ return arc_read_gfrc(NULL);
+}
+
static struct clocksource arc_counter_gfrc = {
.name = "ARConnect GFRC",
.rating = 400,
@@ -111,6 +117,8 @@ static int __init arc_cs_setup_gfrc(struct device_node *node)
if (ret)
return ret;
+ sched_clock_register(arc_gfrc_clock_read, 64, arc_timer_freq);
+
return clocksource_register_hz(&arc_counter_gfrc, arc_timer_freq);
}
TIMER_OF_DECLARE(arc_gfrc, "snps,archs-timer-gfrc", arc_cs_setup_gfrc);
@@ -139,6 +147,11 @@ static u64 arc_read_rtc(struct clocksource *cs)
return (((u64)h) << 32) | l;
}
+static notrace u64 arc_rtc_clock_read(void)
+{
+ return arc_read_rtc(NULL);
+}
+
static struct clocksource arc_counter_rtc = {
.name = "ARCv2 RTC",
.rating = 350,
@@ -170,6 +183,8 @@ static int __init arc_cs_setup_rtc(struct device_node *node)
write_aux_reg(AUX_RTC_CTRL, 1);
+ sched_clock_register(arc_rtc_clock_read, 64, arc_timer_freq);
+
return clocksource_register_hz(&arc_counter_rtc, arc_timer_freq);
}
TIMER_OF_DECLARE(arc_rtc, "snps,archs-timer-rtc", arc_cs_setup_rtc);
@@ -185,6 +200,11 @@ static u64 arc_read_timer1(struct clocksource *cs)
return (u64) read_aux_reg(ARC_REG_TIMER1_CNT);
}
+static notrace u64 arc_timer1_clock_read(void)
+{
+ return arc_read_timer1(NULL);
+}
+
static struct clocksource arc_counter_timer1 = {
.name = "ARC Timer1",
.rating = 300,
@@ -209,6 +229,8 @@ static int __init arc_cs_setup_timer1(struct device_node *node)
write_aux_reg(ARC_REG_TIMER1_CNT, 0);
write_aux_reg(ARC_REG_TIMER1_CTRL, TIMER_CTRL_NH);
+ sched_clock_register(arc_timer1_clock_read, 32, arc_timer_freq);
+
return clocksource_register_hz(&arc_counter_timer1, arc_timer_freq);
}
--
2.17.2
On 10/17/2018 04:30 AM, Alexey Brodkin wrote:
> It turned out we used to use default implementation of sched_clock()
> from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
> by default we had 10 msec granularity of time measurement.
>
> Now given ARC built-in timers are clocked with the same frequency as
> CPU cores we may get much higher precision of time tracking.
>
> Thus we switch to generic sched_clock which really reads ARC hardware
> counters.
>
> This is especially helpful for measuring short events.
> That's what we used to have:
> ------------------------------>8------------------------
> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>
> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>
> 10.000000 task-clock (msec) # 2.832 CPUs utilized
> 1 context-switches # 0.100 K/sec
> 1 cpu-migrations # 0.100 K/sec
> 63 page-faults # 0.006 M/sec
> 3049480 cycles # 0.305 GHz
> 1091259 instructions # 0.36 insn per cycle
> 256828 branches # 25.683 M/sec
> 27026 branch-misses # 10.52% of all branches
>
> 0.003530687 seconds time elapsed
>
> 0.000000000 seconds user
> 0.010000000 seconds sys
> ------------------------------>8------------------------
>
> And now we'll see:
> ------------------------------>8------------------------
> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>
> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>
> 3.004322 task-clock (msec) # 0.865 CPUs utilized
> 1 context-switches # 0.333 K/sec
> 1 cpu-migrations # 0.333 K/sec
> 63 page-faults # 0.021 M/sec
> 2986734 cycles # 0.994 GHz
> 1087466 instructions # 0.36 insn per cycle
> 255209 branches # 84.947 M/sec
> 26002 branch-misses # 10.19% of all branches
>
> 0.003474829 seconds time elapsed
>
> 0.003519000 seconds user
> 0.000000000 seconds sys
> ------------------------------>8------------------------
>
> Note how much more meaningful is the second output - time spent for
> execution pretty much matches number of cycles spent (we're running
> @ 1GHz here).
>
> Signed-off-by: Alexey Brodkin <[email protected]>
> Cc: Daniel Lezcano <[email protected]>
> Cc: Vineet Gupta <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> ---
Acked-by: Vineet Gupta <[email protected]>
@Daniel is this going via timer tree or you want me to pick it up.
Thx,
-Vineet
On 24/10/2018 00:33, Vineet Gupta wrote:
> On 10/17/2018 04:30 AM, Alexey Brodkin wrote:
>> It turned out we used to use default implementation of sched_clock()
>> from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
>> by default we had 10 msec granularity of time measurement.
>>
>> Now given ARC built-in timers are clocked with the same frequency as
>> CPU cores we may get much higher precision of time tracking.
>>
>> Thus we switch to generic sched_clock which really reads ARC hardware
>> counters.
>>
>> This is especially helpful for measuring short events.
>> That's what we used to have:
>> ------------------------------>8------------------------
>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>
>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>
>> 10.000000 task-clock (msec) # 2.832 CPUs utilized
>> 1 context-switches # 0.100 K/sec
>> 1 cpu-migrations # 0.100 K/sec
>> 63 page-faults # 0.006 M/sec
>> 3049480 cycles # 0.305 GHz
>> 1091259 instructions # 0.36 insn per cycle
>> 256828 branches # 25.683 M/sec
>> 27026 branch-misses # 10.52% of all branches
>>
>> 0.003530687 seconds time elapsed
>>
>> 0.000000000 seconds user
>> 0.010000000 seconds sys
>> ------------------------------>8------------------------
>>
>> And now we'll see:
>> ------------------------------>8------------------------
>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>
>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>
>> 3.004322 task-clock (msec) # 0.865 CPUs utilized
>> 1 context-switches # 0.333 K/sec
>> 1 cpu-migrations # 0.333 K/sec
>> 63 page-faults # 0.021 M/sec
>> 2986734 cycles # 0.994 GHz
>> 1087466 instructions # 0.36 insn per cycle
>> 255209 branches # 84.947 M/sec
>> 26002 branch-misses # 10.19% of all branches
>>
>> 0.003474829 seconds time elapsed
>>
>> 0.003519000 seconds user
>> 0.000000000 seconds sys
>> ------------------------------>8------------------------
>>
>> Note how much more meaningful is the second output - time spent for
>> execution pretty much matches number of cycles spent (we're running
>> @ 1GHz here).
>>
>> Signed-off-by: Alexey Brodkin <[email protected]>
>> Cc: Daniel Lezcano <[email protected]>
>> Cc: Vineet Gupta <[email protected]>
>> Cc: Thomas Gleixner <[email protected]>
>> ---
>
> Acked-by: Vineet Gupta <[email protected]>
>
> @Daniel is this going via timer tree or you want me to pick it up.
I will take care of it.
-- Daniel
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 24/10/2018 00:33, Vineet Gupta wrote:
> On 10/17/2018 04:30 AM, Alexey Brodkin wrote:
>> It turned out we used to use default implementation of sched_clock()
>> from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
>> by default we had 10 msec granularity of time measurement.
>>
>> Now given ARC built-in timers are clocked with the same frequency as
>> CPU cores we may get much higher precision of time tracking.
>>
>> Thus we switch to generic sched_clock which really reads ARC hardware
>> counters.
>>
>> This is especially helpful for measuring short events.
>> That's what we used to have:
>> ------------------------------>8------------------------
>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>
>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>
>> 10.000000 task-clock (msec) # 2.832 CPUs utilized
>> 1 context-switches # 0.100 K/sec
>> 1 cpu-migrations # 0.100 K/sec
>> 63 page-faults # 0.006 M/sec
>> 3049480 cycles # 0.305 GHz
>> 1091259 instructions # 0.36 insn per cycle
>> 256828 branches # 25.683 M/sec
>> 27026 branch-misses # 10.52% of all branches
>>
>> 0.003530687 seconds time elapsed
>>
>> 0.000000000 seconds user
>> 0.010000000 seconds sys
>> ------------------------------>8------------------------
>>
>> And now we'll see:
>> ------------------------------>8------------------------
>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>
>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>
>> 3.004322 task-clock (msec) # 0.865 CPUs utilized
>> 1 context-switches # 0.333 K/sec
>> 1 cpu-migrations # 0.333 K/sec
>> 63 page-faults # 0.021 M/sec
>> 2986734 cycles # 0.994 GHz
>> 1087466 instructions # 0.36 insn per cycle
>> 255209 branches # 84.947 M/sec
>> 26002 branch-misses # 10.19% of all branches
>>
>> 0.003474829 seconds time elapsed
>>
>> 0.003519000 seconds user
>> 0.000000000 seconds sys
>> ------------------------------>8------------------------
>>
>> Note how much more meaningful is the second output - time spent for
>> execution pretty much matches number of cycles spent (we're running
>> @ 1GHz here).
>>
>> Signed-off-by: Alexey Brodkin <[email protected]>
>> Cc: Daniel Lezcano <[email protected]>
>> Cc: Vineet Gupta <[email protected]>
>> Cc: Thomas Gleixner <[email protected]>
>> ---
>
> Acked-by: Vineet Gupta <[email protected]>
>
> @Daniel is this going via timer tree or you want me to pick it up.
Applied, thanks
-- Daniel
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 05/11/2018 15:39, Daniel Lezcano wrote:
> On 24/10/2018 00:33, Vineet Gupta wrote:
>> On 10/17/2018 04:30 AM, Alexey Brodkin wrote:
>>> It turned out we used to use default implementation of sched_clock()
>>> from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
>>> by default we had 10 msec granularity of time measurement.
>>>
>>> Now given ARC built-in timers are clocked with the same frequency as
>>> CPU cores we may get much higher precision of time tracking.
>>>
>>> Thus we switch to generic sched_clock which really reads ARC hardware
>>> counters.
>>>
>>> This is especially helpful for measuring short events.
>>> That's what we used to have:
>>> ------------------------------>8------------------------
>>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>>
>>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>>
>>> 10.000000 task-clock (msec) # 2.832 CPUs utilized
>>> 1 context-switches # 0.100 K/sec
>>> 1 cpu-migrations # 0.100 K/sec
>>> 63 page-faults # 0.006 M/sec
>>> 3049480 cycles # 0.305 GHz
>>> 1091259 instructions # 0.36 insn per cycle
>>> 256828 branches # 25.683 M/sec
>>> 27026 branch-misses # 10.52% of all branches
>>>
>>> 0.003530687 seconds time elapsed
>>>
>>> 0.000000000 seconds user
>>> 0.010000000 seconds sys
>>> ------------------------------>8------------------------
>>>
>>> And now we'll see:
>>> ------------------------------>8------------------------
>>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>>
>>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>>
>>> 3.004322 task-clock (msec) # 0.865 CPUs utilized
>>> 1 context-switches # 0.333 K/sec
>>> 1 cpu-migrations # 0.333 K/sec
>>> 63 page-faults # 0.021 M/sec
>>> 2986734 cycles # 0.994 GHz
>>> 1087466 instructions # 0.36 insn per cycle
>>> 255209 branches # 84.947 M/sec
>>> 26002 branch-misses # 10.19% of all branches
>>>
>>> 0.003474829 seconds time elapsed
>>>
>>> 0.003519000 seconds user
>>> 0.000000000 seconds sys
>>> ------------------------------>8------------------------
>>>
>>> Note how much more meaningful is the second output - time spent for
>>> execution pretty much matches number of cycles spent (we're running
>>> @ 1GHz here).
>>>
>>> Signed-off-by: Alexey Brodkin <[email protected]>
>>> Cc: Daniel Lezcano <[email protected]>
>>> Cc: Vineet Gupta <[email protected]>
>>> Cc: Thomas Gleixner <[email protected]>
>>> ---
>>
>> Acked-by: Vineet Gupta <[email protected]>
>>
>> @Daniel is this going via timer tree or you want me to pick it up.
>
> I will take care of it.
Please resend without the arch Kconfig change
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Hi Daniel,
On Sun, 2018-11-18 at 03:17 +0100, Daniel Lezcano wrote:
> On 05/11/2018 15:39, Daniel Lezcano wrote:
> > On 24/10/2018 00:33, Vineet Gupta wrote:
> > > On 10/17/2018 04:30 AM, Alexey Brodkin wrote:
> > > > It turned out we used to use default implementation of sched_clock()
> > > > from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
> > > > by default we had 10 msec granularity of time measurement.
> > > >
> > > > Now given ARC built-in timers are clocked with the same frequency as
> > > > CPU cores we may get much higher precision of time tracking.
> > > >
> > > > Thus we switch to generic sched_clock which really reads ARC hardware
> > > > counters.
> > > >
> > > > This is especially helpful for measuring short events.
> > > > That's what we used to have:
> > > > ------------------------------>8------------------------
> > > > $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
> > > >
> > > > Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
> > > >
> > > > 10.000000 task-clock (msec) # 2.832 CPUs utilized
> > > > 1 context-switches # 0.100 K/sec
> > > > 1 cpu-migrations # 0.100 K/sec
> > > > 63 page-faults # 0.006 M/sec
> > > > 3049480 cycles # 0.305 GHz
> > > > 1091259 instructions # 0.36 insn per cycle
> > > > 256828 branches # 25.683 M/sec
> > > > 27026 branch-misses # 10.52% of all branches
> > > >
> > > > 0.003530687 seconds time elapsed
> > > >
> > > > 0.000000000 seconds user
> > > > 0.010000000 seconds sys
> > > > ------------------------------>8------------------------
> > > >
> > > > And now we'll see:
> > > > ------------------------------>8------------------------
> > > > $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
> > > >
> > > > Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
> > > >
> > > > 3.004322 task-clock (msec) # 0.865 CPUs utilized
> > > > 1 context-switches # 0.333 K/sec
> > > > 1 cpu-migrations # 0.333 K/sec
> > > > 63 page-faults # 0.021 M/sec
> > > > 2986734 cycles # 0.994 GHz
> > > > 1087466 instructions # 0.36 insn per cycle
> > > > 255209 branches # 84.947 M/sec
> > > > 26002 branch-misses # 10.19% of all branches
> > > >
> > > > 0.003474829 seconds time elapsed
> > > >
> > > > 0.003519000 seconds user
> > > > 0.000000000 seconds sys
> > > > ------------------------------>8------------------------
> > > >
> > > > Note how much more meaningful is the second output - time spent for
> > > > execution pretty much matches number of cycles spent (we're running
> > > > @ 1GHz here).
> > > >
> > > > Signed-off-by: Alexey Brodkin <[email protected]>
> > > > Cc: Daniel Lezcano <[email protected]>
> > > > Cc: Vineet Gupta <[email protected]>
> > > > Cc: Thomas Gleixner <[email protected]>
> > > > ---
> > >
> > > Acked-by: Vineet Gupta <[email protected]>
> > >
> > > @Daniel is this going via timer tree or you want me to pick it up.
> >
> > I will take care of it.
>
> Please resend without the arch Kconfig change
I'm wondering if there's a problem with arc/arc/Kconfig change going
through your tree? This way it will be really atomic change and it will be
much easier to back-port (and that's what we'd really like to happen).
If Vineet is OK with that IMHO it's safe to keep it in the one and only commit.
Otherwise should I just split this patch in 2 and still submit them as series or
have 2 completely not-related patches one for you and one for Vineet?
In that case do I understand correctly that we may enable GENERIC_SCHED_CLOCK
for ARC even before proposed change for arc_timer.c gets merged - i.e. with no
special GENERIC_SCHED_CLOCK driver we'll safely fall-back to jiffie-based
sched clock which we anyways use now when GENERIC_SCHED_CLOCK is disabled, right?
-Alexey
On 19/11/2018 10:31, Alexey Brodkin wrote:
> Hi Daniel,
>
> On Sun, 2018-11-18 at 03:17 +0100, Daniel Lezcano wrote:
>> On 05/11/2018 15:39, Daniel Lezcano wrote:
>>> On 24/10/2018 00:33, Vineet Gupta wrote:
>>>> On 10/17/2018 04:30 AM, Alexey Brodkin wrote:
>>>>> It turned out we used to use default implementation of sched_clock()
>>>>> from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
>>>>> by default we had 10 msec granularity of time measurement.
>>>>>
>>>>> Now given ARC built-in timers are clocked with the same frequency as
>>>>> CPU cores we may get much higher precision of time tracking.
>>>>>
>>>>> Thus we switch to generic sched_clock which really reads ARC hardware
>>>>> counters.
>>>>>
>>>>> This is especially helpful for measuring short events.
>>>>> That's what we used to have:
>>>>> ------------------------------>8------------------------
>>>>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>>>>
>>>>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>>>>
>>>>> 10.000000 task-clock (msec) # 2.832 CPUs utilized
>>>>> 1 context-switches # 0.100 K/sec
>>>>> 1 cpu-migrations # 0.100 K/sec
>>>>> 63 page-faults # 0.006 M/sec
>>>>> 3049480 cycles # 0.305 GHz
>>>>> 1091259 instructions # 0.36 insn per cycle
>>>>> 256828 branches # 25.683 M/sec
>>>>> 27026 branch-misses # 10.52% of all branches
>>>>>
>>>>> 0.003530687 seconds time elapsed
>>>>>
>>>>> 0.000000000 seconds user
>>>>> 0.010000000 seconds sys
>>>>> ------------------------------>8------------------------
>>>>>
>>>>> And now we'll see:
>>>>> ------------------------------>8------------------------
>>>>> $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
>>>>>
>>>>> Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
>>>>>
>>>>> 3.004322 task-clock (msec) # 0.865 CPUs utilized
>>>>> 1 context-switches # 0.333 K/sec
>>>>> 1 cpu-migrations # 0.333 K/sec
>>>>> 63 page-faults # 0.021 M/sec
>>>>> 2986734 cycles # 0.994 GHz
>>>>> 1087466 instructions # 0.36 insn per cycle
>>>>> 255209 branches # 84.947 M/sec
>>>>> 26002 branch-misses # 10.19% of all branches
>>>>>
>>>>> 0.003474829 seconds time elapsed
>>>>>
>>>>> 0.003519000 seconds user
>>>>> 0.000000000 seconds sys
>>>>> ------------------------------>8------------------------
>>>>>
>>>>> Note how much more meaningful is the second output - time spent for
>>>>> execution pretty much matches number of cycles spent (we're running
>>>>> @ 1GHz here).
>>>>>
>>>>> Signed-off-by: Alexey Brodkin <[email protected]>
>>>>> Cc: Daniel Lezcano <[email protected]>
>>>>> Cc: Vineet Gupta <[email protected]>
>>>>> Cc: Thomas Gleixner <[email protected]>
>>>>> ---
>>>>
>>>> Acked-by: Vineet Gupta <[email protected]>
>>>>
>>>> @Daniel is this going via timer tree or you want me to pick it up.
>>>
>>> I will take care of it.
>>
>> Please resend without the arch Kconfig change
>
> I'm wondering if there's a problem with arc/arc/Kconfig change going
> through your tree? This way it will be really atomic change and it will be
> much easier to back-port (and that's what we'd really like to happen).
>
> If Vineet is OK with that IMHO it's safe to keep it in the one and only commit.
>
> Otherwise should I just split this patch in 2 and still submit them as series or
> have 2 completely not-related patches one for you and one for Vineet?
>
> In that case do I understand correctly that we may enable GENERIC_SCHED_CLOCK
> for ARC even before proposed change for arc_timer.c gets merged - i.e. with no
> special GENERIC_SCHED_CLOCK driver we'll safely fall-back to jiffie-based
> sched clock which we anyways use now when GENERIC_SCHED_CLOCK is disabled, right?
The ARC's Kconfig part does not apply on tip/timers/core.
As you described, sending a separate arc timer change is fine IMO.
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Hi Daniel,
On Mon, 2018-11-19 at 10:43 +0100, Daniel Lezcano wrote:
> On 19/11/2018 10:31, Alexey Brodkin wrote:
> > Hi Daniel,
> >
> > On Sun, 2018-11-18 at 03:17 +0100, Daniel Lezcano wrote:
> > > On 05/11/2018 15:39, Daniel Lezcano wrote:
> > > > On 24/10/2018 00:33, Vineet Gupta wrote:
> > > > > On 10/17/2018 04:30 AM, Alexey Brodkin wrote:
> > > > > > It turned out we used to use default implementation of sched_clock()
> > > > > > from kernel/sched/clock.c which was as precise as 1/HZ, i.e.
> > > > > > by default we had 10 msec granularity of time measurement.
> > > > > >
> > > > > > Now given ARC built-in timers are clocked with the same frequency as
> > > > > > CPU cores we may get much higher precision of time tracking.
> > > > > >
> > > > > > Thus we switch to generic sched_clock which really reads ARC hardware
> > > > > > counters.
> > > > > >
> > > > > > This is especially helpful for measuring short events.
> > > > > > That's what we used to have:
> > > > > > ------------------------------>8------------------------
> > > > > > $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
> > > > > >
> > > > > > Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
> > > > > >
> > > > > > 10.000000 task-clock (msec) # 2.832 CPUs utilized
> > > > > > 1 context-switches # 0.100 K/sec
> > > > > > 1 cpu-migrations # 0.100 K/sec
> > > > > > 63 page-faults # 0.006 M/sec
> > > > > > 3049480 cycles # 0.305 GHz
> > > > > > 1091259 instructions # 0.36 insn per cycle
> > > > > > 256828 branches # 25.683 M/sec
> > > > > > 27026 branch-misses # 10.52% of all branches
> > > > > >
> > > > > > 0.003530687 seconds time elapsed
> > > > > >
> > > > > > 0.000000000 seconds user
> > > > > > 0.010000000 seconds sys
> > > > > > ------------------------------>8------------------------
> > > > > >
> > > > > > And now we'll see:
> > > > > > ------------------------------>8------------------------
> > > > > > $ perf stat /bin/sh -c /root/lmbench-master/bin/arc/hello > /dev/null
> > > > > >
> > > > > > Performance counter stats for '/bin/sh -c /root/lmbench-master/bin/arc/hello':
> > > > > >
> > > > > > 3.004322 task-clock (msec) # 0.865 CPUs utilized
> > > > > > 1 context-switches # 0.333 K/sec
> > > > > > 1 cpu-migrations # 0.333 K/sec
> > > > > > 63 page-faults # 0.021 M/sec
> > > > > > 2986734 cycles # 0.994 GHz
> > > > > > 1087466 instructions # 0.36 insn per cycle
> > > > > > 255209 branches # 84.947 M/sec
> > > > > > 26002 branch-misses # 10.19% of all branches
> > > > > >
> > > > > > 0.003474829 seconds time elapsed
> > > > > >
> > > > > > 0.003519000 seconds user
> > > > > > 0.000000000 seconds sys
> > > > > > ------------------------------>8------------------------
> > > > > >
> > > > > > Note how much more meaningful is the second output - time spent for
> > > > > > execution pretty much matches number of cycles spent (we're running
> > > > > > @ 1GHz here).
> > > > > >
> > > > > > Signed-off-by: Alexey Brodkin <[email protected]>
> > > > > > Cc: Daniel Lezcano <[email protected]>
> > > > > > Cc: Vineet Gupta <[email protected]>
> > > > > > Cc: Thomas Gleixner <[email protected]>
> > > > > > ---
> > > > >
> > > > > Acked-by: Vineet Gupta <[email protected]>
> > > > >
> > > > > @Daniel is this going via timer tree or you want me to pick it up.
> > > >
> > > > I will take care of it.
> > >
> > > Please resend without the arch Kconfig change
> >
> > I'm wondering if there's a problem with arc/arc/Kconfig change going
> > through your tree? This way it will be really atomic change and it will be
> > much easier to back-port (and that's what we'd really like to happen).
> >
> > If Vineet is OK with that IMHO it's safe to keep it in the one and only commit.
> >
> > Otherwise should I just split this patch in 2 and still submit them as series or
> > have 2 completely not-related patches one for you and one for Vineet?
> >
> > In that case do I understand correctly that we may enable GENERIC_SCHED_CLOCK
> > for ARC even before proposed change for arc_timer.c gets merged - i.e. with no
> > special GENERIC_SCHED_CLOCK driver we'll safely fall-back to jiffie-based
> > sched clock which we anyways use now when GENERIC_SCHED_CLOCK is disabled, right?
>
> The ARC's Kconfig part does not apply on tip/timers/core.
May I have a reference to that tree so I may check what gets in the way there
and then I'll just move "select GENERIC_SCHED_CLOCK" a bit below so there're no
clashes any longer.
-Alexey
Hi Alexey,
On 19/11/2018 10:55, Alexey Brodkin wrote:
> Hi Daniel,
[ ... ]
>>> In that case do I understand correctly that we may enable GENERIC_SCHED_CLOCK
>>> for ARC even before proposed change for arc_timer.c gets merged - i.e. with no
>>> special GENERIC_SCHED_CLOCK driver we'll safely fall-back to jiffie-based
>>> sched clock which we anyways use now when GENERIC_SCHED_CLOCK is disabled, right?
>>
>> The ARC's Kconfig part does not apply on tip/timers/core.
>
> May I have a reference to that tree so I may check what gets in the way there
> and then I'll just move "select GENERIC_SCHED_CLOCK" a bit below so there're no
> clashes any longer.
sure, here it is:
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/log/?h=timers/core
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Hi Daniel,
On Mon, 2018-11-19 at 10:58 +0100, Daniel Lezcano wrote:
> Hi Alexey,
>
> On 19/11/2018 10:55, Alexey Brodkin wrote:
> > Hi Daniel,
>
> [ ... ]
>
> > > > In that case do I understand correctly that we may enable GENERIC_SCHED_CLOCK
> > > > for ARC even before proposed change for arc_timer.c gets merged - i.e. with no
> > > > special GENERIC_SCHED_CLOCK driver we'll safely fall-back to jiffie-based
> > > > sched clock which we anyways use now when GENERIC_SCHED_CLOCK is disabled, right?
> > >
> > > The ARC's Kconfig part does not apply on tip/timers/core.
> >
> > May I have a reference to that tree so I may check what gets in the way there
> > and then I'll just move "select GENERIC_SCHED_CLOCK" a bit below so there're no
> > clashes any longer.
>
> sure, here it is:
>
> https://urldefense.proofpoint.com/v2/url?u=https-3A__git.kernel.org_pub_scm_linux_kernel_git_tip_tip.git_log_-3Fh-3Dtimers_core&d=DwIDaQ&c=DPL6_X_6JkXFx7AXWqB0tg&r=lqdeeSSEes0GFDDl656eViXO7breS55ytWkhpk5R81I&m=r2hJtdjpPi2Y1yDo7wgo7nKu7MV8MU8gRPg3DD1mcyI&s=NOTZZBd25Vlk8-jmMUHxtpEhfxlkanXdvKN3-HQSc9w&e=
Well I did have it based on 4.18 for some reason thus were those clashes.
Rebased on top of Linus' up-to-date tree and re-sent v3.
-Alexey