LinuxLists.cc - [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Mon, Jul 29, 2013 at 12:18:11PM +0200, Thomas Gleixner wrote:

> The reason why we want to do that is:
>
> timer expiry time
> A 100us -> programmed hardware event
> B 2000us
>
> Restart timer A with an expiry time of 3000us without reprogramming:
>
> timer expiry time
> NONE 100us -> programmed hardware event
> B 2000us
> A 3000us
>
> We take an extra wakeup for reprogramming the hardware, which is
> counterproductive for power saving. So disabling it blindly will cause
> a regresssion for other people. We need to be smarter than that.

So aside from the complete mess posted; how about something like this?

*completely untested etc..*

---
include/linux/hrtimer.h | 5 +++++
kernel/hrtimer.c | 60 +++++++++++++++++++++++--------------------------
2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index d19a5c2..f2bcb9c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -185,6 +185,7 @@ struct hrtimer_cpu_base {
ktime_t expires_next;
int hres_active;
int hang_detected;
+ int timers_removed;
unsigned long nr_events;
unsigned long nr_retries;
unsigned long nr_hangs;
@@ -261,6 +262,8 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
#ifdef CONFIG_HIGH_RES_TIMERS
struct clock_event_device;

+extern void hrtimer_enter_idle(void);
+
extern void hrtimer_interrupt(struct clock_event_device *dev);

/*
@@ -296,6 +299,8 @@ extern void clock_was_set_delayed(void);
# define MONOTONIC_RES_NSEC LOW_RES_NSEC
# define KTIME_MONOTONIC_RES KTIME_LOW_RES

+static inline void hrtimer_enter_idle(void) { }
+
static inline void hrtimer_peek_ahead_timers(void) { }

/*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f0f4fe2..ffb16d3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -673,6 +673,27 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
}

+static void __hrtimer_reprogram_all(struct hrtimer_cpu_base *base)
+{
+ if (!hrtimer_hres_active())
+ return;
+
+ raw_spin_lock(&base->lock);
+ hrtimer_update_base(base);
+ hrtimer_force_reprogram(base, 0);
+ raw_spin_unlock(&base->lock);
+}
+
+void hrtimer_enter_idle(void)
+{
+ struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+
+ if (base->timers_removed) {
+ base->timers_removed = 0;
+ __hrtimer_reprogramm_all(base);
+ }
+}
+
/*
* Retrigger next event is called after clock was set
*
@@ -682,13 +703,7 @@ static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);

- if (!hrtimer_hres_active())
- return;
-
- raw_spin_lock(&base->lock);
- hrtimer_update_base(base);
- hrtimer_force_reprogram(base, 0);
- raw_spin_unlock(&base->lock);
+ __hrtimer_reprogram_all(base);
}

/*
@@ -907,7 +922,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
*/
static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
- unsigned long newstate, int reprogram)
+ unsigned long newstate)
{
struct timerqueue_node *next_timer;
if (!(timer->state & HRTIMER_STATE_ENQUEUED))
@@ -915,19 +930,10 @@ static void __remove_hrtimer(struct hrtimer *timer,

next_timer = timerqueue_getnext(&base->active);
timerqueue_del(&base->active, &timer->node);
- if (&timer->node == next_timer) {
#ifdef CONFIG_HIGH_RES_TIMERS
- /* Reprogram the clock event device. if enabled */
- if (reprogram && hrtimer_hres_active()) {
- ktime_t expires;
-
- expires = ktime_sub(hrtimer_get_expires(timer),
- base->offset);
- if (base->cpu_base->expires_next.tv64 == expires.tv64)
- hrtimer_force_reprogram(base->cpu_base, 1);
- }
+ if (hrtimer_hres_active() && &timer->node == next_timer)
+ base->cpu_base->timers_removed++;
#endif
- }
if (!timerqueue_getnext(&base->active))
base->cpu_base->active_bases &= ~(1 << base->index);
out:
@@ -942,26 +948,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
{
if (hrtimer_is_queued(timer)) {
unsigned long state;
- int reprogram;

- /*
- * Remove the timer and force reprogramming when high
- * resolution mode is active and the timer is on the current
- * CPU. If we remove a timer on another CPU, reprogramming is
- * skipped. The interrupt event on this CPU is fired and
- * reprogramming happens in the interrupt handler. This is a
- * rare case and less expensive than a smp call.
- */
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
- reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
/*
* We must preserve the CALLBACK state flag here,
* otherwise we could move the timer base in
* switch_hrtimer_base.
*/
state = timer->state & HRTIMER_STATE_CALLBACK;
- __remove_hrtimer(timer, base, state, reprogram);
+ __remove_hrtimer(timer, base, state);
return 1;
}
return 0;
@@ -1243,7 +1239,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
WARN_ON(!irqs_disabled());

debug_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK);
timer_stats_account_hrtimer(timer);
fn = timer->function;

@@ -1690,7 +1686,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not

2013-07-30 09:35:27

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Tue, Jul 30, 2013 at 05:12:49PM +0800, Ethan Zhao wrote:
> On Mon, Jul 29, 2013 at 6:18 PM, Thomas Gleixner <[email protected]> wrote:
> > The test case does not involve anything hrtimer related. Do you have
> > CONFIG_SCHED_HRTICK enabled?
> >
>
> Yes. it is default configured in stable release.
> CONFIG_SCHED_HRTICK=y

Should still be disabled by default even if supported:

# grep HRTICK kernel/sched/features.h
SCHED_FEAT(HRTICK, false)

> > First of all we want to know, which particular hrtimer is causing that
> > issue. If it is the hrtick one, then I really have to ask why you want
> > to use it at all in such a high performance scenario.
> >
> > Any advice about the HZ in high performance scenario ? hrtimer tick
> Is not fit for high performance ?

Hence why its disabled, programming the timer hardware is too expensive.
But since you didn't even know that I suspect you aren't in fact using
it.

It would be good if you could do what Thomas suggested and look at which
timer is actually active during your workload.

2013-07-30 11:44:05

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Tue, Jul 30, 2013 at 5:35 PM, Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Jul 30, 2013 at 05:12:49PM +0800, Ethan Zhao wrote:
> > On Mon, Jul 29, 2013 at 6:18 PM, Thomas Gleixner <[email protected]> wrote:
> > > The test case does not involve anything hrtimer related. Do you have
> > > CONFIG_SCHED_HRTICK enabled?
> > >
> >
> > Yes. it is default configured in stable release.
> > CONFIG_SCHED_HRTICK=y
>
> Should still be disabled by default even if supported:
>
> # grep HRTICK kernel/sched/features.h
> SCHED_FEAT(HRTICK, false)
>
>
> > > First of all we want to know, which particular hrtimer is causing that
> > > issue. If it is the hrtick one, then I really have to ask why you want
> > > to use it at all in such a high performance scenario.
> > >
> > > Any advice about the HZ in high performance scenario ? hrtimer tick
> > Is not fit for high performance ?
>
> Hence why its disabled, programming the timer hardware is too expensive.
> But since you didn't even know that I suspect you aren't in fact using
> it.
>
Got it.
what tglx and you mean

SCHED_FEAT(HRTICK, 0)

then no hrtimer operation in
void __sched __schedule(void)
{
? ?
if (sched_feat(HRTICK))
hrtick_clear(rq);
? ?

Yup, So what I am facing is not HRTICK.
But that doesn't move my eyes away from hrtimer and suspect
reprogramming delay the scheduling.
The call stack looks like following :

cpu_idle()
{
? ?
tick_nohz_stop_sched_tick()
--> hrtimer_start(); --> __hrtimer_start_range_ns() -- > remove_hrtimer()
-- > raise_softirq_irqoff(TIMER_SOFTIRQ);
---->run_timer_softirq() --> tick_sched_timer() -- >
hrtimer_start_expires
? ?

? ...
tick_nohz_restart_sched_tick()
? ...
schedule()
? ...
}

So the expensive thing maybe not inside the schedule(), but could
outside the scheduler(), the more bigger forever loop.

This is one part of what I am facing.

Thanks
Ethan

>
> It would be good if you could do what Thomas suggested and look at which
> timer is actually active during your workload.

2013-07-30 11:59:16

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Tue, Jul 30, 2013 at 07:44:03PM +0800, Ethan Zhao wrote:
> Got it.
> what tglx and you mean
>
>
> So the expensive thing maybe not inside the schedule(), but could
> outside the scheduler(), the more bigger forever loop.
>
> This is one part of what I am facing.

Right, so it would be good if you could further diagnose the problem so
we can come up with a solution that cures the problem while retaining
the current 'desired' properties.

The patch you pinpointed caused a regression in that it would wake from
NOHZ mode far too often. Could it be that the now longer idle sections
cause your CPU to go into deeper idle modes and you're suffering from
idle-exit latencies?

2013-08-03 06:56:13

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

Peter and tglx,

About which hrtimer causes the performance issue, I did some test with my home server because I am on vacation, An ASUS PC server with 4 core Intel i5 cpu inside,
Running the Stable 3.11RC3+ (removed reschedule IPI), almost got the same result as I did with Sun Servers. I suspect it is the tick_sched_timer, the evidence as following:
While running some test tools such as http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c, the powertop
tool shows following result:

PowerTOP version 1.11 (C) 2007 Intel Corporation

Cn Avg residency P-states (frequencies)
C0 (cpu running) ( 2.2%) Turbo Mode 0.0%
polling 0.0ms ( 0.0%) 3.21 Ghz 0.0%
C1 mwait 0.6ms ( 0.9%) 3.10 Ghz 0.0%
C2 mwait 0.5ms ( 0.1%) 3.00 Ghz 0.0%
C3 mwait 0.9ms (96.7%) 1.60 Ghz 100.0%

Wakeups-from-idle per second : 1100.9 interval: 0.3s
no ACPI power usage estimate available

Top causes for wakeups:
23.5% ( inf) swapper/3 : hrtimer_start_range_ns (tick_sched_timer)
23.5% ( inf) swapper/0 : hrtimer_start_range_ns (tick_sched_timer)
23.5% ( inf) swapper/2 : hrtimer_start_range_ns (tick_sched_timer)
23.5% ( inf) swapper/1 : hrtimer_start_range_ns (tick_sched_timer)
2.4% ( inf) <interrupt> : xhci_hcd
2.4% ( inf) USB device 3-4 : Basic Optical Mouse (Microsoft)
0.3% ( inf) rcu_sched : rcu_gp_kthread (process_timeout)
0.2% ( inf) gnome-terminal : hrtimer_start_range_ns (hrtimer_wakeup)
0.2% ( inf) <interrupt> : PS/2 keyboard/mouse/touchpad
0.1% ( inf) swapper/0 : hrtimer_start (menu_hrtimer_notify)
0.1% ( inf) swapper/0 : clocksource_watchdog
(clocksource_watchdog)
0.1% ( inf) cimserver : hrtimer_start_range_ns (hrtimer_wakeup)
0.1% ( inf) avahi-daemon : hrtimer_start_range_ns (hrtimer_wakeup)
0.1% ( inf) kworker/0:1 : queue_delayed_work_on
(delayed_work_timer_fn)
0.1% ( inf) rtkit-daemon : hrtimer_start_range_ns (hrtimer_wakeup)

And the /proc/timers_list

[root@localhost proc]# cat timer_list
Timer List Version: v0.7
HRTIMER_MAX_CLOCK_BASES: 4
now at 1463485951666 nsecs

cpu: 0
clock 0:
.base: ffff88021ea0d2c0
.index: 0
.resolution: 1 nsecs
.get_time: ktime_get
.offset: 0 nsecs
active timers:
#0: <ffff88021ea0d780>, tick_sched_timer, S:01, hrtimer_start_range_ns, swapper/0/1
# expires at 1463486000000-1463486000000 nsecs [in 48507 to 48507 nsecs]
#1: <ffff88021ea0d900>, watchdog_timer_fn, S:01, hrtimer_start, watchdog/0/10
# expires at 1464038000000-1464038000000 nsecs [in 552048507 to 552048507 nsecs]
#2: <ffff880215b139d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, gnome-power-man/7341
# expires at 1467041141813-1467051131812 nsecs [in 3555190320 to 3565180319 nsecs]
#3: <ffff8802118eb8c8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, ntpd/6806
# expires at 1469048348291-1469061256427 nsecs [in 5562396798 to 5575304934 nsecs]
#4: <ffff88020f34b8c8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, cimservermain/6993
# expires at 1521114040522-1521214040522 nsecs [in 57628089029 to 57728089029 nsecs]
#5: <ffff880215b57d78>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, master/6886
# expires at 1521833602390-1521892602389 nsecs [in 58347650897 to 58406650896 nsecs]
#6: <ffff880213c9c238>, it_real_fn, S:01, hrtimer_start, master/6886
# expires at 1795833601062-1795833601062 nsecs [in 332347649569 to 332347649569 nsecs]
clock 1:
.base: ffff88021ea0d300
.index: 1
.resolution: 1 nsecs
.get_time: ktime_get_real
.offset: 1375508485789320552 nsecs
active timers:
#0: <ffff8802157e5e28>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, automount/6767
# expires at 1375509966602894000-1375509966602944000 nsecs [in 1375508503116942507 to 1375508503116992507 nsecs]
clock 2:
.base: ffff88021ea0d340
.index: 2
.resolution: 1 nsecs
.get_time: ktime_get_boottime
.offset: 0 nsecs
active timers:
clock 3:
.base: ffff88021ea0d380
.index: 3
.resolution: 1 nsecs
.get_time: ktime_get_clocktai
.offset: 1375508485789320552 nsecs
active timers:
.expires_next : 1463487000000 nsecs
.hres_active : 1
.nr_events : 1489177
.nr_retries : 145
.nr_hangs : 0
.max_hang_time : 0 nsecs
.nohz_mode : 0
.last_tick : 0 nsecs
.tick_stopped : 0
.idle_jiffies : 0
.idle_calls : 0
.idle_sleeps : 0
.idle_entrytime : 0 nsecs
.idle_waketime : 0 nsecs
.idle_exittime : 0 nsecs
.idle_sleeptime : 0 nsecs
.iowait_sleeptime: 0 nsecs
.last_jiffies : 0
.next_jiffies : 0
.idle_expires : 0 nsecs
jiffies: 4296130782

cpu: 1
clock 0:
.base: ffff88021ea8d2c0
.index: 0
.resolution: 1 nsecs
.get_time: ktime_get
.offset: 0 nsecs
active timers:
#0: <ffff88021ea8d780>, tick_sched_timer, S:01, hrtimer_start_range_ns, swapper/1/0
# expires at 1463487000000-1463487000000 nsecs [in 1048507 to 1048507 nsecs]
#1: <ffff88021ea8d900>, watchdog_timer_fn, S:01, hrtimer_start, watchdog/1/11
# expires at 1464038000000-1464038000000 nsecs [in 552048507 to 552048507 nsecs]
#2: <ffff88020e2739d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, hald-addon-stor/6726
# expires at 1464211071890-1464213064889 nsecs [in 725120397 to 727113396 nsecs]
#3: <ffff8802157c99d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, rtkit-daemon/7358
# expires at 1467525043210-1467530043209 nsecs [in 4039091717 to 4044091716 nsecs]
#4: <ffff880213ec19d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, rtkit-daemon/7359
# expires at 1472525038227-1472525038227 nsecs [in 9039086734 to 9039086734 nsecs]
#5: <ffff880211a29ea8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, crond/6941
# expires at 1475497502106-1475497552106 nsecs [in 12011550613 to 12011600613 nsecs]
#6: <ffff88021519d9d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, hald/6673
# expires at 1490211101222-1490241071221 nsecs [in 26725149729 to 26755119728 nsecs]
#7: <ffff880215055d78>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, certmonger/7048
# expires at 1492497024071-1492527024070 nsecs [in 29011072578 to 29041072577 nsecs]
#8: <ffff88021182d8c8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, rsyslogd/6412
# expires at 86416926181907-86417026181907 nsecs [in 84953440230414 to 84953540230414 nsecs]
clock 1:
.base: ffff88021ea8d300
.index: 1
.resolution: 1 nsecs
.get_time: ktime_get_real
.offset: 1375508485789320552 nsecs
active timers:
clock 2:
.base: ffff88021ea8d340
.index: 2
.resolution: 1 nsecs
.get_time: ktime_get_boottime
.offset: 0 nsecs
active timers:
clock 3:
.base: ffff88021ea8d380
.index: 3
.resolution: 1 nsecs
.get_time: ktime_get_clocktai
.offset: 1375508485789320552 nsecs
active timers:
.expires_next : 1463487000000 nsecs
.hres_active : 1
.nr_events : 1505848
.nr_retries : 273
.nr_hangs : 0
.max_hang_time : 0 nsecs
.nohz_mode : 0
.last_tick : 0 nsecs
.tick_stopped : 0
.idle_jiffies : 0
.idle_calls : 0
.idle_sleeps : 0
.idle_entrytime : 0 nsecs
.idle_waketime : 0 nsecs
.idle_exittime : 0 nsecs
.idle_sleeptime : 0 nsecs
.iowait_sleeptime: 0 nsecs
.last_jiffies : 0
.next_jiffies : 0
.idle_expires : 0 nsecs
jiffies: 4296130782

cpu: 2
clock 0:
.base: ffff88021eb0d2c0
.index: 0
.resolution: 1 nsecs
.get_time: ktime_get
.offset: 0 nsecs
active timers:
#0: <ffff88021eb0d780>, tick_sched_timer, S:01, hrtimer_start_range_ns, swapper/2/0
# expires at 1463487000000-1463487000000 nsecs [in 1048507 to 1048507 nsecs]
#1: <ffff880213f918c8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, cimserver/6992
# expires at 1463603820300-1463604070299 nsecs [in 117868807 to 118118806 nsecs]
#2: <ffff88021eb0d900>, watchdog_timer_fn, S:01, hrtimer_start, watchdog/2/16
# expires at 1464050000000-1464050000000 nsecs [in 564048507 to 564048507 nsecs]
#3: <ffff8802156939d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, gnome-settings-/7329
# expires at 1467041063935-1467045059934 nsecs [in 3555112442 to 3559108441 nsecs]
#4: <ffff880214db39d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, gconfd-2/7310
# expires at 1467041085986-1467071055985 nsecs [in 3555134493 to 3585104492 nsecs]
#5: <ffff88020fb41ea8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, irqbalance/6442
# expires at 1468210681330-1468210731330 nsecs [in 4724729837 to 4724779837 nsecs]
#6: <ffff880211a5f9d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, rpcbind/6461
# expires at 1488949026351-1488979026350 nsecs [in 25463074858 to 25493074857 nsecs]
#7: <ffff8802141ed9d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, NetworkManager/6495
# expires at 1518210833956-1518310833956 nsecs [in 54724882463 to 54824882463 nsecs]
#8: <ffff88021492bd78>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, qmgr/6899
# expires at 1521592237483-1521692237483 nsecs [in 58106285990 to 58206285990 nsecs]
#9: <ffff880214a929b8>, it_real_fn, S:01, hrtimer_start, qmgr/6899
# expires at 1554592221196-1554592221196 nsecs [in 91106269703 to 91106269703 nsecs]
#10: <ffff880212685ea8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, atd/6952
# expires at 3620649306705-3620649356705 nsecs [in 2157163355212 to 2157163405212 nsecs]
clock 1:
.base: ffff88021eb0d300
.index: 1
.resolution: 1 nsecs
.get_time: ktime_get_real
.offset: 1375508485789320552 nsecs
active timers:
clock 2:
.base: ffff88021eb0d340
.index: 2
.resolution: 1 nsecs
.get_time: ktime_get_boottime
.offset: 0 nsecs
active timers:
clock 3:
.base: ffff88021eb0d380
.index: 3
.resolution: 1 nsecs
.get_time: ktime_get_clocktai
.offset: 1375508485789320552 nsecs
active timers:
.expires_next : 1463487000000 nsecs
.hres_active : 1
.nr_events : 1499325
.nr_retries : 236
.nr_hangs : 0
.max_hang_time : 0 nsecs
.nohz_mode : 0
.last_tick : 0 nsecs
.tick_stopped : 0
.idle_jiffies : 0
.idle_calls : 0
.idle_sleeps : 0
.idle_entrytime : 0 nsecs
.idle_waketime : 0 nsecs
.idle_exittime : 0 nsecs
.idle_sleeptime : 0 nsecs
.iowait_sleeptime: 0 nsecs
.last_jiffies : 0
.next_jiffies : 0
.idle_expires : 0 nsecs
jiffies: 4296130782

cpu: 3
clock 0:
.base: ffff88021eb8d2c0
.index: 0
.resolution: 1 nsecs
.get_time: ktime_get
.offset: 0 nsecs
active timers:
#0: <ffff88021eb8d780>, tick_sched_timer, S:01, hrtimer_start_range_ns, swapper/3/0
# expires at 1463487000000-1463487000000 nsecs [in 1048507 to 1048507 nsecs]
#1: <ffff88021eb8d900>, watchdog_timer_fn, S:01, hrtimer_start, watchdog/3/21
# expires at 1464062000000-1464062000000 nsecs [in 576048507 to 576048507 nsecs]
#2: <ffff880215b199d8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, gdm-simple-gree/7340
# expires at 1474212089328-1474272025327 nsecs [in 10726137835 to 10786073834 nsecs]
#3: <ffff88021402dea8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, sleep/7669
# expires at 1520896517118-1520896567118 nsecs [in 57410565625 to 57410615625 nsecs]
#4: <ffff880214befd78>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, pickup/6898
# expires at 1562833597553-1562933597552 nsecs [in 99347646060 to 99447646059 nsecs]
#5: <ffff88021277d8c8>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, Xorg/7216
# expires at 1826170099331-1826270099331 nsecs [in 362684147838 to 362784147838 nsecs]
#6: <ffff880214cda9b8>, it_real_fn, S:01, hrtimer_start, pickup/6898
# expires at 7462833595405-7462833595405 nsecs [in 5999347643912 to 5999347643912 nsecs]
clock 1:
.base: ffff88021eb8d300
.index: 1
.resolution: 1 nsecs
.get_time: ktime_get_real
.offset: 1375508485789320552 nsecs
active timers:
#0: <ffff880212e1de28>, hrtimer_wakeup, S:01, hrtimer_start_range_ns, cimservermain/6997
# expires at 1375509967695245000-1375509967695295000 nsecs [in 1375508504209293507 to 1375508504209343507 nsecs]
clock 2:
.base: ffff88021eb8d340
.index: 2
.resolution: 1 nsecs
.get_time: ktime_get_boottime
.offset: 0 nsecs
active timers:
clock 3:
.base: ffff88021eb8d380
.index: 3
.resolution: 1 nsecs
.get_time: ktime_get_clocktai
.offset: 1375508485789320552 nsecs
active timers:
.expires_next : 1463487000000 nsecs
.hres_active : 1
.nr_events : 1490001
.nr_retries : 347
.nr_hangs : 0
.max_hang_time : 0 nsecs
.nohz_mode : 0
.last_tick : 0 nsecs
.tick_stopped : 0
.idle_jiffies : 0
.idle_calls : 0
.idle_sleeps : 0
.idle_entrytime : 0 nsecs
.idle_waketime : 0 nsecs
.idle_exittime : 0 nsecs
.idle_sleeptime : 0 nsecs
.iowait_sleeptime: 0 nsecs
.last_jiffies : 0
.next_jiffies : 0
.idle_expires : 0 nsecs
jiffies: 4296130782

Tick Device: mode: 1
Broadcast device
Clock Event Device: hpet
max_delta_ns: 149983013276
min_delta_ns: 13409
mult: 61496111
shift: 32
mode: 1
next_event: 9223372036854775807 nsecs
set_next_event: hpet_legacy_next_event
set_mode: hpet_legacy_set_mode
event_handler: tick_handle_oneshot_broadcast
retries: 0

tick_broadcast_mask: 00000000
tick_broadcast_oneshot_mask: 00000000

Tick Device: mode: 1
Per CPU device: 0
Clock Event Device: lapic
max_delta_ns: 1377930800842
min_delta_ns: 1000
mult: 13387279
shift: 27
mode: 3
next_event: 1463487000000 nsecs
set_next_event: lapic_next_deadline
set_mode: lapic_timer_setup
event_handler: hrtimer_interrupt
retries: 0

Tick Device: mode: 1
Per CPU device: 1
Clock Event Device: lapic
max_delta_ns: 1377930800842
min_delta_ns: 1000
mult: 13387279
shift: 27
mode: 3
next_event: 1463487000000 nsecs
set_next_event: lapic_next_deadline
set_mode: lapic_timer_setup
event_handler: hrtimer_interrupt
retries: 1

Tick Device: mode: 1
Per CPU device: 2
Clock Event Device: lapic
max_delta_ns: 1377930800842
min_delta_ns: 1000
mult: 13387279
shift: 27
mode: 3
next_event: 1463487000000 nsecs
set_next_event: lapic_next_deadline
set_mode: lapic_timer_setup
event_handler: hrtimer_interrupt
retries: 0

Tick Device: mode: 1
Per CPU device: 3
Clock Event Device: lapic
max_delta_ns: 1377930800842
min_delta_ns: 1000
mult: 13387279
shift: 27
mode: 3
next_event: 1463487000000 nsecs
set_next_event: lapic_next_deadline
set_mode: lapic_timer_setup
event_handler: hrtimer_interrupt
retries: 0

Note, I removed the reshcedule IPI for test reason, or reschedule IPI is the first reason to wake-up (In another mail I will tell why I removed the reschedule IPI).
The tick_sched_timers are issued by:

cpu_idle_loop() => tick_nohz_idle_enter() = > tick_nohz_stop_sched_tick() = > raise_softirq_irqoff(TIMER_SOFTIRQ)----------------------------------->+
run_timer_softirq() => tick_sched_timer()=> tick_sched_handle()=>update_process_times()=> run_local_timers() =>raise_softirq_irqoff(TIMER_SOFTIRQ)-->+
^----------------------------------------------------------------------------------------------------------------------------------------------------<

Thanks,
Ethan

?? 2013-7-30??????7:59??Peter Zijlstra <[email protected]> д????

> On Tue, Jul 30, 2013 at 07:44:03PM +0800, Ethan Zhao wrote:
>> Got it.
>> what tglx and you mean
>>
>>
>> So the expensive thing maybe not inside the schedule(), but could
>> outside the scheduler(), the more bigger forever loop.
>>
>> This is one part of what I am facing.
>
> Right, so it would be good if you could further diagnose the problem so
> we can come up with a solution that cures the problem while retaining
> the current 'desired' properties.
>
> The patch you pinpointed caused a regression in that it would wake from
> NOHZ mode far too often. Could it be that the now longer idle sections
> cause your CPU to go into deeper idle modes and you're suffering from
> idle-exit latencies?

2013-08-03 07:38:01

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

Peter and tglx,
Some other tough hacking and testing with result FYI,
With the default kernel 2.6.32-279.19.1.el6.x86_64 in CentOS 6.3 running on my ASUS 4 core Intel i5 server, almost got the best performance of
tool http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c

[root@localhost ~]# time ./pipe-test-1m

real 0m7.704s
user 0m0.047s
sys 0m4.815s
[root@localhost ~]# time ./pipe-test-1m

real 0m8.000s
user 0m0.071s
sys 0m5.035s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.386s
user 0m0.086s
sys 0m4.591s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.919s
user 0m0.064s
sys 0m4.912s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.949s
user 0m0.083s
sys 0m4.917s

[root@localhost ~]# time ./pipe-test-1m
rrr
real 0m7.913s
user 0m0.070s
sys 0m4.903s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.953s
user 0m0.092s
sys 0m4.881s
[root@localhost ~]# time ./pipe-test-1m

real 0m8.059s
user 0m0.108s
sys 0m5.037s
[root@localhost ~]#

Then compiled and boot stable 3.11.0-rc3 with default configuration, redid the same test. got very bad performance:
root@localhost ~]# uname -a
Linux localhost 3.11.0-rc3 #4 SMP Wed Jul 31 16:10:56 EDT 2013 x86_64 x86_64 x86_64 GNU/Linux

real 0m10.730s
user 0m0.245s
sys 0m6.596s
[root@localhost ~]# time ./pipe-test-1m

real 0m10.661s
user 0m0.218s
sys 0m6.520s
[root@localhost ~]# time ./pipe-test-1m

real 0m10.699s
user 0m0.233s
sys 0m6.534s
[root@localhost ~]# time ./pipe-test-1m

real 0m10.616s
user 0m0.191s
sys 0m6.505s
[root@localhost ~]# time ./pipe-test-1m

real 0m10.546s
user 0m0.214s
sys 0m6.441s

[root@localhost ~]# time ./pipe-test-1m

real 0m10.631s
user 0m0.204s
sys 0m6.509s

First 'tough' hacking is disable the reprogramming in _remove_hrtimer() within 3.11-rc3 code and redo the test.
much better.

root@localhost ~]# time ./pipe-test-1m

real 0m9.447s
user 0m0.227s
sys 0m5.900s
[root@localhost ~]# time ./pipe-test-1m

real 0m9.507s
user 0m0.226s
sys 0m5.922s
[root@localhost ~]# time ./pipe-test-1m

real 0m9.495s
user 0m0.228s
sys 0m5.916s
[root@localhost ~]# time ./pipe-test-1m

real 0m9.470s
user 0m0.229s
sys 0m5.938s
[root@localhost ~]# time ./pipe-test-1m

real 0m9.484s
user 0m0.269s
sys 0m5.875s
[root@localhost ~]# time ./pipe-test-1m

real 0m9.328s
user 0m0.242s
sys 0m5.767s

While I monitor the wake-up with powertop, got
Top causes for wakeups:
98.5% ( inf) <kernel IPI> : Rescheduling interrupts
0.5% ( inf) swapper/3 : hrtimer_start_range_ns (tick_sched_timer)
0.3% ( inf) swapper/2 : hrtimer_start_range_ns (tick_sched_timer)
0.2% ( inf) swapper/1 : hrtimer_start_range_ns (tick_sched_timer)
0.2% ( inf) swapper/0 : hrtimer_start_range_ns (tick_sched_timer)

So I did the second tough hacking, commented out the rescheduling IPI sending in following function and re-did the test.

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4137890..c27f04f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -137,7 +137,7 @@ static inline void play_dead(void)

static inline void smp_send_reschedule(int cpu)
{
- smp_ops.smp_send_reschedule(cpu);
+ /* smp_ops.smp_send_reschedule(cpu); */
}

Got the performance as best as 2.6.32 kernel and the scheduling seems also OK.

root@localhost ~]# time ./pipe-test-1m

real 0m7.661s
user 0m0.179s
sys 0m4.880s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.473s
user 0m0.189s
sys 0m4.782s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.658s
user 0m0.195s
sys 0m4.899s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.644s
user 0m0.194s
sys 0m4.941s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.694s
user 0m0.189s
sys 0m4.925s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.694s
user 0m0.197s
sys 0m4.915s
[root@localhost ~]# time ./pipe-test-1m

real 0m7.597s
user 0m0.190s
sys 0m4.886s

The the two processes of pipe-test-1m and its child seem could be balanced from cpu0 to cpu3 well,
#top
f J
14888 root 20 0 68 0 R 73.2 0.0 0:03.22 2 pip1m
14887 root 20 0 284 224 S 63.4 0.0 0:03.23 0 pip1m

And so the above tough hacking and test basicly show the No.1 expensive thing is the rescheduling IPI, and
the No.2 expensive thing is the extra hrtimer reprogramming/tick in Linux 3.11-rc3 code.
We need manage to do as less as possible rescheduling IPI and reprogramming to get better performance.
Does it(the tough hacking and the test) make sense ? and the result rational ?

Thanks,
Ethan

?? 2013-7-30??????7:59??Peter Zijlstra <[email protected]> д????

> On Tue, Jul 30, 2013 at 07:44:03PM +0800, Ethan Zhao wrote:
>> Got it.
>> what tglx and you mean
>>
>>
>> So the expensive thing maybe not inside the schedule(), but could
>> outside the scheduler(), the more bigger forever loop.
>>
>> This is one part of what I am facing.
>
> Right, so it would be good if you could further diagnose the problem so
> we can come up with a solution that cures the problem while retaining
> the current 'desired' properties.
>
> The patch you pinpointed caused a regression in that it would wake from
> NOHZ mode far too often. Could it be that the now longer idle sections
> cause your CPU to go into deeper idle modes and you're suffering from
> idle-exit latencies?

2013-08-06 07:29:13

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Tue, 2013-07-30 at 11:35 +0200, Peter Zijlstra wrote:

> It would be good if you could do what Thomas suggested and look at which
> timer is actually active during your workload.

Rebuilding regression test trees, some pipe-test results...

I'm missing mwait_idle() rather a lot on Q6600, and at 3.8, E5620 took a
severe NOHZ drubbing from the menu governor.

pipe-test, scheduling cross core

NOTE: nohz is throttled here (patchlet below), as to not eat horrible
microidle cost, see E5620 v3.7.10-nothrottle below.

Q6600
v3.8.13 500.6 KHz 1.000
v3.9.11 422.4 KHz .843
v3.10.4 420.2 KHz .839
v3.11-rc3-4-g36f571e 404.7 KHz .808

Q6600 3.9 regression:
guilty party is 69fb3676 x86 idle: remove mwait_idle() and "idle=mwait" cmdline param
halt sucks, HTH does one activate mwait_idle_with_hints() [processor_idle()] for core2 boxen?

E5620 +write 0 -> /dev/cpu_dma_latency, hold open
v3.7.10 578.5 KHz 1.000 675.4 KHz 1.000
v3.7.10-nothrottle 366.7 KHz .633 395.0 KHz .584
v3.8.13 468.3 KHz .809 690.0 KHz 1.021
v3.8.13 idle=mwait 595.1 KHz 1.028 NA
v3.9.11 462.0 KHz .798 691.1 KHz 1.023
v3.10.4 419.4 KHz .724 570.8 KHz .845
v3.11-rc3-4-g36f571e 400.1 KHz .691 538.5 KHz .797

E5620 3.8 regression:
guilty party: 69a37bea cpuidle: Quickly notice prediction failure for repeat mode

Q6600 (2.4 GHz core2 quad)
v3.11-rc3-4-g36f571e v3.8.13
7.97% [k] reschedule_interrupt 8.63% [k] __schedule
6.27% [k] __schedule 6.07% [k] native_sched_clock
4.74% [k] native_sched_clock 4.96% [k] system_call
4.23% [k] _raw_spin_lock_irqsave 4.30% [k] _raw_spin_lock_irqsave
3.39% [k] system_call 4.06% [k] resched_task
2.89% [k] sched_clock_local 3.44% [k] sched_clock_local
2.79% [k] mutex_lock 3.39% [k] pipe_read
2.57% [k] pipe_read 3.21% [k] mutex_lock
2.55% [k] __switch_to 2.98% [k] read_tsc
2.24% [k] read_tsc 2.87% [k] __switch_to

E5620 (2.4 GHz Westmere quad)
v3.7.10 v3.7.10-nothrottle v3.7.10-nothrottle
8.01% [k] __schedule 25.80% [k] _raw_spin_unlock_irqrestore 21.80% [k] _raw_spin_unlock_irqrestore
4.49% [k] resched_tas 4.64% [k] __hrtimer_start_range_ns - _raw_spin_unlock_irqrestore
3.94% [k] mutex_lock 4.62% [k] timerqueue_add + 37.94% __hrtimer_start_range_ns
3.44% [k] __switch_to 4.54% [k] __schedule 19.69% hrtimer_cancel
3.18% [k] menu_select 2.84% [k] enqueue_hrtimer tick_nohz_restart
3.05% [k] copy_user_generic_string 2.64% [k] resched_task tick_nohz_idle_exit
3.02% [k] task_waking_fair 2.29% [k] _raw_spin_lock_irqsave cpu_idle
2.91% [k] mutex_unlock 2.28% [k] mutex_lock start_secondary
2.82% [k] pipe_read 1.96% [k] __switch_to + 16.05% hrtimer_start_range_ns
2.32% [k] ktime_get_real 1.73% [k] menu_select 15.46% hrtimer_start
tick_nohz_stop_sched_tick
__tick_nohz_idle_enter
tick_nohz_idle_enter
cpu_idle
start_secondary
6.37% hrtimer_try_to_cancel
hrtimer_cancel
tick_nohz_restart
tick_nohz_idle_exit
cpu_idle
start_secondary

v3.8.13 v3.8.13 idle=mwait v3.8.13 (throttled, but menu gov bites.. HARD)
23.16% [k] _raw_spin_unlock_irqrestore 8.35% [k] __schedule - 22.91% [k] _raw_spin_unlock_irqrestore
4.93% [k] __schedule 6.49% [k] __switch_to - _raw_spin_unlock_irqrestore
3.42% [k] resched_task 5.71% [k] resched_task - 47.26% hrtimer_try_to_cancel
3.27% [k] __switch_to 4.64% [k] mutex_lock hrtimer_cancel
3.05% [k] mutex_lock 3.48% [k] copy_user_generic_string menu_hrtimer_cancel
2.32% [k] copy_user_generic_string 3.15% [k] task_waking_fair tick_nohz_idle_exit
2.30% [k] _raw_spin_lock_irqsave 3.13% [k] pipe_read cpu_idle
2.15% [k] pipe_read 2.61% [k] mutex_unlock start_secondary
2.15% [k] task_waking_fair 2.54% [k] finish_task_switch - 40.01% __hrtimer_start_range_ns
2.08% [k] ktime_get 2.29% [k] _raw_spin_lock_irqsave hrtimer_start
1.87% [k] mutex_unlock 1.91% [k] idle_cpu menu_select
1.76% [k] finish_task_switch 1.84% [k] __wake_up_common cpuidle_idle_call
cpu_idle
start_secondary

v3.9.11
18.67% [k] _raw_spin_unlock_irqrestore
4.36% [k] __schedule
3.66% [k] __switch_to
3.13% [k] mutex_lock
2.97% [k] __hrtimer_start_range_ns
2.69% [k] _raw_spin_lock_irqsave
2.38% [k] copy_user_generic_string
2.34% [k] hrtimer_reprogram.isra.32
2.34% [k] task_waking_fair
2.25% [k] ktime_get
2.14% [k] pipe_read
1.98% [k] menu_select

v3.10.4
20.42% [k] _raw_spin_unlock_irqrestore
4.75% [k] __schedule
4.42% [k] reschedule_interrupt <== appears in 3.10, guilty party as yet unknown
3.52% [k] __switch_to
3.27% [k] resched_task
2.64% [k] cpuidle_enter_state
2.63% [k] _raw_spin_lock_irqsave
2.04% [k] copy_user_generic_string
2.00% [k] cpu_idle_loop
1.97% [k] mutex_lock
1.90% [k] ktime_get
1.75% [k] task_waking_fair

v3.11-rc3-4-g36f571e
18.96% [k] _raw_spin_unlock_irqrestore
4.84% [k] __schedule
4.69% [k] reschedule_interrupt
3.75% [k] __switch_to
2.62% [k] _raw_spin_lock_irqsave
2.43% [k] cpuidle_enter_state
2.28% [k] resched_task
2.20% [k] cpu_idle_loop
1.97% [k] copy_user_generic_string
1.88% [k] ktime_get
1.81% [k] task_waking_fair
1.75% [k] mutex_lock

sched: ratelimit nohz

Entering nohz code on every micro-idle is too expensive to bear.

Signed-off-by: Mike Galbraith <[email protected]>

---
include/linux/sched.h | 5 +++++
kernel/sched/core.c | 5 +++++
kernel/time/tick-sched.c | 2 +-
3 files changed, 11 insertions(+), 1 deletion(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -235,9 +235,14 @@ extern int runqueue_is_locked(int cpu);
extern void nohz_balance_enter_idle(int cpu);
extern void set_cpu_sd_state_idle(void);
extern int get_nohz_timer_target(void);
+extern int sched_needs_cpu(int cpu);
#else
static inline void nohz_balance_enter_idle(int cpu) { }
static inline void set_cpu_sd_state_idle(void) { }
+static inline int sched_needs_cpu(int cpu)
+{
+ return 0;
+}
#endif

/*
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -650,6 +650,11 @@ static inline bool got_nohz_idle_kick(vo
return false;
}

+int sched_needs_cpu(int cpu)
+{
+ return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;
+}
+
#else /* CONFIG_NO_HZ_COMMON */

static inline bool got_nohz_idle_kick(void)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -548,7 +548,7 @@ static ktime_t tick_nohz_stop_sched_tick
time_delta = timekeeping_max_deferment();
} while (read_seqretry(&jiffies_lock, seq));

- if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
+ if (sched_needs_cpu(cpu) || rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;

2013-08-06 07:47:03

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

(CCs Intel folks)

On Tue, 2013-08-06 at 09:29 +0200, Mike Galbraith wrote:
> On Tue, 2013-07-30 at 11:35 +0200, Peter Zijlstra wrote:
>
> > It would be good if you could do what Thomas suggested and look at which
> > timer is actually active during your workload.
>
> Rebuilding regression test trees, some pipe-test results...
>
> I'm missing mwait_idle() rather a lot on Q6600, and at 3.8, E5620 took a
> severe NOHZ drubbing from the menu governor.
>
> pipe-test, scheduling cross core
>
> NOTE: nohz is throttled here (patchlet below), as to not eat horrible
> microidle cost, see E5620 v3.7.10-nothrottle below.
>
> Q6600
> v3.8.13 500.6 KHz 1.000
> v3.9.11 422.4 KHz .843
> v3.10.4 420.2 KHz .839
> v3.11-rc3-4-g36f571e 404.7 KHz .808
>
> Q6600 3.9 regression:
> guilty party is 69fb3676 x86 idle: remove mwait_idle() and "idle=mwait" cmdline param
> halt sucks, HTH does one activate mwait_idle_with_hints() [processor_idle()] for core2 boxen?
>
> E5620 +write 0 -> /dev/cpu_dma_latency, hold open
> v3.7.10 578.5 KHz 1.000 675.4 KHz 1.000
> v3.7.10-nothrottle 366.7 KHz .633 395.0 KHz .584
> v3.8.13 468.3 KHz .809 690.0 KHz 1.021
> v3.8.13 idle=mwait 595.1 KHz 1.028 NA
> v3.9.11 462.0 KHz .798 691.1 KHz 1.023
> v3.10.4 419.4 KHz .724 570.8 KHz .845
> v3.11-rc3-4-g36f571e 400.1 KHz .691 538.5 KHz .797
>
> E5620 3.8 regression:
> guilty party: 69a37bea cpuidle: Quickly notice prediction failure for repeat mode
>
>
> Q6600 (2.4 GHz core2 quad)
> v3.11-rc3-4-g36f571e v3.8.13
> 7.97% [k] reschedule_interrupt 8.63% [k] __schedule
> 6.27% [k] __schedule 6.07% [k] native_sched_clock
> 4.74% [k] native_sched_clock 4.96% [k] system_call
> 4.23% [k] _raw_spin_lock_irqsave 4.30% [k] _raw_spin_lock_irqsave
> 3.39% [k] system_call 4.06% [k] resched_task
> 2.89% [k] sched_clock_local 3.44% [k] sched_clock_local
> 2.79% [k] mutex_lock 3.39% [k] pipe_read
> 2.57% [k] pipe_read 3.21% [k] mutex_lock
> 2.55% [k] __switch_to 2.98% [k] read_tsc
> 2.24% [k] read_tsc 2.87% [k] __switch_to
>
>
> E5620 (2.4 GHz Westmere quad)
> v3.7.10 v3.7.10-nothrottle v3.7.10-nothrottle
> 8.01% [k] __schedule 25.80% [k] _raw_spin_unlock_irqrestore 21.80% [k] _raw_spin_unlock_irqrestore
> 4.49% [k] resched_tas 4.64% [k] __hrtimer_start_range_ns - _raw_spin_unlock_irqrestore
> 3.94% [k] mutex_lock 4.62% [k] timerqueue_add + 37.94% __hrtimer_start_range_ns
> 3.44% [k] __switch_to 4.54% [k] __schedule 19.69% hrtimer_cancel
> 3.18% [k] menu_select 2.84% [k] enqueue_hrtimer tick_nohz_restart
> 3.05% [k] copy_user_generic_string 2.64% [k] resched_task tick_nohz_idle_exit
> 3.02% [k] task_waking_fair 2.29% [k] _raw_spin_lock_irqsave cpu_idle
> 2.91% [k] mutex_unlock 2.28% [k] mutex_lock start_secondary
> 2.82% [k] pipe_read 1.96% [k] __switch_to + 16.05% hrtimer_start_range_ns
> 2.32% [k] ktime_get_real 1.73% [k] menu_select 15.46% hrtimer_start
> tick_nohz_stop_sched_tick
> __tick_nohz_idle_enter
> tick_nohz_idle_enter
> cpu_idle
> start_secondary
> 6.37% hrtimer_try_to_cancel
> hrtimer_cancel
> tick_nohz_restart
> tick_nohz_idle_exit
> cpu_idle
> start_secondary
>
> v3.8.13 v3.8.13 idle=mwait v3.8.13 (throttled, but menu gov bites.. HARD)
> 23.16% [k] _raw_spin_unlock_irqrestore 8.35% [k] __schedule - 22.91% [k] _raw_spin_unlock_irqrestore
> 4.93% [k] __schedule 6.49% [k] __switch_to - _raw_spin_unlock_irqrestore
> 3.42% [k] resched_task 5.71% [k] resched_task - 47.26% hrtimer_try_to_cancel
> 3.27% [k] __switch_to 4.64% [k] mutex_lock hrtimer_cancel
> 3.05% [k] mutex_lock 3.48% [k] copy_user_generic_string menu_hrtimer_cancel
> 2.32% [k] copy_user_generic_string 3.15% [k] task_waking_fair tick_nohz_idle_exit
> 2.30% [k] _raw_spin_lock_irqsave 3.13% [k] pipe_read cpu_idle
> 2.15% [k] pipe_read 2.61% [k] mutex_unlock start_secondary
> 2.15% [k] task_waking_fair 2.54% [k] finish_task_switch - 40.01% __hrtimer_start_range_ns
> 2.08% [k] ktime_get 2.29% [k] _raw_spin_lock_irqsave hrtimer_start
> 1.87% [k] mutex_unlock 1.91% [k] idle_cpu menu_select
> 1.76% [k] finish_task_switch 1.84% [k] __wake_up_common cpuidle_idle_call
> cpu_idle
> start_secondary
>
> v3.9.11
> 18.67% [k] _raw_spin_unlock_irqrestore
> 4.36% [k] __schedule
> 3.66% [k] __switch_to
> 3.13% [k] mutex_lock
> 2.97% [k] __hrtimer_start_range_ns
> 2.69% [k] _raw_spin_lock_irqsave
> 2.38% [k] copy_user_generic_string
> 2.34% [k] hrtimer_reprogram.isra.32
> 2.34% [k] task_waking_fair
> 2.25% [k] ktime_get
> 2.14% [k] pipe_read
> 1.98% [k] menu_select
>
> v3.10.4
> 20.42% [k] _raw_spin_unlock_irqrestore
> 4.75% [k] __schedule
> 4.42% [k] reschedule_interrupt <== appears in 3.10, guilty party as yet unknown
> 3.52% [k] __switch_to
> 3.27% [k] resched_task
> 2.64% [k] cpuidle_enter_state
> 2.63% [k] _raw_spin_lock_irqsave
> 2.04% [k] copy_user_generic_string
> 2.00% [k] cpu_idle_loop
> 1.97% [k] mutex_lock
> 1.90% [k] ktime_get
> 1.75% [k] task_waking_fair
>
> v3.11-rc3-4-g36f571e
> 18.96% [k] _raw_spin_unlock_irqrestore
> 4.84% [k] __schedule
> 4.69% [k] reschedule_interrupt
> 3.75% [k] __switch_to
> 2.62% [k] _raw_spin_lock_irqsave
> 2.43% [k] cpuidle_enter_state
> 2.28% [k] resched_task
> 2.20% [k] cpu_idle_loop
> 1.97% [k] copy_user_generic_string
> 1.88% [k] ktime_get
> 1.81% [k] task_waking_fair
> 1.75% [k] mutex_lock
>
> sched: ratelimit nohz
>
> Entering nohz code on every micro-idle is too expensive to bear.
>
> Signed-off-by: Mike Galbraith <[email protected]>
>
> ---
> include/linux/sched.h | 5 +++++
> kernel/sched/core.c | 5 +++++
> kernel/time/tick-sched.c | 2 +-
> 3 files changed, 11 insertions(+), 1 deletion(-)
>
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -235,9 +235,14 @@ extern int runqueue_is_locked(int cpu);
> extern void nohz_balance_enter_idle(int cpu);
> extern void set_cpu_sd_state_idle(void);
> extern int get_nohz_timer_target(void);
> +extern int sched_needs_cpu(int cpu);
> #else
> static inline void nohz_balance_enter_idle(int cpu) { }
> static inline void set_cpu_sd_state_idle(void) { }
> +static inline int sched_needs_cpu(int cpu)
> +{
> + return 0;
> +}
> #endif
>
> /*
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -650,6 +650,11 @@ static inline bool got_nohz_idle_kick(vo
> return false;
> }
>
> +int sched_needs_cpu(int cpu)
> +{
> + return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;
> +}
> +
> #else /* CONFIG_NO_HZ_COMMON */
>
> static inline bool got_nohz_idle_kick(void)
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -548,7 +548,7 @@ static ktime_t tick_nohz_stop_sched_tick
> time_delta = timekeeping_max_deferment();
> } while (read_seqretry(&jiffies_lock, seq));
>
> - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
> + if (sched_needs_cpu(cpu) || rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
> arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
> next_jiffies = last_jiffies + 1;
> delta_jiffies = 1;
>
>

2013-08-07 08:25:51

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Tue, 2013-08-06 at 09:29 +0200, Mike Galbraith wrote:

...

> E5620 +write 0 -> /dev/cpu_dma_latency, hold open
> v3.7.10 578.5 KHz 1.000 675.4 KHz 1.000
> v3.7.10-nothrottle 366.7 KHz .633 395.0 KHz .584
> v3.8.13 468.3 KHz .809 690.0 KHz 1.021
> v3.8.13 idle=mwait 595.1 KHz 1.028 NA
> v3.9.11 462.0 KHz .798 691.1 KHz 1.023
> v3.10.4 419.4 KHz .724 570.8 KHz .845
> v3.11-rc3-4-g36f571e 400.1 KHz .691 538.5 KHz .797

Adds Peter's patch, re-tests master/E5620, nohz throttled/unthrottled..

v3.11-rc4-27-ge4ef108 481.9 KHz .833 1.000 throttle (400->481 36f571e..e4ef108? spinlock overhead gone)
v3.11-rc4-27-ge4ef108 496.7 KHz .858 1.030 throttle+peterz (spinlock overhead gone)

v3.11-rc4-27-ge4ef108 340.0 KHz .587 1.000 nothrottle (spinlock overhead present)
v3.11-rc4-27-ge4ef108 440.7 KHz .761 1.296 nothrottle+peterz (spinlock overhead gone)

E5620 (2.4 GHz Westmere) nothrottle

v3.11-rc4-27-ge4ef108 v3.11-rc4-27-ge4ef108+peterz
17.48% [k] _raw_spin_unlock_irqrestore 5.37% [k] __schedule
11.36% [k] __hrtimer_start_range_ns 4.96% [k] reschedule_interrupt
3.78% [k] __schedule 3.82% [k] _raw_spin_lock_irqsave
3.54% [k] reschedule_interrupt 3.62% [k] __switch_to
2.81% [k] __switch_to 3.05% [k] cpuidle_enter_state
2.64% [k] _raw_spin_lock_irqsave 2.99% [k] resched_task
2.14% [k] cpuidle_enter_state 2.39% [k] mutex_lock
1.97% [k] resched_task 2.31% [k] copy_user_generic_string
1.68% [k] copy_user_generic_string 2.23% [k] cpu_idle_loop
1.68% [k] cpu_idle_loop 2.20% [k] task_waking_fair

E5620 (2.4 GHz Westmere) throttle

v3.11-rc4-27-ge4ef108 v3.11-rc4-27-ge4ef108+peterz
6.76% [k] __schedule 7.68% [k] reschedule_interrupt (find this little bastard)
5.90% [k] reschedule_interrupt 7.18% [k] __schedule
4.12% [k] __switch_to 4.52% [k] __switch_to
3.58% [k] resched_tas 4.24% [k] cpuidle_enter_state
3.24% [k] cpuidle_enter_state 2.96% [k] cpu_idle_loop
3.01% [k] _raw_spin_lock_irqsave 2.96% [k] resched_task
2.67% [k] cpu_idle_loop 2.79% [k] _raw_spin_lock_irqsave
2.49% [k] task_waking_fair 2.45% [k] ktime_get
2.38% [k] copy_user_generic_string 2.34% [k] copy_user_generic_string
2.27% [k] mutex_lock 2.32% [k] get_typical_interval

And below is peterz..

On Mon, Jul 29, 2013 at 12:18:11PM +0200, Thomas Gleixner wrote:

> The reason why we want to do that is:
>
> timer expiry time
> A 100us -> programmed hardware event
> B 2000us
>
> Restart timer A with an expiry time of 3000us without reprogramming:
>
> timer expiry time
> NONE 100us -> programmed hardware event
> B 2000us
> A 3000us
>
> We take an extra wakeup for reprogramming the hardware, which is
> counterproductive for power saving. So disabling it blindly will cause
> a regresssion for other people. We need to be smarter than that.

So aside from the complete mess posted; how about something like this?

*completely untested etc..*

mike: Builds(now)/boots/makes E5620 happier.

---
include/linux/hrtimer.h | 5 ++++
kernel/hrtimer.c | 60 ++++++++++++++++++++++--------------------------
2 files changed, 33 insertions(+), 32 deletions(-)

--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -185,6 +185,7 @@ struct hrtimer_cpu_base {
ktime_t expires_next;
int hres_active;
int hang_detected;
+ int timers_removed;
unsigned long nr_events;
unsigned long nr_retries;
unsigned long nr_hangs;
@@ -261,6 +262,8 @@ static inline ktime_t hrtimer_expires_re
#ifdef CONFIG_HIGH_RES_TIMERS
struct clock_event_device;

+extern void hrtimer_enter_idle(void);
+
extern void hrtimer_interrupt(struct clock_event_device *dev);

/*
@@ -296,6 +299,8 @@ extern void clock_was_set_delayed(void);
# define MONOTONIC_RES_NSEC LOW_RES_NSEC
# define KTIME_MONOTONIC_RES KTIME_LOW_RES

+static inline void hrtimer_enter_idle(void) { }
+
static inline void hrtimer_peek_ahead_timers(void) { }

/*
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -673,6 +673,27 @@ static inline ktime_t hrtimer_update_bas
return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
}

+static void __hrtimer_reprogram_all(struct hrtimer_cpu_base *base)
+{
+ if (!hrtimer_hres_active())
+ return;
+
+ raw_spin_lock(&base->lock);
+ hrtimer_update_base(base);
+ hrtimer_force_reprogram(base, 0);
+ raw_spin_unlock(&base->lock);
+}
+
+void hrtimer_enter_idle(void)
+{
+ struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+
+ if (base->timers_removed) {
+ base->timers_removed = 0;
+ __hrtimer_reprogram_all(base);
+ }
+}
+
/*
* Retrigger next event is called after clock was set
*
@@ -682,13 +703,7 @@ static void retrigger_next_event(void *a
{
struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);

- if (!hrtimer_hres_active())
- return;
-
- raw_spin_lock(&base->lock);
- hrtimer_update_base(base);
- hrtimer_force_reprogram(base, 0);
- raw_spin_unlock(&base->lock);
+ __hrtimer_reprogram_all(base);
}

/*
@@ -907,7 +922,7 @@ static int enqueue_hrtimer(struct hrtime
*/
static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
- unsigned long newstate, int reprogram)
+ unsigned long newstate)
{
struct timerqueue_node *next_timer;
if (!(timer->state & HRTIMER_STATE_ENQUEUED))
@@ -915,19 +930,10 @@ static void __remove_hrtimer(struct hrti

next_timer = timerqueue_getnext(&base->active);
timerqueue_del(&base->active, &timer->node);
- if (&timer->node == next_timer) {
#ifdef CONFIG_HIGH_RES_TIMERS
- /* Reprogram the clock event device. if enabled */
- if (reprogram && hrtimer_hres_active()) {
- ktime_t expires;
-
- expires = ktime_sub(hrtimer_get_expires(timer),
- base->offset);
- if (base->cpu_base->expires_next.tv64 == expires.tv64)
- hrtimer_force_reprogram(base->cpu_base, 1);
- }
+ if (hrtimer_hres_active() && &timer->node == next_timer)
+ base->cpu_base->timers_removed++;
#endif
- }
if (!timerqueue_getnext(&base->active))
base->cpu_base->active_bases &= ~(1 << base->index);
out:
@@ -942,26 +948,16 @@ remove_hrtimer(struct hrtimer *timer, st
{
if (hrtimer_is_queued(timer)) {
unsigned long state;
- int reprogram;

- /*
- * Remove the timer and force reprogramming when high
- * resolution mode is active and the timer is on the current
- * CPU. If we remove a timer on another CPU, reprogramming is
- * skipped. The interrupt event on this CPU is fired and
- * reprogramming happens in the interrupt handler. This is a
- * rare case and less expensive than a smp call.
- */
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
- reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
/*
* We must preserve the CALLBACK state flag here,
* otherwise we could move the timer base in
* switch_hrtimer_base.
*/
state = timer->state & HRTIMER_STATE_CALLBACK;
- __remove_hrtimer(timer, base, state, reprogram);
+ __remove_hrtimer(timer, base, state);
return 1;
}
return 0;
@@ -1243,7 +1239,7 @@ static void __run_hrtimer(struct hrtimer
WARN_ON(!irqs_disabled());

debug_deactivate(timer);
- __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+ __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK);
timer_stats_account_hrtimer(timer);
fn = timer->function;

@@ -1690,7 +1686,7 @@ static void migrate_hrtimer_list(struct
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not

2013-08-08 04:05:55

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Wed, 2013-08-07 at 10:25 +0200, Mike Galbraith wrote:

> E5620 (2.4 GHz Westmere) throttle
>
> v3.11-rc4-27-ge4ef108 v3.11-rc4-27-ge4ef108+peterz
> 6.76% [k] __schedule 7.68% [k] reschedule_interrupt (find this little bastard)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 5.90% [k] reschedule_interrupt 7.18% [k] __schedule
> 4.12% [k] __switch_to 4.52% [k] __switch_to
> 3.58% [k] resched_tas 4.24% [k] cpuidle_enter_state
> 3.24% [k] cpuidle_enter_state 2.96% [k] cpu_idle_loop
> 3.01% [k] _raw_spin_lock_irqsave 2.96% [k] resched_task
> 2.67% [k] cpu_idle_loop 2.79% [k] _raw_spin_lock_irqsave
> 2.49% [k] task_waking_fair 2.45% [k] ktime_get
> 2.38% [k] copy_user_generic_string 2.34% [k] copy_user_generic_string
> 2.27% [k] mutex_lock 2.32% [k] get_typical_interval

7d1a941731fabf27e5fb6edbebb79fe856edb4e5 is the first bad commit
commit 7d1a941731fabf27e5fb6edbebb79fe856edb4e5
Author: Thomas Gleixner <[email protected]>
Date: Thu Mar 21 22:50:03 2013 +0100

x86: Use generic idle loop

So, in summary, seems we need to forget all about scheduling anything
remotely fast cross core these days, even on core2 with its lovely
shared L2, as otherwise we'll beat it all up. Boxen with no nohz
throttle maybe don't notice how bad all this progress hurt because
they're used to being in agony. Your patch (modulo any bustage, haven't
run any testsuite, only ran for regression hunt/test purposes) should
make smiles for those who notice large chunk of that pain going away.

My beloved ole Q6600 is currently not particularly happy, and that's
best case, shared L2, where nearly any dinky overlap used to be
reclaimable. Even with your patch, we'll still be losers at network
fast movers across the board on any box methinks.

-Mike

2013-08-08 04:32:06

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

Hi, perter and Mike,

Some other test to verify the regression causes etc.
On an 4 core intel i5 Asus pc.
The pipe test.

1. default Bios configuration and default 3.11-rc3 kernel.

[root@localhost ~]# time ./pip1m

real 0m10.683s
user 0m0.204s
sys 0m6.597s
[root@localhost ~]# time ./pip1m

real 0m10.629s
user 0m0.185s
sys 0m6.546s
[root@localhost ~]# uname -a
Linux localhost 3.11.0-rc3 #4 SMP Wed Jul 31 16:10:56 EDT 2013 x86_64 x86_64 x86_64 GNU/Linux

2. same as 1 and idle=halt command line parameter.
[root@localhost ~]# time ./pip1m

real 0m9.904s
user 0m0.200s
sys 0m6.209s
[root@localhost ~]# time ./pip1m

real 0m9.972s
user 0m0.201s
sys 0m6.200s

3. same as 1 and idle=nomwait command line parameter
real 0m13.634s
user 0m0.407s
sys 0m7.820s
[root@localhost ~]# time ./pip1m

real 0m13.684s
user 0m0.416s
sys 0m7.845s

4. Disable C1E C3 C6 C-states and SpeedStep in BIOS, default configuration of kernel 3.11-rc3.
[root@localhost ~]# time ./pip1m

real 0m5.371s
user 0m0.102s
sys 0m3.253s
[root@localhost ~]# time ./pip1m

real 0m5.329s
user 0m0.075s
sys 0m3.254s
[root@localhost ~]#

5. same as 4 and comment out reschedule IPI sending
[root@localhost ~]# time ./pip1m
real 0m3.883s
user 0m0.098s
sys 0m2.480s
[root@localhost ~]# time ./pip1m

real 0m3.907s
user 0m0.070s
sys 0m2.552s

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4137890..c27f04f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -137,7 +137,7 @@ static inline void play_dead(void)

static inline void smp_send_reschedule(int cpu)
{
- smp_ops.smp_send_reschedule(cpu);
+ /* smp_ops.smp_send_reschedule(cpu); */
}

6. same as 5 and don't reprogram clock device in remove_hrtimer.
got same result as 5.
real 0m3.915s
user 0m0.086s
sys 0m2.499s
[root@localhost ~]# time ./pip1m

real 0m3.919s
user 0m0.110s
sys 0m2.509s

So when C-states disabled, no reprogramming of hrtimer wouldn't gain better performance.
But will get more wakup chances while C-states enabled if no reprogramming clock device.

Thanks,
Ethan

?? 2013-8-6??????3:46??Mike Galbraith <[email protected]> д????

> (CCs Intel folks)
>
> On Tue, 2013-08-06 at 09:29 +0200, Mike Galbraith wrote:
>> On Tue, 2013-07-30 at 11:35 +0200, Peter Zijlstra wrote:
>>
>>> It would be good if you could do what Thomas suggested and look at which
>>> timer is actually active during your workload.
>>
>> Rebuilding regression test trees, some pipe-test results...
>>
>> I'm missing mwait_idle() rather a lot on Q6600, and at 3.8, E5620 took a
>> severe NOHZ drubbing from the menu governor.
>>
>> pipe-test, scheduling cross core
>>
>> NOTE: nohz is throttled here (patchlet below), as to not eat horrible
>> microidle cost, see E5620 v3.7.10-nothrottle below.
>>
>> Q6600
>> v3.8.13 500.6 KHz 1.000
>> v3.9.11 422.4 KHz .843
>> v3.10.4 420.2 KHz .839
>> v3.11-rc3-4-g36f571e 404.7 KHz .808
>>
>> Q6600 3.9 regression:
>> guilty party is 69fb3676 x86 idle: remove mwait_idle() and "idle=mwait" cmdline param
>> halt sucks, HTH does one activate mwait_idle_with_hints() [processor_idle()] for core2 boxen?
>>
>> E5620 +write 0 -> /dev/cpu_dma_latency, hold open
>> v3.7.10 578.5 KHz 1.000 675.4 KHz 1.000
>> v3.7.10-nothrottle 366.7 KHz .633 395.0 KHz .584
>> v3.8.13 468.3 KHz .809 690.0 KHz 1.021
>> v3.8.13 idle=mwait 595.1 KHz 1.028 NA
>> v3.9.11 462.0 KHz .798 691.1 KHz 1.023
>> v3.10.4 419.4 KHz .724 570.8 KHz .845
>> v3.11-rc3-4-g36f571e 400.1 KHz .691 538.5 KHz .797
>>
>> E5620 3.8 regression:
>> guilty party: 69a37bea cpuidle: Quickly notice prediction failure for repeat mode
>>
>>
>> Q6600 (2.4 GHz core2 quad)
>> v3.11-rc3-4-g36f571e v3.8.13
>> 7.97% [k] reschedule_interrupt 8.63% [k] __schedule
>> 6.27% [k] __schedule 6.07% [k] native_sched_clock
>> 4.74% [k] native_sched_clock 4.96% [k] system_call
>> 4.23% [k] _raw_spin_lock_irqsave 4.30% [k] _raw_spin_lock_irqsave
>> 3.39% [k] system_call 4.06% [k] resched_task
>> 2.89% [k] sched_clock_local 3.44% [k] sched_clock_local
>> 2.79% [k] mutex_lock 3.39% [k] pipe_read
>> 2.57% [k] pipe_read 3.21% [k] mutex_lock
>> 2.55% [k] __switch_to 2.98% [k] read_tsc
>> 2.24% [k] read_tsc 2.87% [k] __switch_to
>>
>>
>> E5620 (2.4 GHz Westmere quad)
>> v3.7.10 v3.7.10-nothrottle v3.7.10-nothrottle
>> 8.01% [k] __schedule 25.80% [k] _raw_spin_unlock_irqrestore 21.80% [k] _raw_spin_unlock_irqrestore
>> 4.49% [k] resched_tas 4.64% [k] __hrtimer_start_range_ns - _raw_spin_unlock_irqrestore
>> 3.94% [k] mutex_lock 4.62% [k] timerqueue_add + 37.94% __hrtimer_start_range_ns
>> 3.44% [k] __switch_to 4.54% [k] __schedule 19.69% hrtimer_cancel
>> 3.18% [k] menu_select 2.84% [k] enqueue_hrtimer tick_nohz_restart
>> 3.05% [k] copy_user_generic_string 2.64% [k] resched_task tick_nohz_idle_exit
>> 3.02% [k] task_waking_fair 2.29% [k] _raw_spin_lock_irqsave cpu_idle
>> 2.91% [k] mutex_unlock 2.28% [k] mutex_lock start_secondary
>> 2.82% [k] pipe_read 1.96% [k] __switch_to + 16.05% hrtimer_start_range_ns
>> 2.32% [k] ktime_get_real 1.73% [k] menu_select 15.46% hrtimer_start
>> tick_nohz_stop_sched_tick
>> __tick_nohz_idle_enter
>> tick_nohz_idle_enter
>> cpu_idle
>> start_secondary
>> 6.37% hrtimer_try_to_cancel
>> hrtimer_cancel
>> tick_nohz_restart
>> tick_nohz_idle_exit
>> cpu_idle
>> start_secondary
>>
>> v3.8.13 v3.8.13 idle=mwait v3.8.13 (throttled, but menu gov bites.. HARD)
>> 23.16% [k] _raw_spin_unlock_irqrestore 8.35% [k] __schedule - 22.91% [k] _raw_spin_unlock_irqrestore
>> 4.93% [k] __schedule 6.49% [k] __switch_to - _raw_spin_unlock_irqrestore
>> 3.42% [k] resched_task 5.71% [k] resched_task - 47.26% hrtimer_try_to_cancel
>> 3.27% [k] __switch_to 4.64% [k] mutex_lock hrtimer_cancel
>> 3.05% [k] mutex_lock 3.48% [k] copy_user_generic_string menu_hrtimer_cancel
>> 2.32% [k] copy_user_generic_string 3.15% [k] task_waking_fair tick_nohz_idle_exit
>> 2.30% [k] _raw_spin_lock_irqsave 3.13% [k] pipe_read cpu_idle
>> 2.15% [k] pipe_read 2.61% [k] mutex_unlock start_secondary
>> 2.15% [k] task_waking_fair 2.54% [k] finish_task_switch - 40.01% __hrtimer_start_range_ns
>> 2.08% [k] ktime_get 2.29% [k] _raw_spin_lock_irqsave hrtimer_start
>> 1.87% [k] mutex_unlock 1.91% [k] idle_cpu menu_select
>> 1.76% [k] finish_task_switch 1.84% [k] __wake_up_common cpuidle_idle_call
>> cpu_idle
>> start_secondary
>>
>> v3.9.11
>> 18.67% [k] _raw_spin_unlock_irqrestore
>> 4.36% [k] __schedule
>> 3.66% [k] __switch_to
>> 3.13% [k] mutex_lock
>> 2.97% [k] __hrtimer_start_range_ns
>> 2.69% [k] _raw_spin_lock_irqsave
>> 2.38% [k] copy_user_generic_string
>> 2.34% [k] hrtimer_reprogram.isra.32
>> 2.34% [k] task_waking_fair
>> 2.25% [k] ktime_get
>> 2.14% [k] pipe_read
>> 1.98% [k] menu_select
>>
>> v3.10.4
>> 20.42% [k] _raw_spin_unlock_irqrestore
>> 4.75% [k] __schedule
>> 4.42% [k] reschedule_interrupt <== appears in 3.10, guilty party as yet unknown
>> 3.52% [k] __switch_to
>> 3.27% [k] resched_task
>> 2.64% [k] cpuidle_enter_state
>> 2.63% [k] _raw_spin_lock_irqsave
>> 2.04% [k] copy_user_generic_string
>> 2.00% [k] cpu_idle_loop
>> 1.97% [k] mutex_lock
>> 1.90% [k] ktime_get
>> 1.75% [k] task_waking_fair
>>
>> v3.11-rc3-4-g36f571e
>> 18.96% [k] _raw_spin_unlock_irqrestore
>> 4.84% [k] __schedule
>> 4.69% [k] reschedule_interrupt
>> 3.75% [k] __switch_to
>> 2.62% [k] _raw_spin_lock_irqsave
>> 2.43% [k] cpuidle_enter_state
>> 2.28% [k] resched_task
>> 2.20% [k] cpu_idle_loop
>> 1.97% [k] copy_user_generic_string
>> 1.88% [k] ktime_get
>> 1.81% [k] task_waking_fair
>> 1.75% [k] mutex_lock
>>
>> sched: ratelimit nohz
>>
>> Entering nohz code on every micro-idle is too expensive to bear.
>>
>> Signed-off-by: Mike Galbraith <[email protected]>
>>
>> ---
>> include/linux/sched.h | 5 +++++
>> kernel/sched/core.c | 5 +++++
>> kernel/time/tick-sched.c | 2 +-
>> 3 files changed, 11 insertions(+), 1 deletion(-)
>>
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -235,9 +235,14 @@ extern int runqueue_is_locked(int cpu);
>> extern void nohz_balance_enter_idle(int cpu);
>> extern void set_cpu_sd_state_idle(void);
>> extern int get_nohz_timer_target(void);
>> +extern int sched_needs_cpu(int cpu);
>> #else
>> static inline void nohz_balance_enter_idle(int cpu) { }
>> static inline void set_cpu_sd_state_idle(void) { }
>> +static inline int sched_needs_cpu(int cpu)
>> +{
>> + return 0;
>> +}
>> #endif
>>
>> /*
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -650,6 +650,11 @@ static inline bool got_nohz_idle_kick(vo
>> return false;
>> }
>>
>> +int sched_needs_cpu(int cpu)
>> +{
>> + return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;
>> +}
>> +
>> #else /* CONFIG_NO_HZ_COMMON */
>>
>> static inline bool got_nohz_idle_kick(void)
>> --- a/kernel/time/tick-sched.c
>> +++ b/kernel/time/tick-sched.c
>> @@ -548,7 +548,7 @@ static ktime_t tick_nohz_stop_sched_tick
>> time_delta = timekeeping_max_deferment();
>> } while (read_seqretry(&jiffies_lock, seq));
>>
>> - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
>> + if (sched_needs_cpu(cpu) || rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
>> arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
>> next_jiffies = last_jiffies + 1;
>> delta_jiffies = 1;
>>
>>
>
>

2013-08-08 05:29:17

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

(I got several copies of that, one would have been plenty. Some people
get all grumpy when they get repeats, and at least one, who shall remain
unnamed, gets even grumpier when folks don't trim their replies, so be
careful;)

On Thu, 2013-08-08 at 12:31 +0800, ethan.zhao wrote:

> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index 4137890..c27f04f 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -137,7 +137,7 @@ static inline void play_dead(void)
>
> static inline void smp_send_reschedule(int cpu)
> {
> - smp_ops.smp_send_reschedule(cpu);
> + /* smp_ops.smp_send_reschedule(cpu); */
> }

Hm. You're much braver than I am.

-Mike

2013-08-08 05:51:44

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Thu, 2013-08-08 at 07:29 +0200, Mike Galbraith wrote:

> On Thu, 2013-08-08 at 12:31 +0800, ethan.zhao wrote:
>
> > diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> > index 4137890..c27f04f 100644
> > --- a/arch/x86/include/asm/smp.h
> > +++ b/arch/x86/include/asm/smp.h
> > @@ -137,7 +137,7 @@ static inline void play_dead(void)
> >
> > static inline void smp_send_reschedule(int cpu)
> > {
> > - smp_ops.smp_send_reschedule(cpu);
> > + /* smp_ops.smp_send_reschedule(cpu); */
> > }
>
> Hm. You're much braver than I am.

BTW, according to my testing, Peter's patch should make your box a lot
happier. It won't make reschedule_interrupt crawl back under its rock,
but should improve your pipe-test numbers quite a bit.

Also note that pipe-test is really kinda silly cross core, it's really
good for measuring fastpath weight gain when pinned. When scheduled
cross core (we do that to cut latency for stuff that does real work), it
will always suck rocks, as it's 100% synchronous, there's nada to gain
by occupying two cores. (fortunately, real programs tend to do more
than play ping-pong with a microscopic ball;)

-Mike

2013-08-08 07:32:38

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

Kernel 3.11-rc3 With peter's patch and disable C-state in BIOS, got 1 second better performance !
[root@localhost ~]# time ./pip1m

real 0m4.399s
user 0m0.092s
sys 0m2.775s
[root@localhost ~]# time ./pip1m

real 0m4.352s
user 0m0.099s
sys 0m2.751s
[root@localhost ~]# time ./pip1m

real 0m4.328s
user 0m0.102s
sys 0m2.731s

Compared with default kernel 3.11-rc3 and disable C-states in BIOS, test case 4
4. Disable C1E C3 C6 C-states and SpeedStep in BIOS, default configuration of kernel 3.11-rc3.
[root@localhost ~]# time ./pip1m

real 0m5.371s
user 0m0.102s
sys 0m3.253s
[root@localhost ~]# time ./pip1m

real 0m5.329s
user 0m0.075s
sys 0m3.254s

That is great improvement, So it is worth to merge.

Thanks
Ethan

?? 2013-7-29??????7:57??Peter Zijlstra <[email protected]> д????
>
> So aside from the complete mess posted; how about something like this?
>
> *completely untested etc..*
>
> ---
> include/linux/hrtimer.h | 5 +++++
> kernel/hrtimer.c | 60 +++++++++++++++++++++++--------------------------
> 2 files changed, 33 insertions(+), 32 deletions(-)
>
> diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
> index d19a5c2..f2bcb9c 100644
> --- a/include/linux/hrtimer.h
> +++ b/include/linux/hrtimer.h
> @@ -185,6 +185,7 @@ struct hrtimer_cpu_base {
> ktime_t expires_next;
> int hres_active;
> int hang_detected;
> + int timers_removed;
> unsigned long nr_events;
> unsigned long nr_retries;
> unsigned long nr_hangs;
> @@ -261,6 +262,8 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
> #ifdef CONFIG_HIGH_RES_TIMERS
> struct clock_event_device;
>
> +extern void hrtimer_enter_idle(void);
> +
> extern void hrtimer_interrupt(struct clock_event_device *dev);
>
> /*
> @@ -296,6 +299,8 @@ extern void clock_was_set_delayed(void);
> # define MONOTONIC_RES_NSEC LOW_RES_NSEC
> # define KTIME_MONOTONIC_RES KTIME_LOW_RES
>
> +static inline void hrtimer_enter_idle(void) { }
> +
> static inline void hrtimer_peek_ahead_timers(void) { }
>
> /*
> diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
> index f0f4fe2..ffb16d3 100644
> --- a/kernel/hrtimer.c
> +++ b/kernel/hrtimer.c
> @@ -673,6 +673,27 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
> return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
> }
>
> +static void __hrtimer_reprogram_all(struct hrtimer_cpu_base *base)
> +{
> + if (!hrtimer_hres_active())
> + return;
> +
> + raw_spin_lock(&base->lock);
> + hrtimer_update_base(base);
> + hrtimer_force_reprogram(base, 0);
> + raw_spin_unlock(&base->lock);
> +}
> +
> +void hrtimer_enter_idle(void)
> +{
> + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
> +
> + if (base->timers_removed) {
> + base->timers_removed = 0;
> + __hrtimer_reprogramm_all(base);
> + }
> +}
> +
> /*
> * Retrigger next event is called after clock was set
> *
> @@ -682,13 +703,7 @@ static void retrigger_next_event(void *arg)
> {
> struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
>
> - if (!hrtimer_hres_active())
> - return;
> -
> - raw_spin_lock(&base->lock);
> - hrtimer_update_base(base);
> - hrtimer_force_reprogram(base, 0);
> - raw_spin_unlock(&base->lock);
> + __hrtimer_reprogram_all(base);
> }
>
> /*
> @@ -907,7 +922,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
> */
> static void __remove_hrtimer(struct hrtimer *timer,
> struct hrtimer_clock_base *base,
> - unsigned long newstate, int reprogram)
> + unsigned long newstate)
> {
> struct timerqueue_node *next_timer;
> if (!(timer->state & HRTIMER_STATE_ENQUEUED))
> @@ -915,19 +930,10 @@ static void __remove_hrtimer(struct hrtimer *timer,
>
> next_timer = timerqueue_getnext(&base->active);
> timerqueue_del(&base->active, &timer->node);
> - if (&timer->node == next_timer) {
> #ifdef CONFIG_HIGH_RES_TIMERS
> - /* Reprogram the clock event device. if enabled */
> - if (reprogram && hrtimer_hres_active()) {
> - ktime_t expires;
> -
> - expires = ktime_sub(hrtimer_get_expires(timer),
> - base->offset);
> - if (base->cpu_base->expires_next.tv64 == expires.tv64)
> - hrtimer_force_reprogram(base->cpu_base, 1);
> - }
> + if (hrtimer_hres_active() && &timer->node == next_timer)
> + base->cpu_base->timers_removed++;
> #endif
> - }
> if (!timerqueue_getnext(&base->active))
> base->cpu_base->active_bases &= ~(1 << base->index);
> out:
> @@ -942,26 +948,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
> {
> if (hrtimer_is_queued(timer)) {
> unsigned long state;
> - int reprogram;
>
> - /*
> - * Remove the timer and force reprogramming when high
> - * resolution mode is active and the timer is on the current
> - * CPU. If we remove a timer on another CPU, reprogramming is
> - * skipped. The interrupt event on this CPU is fired and
> - * reprogramming happens in the interrupt handler. This is a
> - * rare case and less expensive than a smp call.
> - */
> debug_deactivate(timer);
> timer_stats_hrtimer_clear_start_info(timer);
> - reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
> /*
> * We must preserve the CALLBACK state flag here,
> * otherwise we could move the timer base in
> * switch_hrtimer_base.
> */
> state = timer->state & HRTIMER_STATE_CALLBACK;
> - __remove_hrtimer(timer, base, state, reprogram);
> + __remove_hrtimer(timer, base, state);
> return 1;
> }
> return 0;
> @@ -1243,7 +1239,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
> WARN_ON(!irqs_disabled());
>
> debug_deactivate(timer);
> - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
> + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK);
> timer_stats_account_hrtimer(timer);
> fn = timer->function;
>
> @@ -1690,7 +1686,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
> * timer could be seen as !active and just vanish away
> * under us on another CPU
> */
> - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
> + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE);
> timer->base = new_base;
> /*
> * Enqueue the timers on the new cpu. This does not

2013-08-08 09:04:56

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

Kernel 3.11-rc1 +Peterz' patch+ Mike's patch, No C-states in BIOS, got the same result as only Peter's patch.
Of course , No C states to enter, make sense.
[root@localhost ~]# time ./pip1m

real 0m4.381s
user 0m0.099s
sys 0m2.784s
[root@localhost ~]# time ./pip1m

real 0m4.436s
user 0m0.093s
sys 0m2.809s
[root@localhost ~]#

Retest with C-states enabled in BIOS
[root@localhost ~]# time ./pip1m

real 0m8.670s
user 0m0.203s
sys 0m5.459s
[root@localhost ~]# time ./pip1m

real 0m8.489s
user 0m0.184s
sys 0m5.360s
[root@localhost ~]#

So the result is Peter's patch working or Mike's ? Compared with default 3.11-rc3
result of test case 1 as following, looks great.
[root@localhost ~]# time ./pip1m

real 0m10.683s
user 0m0.204s
sys 0m6.597s
[root@localhost ~]# time ./pip1m

real 0m10.629s
user 0m0.185s
sys 0m6.546s

So revert Mike's patch and retest, got
[root@localhost ~]# time ./pip1m

real 0m8.606s
user 0m0.193s
sys 0m5.449s
[root@localhost ~]# time ./pip1m

real 0m8.655s
user 0m0.198s
sys 0m5.519s
[root@localhost ~]#

So, it's Peter's patch working??????

The result of kernel 3.11-rc3 + Peter's patch + no rescheduling IPI and no C-states in BIOS is :
[root@localhost ~]# time ./pip1m

real 0m3.915s
user 0m0.088s
sys 0m2.487s
[root@localhost ~]# time ./pip1m

real 0m3.929s
user 0m0.082s
sys 0m2.560s
[root@localhost ~]# time ./pip1m

Got about 0.5 sec better than only Peter's patch, but it is strange, only no rescheduling IPI almost got the
same result.

Thanks,
Ethan

?? 2013-8-8??????12:31??ethan.zhao <[email protected]> д????
>>>
>>> sched: ratelimit nohz
>>>
>>> Entering nohz code on every micro-idle is too expensive to bear.
>>>
>>> Signed-off-by: Mike Galbraith <[email protected]>
>>>
>>> ---
>>> include/linux/sched.h | 5 +++++
>>> kernel/sched/core.c | 5 +++++
>>> kernel/time/tick-sched.c | 2 +-
>>> 3 files changed, 11 insertions(+), 1 deletion(-)
>>>
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -235,9 +235,14 @@ extern int runqueue_is_locked(int cpu);
>>> extern void nohz_balance_enter_idle(int cpu);
>>> extern void set_cpu_sd_state_idle(void);
>>> extern int get_nohz_timer_target(void);
>>> +extern int sched_needs_cpu(int cpu);
>>> #else
>>> static inline void nohz_balance_enter_idle(int cpu) { }
>>> static inline void set_cpu_sd_state_idle(void) { }
>>> +static inline int sched_needs_cpu(int cpu)
>>> +{
>>> + return 0;
>>> +}
>>> #endif
>>>
>>> /*
>>> --- a/kernel/sched/core.c
>>> +++ b/kernel/sched/core.c
>>> @@ -650,6 +650,11 @@ static inline bool got_nohz_idle_kick(vo
>>> return false;
>>> }
>>>
>>> +int sched_needs_cpu(int cpu)
>>> +{
>>> + return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;
>>> +}
>>> +
>>> #else /* CONFIG_NO_HZ_COMMON */
>>>
>>> static inline bool got_nohz_idle_kick(void)
>>> --- a/kernel/time/tick-sched.c
>>> +++ b/kernel/time/tick-sched.c
>>> @@ -548,7 +548,7 @@ static ktime_t tick_nohz_stop_sched_tick
>>> time_delta = timekeeping_max_deferment();
>>> } while (read_seqretry(&jiffies_lock, seq));
>>>
>>> - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
>>> + if (sched_needs_cpu(cpu) || rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
>>> arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
>>> next_jiffies = last_jiffies + 1;
>>> delta_jiffies = 1;
>>>
>>>
>>
>>
>

2013-08-08 09:06:09

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

?? 2013-8-8??????5:04??"ethan.zhao" <[email protected]> д????

> Kernel 3.11-rc1 +Peterz' patch+ Mike's patch, No C-states in BIOS, got the same result as only Peter's patch.

Typo 3.11-rc3
> Of course , No C states to enter, make sense.
> [root@localhost ~]# time ./pip1m
>
> real 0m4.381s
> user 0m0.099s
> sys 0m2.784s
> [root@localhost ~]# time ./pip1m
>
> real 0m4.436s
> user 0m0.093s
> sys 0m2.809s
> [root@localhost ~]#
>
> Retest with C-states enabled in BIOS
> [root@localhost ~]# time ./pip1m
>
> real 0m8.670s
> user 0m0.203s
> sys 0m5.459s
> [root@localhost ~]# time ./pip1m
>
> real 0m8.489s
> user 0m0.184s
> sys 0m5.360s
> [root@localhost ~]#
>
> So the result is Peter's patch working or Mike's ? Compared with default 3.11-rc3
> result of test case 1 as following, looks great.
> [root@localhost ~]# time ./pip1m
>
> real 0m10.683s
> user 0m0.204s
> sys 0m6.597s
> [root@localhost ~]# time ./pip1m
>
> real 0m10.629s
> user 0m0.185s
> sys 0m6.546s
>
> So revert Mike's patch and retest, got
> [root@localhost ~]# time ./pip1m
>
> real 0m8.606s
> user 0m0.193s
> sys 0m5.449s
> [root@localhost ~]# time ./pip1m
>
> real 0m8.655s
> user 0m0.198s
> sys 0m5.519s
> [root@localhost ~]#
>
> So, it's Peter's patch working??????
>
> The result of kernel 3.11-rc3 + Peter's patch + no rescheduling IPI and no C-states in BIOS is :
> [root@localhost ~]# time ./pip1m
>
> real 0m3.915s
> user 0m0.088s
> sys 0m2.487s
> [root@localhost ~]# time ./pip1m
>
> real 0m3.929s
> user 0m0.082s
> sys 0m2.560s
> [root@localhost ~]# time ./pip1m
>
> Got about 0.5 sec better than only Peter's patch, but it is strange, only no rescheduling IPI almost got the
> same result.
>
>
> Thanks,
> Ethan
>
>
> ?? 2013-8-8??????12:31??ethan.zhao <[email protected]> д????
>>>>
>>>> sched: ratelimit nohz
>>>>
>>>> Entering nohz code on every micro-idle is too expensive to bear.
>>>>
>>>> Signed-off-by: Mike Galbraith <[email protected]>
>>>>
>>>> ---
>>>> include/linux/sched.h | 5 +++++
>>>> kernel/sched/core.c | 5 +++++
>>>> kernel/time/tick-sched.c | 2 +-
>>>> 3 files changed, 11 insertions(+), 1 deletion(-)
>>>>
>>>> --- a/include/linux/sched.h
>>>> +++ b/include/linux/sched.h
>>>> @@ -235,9 +235,14 @@ extern int runqueue_is_locked(int cpu);
>>>> extern void nohz_balance_enter_idle(int cpu);
>>>> extern void set_cpu_sd_state_idle(void);
>>>> extern int get_nohz_timer_target(void);
>>>> +extern int sched_needs_cpu(int cpu);
>>>> #else
>>>> static inline void nohz_balance_enter_idle(int cpu) { }
>>>> static inline void set_cpu_sd_state_idle(void) { }
>>>> +static inline int sched_needs_cpu(int cpu)
>>>> +{
>>>> + return 0;
>>>> +}
>>>> #endif
>>>>
>>>> /*
>>>> --- a/kernel/sched/core.c
>>>> +++ b/kernel/sched/core.c
>>>> @@ -650,6 +650,11 @@ static inline bool got_nohz_idle_kick(vo
>>>> return false;
>>>> }
>>>>
>>>> +int sched_needs_cpu(int cpu)
>>>> +{
>>>> + return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;
>>>> +}
>>>> +
>>>> #else /* CONFIG_NO_HZ_COMMON */
>>>>
>>>> static inline bool got_nohz_idle_kick(void)
>>>> --- a/kernel/time/tick-sched.c
>>>> +++ b/kernel/time/tick-sched.c
>>>> @@ -548,7 +548,7 @@ static ktime_t tick_nohz_stop_sched_tick
>>>> time_delta = timekeeping_max_deferment();
>>>> } while (read_seqretry(&jiffies_lock, seq));
>>>>
>>>> - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
>>>> + if (sched_needs_cpu(cpu) || rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
>>>> arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
>>>> next_jiffies = last_jiffies + 1;
>>>> delta_jiffies = 1;
>>>>
>>>>
>>>
>>>
>>
>

2013-08-08 12:15:19

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Thu, 2013-08-08 at 17:04 +0800, ethan.zhao wrote:

> So, it's Peter's patch working………

Yes. My patch killed the nohz tick start/stop overhead prior to 3.8,
but in 3.8, the menu governor changes added more high frequency timer
banging that negated its effect. In 3.9, mwait went away, making core2
boxen very sad, and in 3.10 all of x86 got generic idle, making my boxen
even sadder.

select_idle_sibling() (the thing that makes us schedule pipe-test and
many other things cross core if there's an idle core available) has
always been two-faced, doing both good and evil.. but with recent work,
it's leaning more toward evil than ever before.

-Mike

2013-08-08 15:02:22

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

?? 2013-8-6??????3:29??Mike Galbraith <[email protected]> д????

> +int sched_needs_cpu(int cpu)
> +{
> + return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;
> +}
> +
> #else /* CONFIG_NO_HZ_COMMON */
>
> static inline bool got_nohz_idle_kick(void)
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -548,7 +548,7 @@ static ktime_t tick_nohz_stop_sched_tick
> time_delta = timekeeping_max_deferment();
> } while (read_seqretry(&jiffies_lock, seq));
>
> - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
> + if (sched_needs_cpu(cpu) || rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
> arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
> next_jiffies = last_jiffies + 1;
> delta_jiffies = 1;

If the performace regression was caused by too much expensive clock device reprogramming and too frequent entering /exiting of C-states?? this patch should work.
except the following result is almost always false under 3.11-rc3 code.

> return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;

Ethan

2013-08-09 06:52:16

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Thu, 2013-08-08 at 23:02 +0800, ethan.zhao wrote:
> 在 2013-8-6，下午3:29，Mike Galbraith <[email protected]> 写道：
>
> > +int sched_needs_cpu(int cpu)
> > +{
> > + return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;
> > +}
> > +
> > #else /* CONFIG_NO_HZ_COMMON */
> >
> > static inline bool got_nohz_idle_kick(void)
> > --- a/kernel/time/tick-sched.c
> > +++ b/kernel/time/tick-sched.c
> > @@ -548,7 +548,7 @@ static ktime_t tick_nohz_stop_sched_tick
> > time_delta = timekeeping_max_deferment();
> > } while (read_seqretry(&jiffies_lock, seq));
> >
> > - if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
> > + if (sched_needs_cpu(cpu) || rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
> > arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
> > next_jiffies = last_jiffies + 1;
> > delta_jiffies = 1;
>
> If the performace regression was caused by too much expensive clock device reprogramming and too frequent entering /exiting of C-states… this patch should work.
> except the following result is almost always false under 3.11-rc3 code.
>
> > return cpu_rq(cpu)->avg_idle < sysctl_sched_migration_cost;

On my E5620 box, avg_idle works fine. Patchlet doesn't save as much as
it used to, thanks to Peter's patch now killing the worst of the pain,
but it does still does save cycles. I have too much regression left to
say exactly what it can now save max, doesn't matter much either.

The pertinent numbers:

v3.11-rc4-27-ge4ef108 496.7 KHz .858 1.030 throttle+peterz
v3.11-rc4-27-ge4ef108 440.7 KHz .761 1.296 nothrottle+peterz

-Mike

2013-09-05 06:36:45

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Thu, 2013-08-08 at 15:32 +0800, ethan.zhao wrote:

...

> That is great improvement, So it is worth to merge.

It didn't swim upstream for some reason.

-Mike

2013-09-09 13:30:54

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

Lets go back to the list with this..

On Mon, Sep 09, 2013 at 02:42:18PM +0200, Mike Galbraith wrote:
> On Mon, 2013-09-09 at 14:23 +0200, Peter Zijlstra wrote:
> > On Fri, Sep 06, 2013 at 07:39:02AM +0200, Mike Galbraith wrote:
> > > The patch takes a large bite out of regressions. What's left for a
> > > Westmere box is the introduction of reschedule_interrupt overhead
> > > introduced by 7d1a9417 x86: Use generic idle loop.
> >
> > How exactly does that commit cause extra IPIs? Did the entire TS_POLLING
> > stuff break or so?
>
> Seems so.
>
> > > Core2 eats that,
> > > plus Intel making mwait_idle() go away with no way for them to get to
> > > the remaining mwait_idle_with_hints().
> >
> > but but but drivers/idle/intel_idle.c still uses mwait.. what's the
> > exact complaint?
>
> reschedule_interrupt overhead for cross core pipe-test appeared in
> westmere box at 7d1a9417.

So that patch does indeed loose the TS_POLLING stuff for all of x86. I'm
not entirely sure where we want to add it back, but the best place to me
seems the idle loop implementations themselves.

Below a patch that does intel_idle.c which is what your WSM would be
using I suppose. We'll probably want to iterate all idle implementations
and do what needs doing.

---
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index fa6964d..486c0ba 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -359,6 +359,14 @@ static int intel_idle(struct cpuidle_device *dev,
if (!(lapic_timer_reliable_states & (1 << (cstate))))
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);

+ current_thread_into()->status |= TS_POLLING;
+
+ /*
+ * Order against setting of TS_POLLING against the reading of
+ * NEED_RESCHED, matched by resched_task().
+ */
+ smp_mb();
+
if (!need_resched()) {

__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -367,6 +375,8 @@ static int intel_idle(struct cpuidle_device *dev,
__mwait(eax, ecx);
}

+ current_thread_into()->status &= ~TS_POLLING;
+
if (!(lapic_timer_reliable_states & (1 << (cstate))))
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);

2013-09-09 13:46:48

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Mon, Sep 09, 2013 at 03:30:44PM +0200, Peter Zijlstra wrote:
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index fa6964d..486c0ba 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -359,6 +359,14 @@ static int intel_idle(struct cpuidle_device *dev,
> if (!(lapic_timer_reliable_states & (1 << (cstate))))
> clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
>
> + current_thread_into()->status |= TS_POLLING;
> +
> + /*
> + * Order against setting of TS_POLLING against the reading of
> + * NEED_RESCHED, matched by resched_task().
> + */
> + smp_mb();
> +
> if (!need_resched()) {
>
> __monitor((void *)&current_thread_info()->flags, 0, 0);
> @@ -367,6 +375,8 @@ static int intel_idle(struct cpuidle_device *dev,
> __mwait(eax, ecx);
> }
>
> + current_thread_into()->status &= ~TS_POLLING;
> +
> if (!(lapic_timer_reliable_states & (1 << (cstate))))
> clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);

Hmm, arguably it would be better to set this from
intel_idle_cpuidle_driver_init() and clear it whenever this goes away.
Not sure how all that works, the cpuidle driver framework seems 'weird'.

2013-09-11 08:57:00

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Mon, Sep 09, 2013 at 03:46:35PM +0200, Peter Zijlstra wrote:
> On Mon, Sep 09, 2013 at 03:30:44PM +0200, Peter Zijlstra wrote:
> > diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> > index fa6964d..486c0ba 100644
> > --- a/drivers/idle/intel_idle.c
> > +++ b/drivers/idle/intel_idle.c
> > @@ -359,6 +359,14 @@ static int intel_idle(struct cpuidle_device *dev,
> > if (!(lapic_timer_reliable_states & (1 << (cstate))))
> > clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
> >
> > + current_thread_into()->status |= TS_POLLING;
> > +
> > + /*
> > + * Order against setting of TS_POLLING against the reading of
> > + * NEED_RESCHED, matched by resched_task().
> > + */
> > + smp_mb();
> > +
> > if (!need_resched()) {
> >
> > __monitor((void *)&current_thread_info()->flags, 0, 0);
> > @@ -367,6 +375,8 @@ static int intel_idle(struct cpuidle_device *dev,
> > __mwait(eax, ecx);
> > }
> >
> > + current_thread_into()->status &= ~TS_POLLING;
> > +
> > if (!(lapic_timer_reliable_states & (1 << (cstate))))
> > clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
>
> Hmm, arguably it would be better to set this from
> intel_idle_cpuidle_driver_init() and clear it whenever this goes away.
> Not sure how all that works, the cpuidle driver framework seems 'weird'.

OK, so I went over the idle stuff again, and we do set TS_POLLING like
stuff, it got hidden in current_{clr,set}_polling().

Still if that patch causes extra IPIs its bound to be broken in some
creative way.. I'll prod.

2013-09-11 10:26:05

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Wed, 2013-09-11 at 10:56 +0200, Peter Zijlstra wrote:
> On Mon, Sep 09, 2013 at 03:46:35PM +0200, Peter Zijlstra wrote:
> > On Mon, Sep 09, 2013 at 03:30:44PM +0200, Peter Zijlstra wrote:
> > > diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> > > index fa6964d..486c0ba 100644
> > > --- a/drivers/idle/intel_idle.c
> > > +++ b/drivers/idle/intel_idle.c
> > > @@ -359,6 +359,14 @@ static int intel_idle(struct cpuidle_device *dev,
> > > if (!(lapic_timer_reliable_states & (1 << (cstate))))
> > > clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
> > >
> > > + current_thread_into()->status |= TS_POLLING;
> > > +
> > > + /*
> > > + * Order against setting of TS_POLLING against the reading of
> > > + * NEED_RESCHED, matched by resched_task().
> > > + */
> > > + smp_mb();
> > > +
> > > if (!need_resched()) {
> > >
> > > __monitor((void *)&current_thread_info()->flags, 0, 0);
> > > @@ -367,6 +375,8 @@ static int intel_idle(struct cpuidle_device *dev,
> > > __mwait(eax, ecx);
> > > }
> > >
> > > + current_thread_into()->status &= ~TS_POLLING;
> > > +
> > > if (!(lapic_timer_reliable_states & (1 << (cstate))))
> > > clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
> >
> > Hmm, arguably it would be better to set this from
> > intel_idle_cpuidle_driver_init() and clear it whenever this goes away.
> > Not sure how all that works, the cpuidle driver framework seems 'weird'.
>
> OK, so I went over the idle stuff again, and we do set TS_POLLING like
> stuff, it got hidden in current_{clr,set}_polling().
>
> Still if that patch causes extra IPIs its bound to be broken in some
> creative way.. I'll prod.

(thanks, when I get a break from testing/poking this'n'that, I'll take a
peek too... well, good_intentions++ anyway;)

2013-10-04 12:06:58

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

Mike, Peter,
Seems lots of work has been done these days, studious guys. those
patches merged in last stable/dev branch (fix performance regression
caused by extra rtimer programming and rescheduling IPI,confusing
idle... etc) ? So I could just do a lazy pull for test with my
environment. I need catch up with other mail loops with my vacation
again.

Thanks,
Ethan

On Wed, Sep 11, 2013 at 6:25 PM, Mike Galbraith <[email protected]> wrote:
> On Wed, 2013-09-11 at 10:56 +0200, Peter Zijlstra wrote:
>> On Mon, Sep 09, 2013 at 03:46:35PM +0200, Peter Zijlstra wrote:
>> > On Mon, Sep 09, 2013 at 03:30:44PM +0200, Peter Zijlstra wrote:
>> > > diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
>> > > index fa6964d..486c0ba 100644
>> > > --- a/drivers/idle/intel_idle.c
>> > > +++ b/drivers/idle/intel_idle.c
>> > > @@ -359,6 +359,14 @@ static int intel_idle(struct cpuidle_device *dev,
>> > > if (!(lapic_timer_reliable_states & (1 << (cstate))))
>> > > clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
>> > >
>> > > + current_thread_into()->status |= TS_POLLING;
>> > > +
>> > > + /*
>> > > + * Order against setting of TS_POLLING against the reading of
>> > > + * NEED_RESCHED, matched by resched_task().
>> > > + */
>> > > + smp_mb();
>> > > +
>> > > if (!need_resched()) {
>> > >
>> > > __monitor((void *)&current_thread_info()->flags, 0, 0);
>> > > @@ -367,6 +375,8 @@ static int intel_idle(struct cpuidle_device *dev,
>> > > __mwait(eax, ecx);
>> > > }
>> > >
>> > > + current_thread_into()->status &= ~TS_POLLING;
>> > > +
>> > > if (!(lapic_timer_reliable_states & (1 << (cstate))))
>> > > clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
>> >
>> > Hmm, arguably it would be better to set this from
>> > intel_idle_cpuidle_driver_init() and clear it whenever this goes away.
>> > Not sure how all that works, the cpuidle driver framework seems 'weird'.
>>
>> OK, so I went over the idle stuff again, and we do set TS_POLLING like
>> stuff, it got hidden in current_{clr,set}_polling().
>>
>> Still if that patch causes extra IPIs its bound to be broken in some
>> creative way.. I'll prod.
>
> (thanks, when I get a break from testing/poking this'n'that, I'll take a
> peek too... well, good_intentions++ anyway;)
>

2013-10-07 04:42:41

[permalink] [raw]

Subject: Re: [PATCH V3]hrtimer: Fix a performance regression by disable reprogramming in remove_hrtimer

On Fri, 2013-10-04 at 20:06 +0800, Ethan Zhao wrote:
> Mike, Peter,
> Seems lots of work has been done these days, studious guys. those
> patches merged in last stable/dev branch (fix performance regression
> caused by extra rtimer programming and rescheduling IPI,confusing
> idle... etc) ? So I could just do a lazy pull for test with my
> environment. I need catch up with other mail loops with my vacation
> again.

Massive timer overhead seems to have crawled off and died while I wasn't
looking. Peter's fix for IPI woes..

tip commit ea811747 sched, idle: Fix the idle polling state logic

..hasn't yet swum upstream.

-Mike

2013-10-07 04:58:06