smp_call_function_single and ttwu_queue_remote sends unconditional IPI
to target CPU. However, if the target CPU is in mwait based idle, we can
do IPI-less wakeups using the magical powers of monitor-mwait.
Doing this has certain advantages:
* Lower overhead on Async IPI send path. Measurements on Westmere based
systems show savings on "no wait" smp_call_function_single with idle
target CPU (as measured on the sender side).
local socket smp_call_func cost goes from ~1600 to ~1200 cycles
remote socket smp_call_func cost goes from ~2000 to ~1800 cycles
* Avoiding actual interrupts shows a measurable reduction (10%) in system
non-idle cycles and cache-references with micro-benchmark sending IPI from
one CPU to all the other mostly idle CPUs in the system.
* On a mostly idle system, turbostat shows a tiny decrease in C0(active) time
and a corresponding increase in C6 state (Each row being 10min avg)
%c0 %c1 %c6
Before
Run 1 1.51 2.93 95.55
Run 2 1.48 2.86 95.65
Run 3 1.46 2.78 95.74
After
Run 1 1.35 2.63 96.00
Run 2 1.46 2.78 95.74
Run 3 1.37 2.63 95.98
* As a bonus, we can avoid sched/call IPI overhead altogether in a special case.
When CPU Y has woken up CPU X (which can take 50-100us to actually wakeup
from a deep idle state) and CPU Z wants to send IPI to CPU X in this period.
It can get it for free.
We started looking at this with one of our workloads where system is partially
busy and we noticed some kernel hotspots in find_next_bit and
default_send_IPI_mask_sequence_phys coming from sched wakeup (futex wakeups)
and networking call functions. So, this change addresses those two specific
IPI types. This could be extended to nohz_kick, etc.
Note:
* This only helps when target CPU is idle. When it is busy we will still send
IPI as before.
* Only for X86_64 and mwait_idle_with_hints for now, with limited testing.
* Will need some accounting for these wakeups exported for powertop and friends.
Comments?
Signed-off-by: Venkatesh Pallipadi <[email protected]>
---
arch/x86/Kconfig | 5 ++++
arch/x86/include/asm/thread_info.h | 1 +
arch/x86/kernel/acpi/cstate.c | 22 ++++++++++++++++
arch/x86/kernel/process_64.c | 12 +++++++++
include/linux/sched.h | 47 ++++++++++++++++++++++++++++++++++++
include/linux/thread_info.h | 2 +
kernel/sched/core.c | 40 +++++++++++++++++++++++++++++-
kernel/smp.c | 4 +++
8 files changed, 132 insertions(+), 1 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e..6a56cb0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -170,6 +170,11 @@ config GENERIC_TIME_VSYSCALL
bool
default X86_64
+config AVOID_IPI_ON_IDLE
+ bool
+ depends on X86_64
+ default X86_64
+
config ARCH_HAS_CPU_RELAX
def_bool y
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index bc817cd..24e4a56 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,7 @@ struct thread_info {
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
+#define TIF_IPI_PENDING 29 /* pending IPI work on idle exit */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index f50e7fb..340c4e9 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -161,6 +161,14 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
+#ifdef CONFIG_AVOID_IPI_ON_IDLE
+ int oldval, cpu = smp_processor_id();
+
+ if (!system_using_cpu_idle_sync)
+ system_using_cpu_idle_sync = 1;
+
+ atomic_set(&per_cpu(cpu_idle_sync, cpu), CPU_STATE_IDLE);
+#endif
if (!need_resched()) {
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)¤t_thread_info()->flags);
@@ -170,6 +178,20 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
if (!need_resched())
__mwait(ax, cx);
}
+#ifdef CONFIG_AVOID_IPI_ON_IDLE
+ oldval = atomic_cmpxchg(&per_cpu(cpu_idle_sync, cpu),
+ CPU_STATE_IDLE, CPU_STATE_RUNNING);
+
+ if (oldval != CPU_STATE_IDLE) {
+ oldval = atomic_cmpxchg(&per_cpu(cpu_idle_sync, cpu),
+ CPU_STATE_WOKENUP, CPU_STATE_RUNNING);
+ while (oldval != CPU_STATE_WOKENUP) {
+ rep_nop();
+ oldval = atomic_cmpxchg(&per_cpu(cpu_idle_sync, cpu),
+ CPU_STATE_WOKENUP, CPU_STATE_RUNNING);
+ }
+ }
+#endif
}
void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9b9fe4a..4c6bb4a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -153,6 +153,18 @@ void cpu_idle(void)
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
__exit_idle();
+
+#ifdef CONFIG_AVOID_IPI_ON_IDLE
+ if (ipi_pending()) {
+ clear_ipi_pending();
+ local_bh_disable();
+ local_irq_disable();
+ generic_smp_call_function_single_interrupt();
+ scheduler_wakeup_self_check();
+ local_irq_enable();
+ local_bh_enable();
+ }
+#endif
}
tick_nohz_idle_exit();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2234985..a612e79 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -249,6 +249,43 @@ extern char ___assert_task_state[1 - 2*!!(
#include <linux/spinlock.h>
/*
+ * cpu_idle_sync is used to check CPU idle state from remote CPUs. State
+ * transitions go as:
+ * CPU X idle entry
+ * - Prev state CPU_STATE_RUNNING, Next state CPU_STATE_IDLE.
+ * CPU X idle exit
+ * - If Prev state CPU_STATE_IDLE, Next state CPU_STATE_RUNNING
+ * - If Prev state is CPU_STATE_WAKING, wait until state is CPU_STATE_WOKENUP,
+ * change the state to CPU_STATE_RUNNING and check for pending IPI activity.
+ * - If Prev state is CPU_STATE_WOKENUP, change the state to CPU_STATE_RUNNING
+ * and check for pending IPI activity.
+ *
+ * CPU Y wants to send IPI to CPU X
+ * - If Prev state is CPU_STATE_RUNNING, send ipi.
+ * - If Prev state is CPU_STATE_IDLE, atomic change to CPU_STATE_WAKING,
+ * set the flag for pending IPI activity on corresponding idle task and then
+ * change the state to CPU_STATE_WOKENUP. Continue without IPI.
+ * - If Prev state is CPU_STATE_WAKING or CPU_STATE_WOKENUP, then some CPU Z
+ * is doing the wakeup of X and Y gets it for free. Continue without IPI.
+ */
+enum cpu_idle_state {
+ CPU_STATE_RUNNING,
+ CPU_STATE_IDLE,
+ CPU_STATE_WAKING,
+ CPU_STATE_WOKENUP
+};
+DECLARE_PER_CPU(atomic_t, cpu_idle_sync);
+extern int system_using_cpu_idle_sync;
+
+#if defined(CONFIG_SMP) && defined(CONFIG_AVOID_IPI_ON_IDLE)
+extern int smp_need_ipi(int cpu);
+extern void scheduler_wakeup_self_check(void);
+#else
+static inline int smp_need_ipi(int cpu) { return 1; }
+static inline void scheduler_wakeup_self_check(void) { }
+#endif
+
+/*
* This serializes "schedule()" and also protects
* the run-queue from deletions/modifications (but
* _adding_ to the beginning of the run-queue has
@@ -2574,6 +2611,16 @@ static inline int need_resched(void)
return unlikely(test_thread_flag(TIF_NEED_RESCHED));
}
+static inline void set_tsk_ipi_pending(struct task_struct *tsk)
+{
+ set_tsk_thread_flag(tsk, TIF_IPI_PENDING);
+}
+
+static inline int ipi_pending(void)
+{
+ return unlikely(test_thread_flag(TIF_IPI_PENDING));
+}
+
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 8d03f07..9a458e2 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -98,6 +98,8 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
#define set_need_resched() set_thread_flag(TIF_NEED_RESCHED)
#define clear_need_resched() clear_thread_flag(TIF_NEED_RESCHED)
+#define clear_ipi_pending() clear_thread_flag(TIF_IPI_PENDING)
+
#if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
/*
* An arch can define its own version of set_restore_sigmask() to get the
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index df00cb0..14c9fc8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -492,6 +492,9 @@ static inline void init_hrtick(void)
}
#endif /* CONFIG_SCHED_HRTICK */
+DEFINE_PER_CPU(atomic_t, cpu_idle_sync);
+__read_mostly int system_using_cpu_idle_sync;
+
/*
* resched_task - mark a task 'to be rescheduled now'.
*
@@ -505,6 +508,29 @@ static inline void init_hrtick(void)
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
#endif
+#ifdef CONFIG_AVOID_IPI_ON_IDLE
+int smp_need_ipi(int cpu)
+{
+ int oldval;
+
+ if (!system_using_cpu_idle_sync || cpu == smp_processor_id())
+ return 1;
+
+ oldval = atomic_cmpxchg(&per_cpu(cpu_idle_sync, cpu),
+ CPU_STATE_IDLE, CPU_STATE_WAKING);
+
+ if (oldval == CPU_STATE_RUNNING)
+ return 1;
+
+ if (oldval == CPU_STATE_IDLE) {
+ set_tsk_ipi_pending(idle_task(cpu));
+ atomic_set(&per_cpu(cpu_idle_sync, cpu), CPU_STATE_WOKENUP);
+ }
+
+ return 0;
+}
+#endif
+
void resched_task(struct task_struct *p)
{
int cpu;
@@ -1456,6 +1482,16 @@ static void sched_ttwu_pending(void)
raw_spin_unlock(&rq->lock);
}
+#ifdef CONFIG_AVOID_IPI_ON_IDLE
+void scheduler_wakeup_self_check(void)
+{
+ if (llist_empty(&this_rq()->wake_list))
+ return;
+
+ sched_ttwu_pending();
+}
+#endif
+
void scheduler_ipi(void)
{
if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
@@ -1490,7 +1526,8 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
- smp_send_reschedule(cpu);
+ if (smp_need_ipi(cpu))
+ smp_send_reschedule(cpu);
}
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -6943,6 +6980,7 @@ void __init sched_init(void)
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
+ atomic_set(&per_cpu(cpu_idle_sync, i), CPU_STATE_RUNNING);
}
set_load_weight(&init_task);
diff --git a/kernel/smp.c b/kernel/smp.c
index db197d6..48a9c4a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/percpu.h>
+#include <linux/sched.h>
#include <linux/init.h>
#include <linux/gfp.h>
#include <linux/smp.h>
@@ -145,6 +146,9 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
list_add_tail(&data->list, &dst->list);
raw_spin_unlock_irqrestore(&dst->lock, flags);
+ if (ipi)
+ ipi = smp_need_ipi(cpu);
+
/*
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
--
1.7.7.3
On Mon, 2012-02-06 at 12:42 -0800, Venkatesh Pallipadi wrote:
> smp_call_function_single and ttwu_queue_remote sends unconditional IPI
> to target CPU. However, if the target CPU is in mwait based idle, we can
> do IPI-less wakeups using the magical powers of monitor-mwait.
> Doing this has certain advantages:
> * Lower overhead on Async IPI send path. Measurements on Westmere based
> systems show savings on "no wait" smp_call_function_single with idle
> target CPU (as measured on the sender side).
> local socket smp_call_func cost goes from ~1600 to ~1200 cycles
> remote socket smp_call_func cost goes from ~2000 to ~1800 cycles
> * Avoiding actual interrupts shows a measurable reduction (10%) in system
> non-idle cycles and cache-references with micro-benchmark sending IPI from
> one CPU to all the other mostly idle CPUs in the system.
> * On a mostly idle system, turbostat shows a tiny decrease in C0(active) time
> and a corresponding increase in C6 state (Each row being 10min avg)
> %c0 %c1 %c6
> Before
> Run 1 1.51 2.93 95.55
> Run 2 1.48 2.86 95.65
> Run 3 1.46 2.78 95.74
> After
> Run 1 1.35 2.63 96.00
> Run 2 1.46 2.78 95.74
> Run 3 1.37 2.63 95.98
>
> * As a bonus, we can avoid sched/call IPI overhead altogether in a special case.
> When CPU Y has woken up CPU X (which can take 50-100us to actually wakeup
> from a deep idle state) and CPU Z wants to send IPI to CPU X in this period.
> It can get it for free.
>
> We started looking at this with one of our workloads where system is partially
> busy and we noticed some kernel hotspots in find_next_bit and
> default_send_IPI_mask_sequence_phys coming from sched wakeup (futex wakeups)
> and networking call functions. So, this change addresses those two specific
> IPI types. This could be extended to nohz_kick, etc.
>
> Note:
> * This only helps when target CPU is idle. When it is busy we will still send
> IPI as before.
> * Only for X86_64 and mwait_idle_with_hints for now, with limited testing.
> * Will need some accounting for these wakeups exported for powertop and friends.
>
> Comments?
Curiously you avoided the existing tsk_is_polling() magic, which IIRC is
doing something similar for waking from the idle loop.
On Mon, Feb 6, 2012 at 1:02 PM, Peter Zijlstra <[email protected]> wrote:
> On Mon, 2012-02-06 at 12:42 -0800, Venkatesh Pallipadi wrote:
>> smp_call_function_single and ttwu_queue_remote sends unconditional IPI
>> to target CPU. However, if the target CPU is in mwait based idle, we can
>> do IPI-less wakeups using the magical powers of monitor-mwait.
>> Doing this has certain advantages:
>> * Lower overhead on Async IPI send path. Measurements on Westmere based
>> ? systems show savings on "no wait" smp_call_function_single with idle
>> ? target CPU (as measured on the sender side).
>> ? local socket smp_call_func cost goes from ~1600 to ~1200 cycles
>> ? remote socket smp_call_func cost goes from ~2000 to ~1800 cycles
>> * Avoiding actual interrupts shows a measurable reduction (10%) in system
>> ? non-idle cycles and cache-references with micro-benchmark sending IPI from
>> ? one CPU to all the other mostly idle CPUs in the system.
>> * On a mostly idle system, turbostat shows a tiny decrease in C0(active) time
>> ? and a corresponding increase in C6 state (Each row being 10min avg)
>> ? ? ? ? ? %c0 ? %c1 ? %c6
>> ? Before
>> ? Run 1 ?1.51 ?2.93 95.55
>> ? Run 2 ?1.48 ?2.86 95.65
>> ? Run 3 ?1.46 ?2.78 95.74
>> ? After
>> ? Run 1 ?1.35 ?2.63 96.00
>> ? Run 2 ?1.46 ?2.78 95.74
>> ? Run 3 ?1.37 ?2.63 95.98
>>
>> * As a bonus, we can avoid sched/call IPI overhead altogether in a special case.
>> ? When CPU Y has woken up CPU X (which can take 50-100us to actually wakeup
>> ? from a deep idle state) and CPU Z wants to send IPI to CPU X in this period.
>> ? It can get it for free.
>>
>> We started looking at this with one of our workloads where system is partially
>> busy and we noticed some kernel hotspots in find_next_bit and
>> default_send_IPI_mask_sequence_phys coming from sched wakeup (futex wakeups)
>> and networking call functions. So, this change addresses those two specific
>> IPI types. This could be extended to nohz_kick, etc.
>>
>> Note:
>> * This only helps when target CPU is idle. When it is busy we will still send
>> ? IPI as before.
>> * Only for X86_64 and mwait_idle_with_hints for now, with limited testing.
>> * Will need some accounting for these wakeups exported for powertop and friends.
>>
>> Comments?
>
> Curiously you avoided the existing tsk_is_polling() magic, which IIRC is
> doing something similar for waking from the idle loop.
>
Yes. That needs remote CPU's current task, which extends onto rq lock,
which I was trying to avoid. So, I went with conditional waiting on
idle exit for the small window of WAKING to WOKEN state change, as we
know we are always polling in the mwait loop.
On 02/06/2012 12:42 PM, Venkatesh Pallipadi wrote:
[...]
> index 5bed94e..6a56cb0 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -170,6 +170,11 @@ config GENERIC_TIME_VSYSCALL
> bool
> default X86_64
>
> +config AVOID_IPI_ON_IDLE
> + bool
> + depends on X86_64
> + default X86_64
> +
Can you get rid of this part...
> config ARCH_HAS_CPU_RELAX
> def_bool y
>
[...]
> diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
> index f50e7fb..340c4e9 100644
> --- a/arch/x86/kernel/acpi/cstate.c
> +++ b/arch/x86/kernel/acpi/cstate.c
> @@ -161,6 +161,14 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
> */
> void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
> {
> +#ifdef CONFIG_AVOID_IPI_ON_IDLE
... and change all of these to #ifdef CONFIG_64BIT
> + int oldval, cpu = smp_processor_id();
> +
.
.
.
Thanks,
David Daney
On 02/06/2012 04:26 PM, David Daney wrote:
> On 02/06/2012 12:42 PM, Venkatesh Pallipadi wrote:
> [...]
>> index 5bed94e..6a56cb0 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -170,6 +170,11 @@ config GENERIC_TIME_VSYSCALL
>> bool
>> default X86_64
>>
>> +config AVOID_IPI_ON_IDLE
>> + bool
>> + depends on X86_64
>> + default X86_64
>> +
>
> Can you get rid of this part...
>
>> config ARCH_HAS_CPU_RELAX
>> def_bool y
>>
> [...]
>> diff --git a/arch/x86/kernel/acpi/cstate.c
>> b/arch/x86/kernel/acpi/cstate.c
>> index f50e7fb..340c4e9 100644
>> --- a/arch/x86/kernel/acpi/cstate.c
>> +++ b/arch/x86/kernel/acpi/cstate.c
>> @@ -161,6 +161,14 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
>> */
>> void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
>> {
>> +#ifdef CONFIG_AVOID_IPI_ON_IDLE
>
>
> ... and change all of these to #ifdef CONFIG_64BIT
>
>> + int oldval, cpu = smp_processor_id();
>> +
Why should this be 64-bit specific??
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
On 02/06/2012 05:24 PM, H. Peter Anvin wrote:
> On 02/06/2012 04:26 PM, David Daney wrote:
>> On 02/06/2012 12:42 PM, Venkatesh Pallipadi wrote:
>> [...]
>>> index 5bed94e..6a56cb0 100644
>>> --- a/arch/x86/Kconfig
>>> +++ b/arch/x86/Kconfig
>>> @@ -170,6 +170,11 @@ config GENERIC_TIME_VSYSCALL
>>> bool
>>> default X86_64
>>>
>>> +config AVOID_IPI_ON_IDLE
>>> + bool
>>> + depends on X86_64
>>> + default X86_64
>>> +
>>
>> Can you get rid of this part...
>>
>>> config ARCH_HAS_CPU_RELAX
>>> def_bool y
>>>
>> [...]
>>> diff --git a/arch/x86/kernel/acpi/cstate.c
>>> b/arch/x86/kernel/acpi/cstate.c
>>> index f50e7fb..340c4e9 100644
>>> --- a/arch/x86/kernel/acpi/cstate.c
>>> +++ b/arch/x86/kernel/acpi/cstate.c
>>> @@ -161,6 +161,14 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
>>> */
>>> void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
>>> {
>>> +#ifdef CONFIG_AVOID_IPI_ON_IDLE
>>
>>
>> ... and change all of these to #ifdef CONFIG_64BIT
>>
>>> + int oldval, cpu = smp_processor_id();
>>> +
>
> Why should this be 64-bit specific??
>
Well I don't know that part. The gist of my comment was to get rid of
an unneeded new config variable.
If this works just as well for 32-bit, then by all means do it there too.
In any event, I think you want the kernel to automatically do it if the
CPU is capable, and not have a manual config setting you have to fiddle
with.
David Daney
On 02/06/2012 05:34 PM, David Daney wrote:
>
> Well I don't know that part. The gist of my comment was to get rid of an
> unneeded new config variable.
>
> If this works just as well for 32-bit, then by all means do it there too.
>
> In any event, I think you want the kernel to automatically do it if the
> CPU is capable, and not have a manual config setting you have to fiddle
> with.
>
If this is a transitional thing then it might justify it.
-hpa
On Mon, Feb 6, 2012 at 5:45 PM, H. Peter Anvin <[email protected]> wrote:
> On 02/06/2012 05:34 PM, David Daney wrote:
>>
>>
>> Well I don't know that part. The gist of my comment was to get rid of an
>> unneeded new config variable.
>>
>> If this works just as well for 32-bit, then by all means do it there too.
>>
>> In any event, I think you want the kernel to automatically do it if the
>> CPU is capable, and not have a manual config setting you have to fiddle
>> with.
>>
>
> If this is a transitional thing then it might justify it.
>
There is nothing specific to 64 bit here. Its just that I had only
tested this on 64 bit right now and wanted to get early feedback. So,
in the spirit of RFC, I sent this out as x86_64 only for the time
being. If and when this will be sent for inclusion, this will should
work on both 32 bit and 64 bit on x86. Though we will need a config
option to not add any run time overhead in generic code (and to not
have #ifdef CONFIG_X86 in kernel/).
Thanks,
Venki
On Mon, 2012-02-06 at 12:42 -0800, Venkatesh Pallipadi wrote:
> * Lower overhead on Async IPI send path. Measurements on Westmere based
> systems show savings on "no wait" smp_call_function_single with idle
> target CPU (as measured on the sender side).
> local socket smp_call_func cost goes from ~1600 to ~1200 cycles
> remote socket smp_call_func cost goes from ~2000 to ~1800 cycles
Interesting that savings in the remote socket is less compared to the
local socket.
> +int smp_need_ipi(int cpu)
> +{
> + int oldval;
> +
> + if (!system_using_cpu_idle_sync || cpu == smp_processor_id())
> + return 1;
> +
> + oldval = atomic_cmpxchg(&per_cpu(cpu_idle_sync, cpu),
> + CPU_STATE_IDLE, CPU_STATE_WAKING);
To avoid too many cache line bounces for the case when the cpu is in the
running state, we should do a read to check if the state is in idle
before going ahead with the locked operation?
> +
> + if (oldval == CPU_STATE_RUNNING)
> + return 1;
> +
> + if (oldval == CPU_STATE_IDLE) {
> + set_tsk_ipi_pending(idle_task(cpu));
> + atomic_set(&per_cpu(cpu_idle_sync, cpu), CPU_STATE_WOKENUP);
> + }
> +
> + return 0;
We should probably disable interrupts around this, otherwise any delay
in transitioning to wokenup from waking will cause the idle cpu to be
stuck for similar amount of time.
thanks,
suresh
On Mon, Feb 6, 2012 at 6:24 PM, Suresh Siddha <[email protected]> wrote:
> On Mon, 2012-02-06 at 12:42 -0800, Venkatesh Pallipadi wrote:
>> * Lower overhead on Async IPI send path. Measurements on Westmere based
>> ? systems show savings on "no wait" smp_call_function_single with idle
>> ? target CPU (as measured on the sender side).
>> ? local socket smp_call_func cost goes from ~1600 to ~1200 cycles
>> ? remote socket smp_call_func cost goes from ~2000 to ~1800 cycles
>
> Interesting that savings in the remote socket is less compared to the
> local socket.
Yes. I was not sure whether it has something to do with mwait/IPI
wakeups work in hardware or it has something to do with C-state the
target CPU is woken out of.
>
>> +int smp_need_ipi(int cpu)
>> +{
>> + ? ? int oldval;
>> +
>> + ? ? if (!system_using_cpu_idle_sync || cpu == smp_processor_id())
>> + ? ? ? ? ? ? return 1;
>> +
>> + ? ? oldval = atomic_cmpxchg(&per_cpu(cpu_idle_sync, cpu),
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? CPU_STATE_IDLE, CPU_STATE_WAKING);
>
> To avoid too many cache line bounces for the case when the cpu is in the
> running state, we should do a read to check if the state is in idle
> before going ahead with the locked operation?
>
Agree. Will add it on patch refresh.
>> +
>> + ? ? if (oldval == CPU_STATE_RUNNING)
>> + ? ? ? ? ? ? return 1;
>> +
>> + ? ? if (oldval == CPU_STATE_IDLE) {
>> + ? ? ? ? ? ? set_tsk_ipi_pending(idle_task(cpu));
>> + ? ? ? ? ? ? atomic_set(&per_cpu(cpu_idle_sync, cpu), CPU_STATE_WOKENUP);
>> + ? ? }
>> +
>> + ? ? return 0;
>
> We should probably disable interrupts around this, otherwise any delay
> in transitioning to wokenup from waking will cause the idle cpu to be
> stuck for similar amount of time.
>
Makes sense. Will add that too.
Thanks,
Venki
> thanks,
> suresh
>
On Mon, Feb 06, 2012 at 12:42:13PM -0800, Venkatesh Pallipadi wrote:
> smp_call_function_single and ttwu_queue_remote sends unconditional IPI
> to target CPU. However, if the target CPU is in mwait based idle, we can
> do IPI-less wakeups using the magical powers of monitor-mwait.
> Doing this has certain advantages:
Actually I'm trying to do the similar thing on MIPS.
The difference is that I want task_is_polling() to do something. The basic
idea is:
> + if (ipi_pending()) {
> + clear_ipi_pending();
> + local_bh_disable();
> + local_irq_disable();
> + generic_smp_call_function_single_interrupt();
> + scheduler_wakeup_self_check();
> + local_irq_enable();
> + local_bh_enable();
I let cpu_idle() check if there is anything to do as your above code.
And task_is_polling() handle the others with below patch:
---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5255c9d..09f633d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -527,15 +527,16 @@ void resched_task(struct task_struct *p)
smp_send_reschedule(cpu);
}
-void resched_cpu(int cpu)
+int resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- return;
+ return 0;
resched_task(cpu_curr(cpu));
raw_spin_unlock_irqrestore(&rq->lock, flags);
+ return 1;
}
#ifdef CONFIG_NO_HZ
@@ -1484,7 +1485,8 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list) &&
+ !resched_cpu(cpu))
smp_send_reschedule(cpu);
}
Thought?
Thanks,
Yong
On Tue, Feb 7, 2012 at 10:51 PM, Yong Zhang <[email protected]> wrote:
> On Mon, Feb 06, 2012 at 12:42:13PM -0800, Venkatesh Pallipadi wrote:
>> smp_call_function_single and ttwu_queue_remote sends unconditional IPI
>> to target CPU. However, if the target CPU is in mwait based idle, we can
>> do IPI-less wakeups using the magical powers of monitor-mwait.
>> Doing this has certain advantages:
>
> Actually I'm trying to do the similar thing on MIPS.
>
> The difference is that I want task_is_polling() to do something. The basic
> idea is:
>
>> + ? ? ? ? ? ? ? ? ? ? if (ipi_pending()) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? clear_ipi_pending();
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_bh_disable();
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_irq_disable();
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? generic_smp_call_function_single_interrupt();
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? scheduler_wakeup_self_check();
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_irq_enable();
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_bh_enable();
>
> I let cpu_idle() check if there is anything to do as your above code.
>
> And task_is_polling() handle the others with below patch:
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 5255c9d..09f633d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -527,15 +527,16 @@ void resched_task(struct task_struct *p)
> ? ? ? ? ? ? ? ?smp_send_reschedule(cpu);
> ?}
>
> -void resched_cpu(int cpu)
> +int resched_cpu(int cpu)
> ?{
> ? ? ? ?struct rq *rq = cpu_rq(cpu);
> ? ? ? ?unsigned long flags;
>
> ? ? ? ?if (!raw_spin_trylock_irqsave(&rq->lock, flags))
> - ? ? ? ? ? ? ? return;
> + ? ? ? ? ? ? ? return 0;
> ? ? ? ?resched_task(cpu_curr(cpu));
> ? ? ? ?raw_spin_unlock_irqrestore(&rq->lock, flags);
> + ? ? ? return 1;
> ?}
Two points -
rq->lock: I tried something similar first. One hurdle with checking
task_is_polling() is that you need rq->lock to check it. And adding
lock+unlock (without wait) in wakeup path ended up being no net gain
compared to IPI. And when we actually end up spinning on that lock,
thats going to add overhead in the common path. That is the reason I
switched to atomic compare exchange and moving any wait onto the
target CPU coming out of idle.
resched_task: ttwu_queue_remote() does not imply that the remote CPU
will do a resched. Today there is a IPI and IPI handler calls onto
check_preempt_wakeup() and if the current task has higher precedence
than the waking up task, then there will be just an activation of new
task and no resched. Using resched_task above breaks
check_preempt_wakeup() and always calls a resched on remote CPU after
the IPI, which would be change in behavior.
Thanks,
Venki
>
> ?#ifdef CONFIG_NO_HZ
> @@ -1484,7 +1485,8 @@ void scheduler_ipi(void)
>
> ?static void ttwu_queue_remote(struct task_struct *p, int cpu)
> ?{
> - ? ? ? if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
> + ? ? ? if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list) &&
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? !resched_cpu(cpu))
> ? ? ? ? ? ? ? ?smp_send_reschedule(cpu);
> ?}
>
> Thought?
>
> Thanks,
> Yong
On Wed, Feb 08, 2012 at 03:28:45PM -0800, Venki Pallipadi wrote:
> On Tue, Feb 7, 2012 at 10:51 PM, Yong Zhang <[email protected]> wrote:
> > On Mon, Feb 06, 2012 at 12:42:13PM -0800, Venkatesh Pallipadi wrote:
> >> smp_call_function_single and ttwu_queue_remote sends unconditional IPI
> >> to target CPU. However, if the target CPU is in mwait based idle, we can
> >> do IPI-less wakeups using the magical powers of monitor-mwait.
> >> Doing this has certain advantages:
> >
> > Actually I'm trying to do the similar thing on MIPS.
> >
> > The difference is that I want task_is_polling() to do something. The basic
> > idea is:
> >
> >> + ? ? ? ? ? ? ? ? ? ? if (ipi_pending()) {
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? clear_ipi_pending();
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_bh_disable();
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_irq_disable();
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? generic_smp_call_function_single_interrupt();
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? scheduler_wakeup_self_check();
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_irq_enable();
> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_bh_enable();
> >
> > I let cpu_idle() check if there is anything to do as your above code.
> >
> > And task_is_polling() handle the others with below patch:
> > ---
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 5255c9d..09f633d 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -527,15 +527,16 @@ void resched_task(struct task_struct *p)
> > ? ? ? ? ? ? ? ?smp_send_reschedule(cpu);
> > ?}
> >
> > -void resched_cpu(int cpu)
> > +int resched_cpu(int cpu)
> > ?{
> > ? ? ? ?struct rq *rq = cpu_rq(cpu);
> > ? ? ? ?unsigned long flags;
> >
> > ? ? ? ?if (!raw_spin_trylock_irqsave(&rq->lock, flags))
> > - ? ? ? ? ? ? ? return;
> > + ? ? ? ? ? ? ? return 0;
> > ? ? ? ?resched_task(cpu_curr(cpu));
> > ? ? ? ?raw_spin_unlock_irqrestore(&rq->lock, flags);
> > + ? ? ? return 1;
> > ?}
>
I assume we are talking about 'return from idle' but seems I don't
make it clear.
> Two points -
> rq->lock: I tried something similar first. One hurdle with checking
> task_is_polling() is that you need rq->lock to check it. And adding
> lock+unlock (without wait) in wakeup path ended up being no net gain
> compared to IPI. And when we actually end up spinning on that lock,
> thats going to add overhead in the common path. That is the reason I
> switched to atomic compare exchange and moving any wait onto the
> target CPU coming out of idle.
I see. But actually we will not spinning on that lock because we
use 'trylock' in resched_cpu(). And you are right there is indeed a
little overhead (resched_task()) if we hold the lock but it can be
tolerated IMHO.
BTW, mind showing you test case thus we can collect some common data?
>
> resched_task: ttwu_queue_remote() does not imply that the remote CPU
> will do a resched. Today there is a IPI and IPI handler calls onto
> check_preempt_wakeup() and if the current task has higher precedence
> than the waking up task, then there will be just an activation of new
> task and no resched. Using resched_task above breaks
> check_preempt_wakeup() and always calls a resched on remote CPU after
> the IPI, which would be change in behavior.
Yeah, if the remote cpu is not idle, mine will change the behavior; but
if the remote cpu is idle, it will always rescheduled, right?
So maybe we could introduce resched_idle_cpu() to make things more clear:
int resched_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
int ret = 0;
if (!raw_spin_trylock_irqsave(&rq->lock, flags))
goto out;
if (!idle_cpu(cpu))
goto out_unlock;
resched_task(cpu_curr(cpu));
ret = 1;
out_unlock:
raw_spin_unlock_irqrestore(&rq->lock, flags);
out:
return ret;
}
Thanks,
Yong
>
> Thanks,
> Venki
>
> >
> > ?#ifdef CONFIG_NO_HZ
> > @@ -1484,7 +1485,8 @@ void scheduler_ipi(void)
> >
> > ?static void ttwu_queue_remote(struct task_struct *p, int cpu)
> > ?{
> > - ? ? ? if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
> > + ? ? ? if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list) &&
> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? !resched_cpu(cpu))
> > ? ? ? ? ? ? ? ?smp_send_reschedule(cpu);
> > ?}
> >
> > Thought?
> >
> > Thanks,
> > Yong
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Only stand for myself
On Wed, Feb 8, 2012 at 6:18 PM, Yong Zhang <[email protected]> wrote:
> On Wed, Feb 08, 2012 at 03:28:45PM -0800, Venki Pallipadi wrote:
>> On Tue, Feb 7, 2012 at 10:51 PM, Yong Zhang <[email protected]> wrote:
>> > On Mon, Feb 06, 2012 at 12:42:13PM -0800, Venkatesh Pallipadi wrote:
>> >> smp_call_function_single and ttwu_queue_remote sends unconditional IPI
>> >> to target CPU. However, if the target CPU is in mwait based idle, we can
>> >> do IPI-less wakeups using the magical powers of monitor-mwait.
>> >> Doing this has certain advantages:
>> >
>> > Actually I'm trying to do the similar thing on MIPS.
>> >
>> > The difference is that I want task_is_polling() to do something. The basic
>> > idea is:
>> >
>> >> + ? ? ? ? ? ? ? ? ? ? if (ipi_pending()) {
>> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? clear_ipi_pending();
>> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_bh_disable();
>> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_irq_disable();
>> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? generic_smp_call_function_single_interrupt();
>> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? scheduler_wakeup_self_check();
>> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_irq_enable();
>> >> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? local_bh_enable();
>> >
>> > I let cpu_idle() check if there is anything to do as your above code.
>> >
>> > And task_is_polling() handle the others with below patch:
>> > ---
>> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> > index 5255c9d..09f633d 100644
>> > --- a/kernel/sched/core.c
>> > +++ b/kernel/sched/core.c
>> > @@ -527,15 +527,16 @@ void resched_task(struct task_struct *p)
>> > ? ? ? ? ? ? ? ?smp_send_reschedule(cpu);
>> > ?}
>> >
>> > -void resched_cpu(int cpu)
>> > +int resched_cpu(int cpu)
>> > ?{
>> > ? ? ? ?struct rq *rq = cpu_rq(cpu);
>> > ? ? ? ?unsigned long flags;
>> >
>> > ? ? ? ?if (!raw_spin_trylock_irqsave(&rq->lock, flags))
>> > - ? ? ? ? ? ? ? return;
>> > + ? ? ? ? ? ? ? return 0;
>> > ? ? ? ?resched_task(cpu_curr(cpu));
>> > ? ? ? ?raw_spin_unlock_irqrestore(&rq->lock, flags);
>> > + ? ? ? return 1;
>> > ?}
>>
>
> I assume we are talking about 'return from idle' but seems I don't
> make it clear.
>
>> Two points -
>> rq->lock: I tried something similar first. One hurdle with checking
>> task_is_polling() is that you need rq->lock to check it. And adding
>> lock+unlock (without wait) in wakeup path ended up being no net gain
>> compared to IPI. And when we actually end up spinning on that lock,
>> thats going to add overhead in the common path. That is the reason I
>> switched to atomic compare exchange and moving any wait onto the
>> target CPU coming out of idle.
>
> I see. But actually we will not spinning on that lock because we
> use 'trylock' in resched_cpu().
Ahh. Sorry I missed the trylock in there...
> And you are right there is indeed a
> little overhead (resched_task()) if we hold the lock but it can be
> tolerated IMHO.
One advantage I got by using atomic stuff instead of rq->lock was as I
mentioned in the patch description, if 2 CPUs are trying to send IPI
to same target CPU around same time (50-100 us if CPU is in deep
C-state in x86).
>
> BTW, mind showing you test case thus we can collect some common data?
Test case was a silly clock measure around
__smp_call_function_single() with optimization I had in
generic_exec_single(). Attaching the patch I had..
>>
>> resched_task: ttwu_queue_remote() does not imply that the remote CPU
>> will do a resched. Today there is a IPI and IPI handler calls onto
>> check_preempt_wakeup() and if the current task has higher precedence
>> than the waking up task, then there will be just an activation of new
>> task and no resched. Using resched_task above breaks
>> check_preempt_wakeup() and always calls a resched on remote CPU after
>> the IPI, which would be change in behavior.
>
> Yeah, if the remote cpu is not idle, mine will change the behavior; but
> if the remote cpu is idle, it will always rescheduled, right?
>
> So maybe we could introduce resched_idle_cpu() to make things more clear:
>
> int resched_idle_cpu(int cpu)
> {
> ? ? ? ?struct rq *rq = cpu_rq(cpu);
> ? ? ? ?unsigned long flags;
> ? ? ? ?int ret = 0;
>
> ? ? ? ?if (!raw_spin_trylock_irqsave(&rq->lock, flags))
> ? ? ? ? ? ? ? ?goto out;
> ? ? ? ?if (!idle_cpu(cpu))
> ? ? ? ? ? ? ? ?goto out_unlock;
> ? ? ? ?resched_task(cpu_curr(cpu));
> ? ? ? ? ? ? ? ?ret = 1;
> out_unlock:
> ? ? ? ?raw_spin_unlock_irqrestore(&rq->lock, flags);
> out:
> ? ? ? ?return ret;
> }
>
This should likely work. But, if you do want to use similar logic in
smp_call_function() or idle load balance kick etc, you need additional
bit other than need_resched() as there we only need irq+softirq and
not necessarily a resched.
At this time I am not sure how poll wakeup logic works in MIPS. But,
if it is something that is similar to x86 mwait and we can wakeup with
a bit other than TIF_NEED_RESCHED, we can generalize most of the
changes in my RFC and share it across archs.
-Venki
>>
>> >
>> > ?#ifdef CONFIG_NO_HZ
>> > @@ -1484,7 +1485,8 @@ void scheduler_ipi(void)
>> >
>> > ?static void ttwu_queue_remote(struct task_struct *p, int cpu)
>> > ?{
>> > - ? ? ? if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
>> > + ? ? ? if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list) &&
>> > + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? !resched_cpu(cpu))
>> > ? ? ? ? ? ? ? ?smp_send_reschedule(cpu);
>> > ?}
>> >
>> > Thought?
>> >
>> > Thanks,
>> > Yong
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at ?http://www.tux.org/lkml/
>
> --
> Only stand for myself
On Mon, 2012-02-06 at 12:42 -0800, Venkatesh Pallipadi wrote:
> smp_call_function_single and ttwu_queue_remote sends unconditional IPI
> to target CPU. However, if the target CPU is in mwait based idle, we can
> do IPI-less wakeups using the magical powers of monitor-mwait.
So I was thinking, why not change native_smp_send_reschedule() and
native_send_call_func_single_ipi() and keep the change entirely inside
the arch?
Ideally its only APIC/idle that know about this detail, the scheduler
(or other consumers) really don't care about how the other cpu comes to
run the callback.
On Fri, Feb 10, 2012 at 11:19 AM, Peter Zijlstra <[email protected]> wrote:
> On Mon, 2012-02-06 at 12:42 -0800, Venkatesh Pallipadi wrote:
>> smp_call_function_single and ttwu_queue_remote sends unconditional IPI
>> to target CPU. However, if the target CPU is in mwait based idle, we can
>> do IPI-less wakeups using the magical powers of monitor-mwait.
>
> So I was thinking, why not change native_smp_send_reschedule() and
> native_send_call_func_single_ipi() and keep the change entirely inside
> the arch?
>
> Ideally its only APIC/idle that know about this detail, the scheduler
> (or other consumers) really don't care about how the other cpu comes to
> run the callback.
>
OK. Moving most of this into arch code will be cleaner. But, Yong
mentioned in this thread that he was looking to do something similar
on MIPS. So, we may end up with some code duplication though..
On Fri, 2012-02-10 at 18:11 -0800, Venki Pallipadi wrote:
> OK. Moving most of this into arch code will be cleaner. But, Yong
> mentioned in this thread that he was looking to do something similar
> on MIPS. So, we may end up with some code duplication though..
If there's significant duplication you can always move a few helpers
into lib/, but I think we shouldn't let this get outside of the PIC and
idle code (both clearly arch specific), all we're doing is optimizing
the IPI delivery, conceptually we're still sending one.
On Thu, Feb 09, 2012 at 06:17:06PM -0800, Venki Pallipadi wrote:
>
> This should likely work. But, if you do want to use similar logic in
> smp_call_function() or idle load balance kick etc, you need additional
> bit other than need_resched() as there we only need irq+softirq and
> not necessarily a resched.
Yeah, putting resched_idle_cpu() into smp_call_function_*() is a bit ugly
though resched could work in the scenario because you have
generic_smp_call_function_single_interrupt() called in cpu_idle();
> At this time I am not sure how poll wakeup logic works in MIPS. But,
A typical example is busy loop:
void __noreturn cpu_idle(void)
{
...
while (1) {
tick_nohz_idle_enter();
rcu_idle_enter();
while (!need_resched() && cpu_online(cpu)) {
;
}
rcu_idle_exit();
tick_nohz_idle_exit();
preempt_enable_no_resched();
schedule();
preempt_disable();
}
...
}
> if it is something that is similar to x86 mwait and we can wakeup with
> a bit other than TIF_NEED_RESCHED, we can generalize most of the
> changes in my RFC and share it across archs.
Yeah, there are many things we could share IMHO, especially the hook
in cpu_idle().
Thanks,
Yong
On Fri, Feb 10, 2012 at 06:11:58PM -0800, Venki Pallipadi wrote:
> On Fri, Feb 10, 2012 at 11:19 AM, Peter Zijlstra <[email protected]> wrote:
> > On Mon, 2012-02-06 at 12:42 -0800, Venkatesh Pallipadi wrote:
> >> smp_call_function_single and ttwu_queue_remote sends unconditional IPI
> >> to target CPU. However, if the target CPU is in mwait based idle, we can
> >> do IPI-less wakeups using the magical powers of monitor-mwait.
> >
> > So I was thinking, why not change native_smp_send_reschedule() and
> > native_send_call_func_single_ipi() and keep the change entirely inside
> > the arch?
> >
> > Ideally its only APIC/idle that know about this detail, the scheduler
> > (or other consumers) really don't care about how the other cpu comes to
> > run the callback.
> >
>
> OK. Moving most of this into arch code will be cleaner. But, Yong
> mentioned in this thread that he was looking to do something similar
> on MIPS. So, we may end up with some code duplication though..
Yeah, most of the things could be shared. Such as the added logic when
return from idle and the hooks into ttwu_queue_remote() and
generic_exec_single().
BTW, I think we should also let the arch provide a wrapper for
TIF_IPI_PENDING, such as tsk_ipi_pending().
Thanks,
Yong
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Only stand for myself
On Mon, 2012-02-13 at 13:34 +0800, Yong Zhang wrote:
> Yeah, most of the things could be shared. Such as the added logic when
> return from idle and the hooks into ttwu_queue_remote() and
> generic_exec_single().
>
The thing is, those last two should not have hooks. Nor am I convinced
anything outside of the arch idle loop should have any.
On Tue, Feb 14, 2012 at 02:52:17PM +0100, Peter Zijlstra wrote:
> On Mon, 2012-02-13 at 13:34 +0800, Yong Zhang wrote:
> > Yeah, most of the things could be shared. Such as the added logic when
> > return from idle and the hooks into ttwu_queue_remote() and
> > generic_exec_single().
> >
>
> The thing is, those last two should not have hooks.
Yeah, it's general place; and smp_need_ipi() plays well.
> Nor am I convinced
> anything outside of the arch idle loop should have any.
'idle loop' is the key point :)
Thanks,
Yong
On Tue, Feb 14, 2012 at 5:52 AM, Peter Zijlstra <[email protected]> wrote:
> On Mon, 2012-02-13 at 13:34 +0800, Yong Zhang wrote:
>> Yeah, most of the things could be shared. Such as the added logic when
>> return from idle and the hooks into ttwu_queue_remote() and
>> generic_exec_single().
>>
>
> The thing is, those last two should not have hooks. Nor am I convinced
> anything outside of the arch idle loop should have any.
>
OK. Will put stuff in arch and respin soon..
-Venki