This is the second version of my earlier series [1], which aims at
fixing (or papering over, depending on how you look at things) a
performance regression seen on arm64 for reched IPI heavy workloads
(such as "perf bench sched pipe").
As eloquently described by Thomas in his earlier replies [2], the
current situation is less than ideal on most architecture except x86,
and my conclusion is that what was broken in 5.9 wouldn't be more
broken in 5.10 with these patches (and addresses the performance
regression).
Needless to say, I intend to try and help fixing the issues Thomas
mentioned, and I believe that Mark (cc'd) already has something that
could be used as a healthy starting point (Mark, do correct me if I
misrepresented your work).
Thanks,
M.
* From v1:
- Added a new __irq_modify_status() helper
- Renamed IRQ_NAKED to IRQ_RAW
- Renamed IRQ_HIDDEN to IRQ_IPI
- Applied the same workaround to 32bit ARM for completeness
[1] https://lore.kernel.org/r/[email protected]/
[2] https://lore.kernel.org/r/[email protected]/
Marc Zyngier (6):
genirq: Add __irq_modify_status() helper to clear/set special flags
genirq: Allow an interrupt to be marked as 'raw'
arm64: Mark the recheduling IPI as raw interrupt
arm: Mark the recheduling IPI as raw interrupt
genirq: Drop IRQ_HIDDEN from IRQF_MODIFY_MASK
genirq: Rename IRQ_HIDDEN to IRQ_IPI
arch/arm/Kconfig | 1 +
arch/arm/kernel/smp.c | 6 +++++-
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/smp.c | 6 +++++-
include/linux/irq.h | 11 ++++++++---
kernel/irq/Kconfig | 3 +++
kernel/irq/chip.c | 12 ++++++++++--
kernel/irq/debugfs.c | 3 ++-
kernel/irq/irqdesc.c | 17 ++++++++++++-----
kernel/irq/proc.c | 2 +-
kernel/irq/settings.h | 33 +++++++++++++++++++++++++++------
11 files changed, 75 insertions(+), 20 deletions(-)
--
2.28.0
Some arch-specific flags need to be set/cleared, but not exposed to
random device drivers. Introduce a new helper (__irq_modify_status())
that takes an arbitrary mask, and rewrite irq_modify_status() to use
this new helper.
No functionnal change.
Signed-off-by: Marc Zyngier <[email protected]>
---
include/linux/irq.h | 3 +++
kernel/irq/chip.c | 12 ++++++++++--
kernel/irq/settings.h | 10 ++++++++--
3 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c54365309e97..c55f218d5b61 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -751,6 +751,9 @@ void
irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
void *data);
+void __irq_modify_status(unsigned int irq, unsigned long clr,
+ unsigned long set, unsigned long mask);
+
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set);
static inline void irq_set_status_flags(unsigned int irq, unsigned long set)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b9b9618e1aca..85176712a484 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1107,7 +1107,8 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
}
EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
-void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
+void __irq_modify_status(unsigned int irq, unsigned long clr,
+ unsigned long set, unsigned long mask)
{
unsigned long flags, trigger, tmp;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
@@ -1121,7 +1122,9 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
*/
WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
- irq_settings_clr_and_set(desc, clr, set);
+ /* Warn when trying to clear or set a bit disallowed by the mask */
+ WARN_ON((clr | set) & ~mask);
+ __irq_settings_clr_and_set(desc, clr, set, mask);
trigger = irqd_get_trigger_type(&desc->irq_data);
@@ -1144,6 +1147,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
irq_put_desc_unlock(desc, flags);
}
+
+void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
+{
+ __irq_modify_status(irq, clr, set, _IRQF_MODIFY_MASK);
+}
EXPORT_SYMBOL_GPL(irq_modify_status);
/**
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 403378b9947b..51acdf43eadc 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -36,11 +36,17 @@ enum {
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
+static inline void
+__irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set, u32 mask)
+{
+ desc->status_use_accessors &= ~(clr & mask);
+ desc->status_use_accessors |= (set & mask);
+}
+
static inline void
irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
{
- desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
- desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
+ __irq_settings_clr_and_set(desc, clr, set, _IRQF_MODIFY_MASK);
}
static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
--
2.28.0
IRQ_HIDDEN is hardly a flag generic code should use, so let's
drop it from IRQF_MODIFY_MASK. In turn, update both arm and arm64
to use __irq_modify_status() when setting this flag.
Signed-off-by: Marc Zyngier <[email protected]>
---
arch/arm/kernel/smp.c | 2 +-
arch/arm64/kernel/smp.c | 2 +-
include/linux/irq.h | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 0e09c8320caf..dc746f808400 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -737,7 +737,7 @@ void __init set_smp_ipi_range(int ipi_base, int n)
WARN_ON(err);
ipi_desc[i] = irq_to_desc(ipi_base + i);
- irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
+ __irq_modify_status(ipi_base + i, 0, IRQ_HIDDEN, ~0);
/* The recheduling IPI is special... */
if (i == IPI_RESCHEDULE)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index bad51f7f7ffe..684f41a3ba58 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -994,7 +994,7 @@ void __init set_smp_ipi_range(int ipi_base, int n)
WARN_ON(err);
ipi_desc[i] = irq_to_desc(ipi_base + i);
- irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
+ __irq_modify_status(ipi_base + i, 0, IRQ_HIDDEN, ~0);
/* The recheduling IPI is special... */
if (i == IPI_RESCHEDULE)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 605ba5949255..0e71227fd3ec 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -107,7 +107,7 @@ enum {
(IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
- IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_HIDDEN)
+ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
#define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
--
2.28.0
Some interrupts (such as the rescheduling IPI) rely on not going through
the irq_enter()/irq_exit() calls. To distinguish such interrupts, add
a new IRQ flag that allows the low-level handling code to sidestep the
enter()/exit() calls.
Only the architecture code is expected to use this. It will do the wrong
thing on normal interrupts. Note that this is a band-aid until we can
move to some more correct infrastructure (such as kernel/entry/common.c).
Signed-off-by: Marc Zyngier <[email protected]>
---
include/linux/irq.h | 2 ++
kernel/irq/Kconfig | 3 +++
kernel/irq/debugfs.c | 1 +
kernel/irq/irqdesc.c | 17 ++++++++++++-----
kernel/irq/settings.h | 15 +++++++++++++++
5 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c55f218d5b61..605ba5949255 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -72,6 +72,7 @@ enum irqchip_irq_state;
* mechanism and from core side polling.
* IRQ_DISABLE_UNLAZY - Disable lazy irq disable
* IRQ_HIDDEN - Don't show up in /proc/interrupts
+ * IRQ_RAW - Skip tick management and irqtime accounting
*/
enum {
IRQ_TYPE_NONE = 0x00000000,
@@ -99,6 +100,7 @@ enum {
IRQ_IS_POLLED = (1 << 18),
IRQ_DISABLE_UNLAZY = (1 << 19),
IRQ_HIDDEN = (1 << 20),
+ IRQ_RAW = (1 << 21),
};
#define IRQF_MODIFY_MASK \
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 164a031cfdb6..ae9b13d5ee91 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -109,6 +109,9 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR
config GENERIC_IRQ_RESERVATION_MODE
bool
+config ARCH_WANTS_IRQ_RAW
+ bool
+
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index e4cff358b437..f53475d88072 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -140,6 +140,7 @@ static const struct irq_bit_descr irqdesc_states[] = {
BIT_MASK_DESCR(_IRQ_IS_POLLED),
BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY),
BIT_MASK_DESCR(_IRQ_HIDDEN),
+ BIT_MASK_DESCR(_IRQ_RAW),
};
static const struct irq_bit_descr irqdesc_istates[] = {
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1a7723604399..f5beee546a6f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -667,10 +667,9 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
{
struct pt_regs *old_regs = set_irq_regs(regs);
unsigned int irq = hwirq;
+ struct irq_desc *desc;
int ret = 0;
- irq_enter();
-
#ifdef CONFIG_IRQ_DOMAIN
if (lookup)
irq = irq_find_mapping(domain, hwirq);
@@ -680,14 +679,22 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
* Some hardware gives randomly wrong interrupts. Rather
* than crashing, do something sensible.
*/
- if (unlikely(!irq || irq >= nr_irqs)) {
+ if (unlikely(!irq || irq >= nr_irqs || !(desc = irq_to_desc(irq)))) {
ack_bad_irq(irq);
ret = -EINVAL;
+ goto out;
+ }
+
+ if (IS_ENABLED(CONFIG_ARCH_WANTS_IRQ_RAW) &&
+ unlikely(irq_settings_is_raw(desc))) {
+ generic_handle_irq_desc(desc);
} else {
- generic_handle_irq(irq);
+ irq_enter();
+ generic_handle_irq_desc(desc);
+ irq_exit();
}
- irq_exit();
+out:
set_irq_regs(old_regs);
return ret;
}
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 51acdf43eadc..0033d459fdac 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -18,6 +18,7 @@ enum {
_IRQ_IS_POLLED = IRQ_IS_POLLED,
_IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
_IRQ_HIDDEN = IRQ_HIDDEN,
+ _IRQ_RAW = IRQ_RAW,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -33,6 +34,7 @@ enum {
#define IRQ_IS_POLLED GOT_YOU_MORON
#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
#define IRQ_HIDDEN GOT_YOU_MORON
+#define IRQ_RAW GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -180,3 +182,16 @@ static inline bool irq_settings_is_hidden(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_HIDDEN;
}
+
+static inline bool irq_settings_is_raw(struct irq_desc *desc)
+{
+ if (IS_ENABLED(CONFIG_ARCH_WANTS_IRQ_RAW))
+ return desc->status_use_accessors & _IRQ_RAW;
+
+ /*
+ * Using IRQ_RAW on architectures that don't expect it is
+ * likely to be wrong.
+ */
+ WARN_ON_ONCE(1);
+ return false;
+}
--
2.28.0
On Tue, Nov 24, 2020 at 02:14:45PM +0000, Marc Zyngier wrote:
> Some interrupts (such as the rescheduling IPI) rely on not going through
> the irq_enter()/irq_exit() calls. To distinguish such interrupts, add
> a new IRQ flag that allows the low-level handling code to sidestep the
> enter()/exit() calls.
Well, not quite. The scheduler_ipi() function is perfectly fine being
called with irq_enter/irq_exit. As per this very series, that's your
current reality.
The function just doesn't need it.
Hi Marc,
On 24/11/20 14:14, Marc Zyngier wrote:
> @@ -680,14 +679,22 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
> * Some hardware gives randomly wrong interrupts. Rather
> * than crashing, do something sensible.
> */
> - if (unlikely(!irq || irq >= nr_irqs)) {
> + if (unlikely(!irq || irq >= nr_irqs || !(desc = irq_to_desc(irq)))) {
> ack_bad_irq(irq);
> ret = -EINVAL;
> + goto out;
> + }
> +
> + if (IS_ENABLED(CONFIG_ARCH_WANTS_IRQ_RAW) &&
> + unlikely(irq_settings_is_raw(desc))) {
> + generic_handle_irq_desc(desc);
If I got the RCU bits right from what Thomas mentioned in
https://lore.kernel.org/r/[email protected]
https://lore.kernel.org/r/[email protected]
then we're still missing something to notify RCU in the case the IRQ hits
the idle task. All I see on our entry path is
trace_hardirqs_off();
...
irq_handler()
handle_domain_irq();
...
trace_hardirqs_on();
so we do currently rely on handle_domain_irq()'s irq_enter() + irq_exit()
for that. rcu_irq_enter() says CONFIG_RCU_EQS_DEBUG=y can detect missing
bits, but I don't get any warnings with your series on my Juno.
Now, irq_enter() gives us:
rcu_irq_enter();
irq_enter_rcu()
raise_softirq faffery;
__irq_enter()
irqtime accounting;
preempt count + lockdep; } __irq_enter_raw()
Looking at irqentry_enter() + DEFINE_IDTENTRY_SYSVEC_SIMPLE(), I *think* we
would be fine with just:
rcu_irq_enter();
__irq_enter_raw();
generic_handle_irq_desc()
__irq_exit_raw();
rcu_irq_exit();
I tested that and it didn't explode (though I haven't managed to make
CONFIG_RCU_EQS_DEBUG squeal). Also please note RCU isn't my forte, so the
above may contain traces of bullcrap.
> } else {
> - generic_handle_irq(irq);
> + irq_enter();
> + generic_handle_irq_desc(desc);
> + irq_exit();
> }
>
> - irq_exit();
> +out:
> set_irq_regs(old_regs);
> return ret;
> }
[...]
> @@ -180,3 +182,16 @@ static inline bool irq_settings_is_hidden(struct irq_desc *desc)
> {
> return desc->status_use_accessors & _IRQ_HIDDEN;
> }
> +
> +static inline bool irq_settings_is_raw(struct irq_desc *desc)
> +{
> + if (IS_ENABLED(CONFIG_ARCH_WANTS_IRQ_RAW))
> + return desc->status_use_accessors & _IRQ_RAW;
> +
> + /*
> + * Using IRQ_RAW on architectures that don't expect it is
> + * likely to be wrong.
> + */
> + WARN_ON_ONCE(1);
Per __handle_domain_irq()'s short-circuit evaluation, this is only entered
when the above config is enabled. Perhaps a better place to check for this
would be in __irq_settings_clr_and_set().
> + return false;
> +}
On Thu, Nov 26, 2020 at 06:18:33PM +0000, Valentin Schneider wrote:
> If I got the RCU bits right from what Thomas mentioned in
>
> https://lore.kernel.org/r/[email protected]
> https://lore.kernel.org/r/[email protected]
>
> then we're still missing something to notify RCU in the case the IRQ hits
> the idle task. All I see on our entry path is
>
> trace_hardirqs_off();
> ...
> irq_handler()
> handle_domain_irq();
> ...
> trace_hardirqs_on();
>
> so we do currently rely on handle_domain_irq()'s irq_enter() + irq_exit()
> for that. rcu_irq_enter() says CONFIG_RCU_EQS_DEBUG=y can detect missing
> bits, but I don't get any warnings with your series on my Juno.
The scheduler IPI really doesn't need RCU either ;-)
On 03/12/20 13:03, Peter Zijlstra wrote:
> On Thu, Nov 26, 2020 at 06:18:33PM +0000, Valentin Schneider wrote:
>> If I got the RCU bits right from what Thomas mentioned in
>>
>> https://lore.kernel.org/r/[email protected]
>> https://lore.kernel.org/r/[email protected]
>>
>> then we're still missing something to notify RCU in the case the IRQ hits
>> the idle task. All I see on our entry path is
>>
>> trace_hardirqs_off();
>> ...
>> irq_handler()
>> handle_domain_irq();
>> ...
>> trace_hardirqs_on();
>>
>> so we do currently rely on handle_domain_irq()'s irq_enter() + irq_exit()
>> for that. rcu_irq_enter() says CONFIG_RCU_EQS_DEBUG=y can detect missing
>> bits, but I don't get any warnings with your series on my Juno.
>
> The scheduler IPI really doesn't need RCU either ;-)
Because it doesn't enter any new read-side section, right?
But as with any other interrupt, we could then go through:
preempt_schedule_irq() ~> pick_next_task_fair() -> newidle_balance()
which does enter a read-side section, so RCU would need to be
watching. Looking at kernel/entry/common.c:irqentry_exit_cond_resched(), it
seems we do check for this via rcu_irq_exit_check_preempt().
I however cannot grok why irqentry_exit() *doesn't* call into
preempt_schedule_irq() if RCU wasn't watching on IRQ entry, so I'm starting
to question everything (again).
On 03/12/20 15:52, Valentin Schneider wrote:
> On 03/12/20 13:03, Peter Zijlstra wrote:
[...]
>> The scheduler IPI really doesn't need RCU either ;-)
[...]
> But as with any other interrupt, we could then go through:
>
> preempt_schedule_irq() ~> pick_next_task_fair() -> newidle_balance()
>
> which does enter a read-side section, so RCU would need to be
> watching. Looking at kernel/entry/common.c:irqentry_exit_cond_resched(), it
> seems we do check for this via rcu_irq_exit_check_preempt().
>
> I however cannot grok why irqentry_exit() *doesn't* call into
> preempt_schedule_irq() if RCU wasn't watching on IRQ entry
RCU wasn't watching on IRQ entry:
-> we should be on the idle task
-> no unvoluntary preemption for the idle task, scheduling always happens
at the tail of the idle loop
-> ignore what I've been saying, current patch is fine
Hi Marc,
On Tue, Nov 24, 2020 at 02:14:45PM +0000, Marc Zyngier wrote:
> Some interrupts (such as the rescheduling IPI) rely on not going through
> the irq_enter()/irq_exit() calls. To distinguish such interrupts, add
> a new IRQ flag that allows the low-level handling code to sidestep the
> enter()/exit() calls.
>
> Only the architecture code is expected to use this. It will do the wrong
> thing on normal interrupts. Note that this is a band-aid until we can
> move to some more correct infrastructure (such as kernel/entry/common.c).
>
> Signed-off-by: Marc Zyngier <[email protected]>
> ---
> include/linux/irq.h | 2 ++
> kernel/irq/Kconfig | 3 +++
> kernel/irq/debugfs.c | 1 +
> kernel/irq/irqdesc.c | 17 ++++++++++++-----
> kernel/irq/settings.h | 15 +++++++++++++++
> 5 files changed, 33 insertions(+), 5 deletions(-)
[...]
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 1a7723604399..f5beee546a6f 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -667,10 +667,9 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
> {
> struct pt_regs *old_regs = set_irq_regs(regs);
> unsigned int irq = hwirq;
> + struct irq_desc *desc;
> int ret = 0;
>
> - irq_enter();
> -
> #ifdef CONFIG_IRQ_DOMAIN
> if (lookup)
> irq = irq_find_mapping(domain, hwirq);
> @@ -680,14 +679,22 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
> * Some hardware gives randomly wrong interrupts. Rather
> * than crashing, do something sensible.
> */
> - if (unlikely(!irq || irq >= nr_irqs)) {
> + if (unlikely(!irq || irq >= nr_irqs || !(desc = irq_to_desc(irq)))) {
> ack_bad_irq(irq);
> ret = -EINVAL;
> + goto out;
> + }
> +
> + if (IS_ENABLED(CONFIG_ARCH_WANTS_IRQ_RAW) &&
> + unlikely(irq_settings_is_raw(desc))) {
> + generic_handle_irq_desc(desc);
Based on tglx's previous comments, I was expecting to see calls to
__irq_{enter,exit}_raw() around this. Are they hiding somewhere else or
are they not needed?
Will
Hi Marc,
I plan to add NMI patches which enables IPI_CPU_CRASH_STOP IPI as pseudo-NMI[1].
But I know need to resolve the instrumentation issues before that. I think need to moving arm64 entry code over to the generic entry code(kernel/entry/common.c) for that, is this right?
Can you tell me current status?
Let me know if there's anything I can do to help.
[1]https://lore.kernel.org/lkml/[email protected]/
Thanks,
Yuichi Ito
On 2021-03-01 00:39, [email protected] wrote:
> Hi Marc,
>
> I plan to add NMI patches which enables IPI_CPU_CRASH_STOP IPI as
> pseudo-NMI[1].
> But I know need to resolve the instrumentation issues before that. I
> think need to moving arm64 entry code over to the generic entry
> code(kernel/entry/common.c) for that, is this right?
>
> Can you tell me current status?
> Let me know if there's anything I can do to help.
Mark is working on this, I believe. I'll let him comment on the current
state of things.
Thanks,
M.
--
Jazz is not dead. It just smells funny...
Hi Marc, Mark
> Mark is working on this, I believe. I'll let him comment on the current
> state of things.
I understand.
Mark, Could you tell me current state?
Thanks,
Yuichi Ito
Hello All,
We are seeing significant improvements in time it takes for a task to be
woken up on an idle cpu with these patches.
A trace output without
<<< 96uS total cost: cpu 1 wakes up rt-app task on cpu 2 >>>
rt-app-955 [001] 149.387611: sched_wakeup_new:
comm=rt-app pid=957 prio=120 target_cpu=002
rt-app-955 [001] 149.387616: ipi_raise:
target_mask=00000000,00000004 (Rescheduling interrupts)
<idle>-0 [002] 149.387622: cpu_idle:
state=4294967295 cpu_id=2
<idle>-0 [002] 149.387640: irq_handler_entry: irq=1
name=IPI
<idle>-0 [002] 149.387643: ipi_entry: (Rescheduling
interrupts)
<idle>-0 [002] 149.387646: ipi_exit: (Rescheduling
interrupts)
<idle>-0 [002] 149.387648: irq_handler_exit: irq=1
ret=handled
<idle>-0 [002] 149.387707: sched_switch:
prev_comm=swapper/2 prev_pid=0 prev_prio=120 prev_state=R ==>
next_comm=rt-app next_pid=957 next_prio=120
With the patches.
<<< 68uS total cost: cpu 1 wakes up T0 on cpu 3 >>>
rt-app-956 [001] 28.034953: sched_wakeup_new:
comm=rt-app pid=958 prio=120 target_cpu=003
rt-app-956 [001] 28.034958: ipi_raise:
target_mask=00000000,00000008 (Rescheduling interrupts)
<idle>-0 [003] 28.034964: cpu_idle:
state=4294967295 cpu_id=3
<idle>-0 [003] 28.034970: irq_handler_entry: irq=1
name=IPI
<idle>-0 [003] 28.034974: ipi_entry: (Rescheduling
interrupts)
<idle>-0 [003] 28.034977: ipi_exit: (Rescheduling
interrupts)
<idle>-0 [003] 28.034979: irq_handler_exit: irq=1
ret=handled
<idle>-0 [003] 28.035021: sched_switch:
prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==>
next_comm=rt-app next_pid=958 next_prio=120
This was taken on a snapdragon device similar to 8350. This patch
series helps in reducing the load time on idle cpus and thereby increase
performance KPIs on various benchmarks.
Sent this data in hopes that we resurrect the discussion and get these
fixes in.
Thanks,
Abhijeet
On 11/24/2020 6:14 AM, Marc Zyngier wrote:
> This is the second version of my earlier series [1], which aims at
> fixing (or papering over, depending on how you look at things) a
> performance regression seen on arm64 for reched IPI heavy workloads
> (such as "perf bench sched pipe").
>
> As eloquently described by Thomas in his earlier replies [2], the
> current situation is less than ideal on most architecture except x86,
> and my conclusion is that what was broken in 5.9 wouldn't be more
> broken in 5.10 with these patches (and addresses the performance
> regression).
>
> Needless to say, I intend to try and help fixing the issues Thomas
> mentioned, and I believe that Mark (cc'd) already has something that
> could be used as a healthy starting point (Mark, do correct me if I
> misrepresented your work).
>
> Thanks,
>
> M.
>
> * From v1:
> - Added a new __irq_modify_status() helper
> - Renamed IRQ_NAKED to IRQ_RAW
> - Renamed IRQ_HIDDEN to IRQ_IPI
> - Applied the same workaround to 32bit ARM for completeness
>
> [1] https://lore.kernel.org/r/[email protected]/
> [2] https://lore.kernel.org/r/[email protected]/
>
> Marc Zyngier (6):
> genirq: Add __irq_modify_status() helper to clear/set special flags
> genirq: Allow an interrupt to be marked as 'raw'
> arm64: Mark the recheduling IPI as raw interrupt
> arm: Mark the recheduling IPI as raw interrupt
> genirq: Drop IRQ_HIDDEN from IRQF_MODIFY_MASK
> genirq: Rename IRQ_HIDDEN to IRQ_IPI
>
> arch/arm/Kconfig | 1 +
> arch/arm/kernel/smp.c | 6 +++++-
> arch/arm64/Kconfig | 1 +
> arch/arm64/kernel/smp.c | 6 +++++-
> include/linux/irq.h | 11 ++++++++---
> kernel/irq/Kconfig | 3 +++
> kernel/irq/chip.c | 12 ++++++++++--
> kernel/irq/debugfs.c | 3 ++-
> kernel/irq/irqdesc.c | 17 ++++++++++++-----
> kernel/irq/proc.c | 2 +-
> kernel/irq/settings.h | 33 +++++++++++++++++++++++++++------
> 11 files changed, 75 insertions(+), 20 deletions(-)
>
On Tue, Nov 24, 2020 at 6:15 AM Marc Zyngier <[email protected]> wrote:
>
> Some interrupts (such as the rescheduling IPI) rely on not going through
> the irq_enter()/irq_exit() calls. To distinguish such interrupts, add
> a new IRQ flag that allows the low-level handling code to sidestep the
> enter()/exit() calls.
>
> Only the architecture code is expected to use this. It will do the wrong
> thing on normal interrupts. Note that this is a band-aid until we can
> move to some more correct infrastructure (such as kernel/entry/common.c).
>
> Signed-off-by: Marc Zyngier <[email protected]>
> ---
> include/linux/irq.h | 2 ++
> kernel/irq/Kconfig | 3 +++
> kernel/irq/debugfs.c | 1 +
> kernel/irq/irqdesc.c | 17 ++++++++++++-----
> kernel/irq/settings.h | 15 +++++++++++++++
> 5 files changed, 33 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index c55f218d5b61..605ba5949255 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -72,6 +72,7 @@ enum irqchip_irq_state;
> * mechanism and from core side polling.
> * IRQ_DISABLE_UNLAZY - Disable lazy irq disable
> * IRQ_HIDDEN - Don't show up in /proc/interrupts
> + * IRQ_RAW - Skip tick management and irqtime accounting
> */
> enum {
> IRQ_TYPE_NONE = 0x00000000,
> @@ -99,6 +100,7 @@ enum {
> IRQ_IS_POLLED = (1 << 18),
> IRQ_DISABLE_UNLAZY = (1 << 19),
> IRQ_HIDDEN = (1 << 20),
> + IRQ_RAW = (1 << 21),
> };
>
> #define IRQF_MODIFY_MASK \
> diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> index 164a031cfdb6..ae9b13d5ee91 100644
> --- a/kernel/irq/Kconfig
> +++ b/kernel/irq/Kconfig
> @@ -109,6 +109,9 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR
> config GENERIC_IRQ_RESERVATION_MODE
> bool
>
> +config ARCH_WANTS_IRQ_RAW
> + bool
> +
> # Support forced irq threading
> config IRQ_FORCED_THREADING
> bool
> diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
> index e4cff358b437..f53475d88072 100644
> --- a/kernel/irq/debugfs.c
> +++ b/kernel/irq/debugfs.c
> @@ -140,6 +140,7 @@ static const struct irq_bit_descr irqdesc_states[] = {
> BIT_MASK_DESCR(_IRQ_IS_POLLED),
> BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY),
> BIT_MASK_DESCR(_IRQ_HIDDEN),
> + BIT_MASK_DESCR(_IRQ_RAW),
> };
>
> static const struct irq_bit_descr irqdesc_istates[] = {
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 1a7723604399..f5beee546a6f 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -667,10 +667,9 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
> {
> struct pt_regs *old_regs = set_irq_regs(regs);
> unsigned int irq = hwirq;
> + struct irq_desc *desc;
> int ret = 0;
>
> - irq_enter();
> -
> #ifdef CONFIG_IRQ_DOMAIN
> if (lookup)
> irq = irq_find_mapping(domain, hwirq);
> @@ -680,14 +679,22 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
> * Some hardware gives randomly wrong interrupts. Rather
> * than crashing, do something sensible.
> */
> - if (unlikely(!irq || irq >= nr_irqs)) {
> + if (unlikely(!irq || irq >= nr_irqs || !(desc = irq_to_desc(irq)))) {
I see a checkpatch error here:
ERROR:ASSIGN_IN_IF: do not use assignment in if condition
#96: FILE: kernel/irq/irqdesc.c:682:
> ack_bad_irq(irq);
> ret = -EINVAL;
> + goto out;
> + }
> +
> + if (IS_ENABLED(CONFIG_ARCH_WANTS_IRQ_RAW) &&
> + unlikely(irq_settings_is_raw(desc))) {
> + generic_handle_irq_desc(desc);
> } else {
> - generic_handle_irq(irq);
> + irq_enter();
> + generic_handle_irq_desc(desc);
> + irq_exit();
> }
>
> - irq_exit();
> +out:
> set_irq_regs(old_regs);
> return ret;
> }
> diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
> index 51acdf43eadc..0033d459fdac 100644
> --- a/kernel/irq/settings.h
> +++ b/kernel/irq/settings.h
> @@ -18,6 +18,7 @@ enum {
> _IRQ_IS_POLLED = IRQ_IS_POLLED,
> _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
> _IRQ_HIDDEN = IRQ_HIDDEN,
> + _IRQ_RAW = IRQ_RAW,
> _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
> };
>
> @@ -33,6 +34,7 @@ enum {
> #define IRQ_IS_POLLED GOT_YOU_MORON
> #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
> #define IRQ_HIDDEN GOT_YOU_MORON
> +#define IRQ_RAW GOT_YOU_MORON
> #undef IRQF_MODIFY_MASK
> #define IRQF_MODIFY_MASK GOT_YOU_MORON
>
> @@ -180,3 +182,16 @@ static inline bool irq_settings_is_hidden(struct irq_desc *desc)
> {
> return desc->status_use_accessors & _IRQ_HIDDEN;
> }
> +
> +static inline bool irq_settings_is_raw(struct irq_desc *desc)
> +{
> + if (IS_ENABLED(CONFIG_ARCH_WANTS_IRQ_RAW))
> + return desc->status_use_accessors & _IRQ_RAW;
> +
> + /*
> + * Using IRQ_RAW on architectures that don't expect it is
> + * likely to be wrong.
> + */
> + WARN_ON_ONCE(1);
> + return false;
> +}
> --
> 2.28.0
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>