Patches 1 and 2 are the fix part, 3 is a cleanup, 4 is
extra-optimization (rfc).
Thanks to Paul that confirmed my doubts about ordering on cmpxchg() failure.
Frederic Weisbecker (4):
irq_work: Convert flags to atomic_t
irq_work: Fix irq_work_claim() ordering
irq_work: Slightly simplify IRQ_WORK_PENDING clearing
irq_work: Weaken ordering in irq_work_run_list()
include/linux/irq_work.h | 10 +++++++---
kernel/irq_work.c | 35 ++++++++++++++---------------------
kernel/printk/printk.c | 2 +-
3 files changed, 22 insertions(+), 25 deletions(-)
--
2.23.0
(This patch is RFC because it makes the code less clear in favour of
ordering optimization, ordering which has yet to pass under careful
eyes. Not sure it's worth it.)
Clearing IRQ_WORK_PENDING from atomic_fetch_andnot() let us know the
old value of flags that we can reuse in the later cmpxchg() to clear
IRQ_WORK_BUZY if necessary.
However there is no need to read flags atomically here. Its value can't
be concurrently changed before we clear the IRQ_WORK_PENDING bit. So we
can safely read it before the call to atomic_fetch_andnot() which can
then become atomic_andnot() followed by an smp_mb__after_atomic(). That
preserves the ordering that makes sure we see the latest updates
preceding irq_work_claim() while it doesn't raise a new IPI while
observing the work already queued.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
kernel/irq_work.c | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 49c53f80a13a..b709ab05cbfd 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -34,8 +34,8 @@ static bool irq_work_claim(struct irq_work *work)
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
/*
* If the work is already pending, no need to raise the IPI.
- * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
- * everything we did before is visible.
+ * The pairing atomic_andnot() followed by a barrier in irq_work_run()
+ * makes sure everything we did before is visible.
*/
if (oflags & IRQ_WORK_PENDING)
return false;
@@ -143,7 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
llnode = llist_del_all(list);
llist_for_each_entry_safe(work, tmp, llnode, llnode) {
- int flags;
+ int flags = atomic_read(&work->flags);
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
@@ -151,14 +151,16 @@ static void irq_work_run_list(struct llist_head *list)
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
- flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+ atomic_andnot(IRQ_WORK_PENDING, &work->flags);
+ smp_mb__after_atomic();
work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
- (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+ (void)atomic_cmpxchg(&work->flags, flags & ~IRQ_WORK_PENDING,
+ flags & ~IRQ_WORK_CLAIMED);
}
}
--
2.23.0
When irq_work_claim() finds IRQ_WORK_PENDING flag already set, we just
return and don't raise a new IPI. We expect the destination to see
and handle our latest updades thanks to the pairing atomic_xchg()
in irq_work_run_list().
But cmpxchg() doesn't guarantee a full memory barrier upon failure. So
it's possible that the destination misses our latest updates.
So use atomic_fetch_or() instead that is unconditionally fully ordered
and also performs exactly what we want here and simplify the code.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
kernel/irq_work.c | 22 +++++++---------------
1 file changed, 7 insertions(+), 15 deletions(-)
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index df0dbf4d859b..255454a48346 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -29,24 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list);
*/
static bool irq_work_claim(struct irq_work *work)
{
- int flags, oflags, nflags;
+ int oflags;
+ oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
/*
- * Start with our best wish as a premise but only trust any
- * flag value after cmpxchg() result.
+ * If the work is already pending, no need to raise the IPI.
+ * The pairing atomic_xchg() in irq_work_run() makes sure
+ * everything we did before is visible.
*/
- flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
- for (;;) {
- nflags = flags | IRQ_WORK_CLAIMED;
- oflags = atomic_cmpxchg(&work->flags, flags, nflags);
- if (oflags == flags)
- break;
- if (oflags & IRQ_WORK_PENDING)
- return false;
- flags = oflags;
- cpu_relax();
- }
-
+ if (oflags & IRQ_WORK_PENDING)
+ return false;
return true;
}
--
2.23.0
We need to convert flags to atomic_t in order to later fix an ordering
issue on cmpxchg() failure. This will allow us to use atomic_fetch_or().
Also that clarify the nature of those flags.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
include/linux/irq_work.h | 10 +++++++---
kernel/irq_work.c | 18 +++++++++---------
kernel/printk/printk.c | 2 +-
3 files changed, 17 insertions(+), 13 deletions(-)
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index b11fcdfd0770..02da997ad12c 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -22,7 +22,7 @@
#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY)
struct irq_work {
- unsigned long flags;
+ atomic_t flags;
struct llist_node llnode;
void (*func)(struct irq_work *);
};
@@ -30,11 +30,15 @@ struct irq_work {
static inline
void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
{
- work->flags = 0;
+ atomic_set(&work->flags, 0);
work->func = func;
}
-#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
+#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \
+ .flags = ATOMIC_INIT(0), \
+ .func = (_f) \
+}
+
bool irq_work_queue(struct irq_work *work);
bool irq_work_queue_on(struct irq_work *work, int cpu);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index d42acaf81886..df0dbf4d859b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -29,16 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list);
*/
static bool irq_work_claim(struct irq_work *work)
{
- unsigned long flags, oflags, nflags;
+ int flags, oflags, nflags;
/*
* Start with our best wish as a premise but only trust any
* flag value after cmpxchg() result.
*/
- flags = work->flags & ~IRQ_WORK_PENDING;
+ flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
for (;;) {
nflags = flags | IRQ_WORK_CLAIMED;
- oflags = cmpxchg(&work->flags, flags, nflags);
+ oflags = atomic_cmpxchg(&work->flags, flags, nflags);
if (oflags == flags)
break;
if (oflags & IRQ_WORK_PENDING)
@@ -61,7 +61,7 @@ void __weak arch_irq_work_raise(void)
static void __irq_work_queue_local(struct irq_work *work)
{
/* If the work is "lazy", handle it from next tick if any */
- if (work->flags & IRQ_WORK_LAZY) {
+ if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
tick_nohz_tick_stopped())
arch_irq_work_raise();
@@ -143,7 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
{
struct irq_work *work, *tmp;
struct llist_node *llnode;
- unsigned long flags;
+ int flags;
BUG_ON(!irqs_disabled());
@@ -159,15 +159,15 @@ static void irq_work_run_list(struct llist_head *list)
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
- flags = work->flags & ~IRQ_WORK_PENDING;
- xchg(&work->flags, flags);
+ flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
+ atomic_xchg(&work->flags, flags);
work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
- (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+ (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
}
}
@@ -199,7 +199,7 @@ void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
- while (work->flags & IRQ_WORK_BUSY)
+ while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ca65327a6de8..865727373a3b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2961,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
.func = wake_up_klogd_work_func,
- .flags = IRQ_WORK_LAZY,
+ .flags = ATOMIC_INIT(IRQ_WORK_LAZY),
};
void wake_up_klogd(void)
--
2.23.0
Instead of fetching the value of flags and perform an xchg() to clear
a bit, just use atomic_fetch_andnot() that is more suitable to do that
job in one operation while keeping the full ordering.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
kernel/irq_work.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 255454a48346..49c53f80a13a 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work)
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
/*
* If the work is already pending, no need to raise the IPI.
- * The pairing atomic_xchg() in irq_work_run() makes sure
+ * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
* everything we did before is visible.
*/
if (oflags & IRQ_WORK_PENDING)
@@ -135,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list)
{
struct irq_work *work, *tmp;
struct llist_node *llnode;
- int flags;
BUG_ON(!irqs_disabled());
@@ -144,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
llnode = llist_del_all(list);
llist_for_each_entry_safe(work, tmp, llnode, llnode) {
+ int flags;
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
@@ -151,8 +151,7 @@ static void irq_work_run_list(struct llist_head *list)
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
- flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
- atomic_xchg(&work->flags, flags);
+ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
work->func(work);
/*
--
2.23.0
* Frederic Weisbecker <[email protected]> wrote:
> When irq_work_claim() finds IRQ_WORK_PENDING flag already set, we just
> return and don't raise a new IPI. We expect the destination to see
> and handle our latest updades thanks to the pairing atomic_xchg()
> in irq_work_run_list().
>
> But cmpxchg() doesn't guarantee a full memory barrier upon failure. So
> it's possible that the destination misses our latest updates.
>
> So use atomic_fetch_or() instead that is unconditionally fully ordered
> and also performs exactly what we want here and simplify the code.
Just curious, how was this bug found - in the wild, or via code review?
Thanks,
Ingo
* Frederic Weisbecker <[email protected]> wrote:
> We need to convert flags to atomic_t in order to later fix an ordering
> issue on cmpxchg() failure. This will allow us to use atomic_fetch_or().
> Also that clarify the nature of those flags.
>
> Signed-off-by: Frederic Weisbecker <[email protected]>
> Cc: Paul E. McKenney <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> ---
> include/linux/irq_work.h | 10 +++++++---
> kernel/irq_work.c | 18 +++++++++---------
> kernel/printk/printk.c | 2 +-
> 3 files changed, 17 insertions(+), 13 deletions(-)
You missed the irq_work use in kernel/bpf/stackmap.c - see the fix below.
Thanks,
Ingo
===>
kernel/bpf/stackmap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index dcfe2d37ad15..23cf27ce0491 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
if (in_nmi()) {
work = this_cpu_ptr(&up_read_work);
- if (work->irq_work.flags & IRQ_WORK_BUSY)
+ if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
/* cannot queue more up_read, fallback */
irq_work_busy = true;
}
On Fri, Nov 08, 2019 at 05:08:58PM +0100, Frederic Weisbecker wrote:
> diff --git a/kernel/irq_work.c b/kernel/irq_work.c
> index 49c53f80a13a..b709ab05cbfd 100644
> --- a/kernel/irq_work.c
> +++ b/kernel/irq_work.c
> @@ -34,8 +34,8 @@ static bool irq_work_claim(struct irq_work *work)
> oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
> /*
> * If the work is already pending, no need to raise the IPI.
> + * The pairing atomic_andnot() followed by a barrier in irq_work_run()
> + * makes sure everything we did before is visible.
> */
> if (oflags & IRQ_WORK_PENDING)
> return false;
> @@ -151,14 +151,16 @@ static void irq_work_run_list(struct llist_head *list)
> * to claim that work don't rely on us to handle their data
> * while we are in the middle of the func.
> */
> - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
> + atomic_andnot(IRQ_WORK_PENDING, &work->flags);
> + smp_mb__after_atomic();
I think I'm prefering you use:
flags = atomic_fetch_andnot_acquire(IRQ_WORK_PENDING, &work->flags);
Also, I'm cursing at myself for the horrible comments here.
> work->func(work);
> /*
> * Clear the BUSY bit and return to the free state if
> * no-one else claimed it meanwhile.
> */
> - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
> + (void)atomic_cmpxchg(&work->flags, flags & ~IRQ_WORK_PENDING,
> + flags & ~IRQ_WORK_CLAIMED);
> }
> }
>
> --
> 2.23.0
>
The following commit has been merged into the irq/core branch of tip:
Commit-ID: 153bedbac2ebd475e1c7c2d2fa0c042f5525927d
Gitweb: https://git.kernel.org/tip/153bedbac2ebd475e1c7c2d2fa0c042f5525927d
Author: Frederic Weisbecker <[email protected]>
AuthorDate: Fri, 08 Nov 2019 17:08:55 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Mon, 11 Nov 2019 09:02:56 +01:00
irq_work: Convert flags to atomic_t
We need to convert flags to atomic_t in order to later fix an ordering
issue on atomic_cmpxchg() failure. This will allow us to use atomic_fetch_or().
Also clarify the nature of those flags.
[ mingo: Converted two more usage site the original patch missed. ]
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Paul E . McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/irq_work.h | 10 +++++++---
kernel/bpf/stackmap.c | 2 +-
kernel/irq_work.c | 18 +++++++++---------
kernel/printk/printk.c | 2 +-
kernel/trace/bpf_trace.c | 2 +-
5 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index b11fcdf..02da997 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -22,7 +22,7 @@
#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY)
struct irq_work {
- unsigned long flags;
+ atomic_t flags;
struct llist_node llnode;
void (*func)(struct irq_work *);
};
@@ -30,11 +30,15 @@ struct irq_work {
static inline
void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
{
- work->flags = 0;
+ atomic_set(&work->flags, 0);
work->func = func;
}
-#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
+#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \
+ .flags = ATOMIC_INIT(0), \
+ .func = (_f) \
+}
+
bool irq_work_queue(struct irq_work *work);
bool irq_work_queue_on(struct irq_work *work, int cpu);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c..4d31284 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
if (in_nmi()) {
work = this_cpu_ptr(&up_read_work);
- if (work->irq_work.flags & IRQ_WORK_BUSY)
+ if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
/* cannot queue more up_read, fallback */
irq_work_busy = true;
}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index d42acaf..df0dbf4 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -29,16 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list);
*/
static bool irq_work_claim(struct irq_work *work)
{
- unsigned long flags, oflags, nflags;
+ int flags, oflags, nflags;
/*
* Start with our best wish as a premise but only trust any
* flag value after cmpxchg() result.
*/
- flags = work->flags & ~IRQ_WORK_PENDING;
+ flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
for (;;) {
nflags = flags | IRQ_WORK_CLAIMED;
- oflags = cmpxchg(&work->flags, flags, nflags);
+ oflags = atomic_cmpxchg(&work->flags, flags, nflags);
if (oflags == flags)
break;
if (oflags & IRQ_WORK_PENDING)
@@ -61,7 +61,7 @@ void __weak arch_irq_work_raise(void)
static void __irq_work_queue_local(struct irq_work *work)
{
/* If the work is "lazy", handle it from next tick if any */
- if (work->flags & IRQ_WORK_LAZY) {
+ if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
tick_nohz_tick_stopped())
arch_irq_work_raise();
@@ -143,7 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
{
struct irq_work *work, *tmp;
struct llist_node *llnode;
- unsigned long flags;
+ int flags;
BUG_ON(!irqs_disabled());
@@ -159,15 +159,15 @@ static void irq_work_run_list(struct llist_head *list)
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
- flags = work->flags & ~IRQ_WORK_PENDING;
- xchg(&work->flags, flags);
+ flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
+ atomic_xchg(&work->flags, flags);
work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
- (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+ (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
}
}
@@ -199,7 +199,7 @@ void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
- while (work->flags & IRQ_WORK_BUSY)
+ while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ca65327..8657273 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2961,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
.func = wake_up_klogd_work_func,
- .flags = IRQ_WORK_LAZY,
+ .flags = ATOMIC_INIT(IRQ_WORK_LAZY),
};
void wake_up_klogd(void)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 44bd08f..ff467a4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -660,7 +660,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
return -EINVAL;
work = this_cpu_ptr(&send_signal_work);
- if (work->irq_work.flags & IRQ_WORK_BUSY)
+ if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
return -EBUSY;
/* Add the current task, which is the target of sending signal,
The following commit has been merged into the irq/core branch of tip:
Commit-ID: 25269871db1ad0cbbaafd5098cbdb40c8db4ccb9
Gitweb: https://git.kernel.org/tip/25269871db1ad0cbbaafd5098cbdb40c8db4ccb9
Author: Frederic Weisbecker <[email protected]>
AuthorDate: Fri, 08 Nov 2019 17:08:56 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Mon, 11 Nov 2019 09:03:31 +01:00
irq_work: Fix irq_work_claim() memory ordering
When irq_work_claim() finds IRQ_WORK_PENDING flag already set, we just
return and don't raise a new IPI. We expect the destination to see
and handle our latest updades thanks to the pairing atomic_xchg()
in irq_work_run_list().
But cmpxchg() doesn't guarantee a full memory barrier upon failure. So
it's possible that the destination misses our latest updates.
So use atomic_fetch_or() instead that is unconditionally fully ordered
and also performs exactly what we want here and simplify the code.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Paul E . McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/irq_work.c | 22 +++++++---------------
1 file changed, 7 insertions(+), 15 deletions(-)
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index df0dbf4..255454a 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -29,24 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list);
*/
static bool irq_work_claim(struct irq_work *work)
{
- int flags, oflags, nflags;
+ int oflags;
+ oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
/*
- * Start with our best wish as a premise but only trust any
- * flag value after cmpxchg() result.
+ * If the work is already pending, no need to raise the IPI.
+ * The pairing atomic_xchg() in irq_work_run() makes sure
+ * everything we did before is visible.
*/
- flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
- for (;;) {
- nflags = flags | IRQ_WORK_CLAIMED;
- oflags = atomic_cmpxchg(&work->flags, flags, nflags);
- if (oflags == flags)
- break;
- if (oflags & IRQ_WORK_PENDING)
- return false;
- flags = oflags;
- cpu_relax();
- }
-
+ if (oflags & IRQ_WORK_PENDING)
+ return false;
return true;
}
The following commit has been merged into the irq/core branch of tip:
Commit-ID: feb4a51323babe13315c3b783ea7f1cf25368918
Gitweb: https://git.kernel.org/tip/feb4a51323babe13315c3b783ea7f1cf25368918
Author: Frederic Weisbecker <[email protected]>
AuthorDate: Fri, 08 Nov 2019 17:08:57 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Mon, 11 Nov 2019 09:03:31 +01:00
irq_work: Slightly simplify IRQ_WORK_PENDING clearing
Instead of fetching the value of flags and perform an xchg() to clear
a bit, just use atomic_fetch_andnot() that is more suitable to do that
job in one operation while keeping the full ordering.
Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Paul E . McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/irq_work.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 255454a..49c53f8 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work)
oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
/*
* If the work is already pending, no need to raise the IPI.
- * The pairing atomic_xchg() in irq_work_run() makes sure
+ * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
* everything we did before is visible.
*/
if (oflags & IRQ_WORK_PENDING)
@@ -135,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list)
{
struct irq_work *work, *tmp;
struct llist_node *llnode;
- int flags;
BUG_ON(!irqs_disabled());
@@ -144,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
llnode = llist_del_all(list);
llist_for_each_entry_safe(work, tmp, llnode, llnode) {
+ int flags;
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
@@ -151,8 +151,7 @@ static void irq_work_run_list(struct llist_head *list)
* to claim that work don't rely on us to handle their data
* while we are in the middle of the func.
*/
- flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
- atomic_xchg(&work->flags, flags);
+ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
work->func(work);
/*
On Mon, Nov 11, 2019 at 09:43:13AM +0100, Peter Zijlstra wrote:
> On Fri, Nov 08, 2019 at 05:08:58PM +0100, Frederic Weisbecker wrote:
>
> > diff --git a/kernel/irq_work.c b/kernel/irq_work.c
> > index 49c53f80a13a..b709ab05cbfd 100644
> > --- a/kernel/irq_work.c
> > +++ b/kernel/irq_work.c
> > @@ -34,8 +34,8 @@ static bool irq_work_claim(struct irq_work *work)
> > oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
> > /*
> > * If the work is already pending, no need to raise the IPI.
> > + * The pairing atomic_andnot() followed by a barrier in irq_work_run()
> > + * makes sure everything we did before is visible.
> > */
> > if (oflags & IRQ_WORK_PENDING)
> > return false;
>
> > @@ -151,14 +151,16 @@ static void irq_work_run_list(struct llist_head *list)
> > * to claim that work don't rely on us to handle their data
> > * while we are in the middle of the func.
> > */
> > - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
> > + atomic_andnot(IRQ_WORK_PENDING, &work->flags);
> > + smp_mb__after_atomic();
>
> I think I'm prefering you use:
>
> flags = atomic_fetch_andnot_acquire(IRQ_WORK_PENDING, &work->flags);
Ah good point. Preparing that.
>
> Also, I'm cursing at myself for the horrible comments here.
Hmm, I wrote many of those, which one? :o)
Thanks.
>
> > work->func(work);
> > /*
> > * Clear the BUSY bit and return to the free state if
> > * no-one else claimed it meanwhile.
> > */
> > - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
> > + (void)atomic_cmpxchg(&work->flags, flags & ~IRQ_WORK_PENDING,
> > + flags & ~IRQ_WORK_CLAIMED);
> > }
> > }
> >
> > --
> > 2.23.0
> >
On Mon, Nov 11, 2019 at 09:00:58AM +0100, Ingo Molnar wrote:
>
> * Frederic Weisbecker <[email protected]> wrote:
>
> > We need to convert flags to atomic_t in order to later fix an ordering
> > issue on cmpxchg() failure. This will allow us to use atomic_fetch_or().
> > Also that clarify the nature of those flags.
> >
> > Signed-off-by: Frederic Weisbecker <[email protected]>
> > Cc: Paul E. McKenney <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Ingo Molnar <[email protected]>
> > ---
> > include/linux/irq_work.h | 10 +++++++---
> > kernel/irq_work.c | 18 +++++++++---------
> > kernel/printk/printk.c | 2 +-
> > 3 files changed, 17 insertions(+), 13 deletions(-)
>
> You missed the irq_work use in kernel/bpf/stackmap.c - see the fix below.
>
> Thanks,
>
> Ingo
Oh thanks. Strange that I haven't seen a 0-day warning about those.
On Mon, Nov 11, 2019 at 08:20:05AM +0100, Ingo Molnar wrote:
>
> * Frederic Weisbecker <[email protected]> wrote:
>
> > When irq_work_claim() finds IRQ_WORK_PENDING flag already set, we just
> > return and don't raise a new IPI. We expect the destination to see
> > and handle our latest updades thanks to the pairing atomic_xchg()
> > in irq_work_run_list().
> >
> > But cmpxchg() doesn't guarantee a full memory barrier upon failure. So
> > it's possible that the destination misses our latest updates.
> >
> > So use atomic_fetch_or() instead that is unconditionally fully ordered
> > and also performs exactly what we want here and simplify the code.
>
> Just curious, how was this bug found - in the wild, or via code review?
Well, I wanted to make sure the nohz kcpustat patches are safe and I had
a last minute doubt about that irq work scenario. So I would say code
review :)
On 08.11.2019 18:09, Frederic Weisbecker wrote:
> Instead of fetching the value of flags and perform an xchg() to clear
> a bit, just use atomic_fetch_andnot() that is more suitable to do that
> job in one operation while keeping the full ordering.
>
> Signed-off-by: Frederic Weisbecker <[email protected]>
> Cc: Paul E. McKenney <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> ---
> kernel/irq_work.c | 7 +++----
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/irq_work.c b/kernel/irq_work.c
> index 255454a48346..49c53f80a13a 100644
> --- a/kernel/irq_work.c
> +++ b/kernel/irq_work.c
> @@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work)
> oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
> /*
> * If the work is already pending, no need to raise the IPI.
> - * The pairing atomic_xchg() in irq_work_run() makes sure
> + * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
> * everything we did before is visible.
> */
> if (oflags & IRQ_WORK_PENDING)
> @@ -135,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list)
> {
> struct irq_work *work, *tmp;
> struct llist_node *llnode;
> - int flags;
>
> BUG_ON(!irqs_disabled());
>
> @@ -144,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
>
> llnode = llist_del_all(list);
> llist_for_each_entry_safe(work, tmp, llnode, llnode) {
> + int flags;
> /*
> * Clear the PENDING bit, after this point the @work
> * can be re-used.
> @@ -151,8 +151,7 @@ static void irq_work_run_list(struct llist_head *list)
> * to claim that work don't rely on us to handle their data
> * while we are in the middle of the func.
> */
> - flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
> - atomic_xchg(&work->flags, flags);
> + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
>
> work->func(work);
> /*
This breaks switching between cpufreq governors in linux-next on arm64
and various other stuff. The fix in this email doesn't compile:
https://lkml.org/lkml/2019/11/12/622
I assume you meant "&= ~" instead of "~=", this seems to work:
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 49c53f80a13a..828cc30774bc 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -156,10 +156,11 @@ static void irq_work_run_list(struct llist_head *list)
work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
+ flags &= ~IRQ_WORK_PENDING;
(void)atomic_cmpxchg(&work->flags, flags, flags &
~IRQ_WORK_BUSY);
}
}
On Tue, Nov 12, 2019 at 08:27:05PM +0000, Leonard Crestez wrote:
> On 08.11.2019 18:09, Frederic Weisbecker wrote:
> > Instead of fetching the value of flags and perform an xchg() to clear
> > a bit, just use atomic_fetch_andnot() that is more suitable to do that
> > job in one operation while keeping the full ordering.
> >
> > Signed-off-by: Frederic Weisbecker <[email protected]>
> > Cc: Paul E. McKenney <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Ingo Molnar <[email protected]>
> > ---
> > kernel/irq_work.c | 7 +++----
> > 1 file changed, 3 insertions(+), 4 deletions(-)
> >
> > diff --git a/kernel/irq_work.c b/kernel/irq_work.c
> > index 255454a48346..49c53f80a13a 100644
> > --- a/kernel/irq_work.c
> > +++ b/kernel/irq_work.c
> > @@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work)
> > oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
> > /*
> > * If the work is already pending, no need to raise the IPI.
> > - * The pairing atomic_xchg() in irq_work_run() makes sure
> > + * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
> > * everything we did before is visible.
> > */
> > if (oflags & IRQ_WORK_PENDING)
> > @@ -135,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list)
> > {
> > struct irq_work *work, *tmp;
> > struct llist_node *llnode;
> > - int flags;
> >
> > BUG_ON(!irqs_disabled());
> >
> > @@ -144,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
> >
> > llnode = llist_del_all(list);
> > llist_for_each_entry_safe(work, tmp, llnode, llnode) {
> > + int flags;
> > /*
> > * Clear the PENDING bit, after this point the @work
> > * can be re-used.
> > @@ -151,8 +151,7 @@ static void irq_work_run_list(struct llist_head *list)
> > * to claim that work don't rely on us to handle their data
> > * while we are in the middle of the func.
> > */
> > - flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
> > - atomic_xchg(&work->flags, flags);
> > + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
> >
> > work->func(work);
> > /*
>
> This breaks switching between cpufreq governors in linux-next on arm64
> and various other stuff. The fix in this email doesn't compile:
>
> https://lkml.org/lkml/2019/11/12/622
>
> I assume you meant "&= ~" instead of "~=", this seems to work:
Indeed, duh again!
I still think that ~= would be nice to have though...
Thanks!
>
> diff --git a/kernel/irq_work.c b/kernel/irq_work.c
> index 49c53f80a13a..828cc30774bc 100644
> --- a/kernel/irq_work.c
> +++ b/kernel/irq_work.c
> @@ -156,10 +156,11 @@ static void irq_work_run_list(struct llist_head *list)
> work->func(work);
> /*
> * Clear the BUSY bit and return to the free state if
> * no-one else claimed it meanwhile.
> */
> + flags &= ~IRQ_WORK_PENDING;
> (void)atomic_cmpxchg(&work->flags, flags, flags &
> ~IRQ_WORK_BUSY);
> }
> }