Subject: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

Hey

I use kvm-vm for regular development, and while using the kernel-rt v5.13-rt1
(the latest) on the host, and a regular kernel on the guest, after a while,
this happens:

[ 1723.404979] ------------[ cut here ]------------
[ 1723.404981] WARNING: CPU: 12 PID: 2554 at fs/eventfd.c:74 eventfd_signal+0x7e/0x90
[ 1723.404989] Modules linked in: vhost_net vhost vhost_iotlb tap tun rfcomm snd_seq_dummy snd_hrtimer xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_nat_tftp nf_conntrack_tftp bridge stp llc ccm nft_objref nf_conntrack_netbios_ns nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat ip6table_nat ip6table_mangle ip6table_raw ip6table_security iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle iptable_raw iptable_security ip_set nf_tables nfnetlink ip6table_filter ip6_tables iptable_filter cmac bnep intel_rapl_msr sunrpc intel_rapl_common kvm_amd kvm ath10k_pci snd_hda_codec_realtek ath10k_core snd_hda_codec_generic ledtrig_audio snd_hda_codec_hdmi snd_hda_intel snd_intel_dspcfg snd_hda_codec ath btusb mac80211 snd_hwdep btrtl snd_hda_core btbcm snd_seq irqbypass rapl vfat snd_seq_device btintel dell_wmi_descriptor alienware_wmi wmi_bmof libarc4 fat pcspkr snd_pcm
[ 1723.405033] bluetooth joydev k10temp i2c_piix4 cfg80211 snd_timer snd soundcore ecdh_generic ecc rfkill gpio_amdpt gpio_generic acpi_cpufreq zram ip_tables nouveau hid_logitech_hidpp video drm_ttm_helper ttm i2c_algo_bit mxm_wmi drm_kms_helper crct10dif_pclmul crc32_pclmul crc32c_intel cec drm ghash_clmulni_intel r8169 nvme hid_logitech_dj ccp nvme_core sp5100_tco wmi fuse
[ 1723.405051] CPU: 12 PID: 2554 Comm: vhost-2529 Not tainted 5.13.0-rt-rt1+ #2
[ 1723.405054] Hardware name: Alienware Alienware Aurora Ryzen Edition/0TYR0X, BIOS 2.1.2 02/25/2021
[ 1723.405055] RIP: 0010:eventfd_signal+0x7e/0x90
[ 1723.405059] Code: 01 00 00 00 be 03 00 00 00 4c 89 ef e8 5b ec d9 ff 65 ff 0d e4 34 c9 5a 4c 89 ef e8 ec a8 86 00 4c 89 e0 5b 5d 41 5c 41 5d c3 <0f> 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 0f 1f 00 0f 1f 44 00
[ 1723.405060] RSP: 0018:ffffb719c2f67d70 EFLAGS: 00010202
[ 1723.405062] RAX: 0000000000000001 RBX: ffff9f4897364ae0 RCX: 0000000000000000
[ 1723.405063] RDX: 0000000000000791 RSI: 0000000000000001 RDI: ffff9f489ae647e0
[ 1723.405064] RBP: 0000000000000100 R08: 0000000000000000 R09: 0000000000000001
[ 1723.405065] R10: 000000000004715e R11: 00000000000036e0 R12: 0000000000000001
[ 1723.405066] R13: ffff9f489b7643c0 R14: ffffb719c2f67e20 R15: ffff9f4897364ae0
[ 1723.405067] FS: 0000000000000000(0000) GS:ffff9f4f9ed00000(0000) knlGS:0000000000000000
[ 1723.405068] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1723.405069] CR2: 00007ffa78355000 CR3: 0000000114b7c000 CR4: 0000000000750ee0
[ 1723.405071] PKRU: 55555554
[ 1723.405071] Call Trace:
[ 1723.405078] vhost_tx_batch.constprop.0+0x7d/0xc0 [vhost_net]
[ 1723.405083] handle_tx_copy+0x15b/0x5c0 [vhost_net]
[ 1723.405088] ? __vhost_add_used_n+0x200/0x200 [vhost]
[ 1723.405092] handle_tx+0xa5/0xe0 [vhost_net]
[ 1723.405095] vhost_worker+0x93/0xd0 [vhost]
[ 1723.405099] kthread+0x186/0x1a0
[ 1723.405103] ? __kthread_parkme+0xa0/0xa0
[ 1723.405105] ret_from_fork+0x22/0x30
[ 1723.405110] ---[ end trace 0000000000000002 ]---

and my communication with the VM dies. Rebooting the VM makes it work again.

-- Daniel


2021-07-14 08:11:52

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 14/07/21 10:01, Daniel Bristot de Oliveira wrote:
> Hey
>
> I use kvm-vm for regular development, and while using the kernel-rt v5.13-rt1
> (the latest) on the host, and a regular kernel on the guest, after a while,
> this happens:
>
> [ 1723.404979] ------------[ cut here ]------------
> [ 1723.404981] WARNING: CPU: 12 PID: 2554 at fs/eventfd.c:74 eventfd_signal+0x7e/0x90

> [ 1723.405055] RIP: 0010:eventfd_signal+0x7e/0x90
> [ 1723.405059] Code: 01 00 00 00 be 03 00 00 00 4c 89 ef e8 5b ec d9 ff 65 ff 0d e4 34 c9 5a 4c 89 ef e8 ec a8 86 00 4c 89 e0 5b 5d 41 5c 41 5d c3 <0f> 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 0f 1f 00 0f 1f 44 00
> [ 1723.405078] vhost_tx_batch.constprop.0+0x7d/0xc0 [vhost_net]
> [ 1723.405083] handle_tx_copy+0x15b/0x5c0 [vhost_net]
> [ 1723.405088] ? __vhost_add_used_n+0x200/0x200 [vhost]
> [ 1723.405092] handle_tx+0xa5/0xe0 [vhost_net]
> [ 1723.405095] vhost_worker+0x93/0xd0 [vhost]
> [ 1723.405099] kthread+0x186/0x1a0
> [ 1723.405103] ? __kthread_parkme+0xa0/0xa0
> [ 1723.405105] ret_from_fork+0x22/0x30
> [ 1723.405110] ---[ end trace 0000000000000002 ]---

The WARN has this comment above:

/*
* Deadlock or stack overflow issues can happen if we recurse here
* through waitqueue wakeup handlers. If the caller users potentially
* nested waitqueues with custom wakeup handlers, then it should
* check eventfd_signal_count() before calling this function. If
* it returns true, the eventfd_signal() call should be deferred to a
* safe context.
*/

This was added in 2020, so it's unlikely to be the direct cause of the
change. What is a known-good version for the host?

Since it is not KVM stuff, I'm CCing Michael and Jason.

Paolo

2021-07-14 09:24:59

by Jason Wang

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()


在 2021/7/14 下午4:10, Paolo Bonzini 写道:
> On 14/07/21 10:01, Daniel Bristot de Oliveira wrote:
>> Hey
>>
>> I use kvm-vm for regular development, and while using the kernel-rt
>> v5.13-rt1
>> (the latest) on the host, and a regular kernel on the guest, after a
>> while,
>> this happens:
>>
>> [ 1723.404979] ------------[ cut here ]------------
>> [ 1723.404981] WARNING: CPU: 12 PID: 2554 at fs/eventfd.c:74
>> eventfd_signal+0x7e/0x90
>
>> [ 1723.405055] RIP: 0010:eventfd_signal+0x7e/0x90
>> [ 1723.405059] Code: 01 00 00 00 be 03 00 00 00 4c 89 ef e8 5b ec d9
>> ff 65 ff 0d e4 34 c9 5a 4c 89 ef e8 ec a8 86 00 4c 89 e0 5b 5d 41 5c
>> 41 5d c3 <0f> 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 0f 1f 00 0f
>> 1f 44 00
>> [ 1723.405078]  vhost_tx_batch.constprop.0+0x7d/0xc0 [vhost_net]
>> [ 1723.405083]  handle_tx_copy+0x15b/0x5c0 [vhost_net]
>> [ 1723.405088]  ? __vhost_add_used_n+0x200/0x200 [vhost]
>> [ 1723.405092]  handle_tx+0xa5/0xe0 [vhost_net]
>> [ 1723.405095]  vhost_worker+0x93/0xd0 [vhost]
>> [ 1723.405099]  kthread+0x186/0x1a0
>> [ 1723.405103]  ? __kthread_parkme+0xa0/0xa0
>> [ 1723.405105]  ret_from_fork+0x22/0x30
>> [ 1723.405110] ---[ end trace 0000000000000002 ]---
>
> The WARN has this comment above:
>
>         /*
>          * Deadlock or stack overflow issues can happen if we recurse
> here
>          * through waitqueue wakeup handlers. If the caller users
> potentially
>          * nested waitqueues with custom wakeup handlers, then it should
>          * check eventfd_signal_count() before calling this function. If
>          * it returns true, the eventfd_signal() call should be
> deferred to a
>          * safe context.
>          */
>
> This was added in 2020, so it's unlikely to be the direct cause of the
> change.  What is a known-good version for the host?
>
> Since it is not KVM stuff, I'm CCing Michael and Jason.
>
> Paolo
>

I think this can be probably fixed here:

https://lore.kernel.org/lkml/[email protected]/

Thanks


2021-07-14 10:39:23

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 14/07/21 11:23, Jason Wang wrote:
>> This was added in 2020, so it's unlikely to be the direct cause of the
>> change.  What is a known-good version for the host?
>>
>> Since it is not KVM stuff, I'm CCing Michael and Jason.
>
> I think this can be probably fixed here:
>
> https://lore.kernel.org/lkml/[email protected]/

That seems wrong; in particular it wouldn't protect against AB/BA deadlocks.
In fact, the bug is with the locking; the code assumes that
spin_lock_irqsave/spin_unlock_irqrestore is non-preemptable and therefore
increments and decrements the percpu variable inside the critical section.

This obviously does not fly with PREEMPT_RT; the right fix should be
using a local_lock. Something like this (untested!!):

--------------- 8< ---------------
From: Paolo Bonzini <[email protected]>
Subject: [PATCH] eventfd: protect eventfd_wake_count with a local_lock

eventfd_signal assumes that spin_lock_irqsave/spin_unlock_irqrestore is
non-preemptable and therefore increments and decrements the percpu
variable inside the critical section.

This obviously does not fly with PREEMPT_RT. If eventfd_signal is
preempted and an unrelated thread calls eventfd_signal, the result is
a spurious WARN. To avoid this, protect the percpu variable with a
local_lock.

Reported-by: Daniel Bristot de Oliveira <[email protected]>
Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
Cc: [email protected]
Cc: He Zhe <[email protected]>
Cc: Jens Axboe <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>

diff --git a/fs/eventfd.c b/fs/eventfd.c
index e265b6dd4f34..7d27b6e080ea 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -12,6 +12,7 @@
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/kernel.h>
+#include <linux/local_lock.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/spinlock.h>
@@ -25,6 +26,7 @@
#include <linux/idr.h>
#include <linux/uio.h>

+static local_lock_t eventfd_wake_lock = INIT_LOCAL_LOCK(eventfd_wake_lock);
DEFINE_PER_CPU(int, eventfd_wake_count);

static DEFINE_IDA(eventfd_ida);
@@ -71,8 +73,11 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* it returns true, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ local_lock(&eventfd_wake_lock);
+ if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) {
+ local_unlock(&eventfd_wake_lock);
return 0;
+ }

spin_lock_irqsave(&ctx->wqh.lock, flags);
this_cpu_inc(eventfd_wake_count);
@@ -83,6 +88,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
this_cpu_dec(eventfd_wake_count);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+ local_unlock(&eventfd_wake_lock);

return n;
}

2021-07-14 10:44:46

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On Wed, Jul 14, 2021 at 12:35:27PM +0200, Paolo Bonzini wrote:
> On 14/07/21 11:23, Jason Wang wrote:
> > > This was added in 2020, so it's unlikely to be the direct cause of the
> > > change.? What is a known-good version for the host?
> > >
> > > Since it is not KVM stuff, I'm CCing Michael and Jason.
> >
> > I think this can be probably fixed here:
> >
> > https://lore.kernel.org/lkml/[email protected]/
>
> That seems wrong; in particular it wouldn't protect against AB/BA deadlocks.
> In fact, the bug is with the locking; the code assumes that
> spin_lock_irqsave/spin_unlock_irqrestore is non-preemptable and therefore
> increments and decrements the percpu variable inside the critical section.
>
> This obviously does not fly with PREEMPT_RT; the right fix should be
> using a local_lock. Something like this (untested!!):
>
> --------------- 8< ---------------
> From: Paolo Bonzini <[email protected]>
> Subject: [PATCH] eventfd: protect eventfd_wake_count with a local_lock
>
> eventfd_signal assumes that spin_lock_irqsave/spin_unlock_irqrestore is
> non-preemptable and therefore increments and decrements the percpu
> variable inside the critical section.
>
> This obviously does not fly with PREEMPT_RT. If eventfd_signal is
> preempted and an unrelated thread calls eventfd_signal, the result is
> a spurious WARN. To avoid this, protect the percpu variable with a
> local_lock.
>
> Reported-by: Daniel Bristot de Oliveira <[email protected]>
> Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
> Cc: [email protected]
> Cc: He Zhe <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>


Makes sense ...

Acked-by: Michael S. Tsirkin <[email protected]>

want to send this to the windriver guys so they can test?
Here's the list from that thread:

To: [email protected], [email protected], [email protected],
[email protected], [email protected], [email protected],
[email protected], [email protected],
[email protected], [email protected],
[email protected], [email protected], [email protected],
[email protected], [email protected],
[email protected], [email protected],
[email protected],
[email protected], [email protected],
[email protected], [email protected],
[email protected], [email protected],
[email protected]


>
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index e265b6dd4f34..7d27b6e080ea 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -12,6 +12,7 @@
> #include <linux/fs.h>
> #include <linux/sched/signal.h>
> #include <linux/kernel.h>
> +#include <linux/local_lock.h>
> #include <linux/slab.h>
> #include <linux/list.h>
> #include <linux/spinlock.h>
> @@ -25,6 +26,7 @@
> #include <linux/idr.h>
> #include <linux/uio.h>
> +static local_lock_t eventfd_wake_lock = INIT_LOCAL_LOCK(eventfd_wake_lock);
> DEFINE_PER_CPU(int, eventfd_wake_count);
> static DEFINE_IDA(eventfd_ida);
> @@ -71,8 +73,11 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
> * it returns true, the eventfd_signal() call should be deferred to a
> * safe context.
> */
> - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> + local_lock(&eventfd_wake_lock);
> + if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) {
> + local_unlock(&eventfd_wake_lock);
> return 0;
> + }
> spin_lock_irqsave(&ctx->wqh.lock, flags);
> this_cpu_inc(eventfd_wake_count);
> @@ -83,6 +88,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
> wake_up_locked_poll(&ctx->wqh, EPOLLIN);
> this_cpu_dec(eventfd_wake_count);
> spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> + local_unlock(&eventfd_wake_lock);
> return n;
> }

2021-07-14 10:48:33

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 14/07/21 12:41, Michael S. Tsirkin wrote:
>> --------------- 8< ---------------
>> From: Paolo Bonzini <[email protected]>
>> Subject: [PATCH] eventfd: protect eventfd_wake_count with a local_lock
>>
>> eventfd_signal assumes that spin_lock_irqsave/spin_unlock_irqrestore is
>> non-preemptable and therefore increments and decrements the percpu
>> variable inside the critical section.
>>
>> This obviously does not fly with PREEMPT_RT. If eventfd_signal is
>> preempted and an unrelated thread calls eventfd_signal, the result is
>> a spurious WARN. To avoid this, protect the percpu variable with a
>> local_lock.
>>
>> Reported-by: Daniel Bristot de Oliveira <[email protected]>
>> Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
>> Cc: [email protected]
>> Cc: He Zhe <[email protected]>
>> Cc: Jens Axboe <[email protected]>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>
>
> Makes sense ...
>
> Acked-by: Michael S. Tsirkin <[email protected]>
>
> want to send this to the windriver guys so they can test?
> Here's the list from that thread:

I included He Zhe, but it should be enough for Daniel to test it. The
bug is quite obvious once you wear your PREEMPT_RT goggles.

Paolo

Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 7/14/21 12:35 PM, Paolo Bonzini wrote:
> On 14/07/21 11:23, Jason Wang wrote:
>>> This was added in 2020, so it's unlikely to be the direct cause of the
>>> change.  What is a known-good version for the host?
>>>
>>> Since it is not KVM stuff, I'm CCing Michael and Jason.
>>
>> I think this can be probably fixed here:
>>
>> https://lore.kernel.org/lkml/[email protected]/
>
> That seems wrong; in particular it wouldn't protect against AB/BA deadlocks.
> In fact, the bug is with the locking; the code assumes that
> spin_lock_irqsave/spin_unlock_irqrestore is non-preemptable and therefore
> increments and decrements the percpu variable inside the critical section.
>
> This obviously does not fly with PREEMPT_RT; the right fix should be
> using a local_lock.  Something like this (untested!!):

Makes sense, testing the patch.

-- Daniel

> --------------- 8< ---------------
> From: Paolo Bonzini <[email protected]>
> Subject: [PATCH] eventfd: protect eventfd_wake_count with a local_lock
>
> eventfd_signal assumes that spin_lock_irqsave/spin_unlock_irqrestore is
> non-preemptable and therefore increments and decrements the percpu
> variable inside the critical section.
>
> This obviously does not fly with PREEMPT_RT.  If eventfd_signal is
> preempted and an unrelated thread calls eventfd_signal, the result is
> a spurious WARN.  To avoid this, protect the percpu variable with a
> local_lock.
>
> Reported-by: Daniel Bristot de Oliveira <[email protected]>
> Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
> Cc: [email protected]
> Cc: He Zhe <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
>
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index e265b6dd4f34..7d27b6e080ea 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -12,6 +12,7 @@
>  #include <linux/fs.h>
>  #include <linux/sched/signal.h>
>  #include <linux/kernel.h>
> +#include <linux/local_lock.h>
>  #include <linux/slab.h>
>  #include <linux/list.h>
>  #include <linux/spinlock.h>
> @@ -25,6 +26,7 @@
>  #include <linux/idr.h>
>  #include <linux/uio.h>
>  
> +static local_lock_t eventfd_wake_lock = INIT_LOCAL_LOCK(eventfd_wake_lock);
>  DEFINE_PER_CPU(int, eventfd_wake_count);
>  
>  static DEFINE_IDA(eventfd_ida);
> @@ -71,8 +73,11 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>       * it returns true, the eventfd_signal() call should be deferred to a
>       * safe context.
>       */
> -    if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> +    local_lock(&eventfd_wake_lock);
> +    if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) {
> +        local_unlock(&eventfd_wake_lock);
>          return 0;
> +    }
>  
>      spin_lock_irqsave(&ctx->wqh.lock, flags);
>      this_cpu_inc(eventfd_wake_count);
> @@ -83,6 +88,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>          wake_up_locked_poll(&ctx->wqh, EPOLLIN);
>      this_cpu_dec(eventfd_wake_count);
>      spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> +    local_unlock(&eventfd_wake_lock);
>  
>      return n;
>  }
>

2021-07-15 04:56:15

by Jason Wang

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()


在 2021/7/14 下午6:35, Paolo Bonzini 写道:
> On 14/07/21 11:23, Jason Wang wrote:
>>> This was added in 2020, so it's unlikely to be the direct cause of the
>>> change.  What is a known-good version for the host?
>>>
>>> Since it is not KVM stuff, I'm CCing Michael and Jason.
>>
>> I think this can be probably fixed here:
>>
>> https://lore.kernel.org/lkml/[email protected]/
>>
>
> That seems wrong; in particular it wouldn't protect against AB/BA
> deadlocks.
> In fact, the bug is with the locking; the code assumes that
> spin_lock_irqsave/spin_unlock_irqrestore is non-preemptable and therefore
> increments and decrements the percpu variable inside the critical
> section.
>
> This obviously does not fly with PREEMPT_RT; the right fix should be
> using a local_lock.  Something like this (untested!!):
>
> --------------- 8< ---------------
> From: Paolo Bonzini <[email protected]>
> Subject: [PATCH] eventfd: protect eventfd_wake_count with a local_lock
>
> eventfd_signal assumes that spin_lock_irqsave/spin_unlock_irqrestore is
> non-preemptable and therefore increments and decrements the percpu
> variable inside the critical section.
>
> This obviously does not fly with PREEMPT_RT.  If eventfd_signal is
> preempted and an unrelated thread calls eventfd_signal, the result is
> a spurious WARN.  To avoid this, protect the percpu variable with a
> local_lock.


But local_lock only disable migration not preemption.

Or anything I missed here?

Thanks


>
> Reported-by: Daniel Bristot de Oliveira <[email protected]>
> Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
> Cc: [email protected]
> Cc: He Zhe <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
>
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index e265b6dd4f34..7d27b6e080ea 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -12,6 +12,7 @@
>  #include <linux/fs.h>
>  #include <linux/sched/signal.h>
>  #include <linux/kernel.h>
> +#include <linux/local_lock.h>
>  #include <linux/slab.h>
>  #include <linux/list.h>
>  #include <linux/spinlock.h>
> @@ -25,6 +26,7 @@
>  #include <linux/idr.h>
>  #include <linux/uio.h>
>
> +static local_lock_t eventfd_wake_lock =
> INIT_LOCAL_LOCK(eventfd_wake_lock);
>  DEFINE_PER_CPU(int, eventfd_wake_count);
>
>  static DEFINE_IDA(eventfd_ida);
> @@ -71,8 +73,11 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>       * it returns true, the eventfd_signal() call should be deferred
> to a
>       * safe context.
>       */
> -    if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> +    local_lock(&eventfd_wake_lock);
> +    if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) {
> +        local_unlock(&eventfd_wake_lock);
>          return 0;
> +    }
>
>      spin_lock_irqsave(&ctx->wqh.lock, flags);
>      this_cpu_inc(eventfd_wake_count);
> @@ -83,6 +88,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>          wake_up_locked_poll(&ctx->wqh, EPOLLIN);
>      this_cpu_dec(eventfd_wake_count);
>      spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> +    local_unlock(&eventfd_wake_lock);
>
>      return n;
>  }
>

Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 7/14/21 12:35 PM, Paolo Bonzini wrote:
> On 14/07/21 11:23, Jason Wang wrote:
>>> This was added in 2020, so it's unlikely to be the direct cause of the
>>> change.  What is a known-good version for the host?
>>>
>>> Since it is not KVM stuff, I'm CCing Michael and Jason.
>>
>> I think this can be probably fixed here:
>>
>> https://lore.kernel.org/lkml/[email protected]/
>
> That seems wrong; in particular it wouldn't protect against AB/BA deadlocks.
> In fact, the bug is with the locking; the code assumes that
> spin_lock_irqsave/spin_unlock_irqrestore is non-preemptable and therefore
> increments and decrements the percpu variable inside the critical section.
>
> This obviously does not fly with PREEMPT_RT; the right fix should be
> using a local_lock.  Something like this (untested!!):

the lock needs to be per-pcu... but so far, so good. I will continue using the
system in the next days to see if it blows on another way.

The patch looks like this now:

------------------------- 8< ---------------------
Subject: [PATCH] eventfd: protect eventfd_wake_count with a local_lock

eventfd_signal assumes that spin_lock_irqsave/spin_unlock_irqrestore is
non-preemptable and therefore increments and decrements the percpu
variable inside the critical section.

This obviously does not fly with PREEMPT_RT. If eventfd_signal is
preempted and an unrelated thread calls eventfd_signal, the result is
a spurious WARN. To avoid this, protect the percpu variable with a
local_lock.

Reported-by: Daniel Bristot de Oliveira <[email protected]>
Suggested-by: Paolo Bonzini <[email protected]>
Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
Cc: [email protected]
Cc: He Zhe <[email protected]>
Cc: Jens Axboe <[email protected]>
Co-developed-by: Paolo Bonzini <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
Co-developed-by: Daniel Bristot de Oliveira <[email protected]>
Signed-off-by: Daniel Bristot de Oliveira <[email protected]>
---
fs/eventfd.c | 27 ++++++++++++++++++++++-----
include/linux/eventfd.h | 7 +------
2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index e265b6dd4f34..9754fcd38690 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -12,6 +12,7 @@
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/kernel.h>
+#include <linux/local_lock.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/spinlock.h>
@@ -25,8 +26,6 @@
#include <linux/idr.h>
#include <linux/uio.h>

-DEFINE_PER_CPU(int, eventfd_wake_count);
-
static DEFINE_IDA(eventfd_ida);

struct eventfd_ctx {
@@ -45,6 +44,20 @@ struct eventfd_ctx {
int id;
};

+struct event_fd_recursion {
+ local_lock_t lock;
+ int count;
+};
+
+static DEFINE_PER_CPU(struct event_fd_recursion, event_fd_recursion) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
+
+bool eventfd_signal_count(void)
+{
+ return this_cpu_read(event_fd_recursion.count);
+}
+
/**
* eventfd_signal - Adds @n to the eventfd counter.
* @ctx: [in] Pointer to the eventfd context.
@@ -71,18 +84,22 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* it returns true, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ local_lock(&event_fd_recursion.lock);
+ if (WARN_ON_ONCE(this_cpu_read(event_fd_recursion.count))) {
+ local_unlock(&event_fd_recursion.lock);
return 0;
+ }

spin_lock_irqsave(&ctx->wqh.lock, flags);
- this_cpu_inc(eventfd_wake_count);
+ this_cpu_inc(event_fd_recursion.count);
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- this_cpu_dec(eventfd_wake_count);
+ this_cpu_dec(event_fd_recursion.count);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+ local_unlock(&event_fd_recursion.lock);

return n;
}
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index fa0a524baed0..ca89d6c409c1 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -43,12 +43,7 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
wait_queue_entry_t *w
__u64 *cnt);
void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);

-DECLARE_PER_CPU(int, eventfd_wake_count);
-
-static inline bool eventfd_signal_count(void)
-{
- return this_cpu_read(eventfd_wake_count);
-}
+bool eventfd_signal_count(void);

#else /* CONFIG_EVENTFD */

--

2021-07-15 09:26:19

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 15/07/21 06:14, Jason Wang wrote:
>> This obviously does not fly with PREEMPT_RT.  If eventfd_signal is
>> preempted and an unrelated thread calls eventfd_signal, the result is
>> a spurious WARN.  To avoid this, protect the percpu variable with a
>> local_lock.
>
> But local_lock only disable migration not preemption.

On mainline PREEMPT_RT, local_lock is an array of per-CPU spinlocks.
When two eventfd_signals run on the same CPU and one is preempted, the
spinlocks avoid that the second sees eventfd_wake_count > 0.

Thanks,

Paolo

> Or anything I missed here?
>
> Thanks
>
>
>>
>> Reported-by: Daniel Bristot de Oliveira <[email protected]>
>> Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
>> Cc: [email protected]
>> Cc: He Zhe <[email protected]>
>> Cc: Jens Axboe <[email protected]>
>> Signed-off-by: Paolo Bonzini <[email protected]>
>>
>> diff --git a/fs/eventfd.c b/fs/eventfd.c
>> index e265b6dd4f34..7d27b6e080ea 100644
>> --- a/fs/eventfd.c
>> +++ b/fs/eventfd.c
>> @@ -12,6 +12,7 @@
>>  #include <linux/fs.h>
>>  #include <linux/sched/signal.h>
>>  #include <linux/kernel.h>
>> +#include <linux/local_lock.h>
>>  #include <linux/slab.h>
>>  #include <linux/list.h>
>>  #include <linux/spinlock.h>
>> @@ -25,6 +26,7 @@
>>  #include <linux/idr.h>
>>  #include <linux/uio.h>
>>
>> +static local_lock_t eventfd_wake_lock =
>> INIT_LOCAL_LOCK(eventfd_wake_lock);
>>  DEFINE_PER_CPU(int, eventfd_wake_count);
>>
>>  static DEFINE_IDA(eventfd_ida);
>> @@ -71,8 +73,11 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>>       * it returns true, the eventfd_signal() call should be deferred
>> to a
>>       * safe context.
>>       */
>> -    if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
>> +    local_lock(&eventfd_wake_lock);
>> +    if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) {
>> +        local_unlock(&eventfd_wake_lock);
>>          return 0;
>> +    }
>>
>>      spin_lock_irqsave(&ctx->wqh.lock, flags);
>>      this_cpu_inc(eventfd_wake_count);
>> @@ -83,6 +88,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>>          wake_up_locked_poll(&ctx->wqh, EPOLLIN);
>>      this_cpu_dec(eventfd_wake_count);
>>      spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>> +    local_unlock(&eventfd_wake_lock);
>>
>>      return n;
>>  }
>>
>

2021-07-15 09:26:32

by Jason Wang

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()


在 2021/7/15 下午1:58, Paolo Bonzini 写道:
> On 15/07/21 06:14, Jason Wang wrote:
>>> This obviously does not fly with PREEMPT_RT.  If eventfd_signal is
>>> preempted and an unrelated thread calls eventfd_signal, the result is
>>> a spurious WARN.  To avoid this, protect the percpu variable with a
>>> local_lock.
>>
>> But local_lock only disable migration not preemption.
>
> On mainline PREEMPT_RT, local_lock is an array of per-CPU spinlocks.
> When two eventfd_signals run on the same CPU and one is preempted, the
> spinlocks avoid that the second sees eventfd_wake_count > 0.
>
> Thanks,
>
> Paolo


Right, I see.

Thanks


>
>> Or anything I missed here?
>>
>> Thanks
>>
>>
>>>
>>> Reported-by: Daniel Bristot de Oliveira <[email protected]>
>>> Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
>>> Cc: [email protected]
>>> Cc: He Zhe <[email protected]>
>>> Cc: Jens Axboe <[email protected]>
>>> Signed-off-by: Paolo Bonzini <[email protected]>
>>>
>>> diff --git a/fs/eventfd.c b/fs/eventfd.c
>>> index e265b6dd4f34..7d27b6e080ea 100644
>>> --- a/fs/eventfd.c
>>> +++ b/fs/eventfd.c
>>> @@ -12,6 +12,7 @@
>>>  #include <linux/fs.h>
>>>  #include <linux/sched/signal.h>
>>>  #include <linux/kernel.h>
>>> +#include <linux/local_lock.h>
>>>  #include <linux/slab.h>
>>>  #include <linux/list.h>
>>>  #include <linux/spinlock.h>
>>> @@ -25,6 +26,7 @@
>>>  #include <linux/idr.h>
>>>  #include <linux/uio.h>
>>>
>>> +static local_lock_t eventfd_wake_lock =
>>> INIT_LOCAL_LOCK(eventfd_wake_lock);
>>>  DEFINE_PER_CPU(int, eventfd_wake_count);
>>>
>>>  static DEFINE_IDA(eventfd_ida);
>>> @@ -71,8 +73,11 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx,
>>> __u64 n)
>>>       * it returns true, the eventfd_signal() call should be
>>> deferred to a
>>>       * safe context.
>>>       */
>>> -    if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
>>> +    local_lock(&eventfd_wake_lock);
>>> +    if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) {
>>> +        local_unlock(&eventfd_wake_lock);
>>>          return 0;
>>> +    }
>>>
>>>      spin_lock_irqsave(&ctx->wqh.lock, flags);
>>>      this_cpu_inc(eventfd_wake_count);
>>> @@ -83,6 +88,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx,
>>> __u64 n)
>>>          wake_up_locked_poll(&ctx->wqh, EPOLLIN);
>>>      this_cpu_dec(eventfd_wake_count);
>>>      spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>>> +    local_unlock(&eventfd_wake_lock);
>>>
>>>      return n;
>>>  }
>>>
>>
>

2021-07-15 10:48:27

by He Zhe

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()



On 7/15/21 4:22 PM, Daniel Bristot de Oliveira wrote:
> On 7/14/21 12:35 PM, Paolo Bonzini wrote:
>> On 14/07/21 11:23, Jason Wang wrote:
>>>> This was added in 2020, so it's unlikely to be the direct cause of the
>>>> change.  What is a known-good version for the host?
>>>>
>>>> Since it is not KVM stuff, I'm CCing Michael and Jason.
>>> I think this can be probably fixed here:
>>>
>>> https://lore.kernel.org/lkml/[email protected]/
>> That seems wrong; in particular it wouldn't protect against AB/BA deadlocks.
>> In fact, the bug is with the locking; the code assumes that
>> spin_lock_irqsave/spin_unlock_irqrestore is non-preemptable and therefore
>> increments and decrements the percpu variable inside the critical section.
>>
>> This obviously does not fly with PREEMPT_RT; the right fix should be
>> using a local_lock.  Something like this (untested!!):
> the lock needs to be per-pcu... but so far, so good. I will continue using the
> system in the next days to see if it blows on another way.

The original patch was created before preempt-rt was fully introduced into
mainline. It was to increase the recursion depth to 2 so that vhost_worker and
kvm_vcpu_ioctl syscall could work in parallel, as shown in the original commit
log.

So the event_fd_recursion.count should still be 2 to fix the original issue,
no matter how locks would be tweaked accordingly.

Zhe

>
> The patch looks like this now:
>
> ------------------------- 8< ---------------------
> Subject: [PATCH] eventfd: protect eventfd_wake_count with a local_lock
>
> eventfd_signal assumes that spin_lock_irqsave/spin_unlock_irqrestore is
> non-preemptable and therefore increments and decrements the percpu
> variable inside the critical section.
>
> This obviously does not fly with PREEMPT_RT. If eventfd_signal is
> preempted and an unrelated thread calls eventfd_signal, the result is
> a spurious WARN. To avoid this, protect the percpu variable with a
> local_lock.
>
> Reported-by: Daniel Bristot de Oliveira <[email protected]>
> Suggested-by: Paolo Bonzini <[email protected]>
> Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
> Cc: [email protected]
> Cc: He Zhe <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Co-developed-by: Paolo Bonzini <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> Co-developed-by: Daniel Bristot de Oliveira <[email protected]>
> Signed-off-by: Daniel Bristot de Oliveira <[email protected]>
> ---
> fs/eventfd.c | 27 ++++++++++++++++++++++-----
> include/linux/eventfd.h | 7 +------
> 2 files changed, 23 insertions(+), 11 deletions(-)
>
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index e265b6dd4f34..9754fcd38690 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -12,6 +12,7 @@
> #include <linux/fs.h>
> #include <linux/sched/signal.h>
> #include <linux/kernel.h>
> +#include <linux/local_lock.h>
> #include <linux/slab.h>
> #include <linux/list.h>
> #include <linux/spinlock.h>
> @@ -25,8 +26,6 @@
> #include <linux/idr.h>
> #include <linux/uio.h>
>
> -DEFINE_PER_CPU(int, eventfd_wake_count);
> -
> static DEFINE_IDA(eventfd_ida);
>
> struct eventfd_ctx {
> @@ -45,6 +44,20 @@ struct eventfd_ctx {
> int id;
> };
>
> +struct event_fd_recursion {
> + local_lock_t lock;
> + int count;
> +};
> +
> +static DEFINE_PER_CPU(struct event_fd_recursion, event_fd_recursion) = {
> + .lock = INIT_LOCAL_LOCK(lock),
> +};
> +
> +bool eventfd_signal_count(void)
> +{
> + return this_cpu_read(event_fd_recursion.count);
> +}
> +
> /**
> * eventfd_signal - Adds @n to the eventfd counter.
> * @ctx: [in] Pointer to the eventfd context.
> @@ -71,18 +84,22 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
> * it returns true, the eventfd_signal() call should be deferred to a
> * safe context.
> */
> - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> + local_lock(&event_fd_recursion.lock);
> + if (WARN_ON_ONCE(this_cpu_read(event_fd_recursion.count))) {
> + local_unlock(&event_fd_recursion.lock);
> return 0;
> + }
>
> spin_lock_irqsave(&ctx->wqh.lock, flags);
> - this_cpu_inc(eventfd_wake_count);
> + this_cpu_inc(event_fd_recursion.count);
> if (ULLONG_MAX - ctx->count < n)
> n = ULLONG_MAX - ctx->count;
> ctx->count += n;
> if (waitqueue_active(&ctx->wqh))
> wake_up_locked_poll(&ctx->wqh, EPOLLIN);
> - this_cpu_dec(eventfd_wake_count);
> + this_cpu_dec(event_fd_recursion.count);
> spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> + local_unlock(&event_fd_recursion.lock);
>
> return n;
> }
> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> index fa0a524baed0..ca89d6c409c1 100644
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -43,12 +43,7 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
> wait_queue_entry_t *w
> __u64 *cnt);
> void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
>
> -DECLARE_PER_CPU(int, eventfd_wake_count);
> -
> -static inline bool eventfd_signal_count(void)
> -{
> - return this_cpu_read(eventfd_wake_count);
> -}
> +bool eventfd_signal_count(void);
>
> #else /* CONFIG_EVENTFD */
>

2021-07-15 11:48:25

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 15/07/21 10:22, Daniel Bristot de Oliveira wrote:
> On 7/14/21 12:35 PM, Paolo Bonzini wrote:
>> On 14/07/21 11:23, Jason Wang wrote:
>>> I think this can be probably fixed here:
>>>
>>> https://lore.kernel.org/lkml/[email protected]/
>>
>> That seems wrong; in particular it wouldn't protect against AB/BA deadlocks.
>> In fact, the bug is with the locking; the code assumes that
>> spin_lock_irqsave/spin_unlock_irqrestore is non-preemptable and therefore
>> increments and decrements the percpu variable inside the critical section.
>>
>> This obviously does not fly with PREEMPT_RT; the right fix should be
>> using a local_lock.  Something like this (untested!!):
>
> the lock needs to be per-pcu...

Great, thanks for testing and fixing the patch! Are you going to post
it again once you've confirmed it works?

Paolo

> The patch looks like this now:
>
> ------------------------- 8< ---------------------
> Subject: [PATCH] eventfd: protect eventfd_wake_count with a local_lock
>
> eventfd_signal assumes that spin_lock_irqsave/spin_unlock_irqrestore is
> non-preemptable and therefore increments and decrements the percpu
> variable inside the critical section.
>
> This obviously does not fly with PREEMPT_RT. If eventfd_signal is
> preempted and an unrelated thread calls eventfd_signal, the result is
> a spurious WARN. To avoid this, protect the percpu variable with a
> local_lock.
>
> Reported-by: Daniel Bristot de Oliveira <[email protected]>
> Suggested-by: Paolo Bonzini <[email protected]>
> Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
> Cc: [email protected]
> Cc: He Zhe <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Co-developed-by: Paolo Bonzini <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> Co-developed-by: Daniel Bristot de Oliveira <[email protected]>
> Signed-off-by: Daniel Bristot de Oliveira <[email protected]>
> ---
> fs/eventfd.c | 27 ++++++++++++++++++++++-----
> include/linux/eventfd.h | 7 +------
> 2 files changed, 23 insertions(+), 11 deletions(-)
>
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index e265b6dd4f34..9754fcd38690 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -12,6 +12,7 @@
> #include <linux/fs.h>
> #include <linux/sched/signal.h>
> #include <linux/kernel.h>
> +#include <linux/local_lock.h>
> #include <linux/slab.h>
> #include <linux/list.h>
> #include <linux/spinlock.h>
> @@ -25,8 +26,6 @@
> #include <linux/idr.h>
> #include <linux/uio.h>
>
> -DEFINE_PER_CPU(int, eventfd_wake_count);
> -
> static DEFINE_IDA(eventfd_ida);
>
> struct eventfd_ctx {
> @@ -45,6 +44,20 @@ struct eventfd_ctx {
> int id;
> };
>
> +struct event_fd_recursion {
> + local_lock_t lock;
> + int count;
> +};
> +
> +static DEFINE_PER_CPU(struct event_fd_recursion, event_fd_recursion) = {
> + .lock = INIT_LOCAL_LOCK(lock),
> +};
> +
> +bool eventfd_signal_count(void)
> +{
> + return this_cpu_read(event_fd_recursion.count);
> +}
> +
> /**
> * eventfd_signal - Adds @n to the eventfd counter.
> * @ctx: [in] Pointer to the eventfd context.
> @@ -71,18 +84,22 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
> * it returns true, the eventfd_signal() call should be deferred to a
> * safe context.
> */
> - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> + local_lock(&event_fd_recursion.lock);
> + if (WARN_ON_ONCE(this_cpu_read(event_fd_recursion.count))) {
> + local_unlock(&event_fd_recursion.lock);
> return 0;
> + }
>
> spin_lock_irqsave(&ctx->wqh.lock, flags);
> - this_cpu_inc(eventfd_wake_count);
> + this_cpu_inc(event_fd_recursion.count);
> if (ULLONG_MAX - ctx->count < n)
> n = ULLONG_MAX - ctx->count;
> ctx->count += n;
> if (waitqueue_active(&ctx->wqh))
> wake_up_locked_poll(&ctx->wqh, EPOLLIN);
> - this_cpu_dec(eventfd_wake_count);
> + this_cpu_dec(event_fd_recursion.count);
> spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> + local_unlock(&event_fd_recursion.lock);
>
> return n;
> }
> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> index fa0a524baed0..ca89d6c409c1 100644
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -43,12 +43,7 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
> wait_queue_entry_t *w
> __u64 *cnt);
> void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
>
> -DECLARE_PER_CPU(int, eventfd_wake_count);
> -
> -static inline bool eventfd_signal_count(void)
> -{
> - return this_cpu_read(eventfd_wake_count);
> -}
> +bool eventfd_signal_count(void);
>
> #else /* CONFIG_EVENTFD */
>

2021-07-15 11:52:55

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 15/07/21 10:44, He Zhe wrote:
> It was to increase the recursion depth to 2 so that vhost_worker and
> kvm_vcpu_ioctl syscall could work in parallel

The count is per-CPU, so parallel operations cannot cause it to become
2. Your patch might fix calls from ioeventfd to vhost_worker to another
eventfd, but not *parallel* operation of KVM and vhost (except on
PREEMPT_RT).

You should identify the exact callstack that caused the warning for
vDUSE, and document that one in the commit message, so that reviewers
can understand the issue.

Paolo

2021-07-15 13:43:57

by He Zhe

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()



On 7/15/21 5:51 PM, Paolo Bonzini wrote:
> On 15/07/21 10:44, He Zhe wrote:
>> It was to increase the recursion depth to 2 so that vhost_worker and
>> kvm_vcpu_ioctl syscall could work in parallel
>
> The count is per-CPU, so parallel operations cannot cause it to become 2.  Your patch might fix calls from ioeventfd to vhost_worker to another eventfd, but not *parallel* operation of KVM and vhost (except on PREEMPT_RT).
>
> You should identify the exact callstack that caused the warning for vDUSE, and document that one in the commit message, so that reviewers can understand the issue.

The following was provided in this thread. The commit log contains the call traces that I met and fixed back to Apr. 2020.

https://lore.kernel.org/lkml/[email protected]/

And the problem has been reported many times until last month. So when this patch was pointed at in this thread, I thought it was still the same case.
https://lore.kernel.org/lkml/[email protected]/
https://lore.kernel.org/lkml/[email protected]/

Zhe


>
> Paolo
>

2021-07-15 13:46:52

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 15/07/21 12:10, He Zhe wrote:
> The following was provided in this thread. The commit log contains the call traces that I met and fixed back to Apr. 2020.
>
> https://lore.kernel.org/lkml/[email protected]/

> 001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73 eventfd_signal+0x85/0xa0
> ---- snip ----
> 001: Call Trace:
> 001: vhost_signal+0x15e/0x1b0 [vhost]
> 001: vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
> 001: handle_rx+0xb9/0x900 [vhost_net]
> 001: handle_rx_net+0x15/0x20 [vhost_net]
> 001: vhost_worker+0xbe/0x120 [vhost]
> 001: kthread+0x106/0x140
> 001: ? log_used.part.0+0x20/0x20 [vhost]
> 001: ? kthread_park+0x90/0x90
> 001: ret_from_fork+0x35/0x40

This call trace is not of a reentrant call; there is only one call to
eventfd_signal. It does fit the symptoms that Daniel reported for
PREEMPT_RT though.

> https://lore.kernel.org/lkml/[email protected]/

This one is about PREEMPT_RT, so it would be fixed by local_lock.

There _may_ be two bugs, so let's start by fixing this one. Once this
one is fixed, we will examine the call stacks of any further reports,
and diagnose whether the second bug (if it exists) is related to vDUSE,
PREEMPT_RT or neeither.

Paolo

Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 7/15/21 12:22 PM, Hillf Danton wrote:
>> the lock needs to be per-pcu... but so far, so good. I will continue using the
>> system in the next days to see if it blows on another way.
> Curiou, is it so good as to see the warning no longer printed? Or your
> box still works even with the warning in log?
>

Without the patch:

- warn on dmesg
- I lose my connection with VM (as a side effect)

With the patch:
- no warn
- continue using the VM normally...

-- Daniel

Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 7/15/21 11:46 AM, Paolo Bonzini wrote:
> On 15/07/21 10:22, Daniel Bristot de Oliveira wrote:
>> On 7/14/21 12:35 PM, Paolo Bonzini wrote:
>>> On 14/07/21 11:23, Jason Wang wrote:
>>>> I think this can be probably fixed here:
>>>>
>>>> https://lore.kernel.org/lkml/[email protected]/
>>>
>>> That seems wrong; in particular it wouldn't protect against AB/BA deadlocks.
>>> In fact, the bug is with the locking; the code assumes that
>>> spin_lock_irqsave/spin_unlock_irqrestore is non-preemptable and therefore
>>> increments and decrements the percpu variable inside the critical section.
>>>
>>> This obviously does not fly with PREEMPT_RT; the right fix should be
>>> using a local_lock.  Something like this (untested!!):
>>
>> the lock needs to be per-pcu...
>
> Great, thanks for testing and fixing the patch!  Are you going to post
> it again once you've confirmed it works?

I can do that... unless you want to send it yourself...

-- Daniel

> Paolo

2021-07-16 02:27:34

by Jason Wang

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()


在 2021/7/15 下午7:05, Paolo Bonzini 写道:
> On 15/07/21 12:10, He Zhe wrote:
>> The following was provided in this thread. The commit log contains
>> the call traces that I met and fixed back to Apr. 2020.
>>
>> https://lore.kernel.org/lkml/[email protected]/
>>
>
>> 001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73
>> eventfd_signal+0x85/0xa0
>> ---- snip ----
>> 001: Call Trace:
>> 001:  vhost_signal+0x15e/0x1b0 [vhost]
>> 001:  vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
>> 001:  handle_rx+0xb9/0x900 [vhost_net]
>> 001:  handle_rx_net+0x15/0x20 [vhost_net]
>> 001:  vhost_worker+0xbe/0x120 [vhost]
>> 001:  kthread+0x106/0x140
>> 001:  ? log_used.part.0+0x20/0x20 [vhost]
>> 001:  ? kthread_park+0x90/0x90
>> 001:  ret_from_fork+0x35/0x40
>
> This call trace is not of a reentrant call; there is only one call to
> eventfd_signal.  It does fit the symptoms that Daniel reported for
> PREEMPT_RT though.
>
>> https://lore.kernel.org/lkml/[email protected]/
>>
>
> This one is about PREEMPT_RT, so it would be fixed by local_lock.
>
> There _may_ be two bugs, so let's start by fixing this one.  Once this
> one is fixed, we will examine the call stacks of any further reports,
> and diagnose whether the second bug (if it exists) is related to
> vDUSE, PREEMPT_RT or neeither.


For VDUSE we may still need the patch since it tries to relay
notifications (eventfds) which means the recursion of the eventfd signal.

But looking at the comment in the eventfd_signal() which say we should
check with eventfd_signal_count() and delay the signal into a safe
context (e.g workqueue etc).

Thanks


>
> Paolo
>

2021-07-16 02:45:39

by He Zhe

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()



On 7/16/21 10:26 AM, Jason Wang wrote:
>
> 在 2021/7/15 下午7:05, Paolo Bonzini 写道:
>> On 15/07/21 12:10, He Zhe wrote:
>>> The following was provided in this thread. The commit log contains the call traces that I met and fixed back to Apr. 2020.
>>>
>>> https://lore.kernel.org/lkml/[email protected]/
>>
>>> 001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73 eventfd_signal+0x85/0xa0
>>> ---- snip ----
>>> 001: Call Trace:
>>> 001:  vhost_signal+0x15e/0x1b0 [vhost]
>>> 001:  vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
>>> 001:  handle_rx+0xb9/0x900 [vhost_net]
>>> 001:  handle_rx_net+0x15/0x20 [vhost_net]
>>> 001:  vhost_worker+0xbe/0x120 [vhost]
>>> 001:  kthread+0x106/0x140
>>> 001:  ? log_used.part.0+0x20/0x20 [vhost]
>>> 001:  ? kthread_park+0x90/0x90
>>> 001:  ret_from_fork+0x35/0x40
>>
>> This call trace is not of a reentrant call; there is only one call to eventfd_signal.  It does fit the symptoms that Daniel reported for PREEMPT_RT though.
>>
>>> https://lore.kernel.org/lkml/[email protected]/
>>
>> This one is about PREEMPT_RT, so it would be fixed by local_lock.
>>
>> There _may_ be two bugs, so let's start by fixing this one.  Once this one is fixed, we will examine the call stacks of any further reports, and diagnose whether the second bug (if it exists) is related to vDUSE, PREEMPT_RT or neeither.
>
>
> For VDUSE we may still need the patch since it tries to relay notifications (eventfds) which means the recursion of the eventfd signal.
>
> But looking at the comment in the eventfd_signal() which say we should check with eventfd_signal_count() and delay the signal into a safe context (e.g workqueue etc).

The main concern when adding eventfd count at the very beginning is "Deadlock or stack overflow" in the inline comment. If we can avoid deadlock and one depth of nest is acceptable, I think it's safe to set the max count to 2.

The author of the eventfd count kind of also agrees with this.
https://lore.kernel.org/lkml/[email protected]/

Thanks,
Zhe

>
> Thanks
>
>
>>
>> Paolo
>>
>

2021-07-16 02:48:02

by Jason Wang

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()


在 2021/7/16 上午10:43, He Zhe 写道:
>
> On 7/16/21 10:26 AM, Jason Wang wrote:
>> 在 2021/7/15 下午7:05, Paolo Bonzini 写道:
>>> On 15/07/21 12:10, He Zhe wrote:
>>>> The following was provided in this thread. The commit log contains the call traces that I met and fixed back to Apr. 2020.
>>>>
>>>> https://lore.kernel.org/lkml/[email protected]/
>>>> 001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73 eventfd_signal+0x85/0xa0
>>>> ---- snip ----
>>>> 001: Call Trace:
>>>> 001:  vhost_signal+0x15e/0x1b0 [vhost]
>>>> 001:  vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
>>>> 001:  handle_rx+0xb9/0x900 [vhost_net]
>>>> 001:  handle_rx_net+0x15/0x20 [vhost_net]
>>>> 001:  vhost_worker+0xbe/0x120 [vhost]
>>>> 001:  kthread+0x106/0x140
>>>> 001:  ? log_used.part.0+0x20/0x20 [vhost]
>>>> 001:  ? kthread_park+0x90/0x90
>>>> 001:  ret_from_fork+0x35/0x40
>>> This call trace is not of a reentrant call; there is only one call to eventfd_signal.  It does fit the symptoms that Daniel reported for PREEMPT_RT though.
>>>
>>>> https://lore.kernel.org/lkml/[email protected]/
>>> This one is about PREEMPT_RT, so it would be fixed by local_lock.
>>>
>>> There _may_ be two bugs, so let's start by fixing this one.  Once this one is fixed, we will examine the call stacks of any further reports, and diagnose whether the second bug (if it exists) is related to vDUSE, PREEMPT_RT or neeither.
>>
>> For VDUSE we may still need the patch since it tries to relay notifications (eventfds) which means the recursion of the eventfd signal.
>>
>> But looking at the comment in the eventfd_signal() which say we should check with eventfd_signal_count() and delay the signal into a safe context (e.g workqueue etc).
> The main concern when adding eventfd count at the very beginning is "Deadlock or stack overflow" in the inline comment. If we can avoid deadlock and one depth of nest is acceptable, I think it's safe to set the max count to 2.


That's my understanding as well.


>
> The author of the eventfd count kind of also agrees with this.
> https://lore.kernel.org/lkml/[email protected]/


Ok.

Thanks


>
> Thanks,
> Zhe
>
>> Thanks
>>
>>
>>> Paolo
>>>

2021-07-16 06:57:16

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 16/07/21 04:06, Hillf Danton wrote:
>> With the patch:
>> - no warn
>> - continue using the VM normally...
> Well with the patch applied, the VM works fine without the stuff protected
> by the spin_lock_irqsave(), then without the patch why simply printing a
> warning makes the VM dumb, given the warning is there actually also preventing
> you from touching the lock.
>

If the warning is triggered, eventfd_signal will not do the wakeup.

Paolo

2021-07-16 08:03:35

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 16/07/21 09:55, Hillf Danton wrote:
> On Fri, 16 Jul 2021 08:54:58 +0200 Paolo Bonzini wrote:
>> On 16/07/21 04:06, Hillf Danton wrote:
>>>> With the patch:
>>>> - no warn
>>>> - continue using the VM normally...
>>> Well with the patch applied, the VM works fine without the stuff protected
>>> by the spin_lock_irqsave(), then without the patch why simply printing a
>>> warning makes the VM dumb, given the warning is there actually also
>>> preventing you from touching the lock.
>>
>> If the warning is triggered, eventfd_signal will not do the wakeup.
>
> [I am assuming we are not talking about the deadlock in the comment.]
>
> No preemption occured without the warning printed.
> Why will the wakeup behavior change without peemption?

Sorry, I don't follow. What I'm saying is that without the patch:

* the warning only occurs if preemption occurs during the
spin_lock_irqsave critical section (and therefore it can only occur in
PREEMPT_RT kernels)

* the warning causes an early return 0 that messes up the VM's networking

Paolo

2021-07-16 11:57:08

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 16/07/21 11:37, Hillf Danton wrote:
> On Fri, 16 Jul 2021 09:59:15 +0200 Paolo Bonzini wrote:
>> * the warning only occurs if preemption occurs during the
>> spin_lock_irqsave critical section (and therefore it can only occur in
>> PREEMPT_RT kernels)
>
> With that lock held, no waitqueue entry can be added on to the WQ - IOW no
> wakeup will go stray.
>
>> * the warning causes an early return 0 that messes up the VM's networking
>
> Is the messup due to the zero or wakeup?

It's caused by the missing wakeup, i.e. eventfd_signal not really
signaling anything.

Paolo

2021-07-20 01:58:10

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 18/07/21 14:42, Hillf Danton wrote:
>> It's caused by the missing wakeup, i.e. eventfd_signal not really
>> signaling anything.
>
> Can you please point me to the waiters in the mainline?

It's irqfd_wakeup.

> There are two cases of write_seqcount_begin in x/virt/kvm/eventfd.c, and
> in kvm_irqfd_deassign() it is surrounded by spin_lock_irq(&kvm->irqfds.lock)
> that also protects irqfd_update().
>
> What isnt clear is if the risk is zero that either case can be preempted by
> seqcount reader. That risk may end up with the livelock described in
> x/Documentation/locking/seqlock.rst.

Since the introduction of seqcount_spinlock_t, the writers automatically
disable preemption. This is definitely the right thing in this case
where the seqcount writers are small enough, and the readers are hot
enough, that using a local lock would be too heavyweight.

Without that, the livelock would be possible, though very unlikely. In
practice seqcount updates should only happen while the producer is
quiescent; and also the seqcount readers and writers will often be
pinned to separate CPUs.

Paolo

> +A sequence counter write side critical section must never be preempted
> +or interrupted by read side sections. Otherwise the reader will spin for
> +the entire scheduler tick due to the odd sequence count value and the
> +interrupted writer. If that reader belongs to a real-time scheduling
> +class, it can spin forever and the kernel will livelock.
>

2021-07-21 07:32:48

by Thomas Gleixner

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On Wed, Jul 21 2021 at 15:04, Hillf Danton wrote:
>
> But the preempting waker can not make sense without the waiter who is bloody
> special. Why is it so in the first place? Or it is not at all but the race
> existing from Monday to Friday.

See the large comment in eventfd_poll().

2021-07-21 11:11:57

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 21/07/21 12:11, Hillf Danton wrote:
> On Wed, 21 Jul 2021 09:25:32 +0200 Thomas Gleixner wrote:
>> On Wed, Jul 21 2021 at 15:04, Hillf Danton wrote:
>>>
>>> But the preempting waker can not make sense without the waiter who is bloody
>>> special. Why is it so in the first place? Or it is not at all but the race
>>> existing from Monday to Friday.
>>
>> See the large comment in eventfd_poll().
>
> Is it likely for a reader to make eventfd_poll() return 0?
>
> read * poll write
> ---- * ----------------- ------------
> * count = ctx->count (INVALID!)
> * lock ctx->qwh.lock
> * ctx->count += n
> * **waitqueue_active is false**
> * **no wake_up_locked_poll!**
> * unlock ctx->qwh.lock
>
> lock ctx->qwh.lock
> *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
> ctx->count -= *cnt;
> **waitqueue_active is false**
> unlock ctx->qwh.lock
>
> * lock ctx->wqh.lock (in poll_wait)
> * __add_wait_queue
> * unlock ctx->wqh.lock
> * eventfd_poll returns 0
> */
> count = READ_ONCE(ctx->count);
>

No, it's simply impossible. The same comment explains why: "count =
ctx->count" cannot move above poll_wait's locking of ctx->wqh.lock.

Paolo

2021-07-23 08:01:08

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 23/07/21 04:23, Hillf Danton wrote:
> Detect concurrent reader and writer by reading event counter before and
> after poll_wait(), and determine feedback with the case of unstable
> counter taken into account.
>
> Cut the big comment as the added barriers speak for themselves.

First and foremost, I'm not sure what you are trying to fix.

Second, the patch is wrong even without taking into account the lockless
accesses, because the condition for returning EPOLLOUT is certainly wrong.

Third, barriers very rarely speak for themselves. In particular what
do they pair with? It seems to me that you are basically reintroducing
the same mistake that commit a484c3dd9426 ("eventfd: document lockless
access in eventfd_poll", 2016-03-22) fixed, at the time where the big
comment was introduced:

Things aren't as simple as the read barrier in eventfd_poll
would suggest. In fact, the read barrier, besides lacking a
comment, is not paired in any obvious manner with another read
barrier, and it is pointless because it is sitting between a write
(deep in poll_wait) and the read of ctx->count.

Paolo


> +++ x/fs/eventfd.c
> @@ -131,49 +131,20 @@ static __poll_t eventfd_poll(struct file
> {
> struct eventfd_ctx *ctx = file->private_data;
> __poll_t events = 0;
> - u64 count;
> + u64 c0, count;
> +
> + c0 = ctx->count;
> + smp_rmb();
>
> poll_wait(file, &ctx->wqh, wait);
>
> - /*
> - * All writes to ctx->count occur within ctx->wqh.lock. This read
> - * can be done outside ctx->wqh.lock because we know that poll_wait
> - * takes that lock (through add_wait_queue) if our caller will sleep.
> - *
> - * The read _can_ therefore seep into add_wait_queue's critical
> - * section, but cannot move above it! add_wait_queue's spin_lock acts
> - * as an acquire barrier and ensures that the read be ordered properly
> - * against the writes. The following CAN happen and is safe:
> - *
> - * poll write
> - * ----------------- ------------
> - * lock ctx->wqh.lock (in poll_wait)
> - * count = ctx->count
> - * __add_wait_queue
> - * unlock ctx->wqh.lock
> - * lock ctx->qwh.lock
> - * ctx->count += n
> - * if (waitqueue_active)
> - * wake_up_locked_poll
> - * unlock ctx->qwh.lock
> - * eventfd_poll returns 0
> - *
> - * but the following, which would miss a wakeup, cannot happen:
> - *
> - * poll write
> - * ----------------- ------------
> - * count = ctx->count (INVALID!)
> - * lock ctx->qwh.lock
> - * ctx->count += n
> - * **waitqueue_active is false**
> - * **no wake_up_locked_poll!**
> - * unlock ctx->qwh.lock
> - * lock ctx->wqh.lock (in poll_wait)
> - * __add_wait_queue
> - * unlock ctx->wqh.lock
> - * eventfd_poll returns 0
> - */
> - count = READ_ONCE(ctx->count);
> + smp_rmb();
> + count = ctx->count;
> +
> + if (c0 < count)
> + return EPOLLIN;
> + if (c0 > count)
> + return EPOLLOUT;
>
> if (count > 0)
> events |= EPOLLIN;
>

2021-07-23 11:00:47

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 23/07/21 11:48, Hillf Danton wrote:
> On Fri, 23 Jul 2021 09:59:55 +0200 Paolo Bonzini wrote:
>> First and foremost, I'm not sure what you are trying to fix.
>
> The change proposed builds the return value without assuming that the
> event count is stable across poll_wait(). If it is unstable then we know
> there are concurrent reader and/or writer who both are ingnored currently.

Concurrent reads or writes have their own wake_up_locked_poll calls.
Why are they not enough?

>> Second, the patch is wrong even without taking into account the lockless
>> accesses, because the condition for returning EPOLLOUT is certainly wrong.
>
> Given it is detected that event count was consumed, there is room, though
> as racy as it is, in the event count for writer to make some progress.

For one, you do not return EPOLLIN|EPOLLOUT correctly.

>> Third, barriers very rarely speak for themselves. In particular what
>> do they pair with?
>
> There is no need to consider pair frankly. Barriers are just readded for
> removing the seep in the comment. Then the comment goes with the seep.

Then you would need an smp_mb() to order a spin_unlock() against a
READ_ONCE(). But adding memory barriers randomly is the worst way to
write a lockless algorithm that you can explain to others, and "there is
no need to consider pair frankly" all but convinces me that you've put
enough thought in the change you're proposing.

(Shameless plug: https://lwn.net/Articles/844224/).

> What the comment does not cover is the cases of more-than-two-party race.

But, you still haven't explained what's the bug there.

Paolo

2021-07-26 11:04:16

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 24/07/21 06:33, Hillf Danton wrote:
> lock WQ
> count += n
> no waiter
> unlock WQ

Ok, this is a write.

>
> lock WQ
> add waiter for EPOLLIN
> unlock WQ

This is eventfd_poll(). It hasn't yet returned EPOLLIN.

> lock WQ
> count = 0
> wakeup EPOLLOUT
> unlock WQ

This is a read().

> lock WQ
> count += n
> no waiter
> unlock WQ

This is wrong; after "unlock WQ" in CPU3 there *is* a waiter, no one has
waked it up yet.

Paolo

> ------------------------------- c1 = count

2021-07-28 08:07:51

by Thomas Gleixner

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On Wed, Jul 14 2021 at 12:35, Paolo Bonzini wrote:
> On 14/07/21 11:23, Jason Wang wrote:
>
> +static local_lock_t eventfd_wake_lock = INIT_LOCAL_LOCK(eventfd_wake_lock);
> DEFINE_PER_CPU(int, eventfd_wake_count);
>
> static DEFINE_IDA(eventfd_ida);
> @@ -71,8 +73,11 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
> * it returns true, the eventfd_signal() call should be deferred to a
> * safe context.
> */
> - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> + local_lock(&eventfd_wake_lock);
> + if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) {
> + local_unlock(&eventfd_wake_lock);
> return 0;
> + }
>
> spin_lock_irqsave(&ctx->wqh.lock, flags);
> this_cpu_inc(eventfd_wake_count);
> @@ -83,6 +88,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
> wake_up_locked_poll(&ctx->wqh, EPOLLIN);
> this_cpu_dec(eventfd_wake_count);
> spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> + local_unlock(&eventfd_wake_lock);

Yes, that cures it, but if you think about what this wants to prevent,
then having the recursion counter per CPU is at least suboptimal.

Something like the untested below perhaps?

Thanks,

tglx
---
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_que
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
- if (iocb->ki_eventfd && eventfd_signal_count()) {
+ if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,8 +25,6 @@
#include <linux/idr.h>
#include <linux/uio.h>

-DEFINE_PER_CPU(int, eventfd_wake_count);
-
static DEFINE_IDA(eventfd_ida);

struct eventfd_ctx {
@@ -67,21 +65,21 @@ struct eventfd_ctx {
* Deadlock or stack overflow issues can happen if we recurse here
* through waitqueue wakeup handlers. If the caller users potentially
* nested waitqueues with custom wakeup handlers, then it should
- * check eventfd_signal_count() before calling this function. If
- * it returns true, the eventfd_signal() call should be deferred to a
+ * check eventfd_signal_allowed() before calling this function. If
+ * it returns false, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ if (WARN_ON_ONCE(current->in_eventfd_signal))
return 0;

spin_lock_irqsave(&ctx->wqh.lock, flags);
- this_cpu_inc(eventfd_wake_count);
+ current->in_eventfd_signal = 1;
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- this_cpu_dec(eventfd_wake_count);
+ current->in_eventfd_signal = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);

return n;
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -43,11 +43,9 @@ int eventfd_ctx_remove_wait_queue(struct
__u64 *cnt);
void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);

-DECLARE_PER_CPU(int, eventfd_wake_count);
-
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
{
- return this_cpu_read(eventfd_wake_count);
+ return !current->in_eventfd_signal;
}

#else /* CONFIG_EVENTFD */
@@ -78,9 +76,9 @@ static inline int eventfd_ctx_remove_wai
return -ENOSYS;
}

-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
{
- return false;
+ return true;
}

static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -863,6 +863,8 @@ struct task_struct {
/* Used by page_owner=on to detect recursion in page tracking. */
unsigned in_page_owner:1;
#endif
+ /* Recursion prevention for eventfd_signal() */
+ unsigned in_eventfd_signal:1;

unsigned long atomic_flags; /* Flags requiring atomic access. */



2021-07-28 10:23:20

by Paolo Bonzini

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On 28/07/21 10:06, Thomas Gleixner wrote:
> On Wed, Jul 14 2021 at 12:35, Paolo Bonzini wrote:
>> On 14/07/21 11:23, Jason Wang wrote:
>>
>> +static local_lock_t eventfd_wake_lock = INIT_LOCAL_LOCK(eventfd_wake_lock);
>> DEFINE_PER_CPU(int, eventfd_wake_count);
>>
>> static DEFINE_IDA(eventfd_ida);
>> @@ -71,8 +73,11 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>> * it returns true, the eventfd_signal() call should be deferred to a
>> * safe context.
>> */
>> - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
>> + local_lock(&eventfd_wake_lock);
>> + if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) {
>> + local_unlock(&eventfd_wake_lock);
>> return 0;
>> + }
>>
>> spin_lock_irqsave(&ctx->wqh.lock, flags);
>> this_cpu_inc(eventfd_wake_count);
>> @@ -83,6 +88,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>> wake_up_locked_poll(&ctx->wqh, EPOLLIN);
>> this_cpu_dec(eventfd_wake_count);
>> spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>> + local_unlock(&eventfd_wake_lock);
>
> Yes, that cures it, but if you think about what this wants to prevent,
> then having the recursion counter per CPU is at least suboptimal.
>
> Something like the untested below perhaps?

Yes, that works (it should just be #ifdef CONFIG_EVENTFD).

On !PREEMPT_RT the percpu variable consumes memory while your patch uses
none (there are plenty of spare bits in current), but it is otherwise
basically the same. On PREEMPT_RT the local_lock is certainly more
expensive.

Thanks,

Paolo

> Thanks,
>
> tglx
> ---
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_que
> list_del(&iocb->ki_list);
> iocb->ki_res.res = mangle_poll(mask);
> req->done = true;
> - if (iocb->ki_eventfd && eventfd_signal_count()) {
> + if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
> iocb = NULL;
> INIT_WORK(&req->work, aio_poll_put_work);
> schedule_work(&req->work);
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -25,8 +25,6 @@
> #include <linux/idr.h>
> #include <linux/uio.h>
>
> -DEFINE_PER_CPU(int, eventfd_wake_count);
> -
> static DEFINE_IDA(eventfd_ida);
>
> struct eventfd_ctx {
> @@ -67,21 +65,21 @@ struct eventfd_ctx {
> * Deadlock or stack overflow issues can happen if we recurse here
> * through waitqueue wakeup handlers. If the caller users potentially
> * nested waitqueues with custom wakeup handlers, then it should
> - * check eventfd_signal_count() before calling this function. If
> - * it returns true, the eventfd_signal() call should be deferred to a
> + * check eventfd_signal_allowed() before calling this function. If
> + * it returns false, the eventfd_signal() call should be deferred to a
> * safe context.
> */
> - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> + if (WARN_ON_ONCE(current->in_eventfd_signal))
> return 0;
>
> spin_lock_irqsave(&ctx->wqh.lock, flags);
> - this_cpu_inc(eventfd_wake_count);
> + current->in_eventfd_signal = 1;
> if (ULLONG_MAX - ctx->count < n)
> n = ULLONG_MAX - ctx->count;
> ctx->count += n;
> if (waitqueue_active(&ctx->wqh))
> wake_up_locked_poll(&ctx->wqh, EPOLLIN);
> - this_cpu_dec(eventfd_wake_count);
> + current->in_eventfd_signal = 0;
> spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>
> return n;
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -43,11 +43,9 @@ int eventfd_ctx_remove_wait_queue(struct
> __u64 *cnt);
> void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
>
> -DECLARE_PER_CPU(int, eventfd_wake_count);
> -
> -static inline bool eventfd_signal_count(void)
> +static inline bool eventfd_signal_allowed(void)
> {
> - return this_cpu_read(eventfd_wake_count);
> + return !current->in_eventfd_signal;
> }
>
> #else /* CONFIG_EVENTFD */
> @@ -78,9 +76,9 @@ static inline int eventfd_ctx_remove_wai
> return -ENOSYS;
> }
>
> -static inline bool eventfd_signal_count(void)
> +static inline bool eventfd_signal_allowed(void)
> {
> - return false;
> + return true;
> }
>
> static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -863,6 +863,8 @@ struct task_struct {
> /* Used by page_owner=on to detect recursion in page tracking. */
> unsigned in_page_owner:1;
> #endif
> + /* Recursion prevention for eventfd_signal() */
> + unsigned in_eventfd_signal:1;
>
> unsigned long atomic_flags; /* Flags requiring atomic access. */
>
>


2021-07-28 19:09:03

by Thomas Gleixner

[permalink] [raw]
Subject: Re: 5.13-rt1 + KVM = WARNING: at fs/eventfd.c:74 eventfd_signal()

On Wed, Jul 28 2021 at 12:21, Paolo Bonzini wrote:
> On 28/07/21 10:06, Thomas Gleixner wrote:
>> On Wed, Jul 14 2021 at 12:35, Paolo Bonzini wrote:
>> Yes, that cures it, but if you think about what this wants to prevent,
>> then having the recursion counter per CPU is at least suboptimal.
>>
>> Something like the untested below perhaps?
>
> Yes, that works (it should just be #ifdef CONFIG_EVENTFD).

Yup and it lacks an include.

> On !PREEMPT_RT the percpu variable consumes memory while your patch uses
> none (there are plenty of spare bits in current), but it is otherwise
> basically the same. On PREEMPT_RT the local_lock is certainly more
> expensive.

Let me send a proper patch for that.

Thanks,

tglx

2021-07-29 11:04:23

by Thomas Gleixner

[permalink] [raw]
Subject: [PATCH] eventfd: Make signal recursion protection a task bit

The recursion protection for eventfd_signal() is based on a per CPU
variable and relies on the !RT semantics of spin_lock_irqsave() for
protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither
disables preemption nor interrupts which allows the spin lock held section
to be preempted. If the preempting task invokes eventfd_signal() as well,
then the recursion warning triggers.

Paolo suggested to protect the per CPU variable with a local lock, but
that's heavyweight and actually not necessary. The goal of this protection
is to prevent the task stack from overflowing, which can be achieved with a
per task recursion protection as well.

Replace the per CPU variable with a per task bit similar to other recursion
protection bits like task_struct::in_page_owner. This works on both !RT and
RT kernels and removes as a side effect the extra per CPU storage.

No functional change for !RT kernels.

Reported-by: Daniel Bristot de Oliveira <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
fs/aio.c | 2 +-
fs/eventfd.c | 12 +++++-------
include/linux/eventfd.h | 11 +++++------
include/linux/sched.h | 4 ++++
4 files changed, 15 insertions(+), 14 deletions(-)

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_que
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
- if (iocb->ki_eventfd && eventfd_signal_count()) {
+ if (iocb->ki_eventfd && eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,8 +25,6 @@
#include <linux/idr.h>
#include <linux/uio.h>

-DEFINE_PER_CPU(int, eventfd_wake_count);
-
static DEFINE_IDA(eventfd_ida);

struct eventfd_ctx {
@@ -67,21 +65,21 @@ struct eventfd_ctx {
* Deadlock or stack overflow issues can happen if we recurse here
* through waitqueue wakeup handlers. If the caller users potentially
* nested waitqueues with custom wakeup handlers, then it should
- * check eventfd_signal_count() before calling this function. If
- * it returns true, the eventfd_signal() call should be deferred to a
+ * check eventfd_signal_allowed() before calling this function. If
+ * it returns false, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ if (WARN_ON_ONCE(current->in_eventfd_signal))
return 0;

spin_lock_irqsave(&ctx->wqh.lock, flags);
- this_cpu_inc(eventfd_wake_count);
+ current->in_eventfd_signal = 1;
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- this_cpu_dec(eventfd_wake_count);
+ current->in_eventfd_signal = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);

return n;
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/percpu-defs.h>
#include <linux/percpu.h>
+#include <linux/sched.h>

/*
* CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
@@ -43,11 +44,9 @@ int eventfd_ctx_remove_wait_queue(struct
__u64 *cnt);
void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);

-DECLARE_PER_CPU(int, eventfd_wake_count);
-
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
{
- return this_cpu_read(eventfd_wake_count);
+ return !current->in_eventfd_signal;
}

#else /* CONFIG_EVENTFD */
@@ -78,9 +77,9 @@ static inline int eventfd_ctx_remove_wai
return -ENOSYS;
}

-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
{
- return false;
+ return true;
}

static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -863,6 +863,10 @@ struct task_struct {
/* Used by page_owner=on to detect recursion in page tracking. */
unsigned in_page_owner:1;
#endif
+#ifdef CONFIG_EVENTFD
+ /* Recursion prevention for eventfd_signal() */
+ unsigned in_eventfd_signal:1;
+#endif

unsigned long atomic_flags; /* Flags requiring atomic access. */


Subject: Re: [PATCH] eventfd: Make signal recursion protection a task bit

On 7/29/21 1:01 PM, Thomas Gleixner wrote:
> The recursion protection for eventfd_signal() is based on a per CPU
> variable and relies on the !RT semantics of spin_lock_irqsave() for
> protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither
> disables preemption nor interrupts which allows the spin lock held section
> to be preempted. If the preempting task invokes eventfd_signal() as well,
> then the recursion warning triggers.
>
> Paolo suggested to protect the per CPU variable with a local lock, but
> that's heavyweight and actually not necessary. The goal of this protection
> is to prevent the task stack from overflowing, which can be achieved with a
> per task recursion protection as well.
>
> Replace the per CPU variable with a per task bit similar to other recursion
> protection bits like task_struct::in_page_owner. This works on both !RT and
> RT kernels and removes as a side effect the extra per CPU storage.
>
> No functional change for !RT kernels.
>
> Reported-by: Daniel Bristot de Oliveira <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>

Testing....

Thanks!
-- Daniel


Subject: Re: [PATCH] eventfd: Make signal recursion protection a task bit

On 7/29/21 1:01 PM, Thomas Gleixner wrote:
> The recursion protection for eventfd_signal() is based on a per CPU
> variable and relies on the !RT semantics of spin_lock_irqsave() for
> protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither
> disables preemption nor interrupts which allows the spin lock held section
> to be preempted. If the preempting task invokes eventfd_signal() as well,
> then the recursion warning triggers.
>
> Paolo suggested to protect the per CPU variable with a local lock, but
> that's heavyweight and actually not necessary. The goal of this protection
> is to prevent the task stack from overflowing, which can be achieved with a
> per task recursion protection as well.
>
> Replace the per CPU variable with a per task bit similar to other recursion
> protection bits like task_struct::in_page_owner. This works on both !RT and
> RT kernels and removes as a side effect the extra per CPU storage.
>
> No functional change for !RT kernels.
>
> Reported-by: Daniel Bristot de Oliveira <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>

Tested-by: Daniel Bristot de Oliveira <[email protected]>

Thanks!


2021-08-26 07:05:59

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] eventfd: Make signal recursion protection a task bit


?? 2021/7/29 ????7:01, Thomas Gleixner д??:
> The recursion protection for eventfd_signal() is based on a per CPU
> variable and relies on the !RT semantics of spin_lock_irqsave() for
> protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither
> disables preemption nor interrupts which allows the spin lock held section
> to be preempted. If the preempting task invokes eventfd_signal() as well,
> then the recursion warning triggers.
>
> Paolo suggested to protect the per CPU variable with a local lock, but
> that's heavyweight and actually not necessary. The goal of this protection
> is to prevent the task stack from overflowing, which can be achieved with a
> per task recursion protection as well.
>
> Replace the per CPU variable with a per task bit similar to other recursion
> protection bits like task_struct::in_page_owner. This works on both !RT and
> RT kernels and removes as a side effect the extra per CPU storage.
>
> No functional change for !RT kernels.
>
> Reported-by: Daniel Bristot de Oliveira <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>


Acked-by: Jason Wang <[email protected]>

Anyone want to pick this patch?


> ---
> fs/aio.c | 2 +-
> fs/eventfd.c | 12 +++++-------
> include/linux/eventfd.h | 11 +++++------
> include/linux/sched.h | 4 ++++
> 4 files changed, 15 insertions(+), 14 deletions(-)
>
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_que
> list_del(&iocb->ki_list);
> iocb->ki_res.res = mangle_poll(mask);
> req->done = true;
> - if (iocb->ki_eventfd && eventfd_signal_count()) {
> + if (iocb->ki_eventfd && eventfd_signal_allowed()) {
> iocb = NULL;
> INIT_WORK(&req->work, aio_poll_put_work);
> schedule_work(&req->work);
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -25,8 +25,6 @@
> #include <linux/idr.h>
> #include <linux/uio.h>
>
> -DEFINE_PER_CPU(int, eventfd_wake_count);
> -
> static DEFINE_IDA(eventfd_ida);
>
> struct eventfd_ctx {
> @@ -67,21 +65,21 @@ struct eventfd_ctx {
> * Deadlock or stack overflow issues can happen if we recurse here
> * through waitqueue wakeup handlers. If the caller users potentially
> * nested waitqueues with custom wakeup handlers, then it should
> - * check eventfd_signal_count() before calling this function. If
> - * it returns true, the eventfd_signal() call should be deferred to a
> + * check eventfd_signal_allowed() before calling this function. If
> + * it returns false, the eventfd_signal() call should be deferred to a
> * safe context.
> */
> - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
> + if (WARN_ON_ONCE(current->in_eventfd_signal))
> return 0;
>
> spin_lock_irqsave(&ctx->wqh.lock, flags);
> - this_cpu_inc(eventfd_wake_count);
> + current->in_eventfd_signal = 1;
> if (ULLONG_MAX - ctx->count < n)
> n = ULLONG_MAX - ctx->count;
> ctx->count += n;
> if (waitqueue_active(&ctx->wqh))
> wake_up_locked_poll(&ctx->wqh, EPOLLIN);
> - this_cpu_dec(eventfd_wake_count);
> + current->in_eventfd_signal = 0;
> spin_unlock_irqrestore(&ctx->wqh.lock, flags);
>
> return n;
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -14,6 +14,7 @@
> #include <linux/err.h>
> #include <linux/percpu-defs.h>
> #include <linux/percpu.h>
> +#include <linux/sched.h>
>
> /*
> * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
> @@ -43,11 +44,9 @@ int eventfd_ctx_remove_wait_queue(struct
> __u64 *cnt);
> void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
>
> -DECLARE_PER_CPU(int, eventfd_wake_count);
> -
> -static inline bool eventfd_signal_count(void)
> +static inline bool eventfd_signal_allowed(void)
> {
> - return this_cpu_read(eventfd_wake_count);
> + return !current->in_eventfd_signal;
> }
>
> #else /* CONFIG_EVENTFD */
> @@ -78,9 +77,9 @@ static inline int eventfd_ctx_remove_wai
> return -ENOSYS;
> }
>
> -static inline bool eventfd_signal_count(void)
> +static inline bool eventfd_signal_allowed(void)
> {
> - return false;
> + return true;
> }
>
> static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -863,6 +863,10 @@ struct task_struct {
> /* Used by page_owner=on to detect recursion in page tracking. */
> unsigned in_page_owner:1;
> #endif
> +#ifdef CONFIG_EVENTFD
> + /* Recursion prevention for eventfd_signal() */
> + unsigned in_eventfd_signal:1;
> +#endif
>
> unsigned long atomic_flags; /* Flags requiring atomic access. */
>
>

2021-08-27 23:45:47

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: sched/core] eventfd: Make signal recursion protection a task bit

The following commit has been merged into the sched/core branch of tip:

Commit-ID: b542e383d8c005f06a131e2b40d5889b812f19c6
Gitweb: https://git.kernel.org/tip/b542e383d8c005f06a131e2b40d5889b812f19c6
Author: Thomas Gleixner <[email protected]>
AuthorDate: Thu, 29 Jul 2021 13:01:59 +02:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Sat, 28 Aug 2021 01:33:02 +02:00

eventfd: Make signal recursion protection a task bit

The recursion protection for eventfd_signal() is based on a per CPU
variable and relies on the !RT semantics of spin_lock_irqsave() for
protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither
disables preemption nor interrupts which allows the spin lock held section
to be preempted. If the preempting task invokes eventfd_signal() as well,
then the recursion warning triggers.

Paolo suggested to protect the per CPU variable with a local lock, but
that's heavyweight and actually not necessary. The goal of this protection
is to prevent the task stack from overflowing, which can be achieved with a
per task recursion protection as well.

Replace the per CPU variable with a per task bit similar to other recursion
protection bits like task_struct::in_page_owner. This works on both !RT and
RT kernels and removes as a side effect the extra per CPU storage.

No functional change for !RT kernels.

Reported-by: Daniel Bristot de Oliveira <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Tested-by: Daniel Bristot de Oliveira <[email protected]>
Acked-by: Jason Wang <[email protected]>
Cc: Al Viro <[email protected]>
Link: https://lore.kernel.org/r/87wnp9idso.ffs@tglx

---
fs/aio.c | 2 +-
fs/eventfd.c | 12 +++++-------
include/linux/eventfd.h | 11 +++++------
include/linux/sched.h | 4 ++++
4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 76ce0cc..51b08ab 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
- if (iocb->ki_eventfd && eventfd_signal_count()) {
+ if (iocb->ki_eventfd && eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e265b6d..3627dd7 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,8 +25,6 @@
#include <linux/idr.h>
#include <linux/uio.h>

-DEFINE_PER_CPU(int, eventfd_wake_count);
-
static DEFINE_IDA(eventfd_ida);

struct eventfd_ctx {
@@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* Deadlock or stack overflow issues can happen if we recurse here
* through waitqueue wakeup handlers. If the caller users potentially
* nested waitqueues with custom wakeup handlers, then it should
- * check eventfd_signal_count() before calling this function. If
- * it returns true, the eventfd_signal() call should be deferred to a
+ * check eventfd_signal_allowed() before calling this function. If
+ * it returns false, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ if (WARN_ON_ONCE(current->in_eventfd_signal))
return 0;

spin_lock_irqsave(&ctx->wqh.lock, flags);
- this_cpu_inc(eventfd_wake_count);
+ current->in_eventfd_signal = 1;
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- this_cpu_dec(eventfd_wake_count);
+ current->in_eventfd_signal = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);

return n;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index fa0a524..305d5f1 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/percpu-defs.h>
#include <linux/percpu.h>
+#include <linux/sched.h>

/*
* CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
@@ -43,11 +44,9 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w
__u64 *cnt);
void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);

-DECLARE_PER_CPU(int, eventfd_wake_count);
-
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
{
- return this_cpu_read(eventfd_wake_count);
+ return !current->in_eventfd_signal;
}

#else /* CONFIG_EVENTFD */
@@ -78,9 +77,9 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
return -ENOSYS;
}

-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
{
- return false;
+ return true;
}

static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3bb9fec..6421a9a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -864,6 +864,10 @@ struct task_struct {
/* Used by page_owner=on to detect recursion in page tracking. */
unsigned in_page_owner:1;
#endif
+#ifdef CONFIG_EVENTFD
+ /* Recursion prevention for eventfd_signal() */
+ unsigned in_eventfd_signal:1;
+#endif

unsigned long atomic_flags; /* Flags requiring atomic access. */