LinuxLists.cc - [PATCH 1/2] eventfd: Make wake counter work for single fd instead of all

2020-04-07 11:01:20

Subject: [PATCH 1/2] eventfd: Make wake counter work for single fd instead of all

From: He Zhe <[email protected]>

commit b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
introduces a percpu counter that tracks the percpu recursion depth and
warn if it greater than one, to avoid potential deadlock and stack
overflow.

However sometimes different eventfds may be used in parallel.
Specifically, when high network load goes through kvm and vhost, working
as below, it would trigger the following call trace.

- 100.00%
- 66.51%
ret_from_fork
kthread
- vhost_worker
- 33.47% handle_tx_kick
handle_tx
handle_tx_copy
vhost_tx_batch.isra.0
vhost_add_used_and_signal_n
eventfd_signal
- 33.05% handle_rx_net
handle_rx
vhost_add_used_and_signal_n
eventfd_signal
- 33.49%
ioctl
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
ksys_ioctl
do_vfs_ioctl
kvm_vcpu_ioctl
kvm_arch_vcpu_ioctl_run
vmx_handle_exit
handle_ept_misconfig
kvm_io_bus_write
__kvm_io_bus_write
eventfd_signal

001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73 eventfd_signal+0x85/0xa0
---- snip ----
001: Call Trace:
001: vhost_signal+0x15e/0x1b0 [vhost]
001: vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
001: handle_rx+0xb9/0x900 [vhost_net]
001: handle_rx_net+0x15/0x20 [vhost_net]
001: vhost_worker+0xbe/0x120 [vhost]
001: kthread+0x106/0x140
001: ? log_used.part.0+0x20/0x20 [vhost]
001: ? kthread_park+0x90/0x90
001: ret_from_fork+0x35/0x40
001: ---[ end trace 0000000000000003 ]---

This patch moves the percpu counter into eventfd control structure and
does the clean-ups, so that eventfd can still be protected from deadlock
while allowing different ones to work in parallel.

As to potential stack overflow, we might want to figure out a better
solution in the future to warn when the stack is about to overflow so it
can be better utilized, rather than break the working flow when just the
second one comes.

Fixes: b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
Signed-off-by: He Zhe <[email protected]>
---
Having searched the lkml, I didn't find the discussion of the aio use case
commit b5e683d5cab8 was added for, so I didn't validate this patch against
it.

fs/eventfd.c | 31 +++++++++----------------------
include/linux/eventfd.h | 24 +++++++++++++++++++-----
2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 78e41c7..bb4385a 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -24,26 +24,8 @@
#include <linux/seq_file.h>
#include <linux/idr.h>

-DEFINE_PER_CPU(int, eventfd_wake_count);
-
static DEFINE_IDA(eventfd_ida);

-struct eventfd_ctx {
- struct kref kref;
- wait_queue_head_t wqh;
- /*
- * Every time that a write(2) is performed on an eventfd, the
- * value of the __u64 being written is added to "count" and a
- * wakeup is performed on "wqh". A read(2) will return the "count"
- * value to userspace, and will reset "count" to zero. The kernel
- * side eventfd_signal() also, adds to the "count" counter and
- * issue a wakeup.
- */
- __u64 count;
- unsigned int flags;
- int id;
-};
-
/**
* eventfd_signal - Adds @n to the eventfd counter.
* @ctx: [in] Pointer to the eventfd context.
@@ -70,17 +52,17 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
* it returns true, the eventfd_signal() call should be deferred to a
* safe context.
*/
- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ if (WARN_ON_ONCE(this_cpu_read(*ctx->wake_count)))
return 0;

spin_lock_irqsave(&ctx->wqh.lock, flags);
- this_cpu_inc(eventfd_wake_count);
+ this_cpu_inc(*ctx->wake_count);
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
- this_cpu_dec(eventfd_wake_count);
+ this_cpu_dec(*ctx->wake_count);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);

return n;
@@ -406,7 +388,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
static int do_eventfd(unsigned int count, int flags)
{
struct eventfd_ctx *ctx;
- int fd;
+ int fd, cpu;

/* Check the EFD_* constants for consistency. */
BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
@@ -424,6 +406,11 @@ static int do_eventfd(unsigned int count, int flags)
ctx->count = count;
ctx->flags = flags;
ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
+ ctx->wake_count = alloc_percpu(typeof(*ctx->wake_count));
+ if (!ctx->wake_count)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(ctx->wake_count, cpu) = 0;

fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index dc4fd8a..b0d0f44 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -29,7 +29,23 @@
#define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)

-struct eventfd_ctx;
+struct eventfd_ctx {
+ struct kref kref;
+ wait_queue_head_t wqh;
+ /*
+ * Every time that a write(2) is performed on an eventfd, the
+ * value of the __u64 being written is added to "count" and a
+ * wakeup is performed on "wqh". A read(2) will return the "count"
+ * value to userspace, and will reset "count" to zero. The kernel
+ * side eventfd_signal() also, adds to the "count" counter and
+ * issue a wakeup.
+ */
+ __u64 count;
+ unsigned int flags;
+ int id;
+ int __percpu *wake_count;
+};
+
struct file;

#ifdef CONFIG_EVENTFD
@@ -42,11 +58,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
__u64 *cnt);

-DECLARE_PER_CPU(int, eventfd_wake_count);
-
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_count(struct eventfd_ctx *ctx)
{
- return this_cpu_read(eventfd_wake_count);
+ return this_cpu_read(*ctx->wake_count);
}

#else /* CONFIG_EVENTFD */
--
2.7.4

2020-04-07 11:02:00

by He Zhe

[permalink] [raw]

Subject: [PATCH 2/2] aio: Update calling to eventfd_signal_count with righ parameter

From: He Zhe <[email protected]>

eventfd_signal_count has been changed to take eventfd context as
parameter. Update the calling accordingly.

Signed-off-by: He Zhe <[email protected]>
---
fs/aio.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/aio.c b/fs/aio.c
index 5f3d3d8..c03c487 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1693,7 +1693,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
- if (iocb->ki_eventfd && eventfd_signal_count()) {
+ if (iocb->ki_eventfd &&
+ eventfd_signal_count(iocb->ki_eventfd)) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
--
2.7.4

2020-04-07 20:08:18

by Jens Axboe

[permalink] [raw]

Subject: Re: [PATCH 1/2] eventfd: Make wake counter work for single fd instead of all

On 4/7/20 3:59 AM, [email protected] wrote:
> From: He Zhe <[email protected]>
>
> commit b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
> introduces a percpu counter that tracks the percpu recursion depth and
> warn if it greater than one, to avoid potential deadlock and stack
> overflow.
>
> However sometimes different eventfds may be used in parallel.
> Specifically, when high network load goes through kvm and vhost, working
> as below, it would trigger the following call trace.
>
> - 100.00%
> - 66.51%
> ret_from_fork
> kthread
> - vhost_worker
> - 33.47% handle_tx_kick
> handle_tx
> handle_tx_copy
> vhost_tx_batch.isra.0
> vhost_add_used_and_signal_n
> eventfd_signal
> - 33.05% handle_rx_net
> handle_rx
> vhost_add_used_and_signal_n
> eventfd_signal
> - 33.49%
> ioctl
> entry_SYSCALL_64_after_hwframe
> do_syscall_64
> __x64_sys_ioctl
> ksys_ioctl
> do_vfs_ioctl
> kvm_vcpu_ioctl
> kvm_arch_vcpu_ioctl_run
> vmx_handle_exit
> handle_ept_misconfig
> kvm_io_bus_write
> __kvm_io_bus_write
> eventfd_signal
>
> 001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73 eventfd_signal+0x85/0xa0
> ---- snip ----
> 001: Call Trace:
> 001: vhost_signal+0x15e/0x1b0 [vhost]
> 001: vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
> 001: handle_rx+0xb9/0x900 [vhost_net]
> 001: handle_rx_net+0x15/0x20 [vhost_net]
> 001: vhost_worker+0xbe/0x120 [vhost]
> 001: kthread+0x106/0x140
> 001: ? log_used.part.0+0x20/0x20 [vhost]
> 001: ? kthread_park+0x90/0x90
> 001: ret_from_fork+0x35/0x40
> 001: ---[ end trace 0000000000000003 ]---
>
> This patch moves the percpu counter into eventfd control structure and
> does the clean-ups, so that eventfd can still be protected from deadlock
> while allowing different ones to work in parallel.
>
> As to potential stack overflow, we might want to figure out a better
> solution in the future to warn when the stack is about to overflow so it
> can be better utilized, rather than break the working flow when just the
> second one comes.

This doesn't work for the infinite recursion case, the state has to be
global, or per thread.

--
Jens Axboe

2020-04-09 10:38:55

by He Zhe

[permalink] [raw]

Subject: Re: [PATCH 1/2] eventfd: Make wake counter work for single fd instead of all

On 4/8/20 4:06 AM, Jens Axboe wrote:
> On 4/7/20 3:59 AM, [email protected] wrote:
>> From: He Zhe <[email protected]>
>>
>> commit b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
>> introduces a percpu counter that tracks the percpu recursion depth and
>> warn if it greater than one, to avoid potential deadlock and stack
>> overflow.
>>
>> However sometimes different eventfds may be used in parallel.
>> Specifically, when high network load goes through kvm and vhost, working
>> as below, it would trigger the following call trace.
>>
>> - 100.00%
>> - 66.51%
>> ret_from_fork
>> kthread
>> - vhost_worker
>> - 33.47% handle_tx_kick
>> handle_tx
>> handle_tx_copy
>> vhost_tx_batch.isra.0
>> vhost_add_used_and_signal_n
>> eventfd_signal
>> - 33.05% handle_rx_net
>> handle_rx
>> vhost_add_used_and_signal_n
>> eventfd_signal
>> - 33.49%
>> ioctl
>> entry_SYSCALL_64_after_hwframe
>> do_syscall_64
>> __x64_sys_ioctl
>> ksys_ioctl
>> do_vfs_ioctl
>> kvm_vcpu_ioctl
>> kvm_arch_vcpu_ioctl_run
>> vmx_handle_exit
>> handle_ept_misconfig
>> kvm_io_bus_write
>> __kvm_io_bus_write
>> eventfd_signal
>>
>> 001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73 eventfd_signal+0x85/0xa0
>> ---- snip ----
>> 001: Call Trace:
>> 001: vhost_signal+0x15e/0x1b0 [vhost]
>> 001: vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
>> 001: handle_rx+0xb9/0x900 [vhost_net]
>> 001: handle_rx_net+0x15/0x20 [vhost_net]
>> 001: vhost_worker+0xbe/0x120 [vhost]
>> 001: kthread+0x106/0x140
>> 001: ? log_used.part.0+0x20/0x20 [vhost]
>> 001: ? kthread_park+0x90/0x90
>> 001: ret_from_fork+0x35/0x40
>> 001: ---[ end trace 0000000000000003 ]---
>>
>> This patch moves the percpu counter into eventfd control structure and
>> does the clean-ups, so that eventfd can still be protected from deadlock
>> while allowing different ones to work in parallel.
>>
>> As to potential stack overflow, we might want to figure out a better
>> solution in the future to warn when the stack is about to overflow so it
>> can be better utilized, rather than break the working flow when just the
>> second one comes.
> This doesn't work for the infinite recursion case, the state has to be
> global, or per thread.

Thanks, but I'm not very clear about why the counter has to be global or per
thread.

If the recursion happens on the same eventfd, the attempt to re-grab the same
ctx->wqh.lock would be blocked by the fd-specific counter in this patch.

If the recursion happens with a chain of different eventfds, that might lead to
a stack overflow issue. The issue should be handled but it seems unnecessary to
stop the just the second ring(when the counter is going to be 2) of the chain.

Specifically in the vhost case, it runs very likely with heavy network load
which generates loads of eventfd_signal. Delaying the eventfd_signal to worker
threads will still end up violating the global counter later and failing as
above.

So we might want to take care of the potential overflow later, hopefully with a
measurement that can tell us if it's about to overflow.

Zhe

>

2020-04-09 15:46:48

by Jens Axboe

[permalink] [raw]

Subject: Re: [PATCH 1/2] eventfd: Make wake counter work for single fd instead of all

On 4/9/20 3:37 AM, He Zhe wrote:
>
>
> On 4/8/20 4:06 AM, Jens Axboe wrote:
>> On 4/7/20 3:59 AM, [email protected] wrote:
>>> From: He Zhe <[email protected]>
>>>
>>> commit b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
>>> introduces a percpu counter that tracks the percpu recursion depth and
>>> warn if it greater than one, to avoid potential deadlock and stack
>>> overflow.
>>>
>>> However sometimes different eventfds may be used in parallel.
>>> Specifically, when high network load goes through kvm and vhost, working
>>> as below, it would trigger the following call trace.
>>>
>>> - 100.00%
>>> - 66.51%
>>> ret_from_fork
>>> kthread
>>> - vhost_worker
>>> - 33.47% handle_tx_kick
>>> handle_tx
>>> handle_tx_copy
>>> vhost_tx_batch.isra.0
>>> vhost_add_used_and_signal_n
>>> eventfd_signal
>>> - 33.05% handle_rx_net
>>> handle_rx
>>> vhost_add_used_and_signal_n
>>> eventfd_signal
>>> - 33.49%
>>> ioctl
>>> entry_SYSCALL_64_after_hwframe
>>> do_syscall_64
>>> __x64_sys_ioctl
>>> ksys_ioctl
>>> do_vfs_ioctl
>>> kvm_vcpu_ioctl
>>> kvm_arch_vcpu_ioctl_run
>>> vmx_handle_exit
>>> handle_ept_misconfig
>>> kvm_io_bus_write
>>> __kvm_io_bus_write
>>> eventfd_signal
>>>
>>> 001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73 eventfd_signal+0x85/0xa0
>>> ---- snip ----
>>> 001: Call Trace:
>>> 001: vhost_signal+0x15e/0x1b0 [vhost]
>>> 001: vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
>>> 001: handle_rx+0xb9/0x900 [vhost_net]
>>> 001: handle_rx_net+0x15/0x20 [vhost_net]
>>> 001: vhost_worker+0xbe/0x120 [vhost]
>>> 001: kthread+0x106/0x140
>>> 001: ? log_used.part.0+0x20/0x20 [vhost]
>>> 001: ? kthread_park+0x90/0x90
>>> 001: ret_from_fork+0x35/0x40
>>> 001: ---[ end trace 0000000000000003 ]---
>>>
>>> This patch moves the percpu counter into eventfd control structure and
>>> does the clean-ups, so that eventfd can still be protected from deadlock
>>> while allowing different ones to work in parallel.
>>>
>>> As to potential stack overflow, we might want to figure out a better
>>> solution in the future to warn when the stack is about to overflow so it
>>> can be better utilized, rather than break the working flow when just the
>>> second one comes.
>> This doesn't work for the infinite recursion case, the state has to be
>> global, or per thread.
>
> Thanks, but I'm not very clear about why the counter has to be global
> or per thread.
>
> If the recursion happens on the same eventfd, the attempt to re-grab
> the same ctx->wqh.lock would be blocked by the fd-specific counter in
> this patch.
>
> If the recursion happens with a chain of different eventfds, that
> might lead to a stack overflow issue. The issue should be handled but
> it seems unnecessary to stop the just the second ring(when the counter
> is going to be 2) of the chain.
>
> Specifically in the vhost case, it runs very likely with heavy network
> load which generates loads of eventfd_signal. Delaying the
> eventfd_signal to worker threads will still end up violating the
> global counter later and failing as above.
>
> So we might want to take care of the potential overflow later,
> hopefully with a measurement that can tell us if it's about to
> overflow.

The worry is different eventfds, recursion on a single one could be
detected by keeping state in the ctx itself. And yeah, I agree that one
level isn't very deep, but wakeup chains can be deep and we can't allow
a whole lot more. I'm sure folks would be open to increasing it, if some
worst case kind of data was collected to prove it's fine to go deeper.

--
Jens Axboe

2020-04-10 11:47:34

by He Zhe

[permalink] [raw]

Subject: Re: [PATCH 1/2] eventfd: Make wake counter work for single fd instead of all

On 4/9/20 11:44 PM, Jens Axboe wrote:
> On 4/9/20 3:37 AM, He Zhe wrote:
>>
>> On 4/8/20 4:06 AM, Jens Axboe wrote:
>>> On 4/7/20 3:59 AM, [email protected] wrote:
>>>> From: He Zhe <[email protected]>
>>>>
>>>> commit b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
>>>> introduces a percpu counter that tracks the percpu recursion depth and
>>>> warn if it greater than one, to avoid potential deadlock and stack
>>>> overflow.
>>>>
>>>> However sometimes different eventfds may be used in parallel.
>>>> Specifically, when high network load goes through kvm and vhost, working
>>>> as below, it would trigger the following call trace.
>>>>
>>>> - 100.00%
>>>> - 66.51%
>>>> ret_from_fork
>>>> kthread
>>>> - vhost_worker
>>>> - 33.47% handle_tx_kick
>>>> handle_tx
>>>> handle_tx_copy
>>>> vhost_tx_batch.isra.0
>>>> vhost_add_used_and_signal_n
>>>> eventfd_signal
>>>> - 33.05% handle_rx_net
>>>> handle_rx
>>>> vhost_add_used_and_signal_n
>>>> eventfd_signal
>>>> - 33.49%
>>>> ioctl
>>>> entry_SYSCALL_64_after_hwframe
>>>> do_syscall_64
>>>> __x64_sys_ioctl
>>>> ksys_ioctl
>>>> do_vfs_ioctl
>>>> kvm_vcpu_ioctl
>>>> kvm_arch_vcpu_ioctl_run
>>>> vmx_handle_exit
>>>> handle_ept_misconfig
>>>> kvm_io_bus_write
>>>> __kvm_io_bus_write
>>>> eventfd_signal
>>>>
>>>> 001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73 eventfd_signal+0x85/0xa0
>>>> ---- snip ----
>>>> 001: Call Trace:
>>>> 001: vhost_signal+0x15e/0x1b0 [vhost]
>>>> 001: vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
>>>> 001: handle_rx+0xb9/0x900 [vhost_net]
>>>> 001: handle_rx_net+0x15/0x20 [vhost_net]
>>>> 001: vhost_worker+0xbe/0x120 [vhost]
>>>> 001: kthread+0x106/0x140
>>>> 001: ? log_used.part.0+0x20/0x20 [vhost]
>>>> 001: ? kthread_park+0x90/0x90
>>>> 001: ret_from_fork+0x35/0x40
>>>> 001: ---[ end trace 0000000000000003 ]---
>>>>
>>>> This patch moves the percpu counter into eventfd control structure and
>>>> does the clean-ups, so that eventfd can still be protected from deadlock
>>>> while allowing different ones to work in parallel.
>>>>
>>>> As to potential stack overflow, we might want to figure out a better
>>>> solution in the future to warn when the stack is about to overflow so it
>>>> can be better utilized, rather than break the working flow when just the
>>>> second one comes.
>>> This doesn't work for the infinite recursion case, the state has to be
>>> global, or per thread.
>> Thanks, but I'm not very clear about why the counter has to be global
>> or per thread.
>>
>> If the recursion happens on the same eventfd, the attempt to re-grab
>> the same ctx->wqh.lock would be blocked by the fd-specific counter in
>> this patch.
>>
>> If the recursion happens with a chain of different eventfds, that
>> might lead to a stack overflow issue. The issue should be handled but
>> it seems unnecessary to stop the just the second ring(when the counter
>> is going to be 2) of the chain.
>>
>> Specifically in the vhost case, it runs very likely with heavy network
>> load which generates loads of eventfd_signal. Delaying the
>> eventfd_signal to worker threads will still end up violating the
>> global counter later and failing as above.
>>
>> So we might want to take care of the potential overflow later,
>> hopefully with a measurement that can tell us if it's about to
>> overflow.
> The worry is different eventfds, recursion on a single one could be
> detected by keeping state in the ctx itself. And yeah, I agree that one
> level isn't very deep, but wakeup chains can be deep and we can't allow
> a whole lot more. I'm sure folks would be open to increasing it, if some
> worst case kind of data was collected to prove it's fine to go deeper.

OK, thanks. v2 will be sent.

Zhe

>