Currently, ep_poll_safewake() in the CONFIG_DEBUG_LOCK_ALLOC case uses
ep_call_nested() in order to pass the correct subclass argument to
spin_lock_irqsave_nested(). However, ep_call_nested() adds unnecessary
checks for epoll depth and loops that are already verified when doing
EPOLL_CTL_ADD. This mirrors a conversion that was done for
!CONFIG_DEBUG_LOCK_ALLOC in: commit 37b5e5212a44 ("epoll: remove
ep_call_nested() from ep_eventpoll_poll()")
Signed-off-by: Jason Baron <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Roman Penyaev <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Eric Wong <[email protected]>
Cc: Andrew Morton <[email protected]>
---
fs/eventpoll.c | 36 +++++++++++++-----------------------
1 file changed, 13 insertions(+), 23 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d7f1f50..a9b2737 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -551,28 +551,23 @@ static int ep_call_nested(struct nested_calls *ncalls,
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct nested_calls poll_safewake_ncalls;
-
-static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
-{
- unsigned long flags;
- wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
-
- spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
- wake_up_locked_poll(wqueue, EPOLLIN);
- spin_unlock_irqrestore(&wqueue->lock, flags);
-
- return 0;
-}
+static DEFINE_PER_CPU(int, wakeup_nest);
static void ep_poll_safewake(wait_queue_head_t *wq)
{
- int this_cpu = get_cpu();
-
- ep_call_nested(&poll_safewake_ncalls,
- ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
+ unsigned long flags;
+ int subclass;
- put_cpu();
+ local_irq_save(flags);
+ preempt_disable();
+ subclass = __this_cpu_read(wakeup_nest);
+ spin_lock_nested(&wq->lock, subclass + 1);
+ __this_cpu_inc(wakeup_nest);
+ wake_up_locked_poll(wq, POLLIN);
+ __this_cpu_dec(wakeup_nest);
+ spin_unlock(&wq->lock);
+ local_irq_restore(flags);
+ preempt_enable();
}
#else
@@ -2370,11 +2365,6 @@ static int __init eventpoll_init(void)
*/
ep_nested_calls_init(&poll_loop_ncalls);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /* Initialize the structure used to perform safe poll wait head wake ups */
- ep_nested_calls_init(&poll_safewake_ncalls);
-#endif
-
/*
* We can have many thousands of epitems, so prevent this from
* using an extra cache line on 64-bit (and smaller) CPUs
--
2.7.4
On 9/4/19 4:22 PM, Jason Baron wrote:
> Currently, ep_poll_safewake() in the CONFIG_DEBUG_LOCK_ALLOC case uses
> ep_call_nested() in order to pass the correct subclass argument to
> spin_lock_irqsave_nested(). However, ep_call_nested() adds unnecessary
> checks for epoll depth and loops that are already verified when doing
> EPOLL_CTL_ADD. This mirrors a conversion that was done for
> !CONFIG_DEBUG_LOCK_ALLOC in: commit 37b5e5212a44 ("epoll: remove
> ep_call_nested() from ep_eventpoll_poll()")
>
> Signed-off-by: Jason Baron <[email protected]>
> Cc: Davidlohr Bueso <[email protected]>
> Cc: Roman Penyaev <[email protected]>
> Cc: Al Viro <[email protected]>
> Cc: Eric Wong <[email protected]>
> Cc: Andrew Morton <[email protected]>
> ---
> fs/eventpoll.c | 36 +++++++++++++-----------------------
> 1 file changed, 13 insertions(+), 23 deletions(-)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index d7f1f50..a9b2737 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -551,28 +551,23 @@ static int ep_call_nested(struct nested_calls *ncalls,
> */
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>
> -static struct nested_calls poll_safewake_ncalls;
> -
> -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
> -{
> - unsigned long flags;
> - wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
> -
> - spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
> - wake_up_locked_poll(wqueue, EPOLLIN);
> - spin_unlock_irqrestore(&wqueue->lock, flags);
> -
> - return 0;
> -}
> +static DEFINE_PER_CPU(int, wakeup_nest);
>
> static void ep_poll_safewake(wait_queue_head_t *wq)
> {
> - int this_cpu = get_cpu();
> -
> - ep_call_nested(&poll_safewake_ncalls,
> - ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
> + unsigned long flags;
> + int subclass;
>
> - put_cpu();
> + local_irq_save(flags);
> + preempt_disable();
> + subclass = __this_cpu_read(wakeup_nest);
> + spin_lock_nested(&wq->lock, subclass + 1);
> + __this_cpu_inc(wakeup_nest);
> + wake_up_locked_poll(wq, POLLIN);
> + __this_cpu_dec(wakeup_nest);
> + spin_unlock(&wq->lock);
> + local_irq_restore(flags);
> + preempt_enable();
> }
>
> #else
> @@ -2370,11 +2365,6 @@ static int __init eventpoll_init(void)
> */
> ep_nested_calls_init(&poll_loop_ncalls);
>
> -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> - /* Initialize the structure used to perform safe poll wait head wake ups */
> - ep_nested_calls_init(&poll_safewake_ncalls);
> -#endif
> -
> /*
> * We can have many thousands of epitems, so prevent this from
> * using an extra cache line on 64-bit (and smaller) CPUs
>
Hi,
Any thoughts on this one?
Thanks,
-Jason
On 2019-09-23 17:43, Jason Baron wrote:
> On 9/4/19 4:22 PM, Jason Baron wrote:
>> Currently, ep_poll_safewake() in the CONFIG_DEBUG_LOCK_ALLOC case uses
>> ep_call_nested() in order to pass the correct subclass argument to
>> spin_lock_irqsave_nested(). However, ep_call_nested() adds unnecessary
>> checks for epoll depth and loops that are already verified when doing
>> EPOLL_CTL_ADD. This mirrors a conversion that was done for
>> !CONFIG_DEBUG_LOCK_ALLOC in: commit 37b5e5212a44 ("epoll: remove
>> ep_call_nested() from ep_eventpoll_poll()")
>>
>> Signed-off-by: Jason Baron <[email protected]>
>> Cc: Davidlohr Bueso <[email protected]>
>> Cc: Roman Penyaev <[email protected]>
>> Cc: Al Viro <[email protected]>
>> Cc: Eric Wong <[email protected]>
>> Cc: Andrew Morton <[email protected]>
>> ---
>> fs/eventpoll.c | 36 +++++++++++++-----------------------
>> 1 file changed, 13 insertions(+), 23 deletions(-)
>>
>> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
>> index d7f1f50..a9b2737 100644
>> --- a/fs/eventpoll.c
>> +++ b/fs/eventpoll.c
>> @@ -551,28 +551,23 @@ static int ep_call_nested(struct nested_calls
>> *ncalls,
>> */
>> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>>
>> -static struct nested_calls poll_safewake_ncalls;
>> -
>> -static int ep_poll_wakeup_proc(void *priv, void *cookie, int
>> call_nests)
>> -{
>> - unsigned long flags;
>> - wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
>> -
>> - spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
>> - wake_up_locked_poll(wqueue, EPOLLIN);
>> - spin_unlock_irqrestore(&wqueue->lock, flags);
>> -
>> - return 0;
>> -}
>> +static DEFINE_PER_CPU(int, wakeup_nest);
>>
>> static void ep_poll_safewake(wait_queue_head_t *wq)
>> {
>> - int this_cpu = get_cpu();
>> -
>> - ep_call_nested(&poll_safewake_ncalls,
>> - ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
>> + unsigned long flags;
>> + int subclass;
>>
>> - put_cpu();
>> + local_irq_save(flags);
>> + preempt_disable();
>> + subclass = __this_cpu_read(wakeup_nest);
>> + spin_lock_nested(&wq->lock, subclass + 1);
>> + __this_cpu_inc(wakeup_nest);
>> + wake_up_locked_poll(wq, POLLIN);
>> + __this_cpu_dec(wakeup_nest);
>> + spin_unlock(&wq->lock);
>> + local_irq_restore(flags);
>> + preempt_enable();
>> }
What if reduce number of lines with something as the following:
int this_cpu = get_cpu();
subclass = __this_cpu_inc_return(wakeup_nest);
spin_lock_irqsave_nested(&wq->lock, flags, subclass);
wake_up_locked_poll(wq, POLLIN);
spin_unlock_irqrestore(&wq->lock, flags);
__this_cpu_dec(wakeup_nest);
put_cpu();
Other than that looks good to me.
Reviewed-by: Roman Penyaev <[email protected]>
--
Roman
On 9/23/19 3:23 PM, Roman Penyaev wrote:
> On 2019-09-23 17:43, Jason Baron wrote:
>> On 9/4/19 4:22 PM, Jason Baron wrote:
>>> Currently, ep_poll_safewake() in the CONFIG_DEBUG_LOCK_ALLOC case uses
>>> ep_call_nested() in order to pass the correct subclass argument to
>>> spin_lock_irqsave_nested(). However, ep_call_nested() adds unnecessary
>>> checks for epoll depth and loops that are already verified when doing
>>> EPOLL_CTL_ADD. This mirrors a conversion that was done for
>>> !CONFIG_DEBUG_LOCK_ALLOC in: commit 37b5e5212a44 ("epoll: remove
>>> ep_call_nested() from ep_eventpoll_poll()")
>>>
>>> Signed-off-by: Jason Baron <[email protected]>
>>> Cc: Davidlohr Bueso <[email protected]>
>>> Cc: Roman Penyaev <[email protected]>
>>> Cc: Al Viro <[email protected]>
>>> Cc: Eric Wong <[email protected]>
>>> Cc: Andrew Morton <[email protected]>
>>> ---
>>> fs/eventpoll.c | 36 +++++++++++++-----------------------
>>> 1 file changed, 13 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
>>> index d7f1f50..a9b2737 100644
>>> --- a/fs/eventpoll.c
>>> +++ b/fs/eventpoll.c
>>> @@ -551,28 +551,23 @@ static int ep_call_nested(struct nested_calls
>>> *ncalls,
>>> */
>>> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>>>
>>> -static struct nested_calls poll_safewake_ncalls;
>>> -
>>> -static int ep_poll_wakeup_proc(void *priv, void *cookie, int
>>> call_nests)
>>> -{
>>> - unsigned long flags;
>>> - wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
>>> -
>>> - spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
>>> - wake_up_locked_poll(wqueue, EPOLLIN);
>>> - spin_unlock_irqrestore(&wqueue->lock, flags);
>>> -
>>> - return 0;
>>> -}
>>> +static DEFINE_PER_CPU(int, wakeup_nest);
>>>
>>> static void ep_poll_safewake(wait_queue_head_t *wq)
>>> {
>>> - int this_cpu = get_cpu();
>>> -
>>> - ep_call_nested(&poll_safewake_ncalls,
>>> - ep_poll_wakeup_proc, NULL, wq, (void *) (long)
>>> this_cpu);
>>> + unsigned long flags;
>>> + int subclass;
>>>
>>> - put_cpu();
>>> + local_irq_save(flags);
>>> + preempt_disable();
>>> + subclass = __this_cpu_read(wakeup_nest);
>>> + spin_lock_nested(&wq->lock, subclass + 1);
>>> + __this_cpu_inc(wakeup_nest);
>>> + wake_up_locked_poll(wq, POLLIN);
>>> + __this_cpu_dec(wakeup_nest);
>>> + spin_unlock(&wq->lock);
>>> + local_irq_restore(flags);
>>> + preempt_enable();
>>> }
>
> What if reduce number of lines with something as the following:
>
> int this_cpu = get_cpu();
> subclass = __this_cpu_inc_return(wakeup_nest);
> spin_lock_irqsave_nested(&wq->lock, flags, subclass);
> wake_up_locked_poll(wq, POLLIN);
> spin_unlock_irqrestore(&wq->lock, flags);
> __this_cpu_dec(wakeup_nest);
> put_cpu();
>
> Other than that looks good to me.
>
> Reviewed-by: Roman Penyaev <[email protected]>
>
> --
> Roman
Hi,
I put the local_irq_save(flags), call there first so that there wouldn't
be any nesting. For example, in your sequence, there could be an irq
after the __this_cpu_inc_return(), that could end up back here.
I still can use __this_cpu_inc_return() to simplify things a bit.
Thanks,
-Jason
On 2019-09-24 19:34, Jason Baron wrote:
> On 9/23/19 3:23 PM, Roman Penyaev wrote:
>> On 2019-09-23 17:43, Jason Baron wrote:
>>> On 9/4/19 4:22 PM, Jason Baron wrote:
>>>> Currently, ep_poll_safewake() in the CONFIG_DEBUG_LOCK_ALLOC case
>>>> uses
>>>> ep_call_nested() in order to pass the correct subclass argument to
>>>> spin_lock_irqsave_nested(). However, ep_call_nested() adds
>>>> unnecessary
>>>> checks for epoll depth and loops that are already verified when
>>>> doing
>>>> EPOLL_CTL_ADD. This mirrors a conversion that was done for
>>>> !CONFIG_DEBUG_LOCK_ALLOC in: commit 37b5e5212a44 ("epoll: remove
>>>> ep_call_nested() from ep_eventpoll_poll()")
>>>>
>>>> Signed-off-by: Jason Baron <[email protected]>
>>>> Cc: Davidlohr Bueso <[email protected]>
>>>> Cc: Roman Penyaev <[email protected]>
>>>> Cc: Al Viro <[email protected]>
>>>> Cc: Eric Wong <[email protected]>
>>>> Cc: Andrew Morton <[email protected]>
>>>> ---
>>>> fs/eventpoll.c | 36 +++++++++++++-----------------------
>>>> 1 file changed, 13 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
>>>> index d7f1f50..a9b2737 100644
>>>> --- a/fs/eventpoll.c
>>>> +++ b/fs/eventpoll.c
>>>> @@ -551,28 +551,23 @@ static int ep_call_nested(struct nested_calls
>>>> *ncalls,
>>>> */
>>>> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>>>>
>>>> -static struct nested_calls poll_safewake_ncalls;
>>>> -
>>>> -static int ep_poll_wakeup_proc(void *priv, void *cookie, int
>>>> call_nests)
>>>> -{
>>>> - unsigned long flags;
>>>> - wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
>>>> -
>>>> - spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
>>>> - wake_up_locked_poll(wqueue, EPOLLIN);
>>>> - spin_unlock_irqrestore(&wqueue->lock, flags);
>>>> -
>>>> - return 0;
>>>> -}
>>>> +static DEFINE_PER_CPU(int, wakeup_nest);
>>>>
>>>> static void ep_poll_safewake(wait_queue_head_t *wq)
>>>> {
>>>> - int this_cpu = get_cpu();
>>>> -
>>>> - ep_call_nested(&poll_safewake_ncalls,
>>>> - ep_poll_wakeup_proc, NULL, wq, (void *) (long)
>>>> this_cpu);
>>>> + unsigned long flags;
>>>> + int subclass;
>>>>
>>>> - put_cpu();
>>>> + local_irq_save(flags);
>>>> + preempt_disable();
>>>> + subclass = __this_cpu_read(wakeup_nest);
>>>> + spin_lock_nested(&wq->lock, subclass + 1);
>>>> + __this_cpu_inc(wakeup_nest);
>>>> + wake_up_locked_poll(wq, POLLIN);
>>>> + __this_cpu_dec(wakeup_nest);
>>>> + spin_unlock(&wq->lock);
>>>> + local_irq_restore(flags);
>>>> + preempt_enable();
>>>> }
>>
>> What if reduce number of lines with something as the following:
>>
>> int this_cpu = get_cpu();
>> subclass = __this_cpu_inc_return(wakeup_nest);
>> spin_lock_irqsave_nested(&wq->lock, flags, subclass);
>> wake_up_locked_poll(wq, POLLIN);
>> spin_unlock_irqrestore(&wq->lock, flags);
>> __this_cpu_dec(wakeup_nest);
>> put_cpu();
>>
>> Other than that looks good to me.
>>
>> Reviewed-by: Roman Penyaev <[email protected]>
>>
>> --
>> Roman
>
>
> Hi,
>
> I put the local_irq_save(flags), call there first so that there
> wouldn't
> be any nesting. For example, in your sequence, there could be an irq
> after the __this_cpu_inc_return(), that could end up back here.
That is correct, but seems this is the original behavior of
ep_call_nested(),
where irq can happen just after spin_unlock_irqrestore():
spin_unlock_irqrestore(&ncalls->lock, flags);
>>>> irq here <<<<<
/* Call the nested function */
error = (*nproc)(priv, cookie, call_nests);
so eventually you end up with spin_lock_irqsave_nested() call where
call_nests is not monotonically increased (not sequential) but has
a gap (depends on nesting).
So if shorter, I thought that your "local_irq_save + increment" sequence
is excessive.
--
Roman
Some comments on:
f6520c52084 (epoll: simplify ep_poll_safewake() for CONFIG_DEBUG_LOCK_ALLOC)
For one this does not play nice with preempt_rt as disabling irq and
then taking a spinlock_t is a no no; the critical region becomes
preemptible. This is particularly important as -rt is being mainlined.
Secondly, it is really ugly compared to what we had before - albeit not
having to deal with all the ep_call_nested() checks, but I doubt this
overhead matters at all with CONFIG_DEBUG_LOCK_ALLOC.
While the current logic avoids nesting by disabling irq during the whole
path, this seems like an overkill under debug. This patch proposes using
this_cpu_inc_return() then taking the irq-safe lock - albeit a possible
irq coming in the middle between these operations and messing up the
subclass. If this is unacceptable, we could always revert the patch,
as this was never a problem in the first place.
Signed-off-by: Davidlohr Bueso <[email protected]>
---
This is pretty much untested - I wanted to see what is the best way to
address the concerns first.
XXX: I don't think we need get/put_cpu() around the call, this was intended
only for ep_call_nested() tasks_call_list checking afaict.
fs/eventpoll.c | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 67a395039268..97d036deff3e 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -558,16 +558,17 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
unsigned long flags;
int subclass;
- local_irq_save(flags);
- preempt_disable();
- subclass = __this_cpu_read(wakeup_nest);
- spin_lock_nested(&wq->lock, subclass + 1);
- __this_cpu_inc(wakeup_nest);
+ subclass = this_cpu_inc_return(wakeup_nest);
+ /*
+ * Careful: while unlikely, an irq can occur here and mess up
+ * the subclass. The same goes for right before the dec
+ * operation, but that does not matter.
+ */
+ spin_lock_irqsave_nested(&wq->lock, flags, subclass + 1);
wake_up_locked_poll(wq, POLLIN);
- __this_cpu_dec(wakeup_nest);
- spin_unlock(&wq->lock);
- local_irq_restore(flags);
- preempt_enable();
+ spin_unlock_irqrestore(&wq->lock, flags);
+
+ this_cpu_dec(wakeup_nest);
}
#else
--
2.16.4
Hi,
Thanks for looking at this.
On 1/6/20 2:38 PM, Davidlohr Bueso wrote:
> Some comments on:
>
> f6520c52084 (epoll: simplify ep_poll_safewake() for CONFIG_DEBUG_LOCK_ALLOC)
>
> For one this does not play nice with preempt_rt as disabling irq and
> then taking a spinlock_t is a no no; the critical region becomes
> preemptible. This is particularly important as -rt is being mainlined.
>
hmmm, but before the spinlock is taken there is a preempt_disable() call.
> Secondly, it is really ugly compared to what we had before - albeit not
> having to deal with all the ep_call_nested() checks, but I doubt this
> overhead matters at all with CONFIG_DEBUG_LOCK_ALLOC.
>
Yes, the main point of the patch is to continue to remove dependencies
on ep_call_nested(), and then eventually to remove it completely.
> While the current logic avoids nesting by disabling irq during the whole
> path, this seems like an overkill under debug. This patch proposes using
> this_cpu_inc_return() then taking the irq-safe lock - albeit a possible
> irq coming in the middle between these operations and messing up the
> subclass. If this is unacceptable, we could always revert the patch,
> as this was never a problem in the first place.
I personally don't want to introduce false positives. But I'm not quite
sore on that point - the subclass will still I think always increase on
nested calls it just may skip some numbers. I'm not sure offhand if that
messes up lockdep. perhaps not?
Thanks,
-Jason
>
> Signed-off-by: Davidlohr Bueso <[email protected]>
> ---
>
> This is pretty much untested - I wanted to see what is the best way to
> address the concerns first.
>
> XXX: I don't think we need get/put_cpu() around the call, this was intended
> only for ep_call_nested() tasks_call_list checking afaict.
>
> fs/eventpoll.c | 19 ++++++++++---------
> 1 file changed, 10 insertions(+), 9 deletions(-)
>
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 67a395039268..97d036deff3e 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -558,16 +558,17 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
> unsigned long flags;
> int subclass;
>
> - local_irq_save(flags);
> - preempt_disable();
> - subclass = __this_cpu_read(wakeup_nest);
> - spin_lock_nested(&wq->lock, subclass + 1);
> - __this_cpu_inc(wakeup_nest);
> + subclass = this_cpu_inc_return(wakeup_nest);
> + /*
> + * Careful: while unlikely, an irq can occur here and mess up
> + * the subclass. The same goes for right before the dec
> + * operation, but that does not matter.
> + */
> + spin_lock_irqsave_nested(&wq->lock, flags, subclass + 1);
> wake_up_locked_poll(wq, POLLIN);
> - __this_cpu_dec(wakeup_nest);
> - spin_unlock(&wq->lock);
> - local_irq_restore(flags);
> - preempt_enable();
> + spin_unlock_irqrestore(&wq->lock, flags);
> +
> + this_cpu_dec(wakeup_nest);
> }
>
> #else
>
On Mon, 06 Jan 2020, Jason Baron wrote:
>> For one this does not play nice with preempt_rt as disabling irq and
>> then taking a spinlock_t is a no no; the critical region becomes
>> preemptible. This is particularly important as -rt is being mainlined.
>>
>
>hmmm, but before the spinlock is taken there is a preempt_disable() call.
Yes, this is illegal in -rt as well.
>
>> Secondly, it is really ugly compared to what we had before - albeit not
>> having to deal with all the ep_call_nested() checks, but I doubt this
>> overhead matters at all with CONFIG_DEBUG_LOCK_ALLOC.
>>
>
>Yes, the main point of the patch is to continue to remove dependencies
>on ep_call_nested(), and then eventually to remove it completely.
I've also thought about this.
>> While the current logic avoids nesting by disabling irq during the whole
>> path, this seems like an overkill under debug. This patch proposes using
>> this_cpu_inc_return() then taking the irq-safe lock - albeit a possible
>> irq coming in the middle between these operations and messing up the
>> subclass. If this is unacceptable, we could always revert the patch,
>> as this was never a problem in the first place.
>
>I personally don't want to introduce false positives. But I'm not quite
>sore on that point - the subclass will still I think always increase on
>nested calls it just may skip some numbers. I'm not sure offhand if that
>messes up lockdep. perhaps not?
Yes I agree that this will only cause numbers to be skipped, but as mentioned
it's not very tested. I'll go see what comes out with more testing, of course
it means very little unless I can actually reproduce spurious irqs. Maybe I
can hack something up that bumps the subclass intentionally and see what happens
as well.
Thanks,
Davidlohr
Davidlohr Bueso pointed out that when CONFIG_DEBUG_LOCK_ALLOC is set
ep_poll_safewake() can take several non-raw spinlocks after disabling
pre-emption which is no no for the -rt kernel. So let's re-work how we
determine the nesting level such that it plays nicely with -rt kernel.
Let's introduce a 'nests' field in struct eventpoll that records the
current nesting level during ep_poll_callback(). Then, if we nest again we
can find the previous struct eventpoll that we were called from and
increase our count by 1. The 'nests' field is protected by
ep->poll_wait.lock.
I've also moved napi_id field into a hole in struct eventpoll as part of
introduing the nests field. This change reduces the struct eventpoll from
184 bytes to 176 bytes on x86_64 for the !CONFIG_DEBUG_LOCK_ALLOC
production config.
Reported-by: Davidlohr Bueso <[email protected]>
Signed-off-by: Jason Baron <[email protected]>
---
fs/eventpoll.c | 61 ++++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 40 insertions(+), 21 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 67a39503..8ee356c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -219,12 +219,18 @@ struct eventpoll {
/* used to optimize loop detection check */
int visited;
- struct list_head visited_list_link;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
+
+ struct list_head visited_list_link;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /* used to track wakeup nests for lockdep validation */
+ u8 nests;
+#endif
};
/* Wait structure used by the poll hooks */
@@ -551,30 +557,43 @@ static int ep_call_nested(struct nested_calls *ncalls,
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static DEFINE_PER_CPU(int, wakeup_nest);
-
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
+ struct eventpoll *ep_src;
unsigned long flags;
- int subclass;
+ u8 nests = 0;
- local_irq_save(flags);
- preempt_disable();
- subclass = __this_cpu_read(wakeup_nest);
- spin_lock_nested(&wq->lock, subclass + 1);
- __this_cpu_inc(wakeup_nest);
- wake_up_locked_poll(wq, POLLIN);
- __this_cpu_dec(wakeup_nest);
- spin_unlock(&wq->lock);
- local_irq_restore(flags);
- preempt_enable();
+ /*
+ * If we are not being call from ep_poll_callback(), epi is
+ * NULL and we are at the first level of nesting, 0. Otherwise,
+ * we are being called from ep_poll_callback() and if a previous
+ * wakeup source is not an epoll file itself, we are at depth
+ * 1 since the wakeup source is depth 0. If the wakeup source
+ * is a previous epoll file in the wakeup chain then we use its
+ * nests value and record ours as nests + 1. The previous epoll
+ * file nests value is stable since its already holding its
+ * own poll_wait.lock.
+ */
+ if (epi) {
+ if ((is_file_epoll(epi->ffd.file))) {
+ ep_src = epi->ffd.file->private_data;
+ nests = ep_src->nests;
+ } else {
+ nests = 1;
+ }
+ }
+ spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
+ ep->nests = nests + 1;
+ wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+ ep->nests = 0;
+ spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
#else
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
- wake_up_poll(wq, EPOLLIN);
+ wake_up_poll(&ep->poll_wait, EPOLLIN);
}
#endif
@@ -795,7 +814,7 @@ static void ep_free(struct eventpoll *ep)
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
/*
* We need to lock this because we could be hit by
@@ -1264,7 +1283,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, epi);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
@@ -1568,7 +1587,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
return 0;
@@ -1672,7 +1691,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
return 0;
}
--
2.7.4
On Fri, 17 Jan 2020 14:16:47 -0500 Jason Baron <[email protected]> wrote:
> Davidlohr Bueso pointed out that when CONFIG_DEBUG_LOCK_ALLOC is set
> ep_poll_safewake() can take several non-raw spinlocks after disabling
> pre-emption which is no no for the -rt kernel. So let's re-work how we
> determine the nesting level such that it plays nicely with -rt kernel.
"no no" isn't terribly informative, and knowledge of -rt's requirements
isn't widespread. Can we please spell this requirement out in full
detail, if only to spread the -rt education a bit?
> Let's introduce a 'nests' field in struct eventpoll that records the
> current nesting level during ep_poll_callback(). Then, if we nest again we
> can find the previous struct eventpoll that we were called from and
> increase our count by 1. The 'nests' field is protected by
> ep->poll_wait.lock.
>
> I've also moved napi_id field into a hole in struct eventpoll as part of
> introduing the nests field. This change reduces the struct eventpoll from
> 184 bytes to 176 bytes on x86_64 for the !CONFIG_DEBUG_LOCK_ALLOC
> production config.
>
> ...
>
> @@ -551,30 +557,43 @@ static int ep_call_nested(struct nested_calls *ncalls,
> */
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>
> -static DEFINE_PER_CPU(int, wakeup_nest);
> -
> -static void ep_poll_safewake(wait_queue_head_t *wq)
> +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
> {
> + struct eventpoll *ep_src;
> unsigned long flags;
> - int subclass;
> + u8 nests = 0;
>
> - local_irq_save(flags);
> - preempt_disable();
> - subclass = __this_cpu_read(wakeup_nest);
> - spin_lock_nested(&wq->lock, subclass + 1);
> - __this_cpu_inc(wakeup_nest);
> - wake_up_locked_poll(wq, POLLIN);
> - __this_cpu_dec(wakeup_nest);
> - spin_unlock(&wq->lock);
> - local_irq_restore(flags);
> - preempt_enable();
> + /*
> + * If we are not being call from ep_poll_callback(), epi is
> + * NULL and we are at the first level of nesting, 0. Otherwise,
> + * we are being called from ep_poll_callback() and if a previous
> + * wakeup source is not an epoll file itself, we are at depth
> + * 1 since the wakeup source is depth 0. If the wakeup source
> + * is a previous epoll file in the wakeup chain then we use its
> + * nests value and record ours as nests + 1. The previous epoll
> + * file nests value is stable since its already holding its
> + * own poll_wait.lock.
> + */
Similarly, it would be helpful if this comment were to explain that
this code exists for -rt's requirements, and to briefly describe what
that requirement is.
> + if (epi) {
> + if ((is_file_epoll(epi->ffd.file))) {
> + ep_src = epi->ffd.file->private_data;
> + nests = ep_src->nests;
> + } else {
> + nests = 1;
> + }
> + }
> + spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
> + ep->nests = nests + 1;
> + wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
> + ep->nests = 0;
> + spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
> }
Davidlohr Bueso pointed out that when CONFIG_DEBUG_LOCK_ALLOC is set
ep_poll_safewake() can take several non-raw spinlocks after disabling
interrupts. Since a spinlock can block in the -rt kernel, we can't take a
spinlock after disabling interrupts. So let's re-work how we determine
the nesting level such that it plays nicely with the -rt kernel.
Let's introduce a 'nests' field in struct eventpoll that records the
current nesting level during ep_poll_callback(). Then, if we nest again we
can find the previous struct eventpoll that we were called from and
increase our count by 1. The 'nests' field is protected by
ep->poll_wait.lock.
I've also moved the visited field to reduce the size of struct eventpoll
from 184 bytes to 176 bytes on x86_64 for !CONFIG_DEBUG_LOCK_ALLOC,
which is typical for a production config.
Reported-by: Davidlohr Bueso <[email protected]>
Signed-off-by: Jason Baron <[email protected]>
---
v2:
-improve (hopefully:)) comments and explanations around -rt requirements
(Andrew Morton)
fs/eventpoll.c | 64 +++++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 43 insertions(+), 21 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 67a39503..81ef47c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,13 +218,18 @@ struct eventpoll {
struct file *file;
/* used to optimize loop detection check */
- int visited;
struct list_head visited_list_link;
+ int visited;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /* tracks wakeup nests for lockdep validation */
+ u8 nests;
+#endif
};
/* Wait structure used by the poll hooks */
@@ -551,30 +556,47 @@ static int ep_call_nested(struct nested_calls *ncalls,
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static DEFINE_PER_CPU(int, wakeup_nest);
-
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
+ struct eventpoll *ep_src;
unsigned long flags;
- int subclass;
+ u8 nests = 0;
- local_irq_save(flags);
- preempt_disable();
- subclass = __this_cpu_read(wakeup_nest);
- spin_lock_nested(&wq->lock, subclass + 1);
- __this_cpu_inc(wakeup_nest);
- wake_up_locked_poll(wq, POLLIN);
- __this_cpu_dec(wakeup_nest);
- spin_unlock(&wq->lock);
- local_irq_restore(flags);
- preempt_enable();
+ /*
+ * To set the subclass or nesting level for spin_lock_irqsave_nested()
+ * it might be natural to create a per-cpu nest count. However, since
+ * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
+ * schedule() in the -rt kernel, the per-cpu variable are no longer
+ * protected. Thus, we are introducing a per eventpoll nest field.
+ * If we are not being call from ep_poll_callback(), epi is NULL and
+ * we are at the first level of nesting, 0. Otherwise, we are being
+ * called from ep_poll_callback() and if a previous wakeup source is
+ * not an epoll file itself, we are at depth 1 since the wakeup source
+ * is depth 0. If the wakeup source is a previous epoll file in the
+ * wakeup chain then we use its nests value and record ours as
+ * nests + 1. The previous epoll file nests value is stable since its
+ * already holding its own poll_wait.lock.
+ */
+ if (epi) {
+ if ((is_file_epoll(epi->ffd.file))) {
+ ep_src = epi->ffd.file->private_data;
+ nests = ep_src->nests;
+ } else {
+ nests = 1;
+ }
+ }
+ spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
+ ep->nests = nests + 1;
+ wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+ ep->nests = 0;
+ spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
#else
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
- wake_up_poll(wq, EPOLLIN);
+ wake_up_poll(&ep->poll_wait, EPOLLIN);
}
#endif
@@ -795,7 +817,7 @@ static void ep_free(struct eventpoll *ep)
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
/*
* We need to lock this because we could be hit by
@@ -1264,7 +1286,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, epi);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
@@ -1568,7 +1590,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
return 0;
@@ -1672,7 +1694,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(&ep->poll_wait);
+ ep_poll_safewake(ep, NULL);
return 0;
}
--
2.7.4
On Wed, 26 Feb 2020, Jason Baron wrote:
>Davidlohr Bueso pointed out that when CONFIG_DEBUG_LOCK_ALLOC is set
>ep_poll_safewake() can take several non-raw spinlocks after disabling
>interrupts. Since a spinlock can block in the -rt kernel, we can't take a
>spinlock after disabling interrupts. So let's re-work how we determine
>the nesting level such that it plays nicely with the -rt kernel.
>
>Let's introduce a 'nests' field in struct eventpoll that records the
>current nesting level during ep_poll_callback(). Then, if we nest again we
>can find the previous struct eventpoll that we were called from and
>increase our count by 1. The 'nests' field is protected by
>ep->poll_wait.lock.
>
>I've also moved the visited field to reduce the size of struct eventpoll
>from 184 bytes to 176 bytes on x86_64 for !CONFIG_DEBUG_LOCK_ALLOC,
>which is typical for a production config.
>
>Reported-by: Davidlohr Bueso <[email protected]>
>Signed-off-by: Jason Baron <[email protected]>
Sorry for the tardy reply. This looks good to me:
Reviewed-by: Davidlohr Bueso <[email protected]>