The following BUG has been reported (slightly edited):
BUG: using smp_processor_id() in preemptible [00000000] code: handler106/3082
caller is flow_lookup.isra.15+0x2c/0xf0 [openvswitch]
CPU: 46 PID: 3082 Comm: handler106 Not tainted ... #1
Hardware name: Dell Inc. PowerEdge R640/06DKY5, BIOS 2.5.4 01/13/2020
Call Trace:
dump_stack+0x5c/0x80
check_preemption_disabled+0xc4/0xd0
flow_lookup.isra.15+0x2c/0xf0 [openvswitch]
ovs_flow_tbl_lookup+0x3b/0x60 [openvswitch]
ovs_flow_cmd_new+0x2d8/0x430 [openvswitch]
? __switch_to_asm+0x35/0x70
? __switch_to_asm+0x41/0x70
? __switch_to_asm+0x35/0x70
genl_family_rcv_msg+0x1d7/0x410
? migrate_enable+0x123/0x3a0
genl_rcv_msg+0x47/0x8c
? __kmalloc_node_track_caller+0xff/0x2e0
? genl_family_rcv_msg+0x410/0x410
netlink_rcv_skb+0x4c/0x120
genl_rcv+0x24/0x40
netlink_unicast+0x197/0x230
netlink_sendmsg+0x204/0x3d0
sock_sendmsg+0x4c/0x50
___sys_sendmsg+0x29f/0x300
? migrate_enable+0x123/0x3a0
? ep_send_events_proc+0x8a/0x1f0
? ep_scan_ready_list.constprop.23+0x237/0x260
? rt_spin_unlock+0x23/0x40
? ep_poll+0x1b3/0x390
? __fget+0x72/0xa0
__sys_sendmsg+0x57/0xa0
do_syscall_64+0x87/0x1a0
entry_SYSCALL_64_after_hwframe+0x65/0xca
RIP: 0033:0x7f1ed72ccb07
Code: ...
RSP: 002b:00007f1ecbd9ba80 EFLAGS: 00003293 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 000000000000007b RCX: 00007f1ed72ccb07
RDX: 0000000000000000 RSI: 00007f1ecbd9bb10 RDI: 000000000000007b
RBP: 00007f1ecbd9bb10 R08: 0000000000000000 R09: 00007f1ecbd9d390
R10: 0000000019616156 R11: 0000000000003293 R12: 0000000000000000
R13: 00007f1ecbd9d338 R14: 00007f1ecbd9bfb0 R15: 00007f1ecbd9bb10
This happens because openvswitch/flow_table::flow_lookup() accesses
per-cpu data while being preemptible (and migratable).
Fix it by adding get/put_cpu_light(), so that, even if preempted, the
task executing this code is not migrated (operation is also guarded by
ovs_mutex mutex).
Signed-off-by: Juri Lelli <[email protected]>
---
net/openvswitch/flow_table.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index e2235849a57e..7df27ef7da09 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -732,11 +732,14 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
u32 *n_cache_hit,
u32 *index)
{
- u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
+ u64 *usage_counters;
struct sw_flow *flow;
struct sw_flow_mask *mask;
int i;
+ get_cpu_light();
+ usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
+
if (likely(*index < ma->max)) {
mask = rcu_dereference_ovsl(ma->masks[*index]);
if (mask) {
@@ -746,6 +749,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
usage_counters[*index]++;
u64_stats_update_end(&ma->syncp);
(*n_cache_hit)++;
+ put_cpu_light();
return flow;
}
}
@@ -766,10 +770,12 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
u64_stats_update_begin(&ma->syncp);
usage_counters[*index]++;
u64_stats_update_end(&ma->syncp);
+ put_cpu_light();
return flow;
}
}
+ put_cpu_light();
return NULL;
}
--
2.26.2
On 09/10/20 14:47, Juri Lelli wrote:
> The following BUG has been reported (slightly edited):
>
> BUG: using smp_processor_id() in preemptible [00000000] code: handler106/3082
> caller is flow_lookup.isra.15+0x2c/0xf0 [openvswitch]
> CPU: 46 PID: 3082 Comm: handler106 Not tainted ... #1
> Hardware name: Dell Inc. PowerEdge R640/06DKY5, BIOS 2.5.4 01/13/2020
> Call Trace:
> dump_stack+0x5c/0x80
> check_preemption_disabled+0xc4/0xd0
> flow_lookup.isra.15+0x2c/0xf0 [openvswitch]
> ovs_flow_tbl_lookup+0x3b/0x60 [openvswitch]
> ovs_flow_cmd_new+0x2d8/0x430 [openvswitch]
> ? __switch_to_asm+0x35/0x70
> ? __switch_to_asm+0x41/0x70
> ? __switch_to_asm+0x35/0x70
> genl_family_rcv_msg+0x1d7/0x410
> ? migrate_enable+0x123/0x3a0
> genl_rcv_msg+0x47/0x8c
> ? __kmalloc_node_track_caller+0xff/0x2e0
> ? genl_family_rcv_msg+0x410/0x410
> netlink_rcv_skb+0x4c/0x120
> genl_rcv+0x24/0x40
> netlink_unicast+0x197/0x230
> netlink_sendmsg+0x204/0x3d0
> sock_sendmsg+0x4c/0x50
> ___sys_sendmsg+0x29f/0x300
> ? migrate_enable+0x123/0x3a0
> ? ep_send_events_proc+0x8a/0x1f0
> ? ep_scan_ready_list.constprop.23+0x237/0x260
> ? rt_spin_unlock+0x23/0x40
> ? ep_poll+0x1b3/0x390
> ? __fget+0x72/0xa0
> __sys_sendmsg+0x57/0xa0
> do_syscall_64+0x87/0x1a0
> entry_SYSCALL_64_after_hwframe+0x65/0xca
> RIP: 0033:0x7f1ed72ccb07
> Code: ...
> RSP: 002b:00007f1ecbd9ba80 EFLAGS: 00003293 ORIG_RAX: 000000000000002e
> RAX: ffffffffffffffda RBX: 000000000000007b RCX: 00007f1ed72ccb07
> RDX: 0000000000000000 RSI: 00007f1ecbd9bb10 RDI: 000000000000007b
> RBP: 00007f1ecbd9bb10 R08: 0000000000000000 R09: 00007f1ecbd9d390
> R10: 0000000019616156 R11: 0000000000003293 R12: 0000000000000000
> R13: 00007f1ecbd9d338 R14: 00007f1ecbd9bfb0 R15: 00007f1ecbd9bb10
>
> This happens because openvswitch/flow_table::flow_lookup() accesses
> per-cpu data while being preemptible (and migratable).
>
> Fix it by adding get/put_cpu_light(), so that, even if preempted, the
> task executing this code is not migrated (operation is also guarded by
> ovs_mutex mutex).
>
This actually wants also a (sorry for missing it in the first place)
Suggested-by: Daniel Bristot de Oliveira <[email protected]>
> Signed-off-by: Juri Lelli <[email protected]>
> ---
> net/openvswitch/flow_table.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
> index e2235849a57e..7df27ef7da09 100644
> --- a/net/openvswitch/flow_table.c
> +++ b/net/openvswitch/flow_table.c
> @@ -732,11 +732,14 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
> u32 *n_cache_hit,
> u32 *index)
> {
> - u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
> + u64 *usage_counters;
> struct sw_flow *flow;
> struct sw_flow_mask *mask;
> int i;
>
> + get_cpu_light();
> + usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
> +
> if (likely(*index < ma->max)) {
> mask = rcu_dereference_ovsl(ma->masks[*index]);
> if (mask) {
> @@ -746,6 +749,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
> usage_counters[*index]++;
> u64_stats_update_end(&ma->syncp);
> (*n_cache_hit)++;
> + put_cpu_light();
> return flow;
> }
> }
> @@ -766,10 +770,12 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
> u64_stats_update_begin(&ma->syncp);
> usage_counters[*index]++;
> u64_stats_update_end(&ma->syncp);
> + put_cpu_light();
> return flow;
> }
> }
>
> + put_cpu_light();
> return NULL;
> }
>
> --
> 2.26.2
>
On 2020-10-09 14:47:59 [+0200], Juri Lelli wrote:
> This happens because openvswitch/flow_table::flow_lookup() accesses
> per-cpu data while being preemptible (and migratable).
>
> Fix it by adding get/put_cpu_light(), so that, even if preempted, the
> task executing this code is not migrated (operation is also guarded by
> ovs_mutex mutex).
This warning is not limited to PREEMPT_RT it also present upstream since
commit
eac87c413bf97 ("net: openvswitch: reorder masks array based on usage")
You should be able to reproduce it there, too.
The path ovs_flow_tbl_lookup() -> flow_lookup() is guarded by ovs_lock()
I can't say that this true for
ovs_vport_receive() -> ovs_dp_process_packet() ->
ovs_flow_tbl_lookup_stats() -> flow_lookup()
(means I don't know but it looks like coming from NAPI).
Which means u64_stats_update_begin() could have two writers. This must
not happen.
There are two reader which do u64_stats_fetch_begin_irq(). Disabling
interrupts makes no sense since they perform cross-CPU access.
-> You need to ensure that there is only one writer at a time.
If mask_array gains a spinlock_t for writer protection then you can
acquire the lock prio grabbing ->masks_usage_cntr. But as of now there
is one `ma->syncp'.
Sebastian
On 9 Oct 2020, at 17:41, Sebastian Andrzej Siewior wrote:
> On 2020-10-09 14:47:59 [+0200], Juri Lelli wrote:
>> This happens because openvswitch/flow_table::flow_lookup() accesses
>> per-cpu data while being preemptible (and migratable).
>>
>> Fix it by adding get/put_cpu_light(), so that, even if preempted, the
>> task executing this code is not migrated (operation is also guarded
>> by
>> ovs_mutex mutex).
>
> This warning is not limited to PREEMPT_RT it also present upstream
> since
> commit
> eac87c413bf97 ("net: openvswitch: reorder masks array based on
> usage")
>
> You should be able to reproduce it there, too.
> The path ovs_flow_tbl_lookup() -> flow_lookup() is guarded by
> ovs_lock()
> I can't say that this true for
> ovs_vport_receive() -> ovs_dp_process_packet() ->
> ovs_flow_tbl_lookup_stats() -> flow_lookup()
>
> (means I don't know but it looks like coming from NAPI).
>
> Which means u64_stats_update_begin() could have two writers. This must
> not happen.
> There are two reader which do u64_stats_fetch_begin_irq(). Disabling
> interrupts makes no sense since they perform cross-CPU access.
>
> -> You need to ensure that there is only one writer at a time.
>
> If mask_array gains a spinlock_t for writer protection then you can
> acquire the lock prio grabbing ->masks_usage_cntr. But as of now there
> is one `ma->syncp'.
I’m not too familiar with the RT kernel, but in the none RT kernel,
this function is called in run to completion parts only, hence does not
need a lock. Actually, this was designed in such a way that it does not
need a lock at all.
So maybe this needs a get_cpu() instead of the light variant in the RT
case?
//Eelco
On 2020-10-12 10:14:42 [+0200], Eelco Chaudron wrote:
>
>
> On 9 Oct 2020, at 17:41, Sebastian Andrzej Siewior wrote:
>
> > On 2020-10-09 14:47:59 [+0200], Juri Lelli wrote:
> > > This happens because openvswitch/flow_table::flow_lookup() accesses
> > > per-cpu data while being preemptible (and migratable).
> > >
> > > Fix it by adding get/put_cpu_light(), so that, even if preempted, the
> > > task executing this code is not migrated (operation is also guarded
> > > by
> > > ovs_mutex mutex).
> >
> > This warning is not limited to PREEMPT_RT it also present upstream since
> > commit
> > eac87c413bf97 ("net: openvswitch: reorder masks array based on
> > usage")
> >
> > You should be able to reproduce it there, too.
> > The path ovs_flow_tbl_lookup() -> flow_lookup() is guarded by ovs_lock()
> > I can't say that this true for
> > ovs_vport_receive() -> ovs_dp_process_packet() ->
> > ovs_flow_tbl_lookup_stats() -> flow_lookup()
> >
> > (means I don't know but it looks like coming from NAPI).
> >
> > Which means u64_stats_update_begin() could have two writers. This must
> > not happen.
> > There are two reader which do u64_stats_fetch_begin_irq(). Disabling
> > interrupts makes no sense since they perform cross-CPU access.
> >
> > -> You need to ensure that there is only one writer at a time.
> >
> > If mask_array gains a spinlock_t for writer protection then you can
> > acquire the lock prio grabbing ->masks_usage_cntr. But as of now there
> > is one `ma->syncp'.
>
> I’m not too familiar with the RT kernel, but in the none RT kernel, this
> function is called in run to completion parts only, hence does not need a
> lock. Actually, this was designed in such a way that it does not need a lock
> at all.
_no_ As explained above, this is not RT specific.
What guaranties that you don't have two flow_lookup() invocations on the
same CPU? You are using u64_stats_update_begin() which must not be
preempted. This means even if preemption is disabled you must not have
another invocation in BH context. This is due to the
write_seqcount_begin() in u64_stats_update_begin().
If preemption / CPU migration is not a problem in the above part, you
can use annotation to disable the warning that led to the warning. But
the u64_stats invocation looks still problematic.
> So maybe this needs a get_cpu() instead of the light variant in the RT case?
> //Eelco
Sebastian
On 12 Oct 2020, at 10:21, Sebastian Andrzej Siewior wrote:
> On 2020-10-12 10:14:42 [+0200], Eelco Chaudron wrote:
>>
>>
>> On 9 Oct 2020, at 17:41, Sebastian Andrzej Siewior wrote:
>>
>>> On 2020-10-09 14:47:59 [+0200], Juri Lelli wrote:
>>>> This happens because openvswitch/flow_table::flow_lookup() accesses
>>>> per-cpu data while being preemptible (and migratable).
>>>>
>>>> Fix it by adding get/put_cpu_light(), so that, even if preempted,
>>>> the
>>>> task executing this code is not migrated (operation is also guarded
>>>> by
>>>> ovs_mutex mutex).
>>>
>>> This warning is not limited to PREEMPT_RT it also present upstream
>>> since
>>> commit
>>> eac87c413bf97 ("net: openvswitch: reorder masks array based on
>>> usage")
>>>
>>> You should be able to reproduce it there, too.
>>> The path ovs_flow_tbl_lookup() -> flow_lookup() is guarded by
>>> ovs_lock()
>>> I can't say that this true for
>>> ovs_vport_receive() -> ovs_dp_process_packet() ->
>>> ovs_flow_tbl_lookup_stats() -> flow_lookup()
>>>
>>> (means I don't know but it looks like coming from NAPI).
>>>
>>> Which means u64_stats_update_begin() could have two writers. This
>>> must
>>> not happen.
>>> There are two reader which do u64_stats_fetch_begin_irq(). Disabling
>>> interrupts makes no sense since they perform cross-CPU access.
>>>
>>> -> You need to ensure that there is only one writer at a time.
>>>
>>> If mask_array gains a spinlock_t for writer protection then you can
>>> acquire the lock prio grabbing ->masks_usage_cntr. But as of now
>>> there
>>> is one `ma->syncp'.
>>
>> I’m not too familiar with the RT kernel, but in the none RT kernel,
>> this
>> function is called in run to completion parts only, hence does not
>> need a
>> lock. Actually, this was designed in such a way that it does not need
>> a lock
>> at all.
>
> _no_ As explained above, this is not RT specific.
> What guaranties that you don't have two flow_lookup() invocations on
> the
> same CPU? You are using u64_stats_update_begin() which must not be
> preempted. This means even if preemption is disabled you must not have
> another invocation in BH context. This is due to the
> write_seqcount_begin() in u64_stats_update_begin().
> If preemption / CPU migration is not a problem in the above part, you
> can use annotation to disable the warning that led to the warning. But
> the u64_stats invocation looks still problematic.
>
>> So maybe this needs a get_cpu() instead of the light variant in the
>> RT case?
Hi Sebastian,
I was not reading the splat correctly and thought it was from the NAPI
path but it looks like it's from the netlink part. I think this could be
fixed with the following patch, so both paths, NAPI, and netlink become
non-preemptive:
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -850,9 +850,14 @@ struct sw_flow *ovs_flow_tbl_lookup(struct
flow_table *tbl,
struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
u32 __always_unused n_mask_hit;
u32 __always_unused n_cache_hit;
+ struct sw_flow *flow;
u32 index = 0;
- return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit,
&index);
+ preempt_disable();
+ flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit,
&index);
+ preempt_enable();
+
+ return flow;
}
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
Note that choosing to do this in ovs_flow_tbl_lookup() and not
flow_lookup() allows the fast-path, through NAPI, not having to call
preempt_disable().
Any comments? If not I can sent a proper patch trough netdev.
//Eelco