LinuxLists.cc - [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

2021-09-22 11:17:50

Subject: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

Use rcu_user_{enter,exit}() calls to provide SMP ordering on context
tracking state stores:

__context_tracking_exit()
__this_cpu_write(context_tracking.state, CONTEXT_KERNEL)
rcu_user_exit()
rcu_eqs_exit()
rcu_dynticks_eqs_eit()
rcu_dynticks_inc()
atomic_add_return() /* smp_mb */

__context_tracking_enter()
rcu_user_enter()
rcu_eqs_enter()
rcu_dynticks_eqs_enter()
rcu_dynticks_inc()
atomic_add_return() /* smp_mb */
__this_cpu_write(context_tracking.state, state)

This separates USER/KERNEL state with an smp_mb() on each side,
therefore, a user of context_tracking_state_cpu() can say the CPU must
pass through an smp_mb() before changing.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/context_tracking_state.h | 12 ++++++++++++
kernel/context_tracking.c | 7 ++++---
2 files changed, 16 insertions(+), 3 deletions(-)

--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -45,11 +45,23 @@ static __always_inline bool context_trac
{
return __this_cpu_read(context_tracking.state) == CONTEXT_USER;
}
+
+static __always_inline bool context_tracking_state_cpu(int cpu)
+{
+ struct context_tracking *ct = per_cpu_ptr(&context_tracking);
+
+ if (!context_tracking_enabled() || !ct->active)
+ return CONTEXT_DISABLED;
+
+ return ct->state;
+}
+
#else
static inline bool context_tracking_in_user(void) { return false; }
static inline bool context_tracking_enabled(void) { return false; }
static inline bool context_tracking_enabled_cpu(int cpu) { return false; }
static inline bool context_tracking_enabled_this_cpu(void) { return false; }
+static inline bool context_tracking_state_cpu(int cpu) { return CONTEXT_DISABLED; }
#endif /* CONFIG_CONTEXT_TRACKING */

#endif
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -82,7 +82,7 @@ void noinstr __context_tracking_enter(en
vtime_user_enter(current);
instrumentation_end();
}
- rcu_user_enter();
+ rcu_user_enter(); /* smp_mb */
}
/*
* Even if context tracking is disabled on this CPU, because it's outside
@@ -149,12 +149,14 @@ void noinstr __context_tracking_exit(enu
return;

if (__this_cpu_read(context_tracking.state) == state) {
+ __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
+
if (__this_cpu_read(context_tracking.active)) {
/*
* We are going to run code that may use RCU. Inform
* RCU core about that (ie: we may need the tick again).
*/
- rcu_user_exit();
+ rcu_user_exit(); /* smp_mb */
if (state == CONTEXT_USER) {
instrumentation_begin();
vtime_user_exit(current);
@@ -162,7 +164,6 @@ void noinstr __context_tracking_exit(enu
instrumentation_end();
}
}
- __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}

2021-09-22 15:21:48

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

On Wed, Sep 22, 2021 at 01:05:12PM +0200, Peter Zijlstra wrote:
> Use rcu_user_{enter,exit}() calls to provide SMP ordering on context
> tracking state stores:
>
> __context_tracking_exit()
> __this_cpu_write(context_tracking.state, CONTEXT_KERNEL)
> rcu_user_exit()
> rcu_eqs_exit()
> rcu_dynticks_eqs_eit()
> rcu_dynticks_inc()
> atomic_add_return() /* smp_mb */
>
> __context_tracking_enter()
> rcu_user_enter()
> rcu_eqs_enter()
> rcu_dynticks_eqs_enter()
> rcu_dynticks_inc()
> atomic_add_return() /* smp_mb */
> __this_cpu_write(context_tracking.state, state)
>
> This separates USER/KERNEL state with an smp_mb() on each side,
> therefore, a user of context_tracking_state_cpu() can say the CPU must
> pass through an smp_mb() before changing.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

For the transformation to negative errno return value and name change
from an RCU perspective:

Acked-by: Paul E. McKenney <[email protected]>

For the sampling of nohz_full userspace state:

Another approach is for the rcu_data structure's ->dynticks variable to
use the lower two bits to differentiate between idle, nohz_full userspace
and kernel. In theory, inlining should make this zero cost for idle
transition, and should allow you to safely sample nohz_full userspace
state with a load and a couple of memory barriers instead of an IPI.

To make this work nicely, the low-order bits have to be 00 for kernel,
and (say) 01 for idle and 10 for nohz_full userspace. 11 would be an
error.

The trick would be for rcu_user_enter() and rcu_user_exit() to atomically
increment ->dynticks by 2, for rcu_nmi_exit() to increment by 1 and
rcu_nmi_enter() to increment by 3. The state sampling would need to
change accordingly.

Does this make sense, or am I missing something?

Thanx, Paul

> ---
> include/linux/context_tracking_state.h | 12 ++++++++++++
> kernel/context_tracking.c | 7 ++++---
> 2 files changed, 16 insertions(+), 3 deletions(-)
>
> --- a/include/linux/context_tracking_state.h
> +++ b/include/linux/context_tracking_state.h
> @@ -45,11 +45,23 @@ static __always_inline bool context_trac
> {
> return __this_cpu_read(context_tracking.state) == CONTEXT_USER;
> }
> +
> +static __always_inline bool context_tracking_state_cpu(int cpu)
> +{
> + struct context_tracking *ct = per_cpu_ptr(&context_tracking);
> +
> + if (!context_tracking_enabled() || !ct->active)
> + return CONTEXT_DISABLED;
> +
> + return ct->state;
> +}
> +
> #else
> static inline bool context_tracking_in_user(void) { return false; }
> static inline bool context_tracking_enabled(void) { return false; }
> static inline bool context_tracking_enabled_cpu(int cpu) { return false; }
> static inline bool context_tracking_enabled_this_cpu(void) { return false; }
> +static inline bool context_tracking_state_cpu(int cpu) { return CONTEXT_DISABLED; }
> #endif /* CONFIG_CONTEXT_TRACKING */
>
> #endif
> --- a/kernel/context_tracking.c
> +++ b/kernel/context_tracking.c
> @@ -82,7 +82,7 @@ void noinstr __context_tracking_enter(en
> vtime_user_enter(current);
> instrumentation_end();
> }
> - rcu_user_enter();
> + rcu_user_enter(); /* smp_mb */
> }
> /*
> * Even if context tracking is disabled on this CPU, because it's outside
> @@ -149,12 +149,14 @@ void noinstr __context_tracking_exit(enu
> return;
>
> if (__this_cpu_read(context_tracking.state) == state) {
> + __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
> +
> if (__this_cpu_read(context_tracking.active)) {
> /*
> * We are going to run code that may use RCU. Inform
> * RCU core about that (ie: we may need the tick again).
> */
> - rcu_user_exit();
> + rcu_user_exit(); /* smp_mb */
> if (state == CONTEXT_USER) {
> instrumentation_begin();
> vtime_user_exit(current);
> @@ -162,7 +164,6 @@ void noinstr __context_tracking_exit(enu
> instrumentation_end();
> }
> }
> - __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
> }
> context_tracking_recursion_exit();
> }
>
>

2021-09-22 19:39:01

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

On Wed, Sep 22, 2021 at 08:17:21AM -0700, Paul E. McKenney wrote:
> On Wed, Sep 22, 2021 at 01:05:12PM +0200, Peter Zijlstra wrote:
> > Use rcu_user_{enter,exit}() calls to provide SMP ordering on context
> > tracking state stores:
> >
> > __context_tracking_exit()
> > __this_cpu_write(context_tracking.state, CONTEXT_KERNEL)
> > rcu_user_exit()
> > rcu_eqs_exit()
> > rcu_dynticks_eqs_eit()
> > rcu_dynticks_inc()
> > atomic_add_return() /* smp_mb */
> >
> > __context_tracking_enter()
> > rcu_user_enter()
> > rcu_eqs_enter()
> > rcu_dynticks_eqs_enter()
> > rcu_dynticks_inc()
> > atomic_add_return() /* smp_mb */
> > __this_cpu_write(context_tracking.state, state)
> >
> > This separates USER/KERNEL state with an smp_mb() on each side,
> > therefore, a user of context_tracking_state_cpu() can say the CPU must
> > pass through an smp_mb() before changing.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>
> For the transformation to negative errno return value and name change
> from an RCU perspective:
>
> Acked-by: Paul E. McKenney <[email protected]>

Thanks!

> For the sampling of nohz_full userspace state:
>
> Another approach is for the rcu_data structure's ->dynticks variable to
> use the lower two bits to differentiate between idle, nohz_full userspace
> and kernel. In theory, inlining should make this zero cost for idle
> transition, and should allow you to safely sample nohz_full userspace
> state with a load and a couple of memory barriers instead of an IPI.

That's what I do now, it's like:

<user code>

state = KERNEL
smp_mb()

<kernel code>

smp_mb()
state = USER

<user core>

vs

<patch kernel code>
smp_mb()
if (state == USER)
// then we're guaranteed any subsequent kernel code execution
// will see the modified kernel code

more-or-less

> To make this work nicely, the low-order bits have to be 00 for kernel,
> and (say) 01 for idle and 10 for nohz_full userspace. 11 would be an
> error.
>
> The trick would be for rcu_user_enter() and rcu_user_exit() to atomically
> increment ->dynticks by 2, for rcu_nmi_exit() to increment by 1 and
> rcu_nmi_enter() to increment by 3. The state sampling would need to
> change accordingly.
>
> Does this make sense, or am I missing something?

Why doesn't the proposed patch work? Also, ISTR sampling of remote
context state coming up before. And as is, it's a weird mix between
context_tracking and rcu.

AFAICT there is very little useful in context_tracking as is, but it's
also very weird to have to ask RCU about this. Is there any way to slice
this this code differently? Perhaps move some of the state RCU now keeps
into context_tracking ?

Anyway, lemme see if I get your proposal; lets say the counter starts at
0 and is in kernel space.

0x00(0) - kernel
0x02(2) - user
0x04(0) - kernel

So far so simple, then NMI on top of that goes:

0x00(0) - kernel
0x03(3) - kernel + nmi
0x04(0) - kernel
0x06(2) - user
0x09(1) - user + nmi
0x0a(2) - user

Which then gives us:

(0) := kernel
(1) := nmi-from-user
(2) := user
(3) := nmi-from-kernel

Which should work I suppose. But like I said above, I'd be happier if
this counter would live in context_tracking rather than RCU.

2021-09-22 19:49:27

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

On Wed, Sep 22, 2021 at 09:33:43PM +0200, Peter Zijlstra wrote:

> Anyway, lemme see if I get your proposal; lets say the counter starts at
> 0 and is in kernel space.
>
> 0x00(0) - kernel
> 0x02(2) - user
> 0x04(0) - kernel
>
> So far so simple, then NMI on top of that goes:
>
> 0x00(0) - kernel
> 0x03(3) - kernel + nmi
> 0x04(0) - kernel
> 0x06(2) - user
> 0x09(1) - user + nmi
> 0x0a(2) - user
>
> Which then gives us:
>
> (0) := kernel
> (1) := nmi-from-user
> (2) := user
> (3) := nmi-from-kernel
>
> Which should work I suppose. But like I said above, I'd be happier if
> this counter would live in context_tracking rather than RCU.

Furthermore, if we have this counter, the we can also do things like:

seq = context_tracking_seq_cpu(that_cpu);
if ((seq & 3) != USER)
// nohz_fail, do something
set_tsk_thread_flag(curr_task(that_cpu), TIF_DO_SOME_WORK);
if (seq == context_tracking_seq_cpu(that_cpu))
// success!!

To remotely set pending state. Allowing yet more NOHZ_FULL fixes, like,
for example, eliding the text_poke IPIs.

2021-09-22 19:55:49

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

On Wed, Sep 22, 2021 at 09:33:43PM +0200, Peter Zijlstra wrote:
> On Wed, Sep 22, 2021 at 08:17:21AM -0700, Paul E. McKenney wrote:
> > On Wed, Sep 22, 2021 at 01:05:12PM +0200, Peter Zijlstra wrote:
> > > Use rcu_user_{enter,exit}() calls to provide SMP ordering on context
> > > tracking state stores:
> > >
> > > __context_tracking_exit()
> > > __this_cpu_write(context_tracking.state, CONTEXT_KERNEL)
> > > rcu_user_exit()
> > > rcu_eqs_exit()
> > > rcu_dynticks_eqs_eit()
> > > rcu_dynticks_inc()
> > > atomic_add_return() /* smp_mb */
> > >
> > > __context_tracking_enter()
> > > rcu_user_enter()
> > > rcu_eqs_enter()
> > > rcu_dynticks_eqs_enter()
> > > rcu_dynticks_inc()
> > > atomic_add_return() /* smp_mb */
> > > __this_cpu_write(context_tracking.state, state)
> > >
> > > This separates USER/KERNEL state with an smp_mb() on each side,
> > > therefore, a user of context_tracking_state_cpu() can say the CPU must
> > > pass through an smp_mb() before changing.
> > >
> > > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> >
> > For the transformation to negative errno return value and name change
> > from an RCU perspective:
> >
> > Acked-by: Paul E. McKenney <[email protected]>
>
> Thanks!
>
> > For the sampling of nohz_full userspace state:
> >
> > Another approach is for the rcu_data structure's ->dynticks variable to
> > use the lower two bits to differentiate between idle, nohz_full userspace
> > and kernel. In theory, inlining should make this zero cost for idle
> > transition, and should allow you to safely sample nohz_full userspace
> > state with a load and a couple of memory barriers instead of an IPI.
>
> That's what I do now, it's like:
>
> <user code>
>
> state = KERNEL
> smp_mb()
>
> <kernel code>
>
> smp_mb()
> state = USER
>
> <user core>
>
> vs
>
> <patch kernel code>
> smp_mb()
> if (state == USER)
> // then we're guaranteed any subsequent kernel code execution
> // will see the modified kernel code
>
> more-or-less

OK.

> > To make this work nicely, the low-order bits have to be 00 for kernel,
> > and (say) 01 for idle and 10 for nohz_full userspace. 11 would be an
> > error.
> >
> > The trick would be for rcu_user_enter() and rcu_user_exit() to atomically
> > increment ->dynticks by 2, for rcu_nmi_exit() to increment by 1 and
> > rcu_nmi_enter() to increment by 3. The state sampling would need to
> > change accordingly.
> >
> > Does this make sense, or am I missing something?
>
> Why doesn't the proposed patch work? Also, ISTR sampling of remote
> context state coming up before. And as is, it's a weird mix between
> context_tracking and rcu.

I wasn't saying that the patch doesn't work. But doesn't it add an IPI?
Or was I looking at it too early this morning?

As to RCU's ->dynticks and context-tracking state, something about RCU
being there first by many years. ;-) Plus, does context-tracking
track idleness within the kernel? RCU needs that as well.

> AFAICT there is very little useful in context_tracking as is, but it's
> also very weird to have to ask RCU about this. Is there any way to slice
> this this code differently? Perhaps move some of the state RCU now keeps
> into context_tracking ?
>
> Anyway, lemme see if I get your proposal; lets say the counter starts at
> 0 and is in kernel space.
>
> 0x00(0) - kernel
> 0x02(2) - user
> 0x04(0) - kernel
>
> So far so simple, then NMI on top of that goes:
>
> 0x00(0) - kernel
> 0x03(3) - kernel + nmi

This would stay 0x00 because the NMI is interrupting kernel code. The
check of rcu_dynticks_curr_cpu_in_eqs() avoids this additional increment.

> 0x04(0) - kernel

And same here, still zero.

> 0x06(2) - user

And now 0x02.

> 0x09(1) - user + nmi

This would be 0x04, back in the kernel. Which is the area of concern,
because the amount to increment depends on the counter value, requiring
an additional arithmetic operation on the from-idle fastpath. Probably
not visible even in microbenchmarks, but still a potential issue.

> 0x0a(2) - user

Now 0x06, back in nohz_full userspace.

> Which then gives us:
>
> (0) := kernel
> (1) := nmi-from-user
> (2) := user
> (3) := nmi-from-kernel

You need to know NMI as a separate state?

> Which should work I suppose. But like I said above, I'd be happier if
> this counter would live in context_tracking rather than RCU.

This would be the first non-RCU user of the counter. The various
rcu_*_{enter,exit}() functions are still required, though, in order
to handle things like deferred wakeups. Plus RCU makes heavy use
of that counter. So it is not clear that moving the counter to the
context-tracking subsystem really buys you anything.

But it would be good to avoid maintaining duplicate information,
that is assuming that the information really is duplicate...

Thanx, Paul

2021-09-22 20:01:44

by Paul E. McKenney

[permalink] [raw]

Subject: Re: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

On Wed, Sep 22, 2021 at 09:47:59PM +0200, Peter Zijlstra wrote:
> On Wed, Sep 22, 2021 at 09:33:43PM +0200, Peter Zijlstra wrote:
>
> > Anyway, lemme see if I get your proposal; lets say the counter starts at
> > 0 and is in kernel space.
> >
> > 0x00(0) - kernel
> > 0x02(2) - user
> > 0x04(0) - kernel
> >
> > So far so simple, then NMI on top of that goes:
> >
> > 0x00(0) - kernel
> > 0x03(3) - kernel + nmi
> > 0x04(0) - kernel
> > 0x06(2) - user
> > 0x09(1) - user + nmi
> > 0x0a(2) - user
> >
> > Which then gives us:
> >
> > (0) := kernel
> > (1) := nmi-from-user
> > (2) := user
> > (3) := nmi-from-kernel
> >
> > Which should work I suppose. But like I said above, I'd be happier if
> > this counter would live in context_tracking rather than RCU.
>
> Furthermore, if we have this counter, the we can also do things like:
>
> seq = context_tracking_seq_cpu(that_cpu);
> if ((seq & 3) != USER)
> // nohz_fail, do something
> set_tsk_thread_flag(curr_task(that_cpu), TIF_DO_SOME_WORK);
> if (seq == context_tracking_seq_cpu(that_cpu))
> // success!!
>
> To remotely set pending state. Allowing yet more NOHZ_FULL fixes, like,
> for example, eliding the text_poke IPIs.

Nice!

There have been several instances where I thought that the extra state
would help RCU, but each time there turned out to be a simpler way to
get things done. Or that it eventually turned out that RCU didn't need
to care about the difference between idle and nohz_full userspace.

Thanx, Paul

2021-09-22 20:05:04

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

On Wed, Sep 22, 2021 at 12:53:50PM -0700, Paul E. McKenney wrote:

> I wasn't saying that the patch doesn't work. But doesn't it add an IPI?
> Or was I looking at it too early this morning?

Ah, no. The patch allows a user-bound NOHZ_FULL task to be transitioned
remotely. Unlike today, where they'll eventually poke it with a signal
to force a kernel entry, which is bad m'kay :-)

The code in question skips transitioning running tasks, seeing as you
can't tell what they're doing etc.. Howver, with context tracking on
you're supposedly able to tell they're in userspace without disturbing
them -- except you really can't today.

So if you can tell that a current running task is in userspace (hence my
patch) you can allow the task to transition without any further ado,
userspace is a safe state vs kernel text patching.

2021-09-23 12:11:57

by Petr Mladek

[permalink] [raw]

Subject: Re: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

On Wed 2021-09-22 13:05:12, Peter Zijlstra wrote:
> Use rcu_user_{enter,exit}() calls to provide SMP ordering on context
> tracking state stores:
>
> __context_tracking_exit()
> __this_cpu_write(context_tracking.state, CONTEXT_KERNEL)
> rcu_user_exit()
> rcu_eqs_exit()
> rcu_dynticks_eqs_eit()
> rcu_dynticks_inc()
> atomic_add_return() /* smp_mb */
>
> __context_tracking_enter()
> rcu_user_enter()
> rcu_eqs_enter()
> rcu_dynticks_eqs_enter()
> rcu_dynticks_inc()
> atomic_add_return() /* smp_mb */
> __this_cpu_write(context_tracking.state, state)
>
> This separates USER/KERNEL state with an smp_mb() on each side,
> therefore, a user of context_tracking_state_cpu() can say the CPU must
> pass through an smp_mb() before changing.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> include/linux/context_tracking_state.h | 12 ++++++++++++
> kernel/context_tracking.c | 7 ++++---
> 2 files changed, 16 insertions(+), 3 deletions(-)
>
> --- a/include/linux/context_tracking_state.h
> +++ b/include/linux/context_tracking_state.h
> @@ -45,11 +45,23 @@ static __always_inline bool context_trac
> {
> return __this_cpu_read(context_tracking.state) == CONTEXT_USER;
> }
> +
> +static __always_inline bool context_tracking_state_cpu(int cpu)
> +{
> + struct context_tracking *ct = per_cpu_ptr(&context_tracking);

Missing cpu parameter:

struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);

> +
> + if (!context_tracking_enabled() || !ct->active)
> + return CONTEXT_DISABLED;
> +
> + return ct->state;
> +}

Best Regards,
Petr

2021-09-25 06:20:30

by Joel Savitz

[permalink] [raw]

Subject: Re: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

On Wed, Sep 22, 2021 at 01:05:12PM +0200, Peter Zijlstra wrote:
> ---
> include/linux/context_tracking_state.h | 12 ++++++++++++
> kernel/context_tracking.c | 7 ++++---
> 2 files changed, 16 insertions(+), 3 deletions(-)
>
> --- a/include/linux/context_tracking_state.h
> +++ b/include/linux/context_tracking_state.h
> @@ -45,11 +45,23 @@ static __always_inline bool context_trac
> {
> return __this_cpu_read(context_tracking.state) == CONTEXT_USER;
> }
> +
> +static __always_inline bool context_tracking_state_cpu(int cpu)
> +{
> + struct context_tracking *ct = per_cpu_ptr(&context_tracking);
> +
> + if (!context_tracking_enabled() || !ct->active)
> + return CONTEXT_DISABLED;
> +
> + return ct->state;
> +}
> +
> #else
> static inline bool context_tracking_in_user(void) { return false; }
> static inline bool context_tracking_enabled(void) { return false; }
> static inline bool context_tracking_enabled_cpu(int cpu) { return false; }
> static inline bool context_tracking_enabled_this_cpu(void) { return false; }
> +static inline bool context_tracking_state_cpu(int cpu) { return CONTEXT_DISABLED; }
> #endif /* CONFIG_CONTEXT_TRACKING */
>
> #endif

Should context_tracking_state_cpu return an enum ctx_state rather than a
bool? It appears to be doing an implicit cast.

I don't know if it possible to run livepatch with
CONFIG_CONTEXT_TRACKING disabled, but if so, then klp_check_task() as
modified by patch 7 will always consider the transition complete even if
the current task is in kernel mode. Also in the general case, the CPU
will consider the task complete if has ctx_state CONTEXT_GUEST though the
condition does not make it explicit.

I'm not sure what the correct behavior should be here as I am not very
experienced with this sybsystem but the patch looks a bit odd to me.

> --- a/kernel/context_tracking.c
> +++ b/kernel/context_tracking.c
> @@ -82,7 +82,7 @@ void noinstr __context_tracking_enter(en
> vtime_user_enter(current);
> instrumentation_end();
> }
> - rcu_user_enter();
> + rcu_user_enter(); /* smp_mb */
> }
> /*
> * Even if context tracking is disabled on this CPU, because it's outside
> @@ -149,12 +149,14 @@ void noinstr __context_tracking_exit(enu
> return;
>
> if (__this_cpu_read(context_tracking.state) == state) {
> + __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
> +
> if (__this_cpu_read(context_tracking.active)) {
> /*
> * We are going to run code that may use RCU. Inform
> * RCU core about that (ie: we may need the tick again).
> */
> - rcu_user_exit();
> + rcu_user_exit(); /* smp_mb */
> if (state == CONTEXT_USER) {
> instrumentation_begin();
> vtime_user_exit(current);
> @@ -162,7 +164,6 @@ void noinstr __context_tracking_exit(enu
> instrumentation_end();
> }
> }
> - __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
> }
> context_tracking_recursion_exit();
> }
>
>

2021-09-27 14:35:13

by Petr Mladek

[permalink] [raw]

Subject: Re: [RFC][PATCH 6/7] context_tracking: Provide SMP ordering using RCU

On Fri 2021-09-24 14:57:05, Joel Savitz wrote:
> On Wed, Sep 22, 2021 at 01:05:12PM +0200, Peter Zijlstra wrote:
> > ---
> > include/linux/context_tracking_state.h | 12 ++++++++++++
> > kernel/context_tracking.c | 7 ++++---
> > 2 files changed, 16 insertions(+), 3 deletions(-)
> >
> > --- a/include/linux/context_tracking_state.h
> > +++ b/include/linux/context_tracking_state.h
> > @@ -45,11 +45,23 @@ static __always_inline bool context_trac
> > {
> > return __this_cpu_read(context_tracking.state) == CONTEXT_USER;
> > }
> > +
> > +static __always_inline bool context_tracking_state_cpu(int cpu)
> > +{
> > + struct context_tracking *ct = per_cpu_ptr(&context_tracking);
> > +
> > + if (!context_tracking_enabled() || !ct->active)
> > + return CONTEXT_DISABLED;
> > +
> > + return ct->state;
> > +}
> > +
> > #else
> > static inline bool context_tracking_in_user(void) { return false; }
> > static inline bool context_tracking_enabled(void) { return false; }
> > static inline bool context_tracking_enabled_cpu(int cpu) { return false; }
> > static inline bool context_tracking_enabled_this_cpu(void) { return false; }
> > +static inline bool context_tracking_state_cpu(int cpu) { return CONTEXT_DISABLED; }
> > #endif /* CONFIG_CONTEXT_TRACKING */
> >
> > #endif
>
> Should context_tracking_state_cpu return an enum ctx_state rather than a
> bool? It appears to be doing an implicit cast.

Great catch!

> I don't know if it possible to run livepatch with
> CONFIG_CONTEXT_TRACKING disabled,

It should work with CONFIG_CONTEXT_TRACKING. The original code
migrates the task only when it is not running on any CPU and patched
functions are not on the stack. The stack check covers also
the user context.

> modified by patch 7 will always consider the transition complete even if
> the current task is in kernel mode. Also in the general case, the CPU
> will consider the task complete if has ctx_state CONTEXT_GUEST though the
> condition does not make it explicit.

Yup, we should avoid the enum -> bool cast, definitely.

Best Regards,
Petr