2022-05-20 00:33:32

by Frederic Weisbecker

[permalink] [raw]
Subject: [PATCH 20/21] rcu/context_tracking: Merge dynticks counter and context tracking states

Updating the context tracking state and the RCU dynticks counter
atomically in a single operation is a first step towards improving CPU
isolation. This makes the context tracking state updates fully ordered
and therefore allow for later enhancements such as postponing some work
while a task is running isolated in userspace until it ever comes back
to the kernel.

The state field becomes divided in two parts:

1) Two Lower bits for context tracking state:

CONTEXT_KERNEL = 0
CONTEXT_IDLE = 1,
CONTEXT_USER = 2,
CONTEXT_GUEST = 3,

2) Higher bits for RCU eqs dynticks counting:

RCU_DYNTICKS_IDX = 4

The dynticks counting is always incremented by this value.
(state & RCU_DYNTICKS_IDX) means we are NOT in an extended quiescent
state. This makes the chance for a collision more likely between two
RCU dynticks snapshots but wrapping up 28 bits of eqs dynticks
increments still takes some bad luck (also rdp.dynticks_snap could be
converted from int to long?)

Some RCU eqs functions have been renamed to better reflect their broader
scope that now include context tracking state.

Signed-off-by: Frederic Weisbecker <[email protected]>
Cc: Paul E. McKenney <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Neeraj Upadhyay <[email protected]>
Cc: Uladzislau Rezki <[email protected]>
Cc: Joel Fernandes <[email protected]>
Cc: Boqun Feng <[email protected]>
Cc: Nicolas Saenz Julienne <[email protected]>
Cc: Marcelo Tosatti <[email protected]>
Cc: Xiongfeng Wang <[email protected]>
Cc: Yu Liao<[email protected]>
Cc: Phil Auld <[email protected]>
Cc: Paul Gortmaker<[email protected]>
Cc: Alex Belits <[email protected]>
---
include/linux/context_tracking.h | 8 +-
include/linux/context_tracking_state.h | 35 ++++---
kernel/context_tracking.c | 128 ++++++++++++++++---------
kernel/rcu/tree.c | 13 ++-
kernel/rcu/tree_stall.h | 4 +-
5 files changed, 119 insertions(+), 69 deletions(-)

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index a8c1db0a3f65..fd354eaea510 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -118,16 +118,16 @@ extern void ct_idle_exit(void);
*/
static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
{
- return !(arch_atomic_read(this_cpu_ptr(&context_tracking.dynticks)) & 0x1);
+ return !(arch_atomic_read(this_cpu_ptr(&context_tracking.state)) & RCU_DYNTICKS_IDX);
}

/*
- * Increment the current CPU's context_tracking structure's ->dynticks field
+ * Increment the current CPU's context_tracking structure's ->state field
* with ordering. Return the new value.
*/
-static __always_inline unsigned long rcu_dynticks_inc(int incby)
+static __always_inline unsigned long ct_state_inc(int incby)
{
- return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.dynticks));
+ return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state));
}

#else
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 1501df6d4cfa..580a525bfba7 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -10,12 +10,20 @@
#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)

enum ctx_state {
- CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */
- CONTEXT_KERNEL = 0,
- CONTEXT_USER,
- CONTEXT_GUEST,
+ CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */
+ CONTEXT_KERNEL = 0,
+ CONTEXT_IDLE = 1,
+ CONTEXT_USER = 2,
+ CONTEXT_GUEST = 3,
+ CONTEXT_MAX = 4,
};

+/* Even value for idle, else odd. */
+#define RCU_DYNTICKS_IDX CONTEXT_MAX
+
+#define CT_STATE_MASK (CONTEXT_MAX - 1)
+#define CT_DYNTICKS_MASK (~CT_STATE_MASK)
+
struct context_tracking {
#ifdef CONFIG_CONTEXT_TRACKING_USER
/*
@@ -26,10 +34,11 @@ struct context_tracking {
*/
bool active;
int recursion;
+#endif
+#ifdef CONFIG_CONTEXT_TRACKING
atomic_t state;
#endif
#ifdef CONFIG_CONTEXT_TRACKING_IDLE
- atomic_t dynticks; /* Even value for idle, else odd. */
long dynticks_nesting; /* Track process nesting level. */
long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */
#endif
@@ -37,24 +46,29 @@ struct context_tracking {

#ifdef CONFIG_CONTEXT_TRACKING
DECLARE_PER_CPU(struct context_tracking, context_tracking);
+
+static __always_inline int __ct_state(void)
+{
+ return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_STATE_MASK;
+}
#endif

#ifdef CONFIG_CONTEXT_TRACKING_IDLE
static __always_inline int ct_dynticks(void)
{
- return atomic_read(this_cpu_ptr(&context_tracking.dynticks));
+ return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_DYNTICKS_MASK;
}

static __always_inline int ct_dynticks_cpu(int cpu)
{
struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
- return atomic_read(&ct->dynticks);
+ return atomic_read(&ct->state) & CT_DYNTICKS_MASK;
}

static __always_inline int ct_dynticks_cpu_acquire(int cpu)
{
struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
- return atomic_read_acquire(&ct->dynticks);
+ return atomic_read_acquire(&ct->state) & CT_DYNTICKS_MASK;
}

static __always_inline long ct_dynticks_nesting(void)
@@ -98,11 +112,6 @@ static inline bool context_tracking_enabled_this_cpu(void)
return context_tracking_enabled() && __this_cpu_read(context_tracking.active);
}

-static __always_inline int __ct_state(void)
-{
- return atomic_read(this_cpu_ptr(&context_tracking.state));
-}
-
/**
* ct_state() - return the current context tracking state if known
*
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 30a3d4c8c045..05723ba8bbf0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -28,8 +28,8 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
#ifdef CONFIG_CONTEXT_TRACKING_IDLE
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
- .dynticks = ATOMIC_INIT(1),
#endif
+ .state = ATOMIC_INIT(RCU_DYNTICKS_IDX),
};
EXPORT_SYMBOL_GPL(context_tracking);

@@ -76,7 +76,7 @@ static __always_inline void rcu_dynticks_task_trace_exit(void)
* RCU is watching prior to the call to this function and is no longer
* watching upon return.
*/
-static noinstr void rcu_dynticks_eqs_enter(void)
+static noinstr void ct_kernel_exit_state(int offset)
{
int seq;

@@ -86,9 +86,9 @@ static noinstr void rcu_dynticks_eqs_enter(void)
* next idle sojourn.
*/
rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
- seq = rcu_dynticks_inc(1);
+ seq = ct_state_inc(offset);
// RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
}

/*
@@ -96,7 +96,7 @@ static noinstr void rcu_dynticks_eqs_enter(void)
* called from an extended quiescent state, that is, RCU is not watching
* prior to the call to this function and is watching upon return.
*/
-static noinstr void rcu_dynticks_eqs_exit(void)
+static noinstr void ct_kernel_enter_state(int offset)
{
int seq;

@@ -105,10 +105,10 @@ static noinstr void rcu_dynticks_eqs_exit(void)
* and we also must force ordering with the next RCU read-side
* critical section.
*/
- seq = rcu_dynticks_inc(1);
+ seq = ct_state_inc(offset);
// RCU is now watching. Better not be in an extended quiescent state!
rcu_dynticks_task_trace_exit(); // After ->dynticks update!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
}

/*
@@ -119,7 +119,7 @@ static noinstr void rcu_dynticks_eqs_exit(void)
* the possibility of usermode upcalls having messed up our count
* of interrupt nesting level during the prior busy period.
*/
-static void noinstr rcu_eqs_enter(bool user)
+static void noinstr ct_kernel_exit(bool user, int offset)
{
struct context_tracking *ct = this_cpu_ptr(&context_tracking);

@@ -139,13 +139,13 @@ static void noinstr rcu_eqs_enter(bool user)
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rcu_preempt_deferred_qs(current);

- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));

instrumentation_end();
WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */
// RCU is watching here ...
- rcu_dynticks_eqs_enter();
+ ct_kernel_exit_state(offset);
// ... but is no longer watching here.
rcu_dynticks_task_enter();
}
@@ -158,7 +158,7 @@ static void noinstr rcu_eqs_enter(bool user)
* allow for the possibility of usermode upcalls messing up our count of
* interrupt nesting level during the busy period that is just now starting.
*/
-static void noinstr rcu_eqs_exit(bool user)
+static void noinstr ct_kernel_enter(bool user, int offset)
{
struct context_tracking *ct = this_cpu_ptr(&context_tracking);
long oldval;
@@ -173,12 +173,12 @@ static void noinstr rcu_eqs_exit(bool user)
}
rcu_dynticks_task_exit();
// RCU is not watching here ...
- rcu_dynticks_eqs_exit();
+ ct_kernel_enter_state(offset);
// ... but is watching here.
instrumentation_begin();

- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));

trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks());
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
@@ -192,7 +192,7 @@ static void noinstr rcu_eqs_exit(bool user)
* ct_nmi_exit - inform RCU of exit from NMI context
*
* If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update ct->dynticks and ct->dynticks_nmi_nesting
+ * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting
* to let the RCU grace-period handling know that the CPU is back to
* being RCU-idle.
*
@@ -229,12 +229,12 @@ void noinstr ct_nmi_exit(void)
trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks());
WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */

- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+ // instrumentation for the noinstr ct_kernel_exit_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));
instrumentation_end();

// RCU is watching here ...
- rcu_dynticks_eqs_enter();
+ ct_kernel_exit_state(RCU_DYNTICKS_IDX);
// ... but is no longer watching here.

if (!in_nmi())
@@ -244,7 +244,7 @@ void noinstr ct_nmi_exit(void)
/**
* ct_nmi_enter - inform RCU of entry to NMI context
*
- * If the CPU was idle from RCU's viewpoint, update ct->dynticks and
+ * If the CPU was idle from RCU's viewpoint, update ct->state and
* ct->dynticks_nmi_nesting to let the RCU grace-period handling know
* that the CPU is active. This implementation permits nested NMIs, as
* long as the nesting level does not overflow an int. (You will probably
@@ -275,14 +275,14 @@ void noinstr ct_nmi_enter(void)
rcu_dynticks_task_exit();

// RCU is not watching here ...
- rcu_dynticks_eqs_exit();
+ ct_kernel_enter_state(RCU_DYNTICKS_IDX);
// ... but is watching here.

instrumentation_begin();
// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
- instrument_atomic_read(&ct->dynticks, sizeof(ct->dynticks));
- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&ct->dynticks, sizeof(ct->dynticks));
+ instrument_atomic_read(&ct->state, sizeof(ct->state));
+ // instrumentation for the noinstr ct_kernel_enter_state()
+ instrument_atomic_write(&ct->state, sizeof(ct->state));

incby = 1;
} else if (!in_nmi()) {
@@ -315,7 +315,7 @@ void noinstr ct_nmi_enter(void)
void noinstr ct_idle_enter(void)
{
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled());
- rcu_eqs_enter(false);
+ ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE);
}
EXPORT_SYMBOL_GPL(ct_idle_enter);

@@ -333,7 +333,7 @@ void noinstr ct_idle_exit(void)
unsigned long flags;

raw_local_irq_save(flags);
- rcu_eqs_exit(false);
+ ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE);
raw_local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ct_idle_exit);
@@ -490,28 +490,49 @@ void noinstr __ct_user_enter(enum ctx_state state)
* that will fire and reschedule once we resume in user/guest mode.
*/
rcu_irq_work_resched();
+
/*
* Enter RCU idle mode right before resuming userspace. No use of RCU
* is permitted between this call and rcu_eqs_exit(). This way the
* CPU doesn't need to maintain the tick for RCU maintenance purposes
* when the CPU runs in userspace.
*/
- rcu_eqs_enter(true);
+ ct_kernel_exit(true, RCU_DYNTICKS_IDX + state);
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Even if context tracking is disabled on this CPU, because it's outside
+ * the full dynticks mask for example, we still have to keep track of the
+ * context transitions and states to prevent inconsistency on those of
+ * other CPUs.
+ * If a task triggers an exception in userspace, sleep on the exception
+ * handler and then migrate to another CPU, that new CPU must know where
+ * the exception returns by the time we call exception_exit().
+ * This information can only be provided by the previous CPU when it called
+ * exception_enter().
+ * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+ * is false because we know that CPU is not tickless.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ atomic_set(&ct->state, state);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ atomic_add(state, &ct->state);
+ }
}
- /*
- * Even if context tracking is disabled on this CPU, because it's outside
- * the full dynticks mask for example, we still have to keep track of the
- * context transitions and states to prevent inconsistency on those of
- * other CPUs.
- * If a task triggers an exception in userspace, sleep on the exception
- * handler and then migrate to another CPU, that new CPU must know where
- * the exception returns by the time we call exception_exit().
- * This information can only be provided by the previous CPU when it called
- * exception_enter().
- * OTOH we can spare the calls to vtime and RCU when context_tracking.active
- * is false because we know that CPU is not tickless.
- */
- atomic_set(&ct->state, state);
}
context_tracking_recursion_exit();
}
@@ -583,15 +604,36 @@ void noinstr __ct_user_exit(enum ctx_state state)
* Exit RCU idle mode while entering the kernel because it can
* run a RCU read side critical section anytime.
*/
- rcu_eqs_exit(true);
+ ct_kernel_enter(true, RCU_DYNTICKS_IDX - state);
if (state == CONTEXT_USER) {
instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
instrumentation_end();
}
+
+ /*
+ * Special case if we only track user <-> kernel transitions for tickless
+ * cputime accounting but we don't support RCU extended quiescent state.
+ * In this we case we don't care about any concurrency/ordering.
+ */
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE))
+ atomic_set(&ct->state, CONTEXT_KERNEL);
+
+ } else {
+ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {
+ /* Tracking for vtime only, no concurrent RCU EQS accounting */
+ atomic_set(&ct->state, CONTEXT_KERNEL);
+ } else {
+ /*
+ * Tracking for vtime and RCU EQS. Make sure we don't race
+ * with NMIs. OTOH we don't care about ordering here since
+ * RCU only requires RCU_DYNTICKS_IDX increments to be fully
+ * ordered.
+ */
+ atomic_sub(state, &ct->state);
+ }
}
- atomic_set(&ct->state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bd33b7b5cc9d..83a9aeed5409 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -276,9 +276,9 @@ void rcu_softirq_qs(void)
*/
static void rcu_dynticks_eqs_online(void)
{
- if (ct_dynticks() & 0x1)
+ if (ct_dynticks() & RCU_DYNTICKS_IDX)
return;
- rcu_dynticks_inc(1);
+ ct_state_inc(RCU_DYNTICKS_IDX);
}

/*
@@ -297,7 +297,7 @@ static int rcu_dynticks_snap(int cpu)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & 0x1);
+ return !(snap & RCU_DYNTICKS_IDX);
}

/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
@@ -325,8 +325,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
int snap;

// If not quiescent, force back to earlier extended quiescent state.
- snap = ct_dynticks_cpu(cpu) & ~0x1;
-
+ snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
smp_rmb(); // Order ->dynticks and *vp reads.
if (READ_ONCE(*vp))
return false; // Non-zero, so report failure;
@@ -352,9 +351,9 @@ notrace void rcu_momentary_dyntick_idle(void)
int seq;

raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- seq = rcu_dynticks_inc(2);
+ seq = ct_state_inc(2 * RCU_DYNTICKS_IDX);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(seq & 0x1));
+ WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
rcu_preempt_deferred_qs(current);
}
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 91e4fd4db12d..c3fbbcc09327 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -469,7 +469,7 @@ static void print_cpu_stall_info(int cpu)
rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
if (rcuc_starved)
sprintf(buf, " rcuc=%ld jiffies(starved)", j);
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -478,7 +478,7 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- rcu_dynticks_snap(cpu) & 0xfff,
+ rcu_dynticks_snap(cpu) & 0xffff,
ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
--
2.25.1



2022-06-01 19:51:11

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 20/21] rcu/context_tracking: Merge dynticks counter and context tracking states

On Mon, May 30, 2022 at 08:02:57PM +0200, nicolas saenz julienne wrote:
> Hi Frederic,
>
> On Thu, 2022-05-19 at 16:58 +0200, Frederic Weisbecker wrote:
> > Updating the context tracking state and the RCU dynticks counter
> > atomically in a single operation is a first step towards improving CPU
> > isolation. This makes the context tracking state updates fully ordered
> > and therefore allow for later enhancements such as postponing some work
> > while a task is running isolated in userspace until it ever comes back
> > to the kernel.
> >
> > The state field becomes divided in two parts:
> >
> > 1) Two Lower bits for context tracking state:
> >
> > CONTEXT_KERNEL = 0
> > CONTEXT_IDLE = 1,
> > CONTEXT_USER = 2,
> > CONTEXT_GUEST = 3,
> >
> > 2) Higher bits for RCU eqs dynticks counting:
> >
> > RCU_DYNTICKS_IDX = 4
> >
> > The dynticks counting is always incremented by this value.
> > (state & RCU_DYNTICKS_IDX) means we are NOT in an extended quiescent
> > state. This makes the chance for a collision more likely between two
> > RCU dynticks snapshots but wrapping up 28 bits of eqs dynticks
> > increments still takes some bad luck (also rdp.dynticks_snap could be
> > converted from int to long?)
> >
> > Some RCU eqs functions have been renamed to better reflect their broader
> > scope that now include context tracking state.
> >
> > Signed-off-by: Frederic Weisbecker <[email protected]>
> > Cc: Paul E. McKenney <[email protected]>
> > Cc: Peter Zijlstra <[email protected]>
> > Cc: Thomas Gleixner <[email protected]>
> > Cc: Neeraj Upadhyay <[email protected]>
> > Cc: Uladzislau Rezki <[email protected]>
> > Cc: Joel Fernandes <[email protected]>
> > Cc: Boqun Feng <[email protected]>
> > Cc: Nicolas Saenz Julienne <[email protected]>
> > Cc: Marcelo Tosatti <[email protected]>
> > Cc: Xiongfeng Wang <[email protected]>
> > Cc: Yu Liao<[email protected]>
> > Cc: Phil Auld <[email protected]>
> > Cc: Paul Gortmaker<[email protected]>
> > Cc: Alex Belits <[email protected]>
> > ---
>
> While working on a feature on top of this series (IPI deferral stuff) I believe
> I've found a discrepancy on how context state is being updated:
>
> - When servicing an IRQ from user-space, we increment dynticks, and clear the
> ct state to show we're in-kernel.
>
> - When servicing an IRQ from idle/guest or an NMI from any context we only
> increment the dynticks counter. The ct state remains unchanged.

Hmm, an IRQ from userspace does:

ct_user_enter()
//run in user
//-----IRQ
ct_user_exit()
ct_irq_enter()
ct_irq_exit()
ct_user_enter()
//run in user

An IRQ from guest does:

for (;;) {
context_tracking_guest_enter()
//vmrun
//IRQ pending
#VMEXIT
context_tracking_guest_exit()
local_irq_enable()
ct_irq_enter()
ct_irq_exit()
local_irq_disable()
}


(although I see there is an "sti" right before "vmrun" so it looks
possible to have ct_irq_enter() after context_tracking_guest_enter()
if a host IRQ fires between the sti and the vmrun though I might be
missing some kvm subtelty).

An IRQ from idle does just:

ct_idle_enter()
//IRQ
ct_irq_enter()
ct_irq_exit()
ct_idle_exit()

So guest looks mostly ok to me (except for the little sti before vmrun for
which I have a doubt). But idle at least is an exception and CONTEXT_IDLE will
remain during the interrupt handling. It's not that trivial to handle the idle
case because ct_irq_exit() needs to know that it is called between
ct_idle_enter() and ct_idle_exit().

Thanks.

2022-06-01 20:13:56

by nicolas saenz julienne

[permalink] [raw]
Subject: Re: [PATCH 20/21] rcu/context_tracking: Merge dynticks counter and context tracking states

Hi Frederic,

On Thu, 2022-05-19 at 16:58 +0200, Frederic Weisbecker wrote:
> Updating the context tracking state and the RCU dynticks counter
> atomically in a single operation is a first step towards improving CPU
> isolation. This makes the context tracking state updates fully ordered
> and therefore allow for later enhancements such as postponing some work
> while a task is running isolated in userspace until it ever comes back
> to the kernel.
>
> The state field becomes divided in two parts:
>
> 1) Two Lower bits for context tracking state:
>
> CONTEXT_KERNEL = 0
> CONTEXT_IDLE = 1,
> CONTEXT_USER = 2,
> CONTEXT_GUEST = 3,
>
> 2) Higher bits for RCU eqs dynticks counting:
>
> RCU_DYNTICKS_IDX = 4
>
> The dynticks counting is always incremented by this value.
> (state & RCU_DYNTICKS_IDX) means we are NOT in an extended quiescent
> state. This makes the chance for a collision more likely between two
> RCU dynticks snapshots but wrapping up 28 bits of eqs dynticks
> increments still takes some bad luck (also rdp.dynticks_snap could be
> converted from int to long?)
>
> Some RCU eqs functions have been renamed to better reflect their broader
> scope that now include context tracking state.
>
> Signed-off-by: Frederic Weisbecker <[email protected]>
> Cc: Paul E. McKenney <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Neeraj Upadhyay <[email protected]>
> Cc: Uladzislau Rezki <[email protected]>
> Cc: Joel Fernandes <[email protected]>
> Cc: Boqun Feng <[email protected]>
> Cc: Nicolas Saenz Julienne <[email protected]>
> Cc: Marcelo Tosatti <[email protected]>
> Cc: Xiongfeng Wang <[email protected]>
> Cc: Yu Liao<[email protected]>
> Cc: Phil Auld <[email protected]>
> Cc: Paul Gortmaker<[email protected]>
> Cc: Alex Belits <[email protected]>
> ---

While working on a feature on top of this series (IPI deferral stuff) I believe
I've found a discrepancy on how context state is being updated:

- When servicing an IRQ from user-space, we increment dynticks, and clear the
ct state to show we're in-kernel.

- When servicing an IRQ from idle/guest or an NMI from any context we only
increment the dynticks counter. The ct state remains unchanged.

Regards,
Nicolas

2022-06-01 20:14:50

by nicolas saenz julienne

[permalink] [raw]
Subject: Re: [PATCH 20/21] rcu/context_tracking: Merge dynticks counter and context tracking states

On Tue, 2022-05-31 at 16:23 +0200, Frederic Weisbecker wrote:
> On Mon, May 30, 2022 at 08:02:57PM +0200, nicolas saenz julienne wrote:
> > Hi Frederic,
> >
> > On Thu, 2022-05-19 at 16:58 +0200, Frederic Weisbecker wrote:
> > > Updating the context tracking state and the RCU dynticks counter
> > > atomically in a single operation is a first step towards improving CPU
> > > isolation. This makes the context tracking state updates fully ordered
> > > and therefore allow for later enhancements such as postponing some work
> > > while a task is running isolated in userspace until it ever comes back
> > > to the kernel.
> > >
> > > The state field becomes divided in two parts:
> > >
> > > 1) Two Lower bits for context tracking state:
> > >
> > > CONTEXT_KERNEL = 0
> > > CONTEXT_IDLE = 1,
> > > CONTEXT_USER = 2,
> > > CONTEXT_GUEST = 3,
> > >
> > > 2) Higher bits for RCU eqs dynticks counting:
> > >
> > > RCU_DYNTICKS_IDX = 4
> > >
> > > The dynticks counting is always incremented by this value.
> > > (state & RCU_DYNTICKS_IDX) means we are NOT in an extended quiescent
> > > state. This makes the chance for a collision more likely between two
> > > RCU dynticks snapshots but wrapping up 28 bits of eqs dynticks
> > > increments still takes some bad luck (also rdp.dynticks_snap could be
> > > converted from int to long?)
> > >
> > > Some RCU eqs functions have been renamed to better reflect their broader
> > > scope that now include context tracking state.
> > >
> > > Signed-off-by: Frederic Weisbecker <[email protected]>
> > > Cc: Paul E. McKenney <[email protected]>
> > > Cc: Peter Zijlstra <[email protected]>
> > > Cc: Thomas Gleixner <[email protected]>
> > > Cc: Neeraj Upadhyay <[email protected]>
> > > Cc: Uladzislau Rezki <[email protected]>
> > > Cc: Joel Fernandes <[email protected]>
> > > Cc: Boqun Feng <[email protected]>
> > > Cc: Nicolas Saenz Julienne <[email protected]>
> > > Cc: Marcelo Tosatti <[email protected]>
> > > Cc: Xiongfeng Wang <[email protected]>
> > > Cc: Yu Liao<[email protected]>
> > > Cc: Phil Auld <[email protected]>
> > > Cc: Paul Gortmaker<[email protected]>
> > > Cc: Alex Belits <[email protected]>
> > > ---
> >
> > While working on a feature on top of this series (IPI deferral stuff) I believe
> > I've found a discrepancy on how context state is being updated:
> >
> > - When servicing an IRQ from user-space, we increment dynticks, and clear the
> > ct state to show we're in-kernel.
> >
> > - When servicing an IRQ from idle/guest or an NMI from any context we only
> > increment the dynticks counter. The ct state remains unchanged.
>
> Hmm, an IRQ from userspace does:
>
> ct_user_enter()
> //run in user
> //-----IRQ
> ct_user_exit()
> ct_irq_enter()
> ct_irq_exit()
> ct_user_enter()
> //run in user
>
> An IRQ from guest does:
>
> for (;;) {
> context_tracking_guest_enter()
> //vmrun
> //IRQ pending
> #VMEXIT
> context_tracking_guest_exit()
> local_irq_enable()
> ct_irq_enter()
> ct_irq_exit()
> local_irq_disable()
> }
>
>
> (although I see there is an "sti" right before "vmrun" so it looks
> possible to have ct_irq_enter() after context_tracking_guest_enter()
> if a host IRQ fires between the sti and the vmrun though I might be
> missing some kvm subtelty).
>
> An IRQ from idle does just:
>
> ct_idle_enter()
> //IRQ
> ct_irq_enter()
> ct_irq_exit()
> ct_idle_exit()
>
> So guest looks mostly ok to me (except for the little sti before vmrun for
> which I have a doubt).

Yes, shouldn't have mentioned guests. I got carried away.

> But idle at least is an exception and CONTEXT_IDLE will remain during the
> interrupt handling. It's not that trivial to handle the idle case because
> ct_irq_exit() needs to know that it is called between ct_idle_enter() and
> ct_idle_exit().

Just for the record, this behaviour was already here regardless of this series,
so it's not something it needs to fix.

Something like this should work, right?

ct_idle_enter()
//IRQ or NMI
if (__ct_state() == CONTEXT_IDLE)
ct_idle_exit()
ct_irq_enter()
...
ct_irq_exit()
if (needs_update_state()) //using irqentry_state_t for ex.
ct_idle_entry()
ct_idle_exit()

Note that it's not a big issue as we can work around this behaviour by checking
through dynticks whether a CPU is really idle.

Do you think it's worth fixing nonetheless?

Regards,
Nicolas

2022-06-08 14:57:26

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH 20/21] rcu/context_tracking: Merge dynticks counter and context tracking states

On Tue, May 31, 2022 at 06:15:36PM +0200, nicolas saenz julienne wrote:
> On Tue, 2022-05-31 at 16:23 +0200, Frederic Weisbecker wrote:
> > But idle at least is an exception and CONTEXT_IDLE will remain during the
> > interrupt handling. It's not that trivial to handle the idle case because
> > ct_irq_exit() needs to know that it is called between ct_idle_enter() and
> > ct_idle_exit().
>
> Just for the record, this behaviour was already here regardless of this series,
> so it's not something it needs to fix.

Right.

>
> Something like this should work, right?
>
> ct_idle_enter()
> //IRQ or NMI
> if (__ct_state() == CONTEXT_IDLE)
> ct_idle_exit()

Right but that's one more costly operation (atomic_add_return())

> ct_irq_enter()

Ideally this should increment by RCU_DYNTICKS_IDX - CONTEXT_IDLE

> ...
> ct_irq_exit()

And this should increment by RCU_DYNTICKS_IDX + CONTEXT_IDLE

I guess the CONTEXT_IDLE state should be remembered on some per cpu
variable somewhere.

BTW one interesting optimization to do when an idle interrupt leads to
setting need_resched() would be to have:

idle_loop() {
while (!need_resched) {
rcu_idle_enter();
mwait();
//IRQ {
rcu_irq_enter();
do_irq()... //set need_resched()
rcu_irq_exit() // but no need to do the atomic_add_return() here
// since we want to keep RCU watching as we'll
// escape from idle
}
rcu_idle_exit() // and no need to do the atomic_add_return() here either


That's two expensive operations spared for a pretty common event.



> if (needs_update_state()) //using irqentry_state_t for ex.
> ct_idle_entry()
> ct_idle_exit()
>
> Note that it's not a big issue as we can work around this behaviour by checking
> through dynticks whether a CPU is really idle.
>
> Do you think it's worth fixing nonetheless?

Nothing urgent for sure.

>
> Regards,
> Nicolas

2022-06-08 18:19:08

by nicolas saenz julienne

[permalink] [raw]
Subject: Re: [PATCH 20/21] rcu/context_tracking: Merge dynticks counter and context tracking states

Hi Frederic,

On Wed, 2022-06-08 at 16:29 +0200, Frederic Weisbecker wrote:
> On Tue, May 31, 2022 at 06:15:36PM +0200, nicolas saenz julienne wrote:
> > On Tue, 2022-05-31 at 16:23 +0200, Frederic Weisbecker wrote:
> > > But idle at least is an exception and CONTEXT_IDLE will remain during the
> > > interrupt handling. It's not that trivial to handle the idle case because
> > > ct_irq_exit() needs to know that it is called between ct_idle_enter() and
> > > ct_idle_exit().
> >
> > Just for the record, this behaviour was already here regardless of this series,
> > so it's not something it needs to fix.
>
> Right.
>
> >
> > Something like this should work, right?
> >
> > ct_idle_enter()
> > //IRQ or NMI
> > if (__ct_state() == CONTEXT_IDLE)
> > ct_idle_exit()
>
> Right but that's one more costly operation (atomic_add_return())
>
> > ct_irq_enter()
>
> Ideally this should increment by RCU_DYNTICKS_IDX - CONTEXT_IDLE
>
> > ...
> > ct_irq_exit()
>
> And this should increment by RCU_DYNTICKS_IDX + CONTEXT_IDLE
>
> I guess the CONTEXT_IDLE state should be remembered on some per cpu
> variable somewhere.
>
> BTW one interesting optimization to do when an idle interrupt leads to
> setting need_resched() would be to have:
>
> idle_loop() {
> while (!need_resched) {
> rcu_idle_enter();
> mwait();
> //IRQ {
> rcu_irq_enter();
> do_irq()... //set need_resched()
> rcu_irq_exit() // but no need to do the atomic_add_return() here
> // since we want to keep RCU watching as we'll
> // escape from idle
> }
> rcu_idle_exit() // and no need to do the atomic_add_return() here either
>
>
> That's two expensive operations spared for a pretty common event.

>
>
>
> > if (needs_update_state()) //using irqentry_state_t for ex.
> > ct_idle_entry()
> > ct_idle_exit()
> >
> > Note that it's not a big issue as we can work around this behaviour by checking
> > through dynticks whether a CPU is really idle.
> >
> > Do you think it's worth fixing nonetheless?
>
> Nothing urgent for sure.

Thanks for the feedback, I'll think about it.

BTW if you're patient I'll try to make a last test run on v4 of the series next
week.

Regards,
Nicolas