2022-03-16 13:37:08

by Marcelo Tosatti

[permalink] [raw]
Subject: [patch v12 09/13] task isolation: add preempt notifier to sync per-CPU vmstat dirty info to thread info

If a thread has task isolation activated, is preempted by thread B,
which marks vmstat information dirty, and is preempted back in,
one might return to userspace with vmstat dirty information on the
CPU in question.

To address this problem, add a preempt notifier that transfers vmstat dirty
information to TIF_TASK_ISOL thread flag.

Signed-off-by: Marcelo Tosatti <[email protected]>

---

v12:
- switch from raw_cpu_read to __this_cpu_read (Frederic)

---
include/linux/task_isolation.h | 2 ++
include/linux/vmstat.h | 6 ++++++
kernel/task_isolation.c | 23 +++++++++++++++++++++++
mm/vmstat.c | 7 +++++++
4 files changed, 38 insertions(+)

Index: linux-2.6/kernel/task_isolation.c
===================================================================
--- linux-2.6.orig/kernel/task_isolation.c
+++ linux-2.6/kernel/task_isolation.c
@@ -19,6 +19,7 @@
#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
+#include <linux/preempt.h>
#include <linux/task_isolation.h>

void __task_isol_exit(struct task_struct *tsk)
@@ -30,6 +31,9 @@ void __task_isol_exit(struct task_struct
return;

static_key_slow_dec(&vmstat_sync_enabled);
+
+ preempt_notifier_unregister(&i->preempt_notifier);
+ preempt_notifier_dec();
}

void __task_isol_free(struct task_struct *tsk)
@@ -40,6 +44,21 @@ void __task_isol_free(struct task_struct
tsk->task_isol_info = NULL;
}

+static void task_isol_sched_in(struct preempt_notifier *pn, int cpu)
+{
+ vmstat_dirty_to_thread_flag();
+}
+
+static void task_isol_sched_out(struct preempt_notifier *pn,
+ struct task_struct *next)
+{
+}
+
+static __read_mostly struct preempt_ops task_isol_preempt_ops = {
+ .sched_in = task_isol_sched_in,
+ .sched_out = task_isol_sched_out,
+};
+
static struct task_isol_info *task_isol_alloc_context(void)
{
struct task_isol_info *info;
@@ -48,6 +67,10 @@ static struct task_isol_info *task_isol_
if (unlikely(!info))
return ERR_PTR(-ENOMEM);

+ preempt_notifier_inc();
+ preempt_notifier_init(&info->preempt_notifier, &task_isol_preempt_ops);
+ preempt_notifier_register(&info->preempt_notifier);
+
preempt_disable();
init_sync_vmstat();
preempt_enable();
Index: linux-2.6/include/linux/task_isolation.h
===================================================================
--- linux-2.6.orig/include/linux/task_isolation.h
+++ linux-2.6/include/linux/task_isolation.h
@@ -17,6 +17,8 @@ struct task_isol_info {
u64 oneshot_mask;

u8 inherit_mask;
+
+ struct preempt_notifier preempt_notifier;
};

extern void __task_isol_free(struct task_struct *tsk);
Index: linux-2.6/include/linux/vmstat.h
===================================================================
--- linux-2.6.orig/include/linux/vmstat.h
+++ linux-2.6/include/linux/vmstat.h
@@ -34,10 +34,16 @@ static inline void sync_vmstat(void)
}

void init_sync_vmstat(void);
+
+void vmstat_dirty_to_thread_flag(void);
#else
static inline void sync_vmstat(void)
{
}
+
+static inline void vmstat_dirty_to_thread_flag(void)
+{
+}
#endif

struct reclaim_stat {
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c
+++ linux-2.6/mm/vmstat.c
@@ -353,6 +353,13 @@ void init_sync_vmstat(void)
set_thread_flag(TIF_TASK_ISOL);
}
EXPORT_SYMBOL_GPL(vmstat_dirty);
+
+void vmstat_dirty_to_thread_flag(void)
+{
+ if (__this_cpu_read(vmstat_dirty) == true)
+ set_thread_flag(TIF_TASK_ISOL);
+}
+EXPORT_SYMBOL_GPL(vmstat_dirty_to_thread_flag);
#else
static inline void mark_vmstat_dirty(void)
{



2022-03-17 06:09:22

by Oscar Shiang

[permalink] [raw]
Subject: Re: [patch v12 09/13] task isolation: add preempt notifier to sync per-CPU vmstat dirty info to thread info

On Mar 15, 2022, at 11:31 PM, Marcelo Tosatti <[email protected]> wrote:
> If a thread has task isolation activated, is preempted by thread B,
> which marks vmstat information dirty, and is preempted back in,
> one might return to userspace with vmstat dirty information on the
> CPU in question.
>
> To address this problem, add a preempt notifier that transfers vmstat dirty
> information to TIF_TASK_ISOL thread flag.
>
> Signed-off-by: Marcelo Tosatti <[email protected]>
>
> ---
>
> v12:
> - switch from raw_cpu_read to __this_cpu_read (Frederic)
>
> ---
> include/linux/task_isolation.h | 2 ++
> include/linux/vmstat.h | 6 ++++++
> kernel/task_isolation.c | 23 +++++++++++++++++++++++
> mm/vmstat.c | 7 +++++++
> 4 files changed, 38 insertions(+)
>
> Index: linux-2.6/kernel/task_isolation.c
> ===================================================================
> --- linux-2.6.orig/kernel/task_isolation.c
> +++ linux-2.6/kernel/task_isolation.c
> @@ -19,6 +19,7 @@
> #include <linux/sched/task.h>
> #include <linux/mm.h>
> #include <linux/vmstat.h>
> +#include <linux/preempt.h>
> #include <linux/task_isolation.h>
>
> void __task_isol_exit(struct task_struct *tsk)
> @@ -30,6 +31,9 @@ void __task_isol_exit(struct task_struct
> return;
>
> static_key_slow_dec(&vmstat_sync_enabled);
> +
> + preempt_notifier_unregister(&i->preempt_notifier);
> + preempt_notifier_dec();
> }
>
> void __task_isol_free(struct task_struct *tsk)
> @@ -40,6 +44,21 @@ void __task_isol_free(struct task_struct
> tsk->task_isol_info = NULL;
> }
>
> +static void task_isol_sched_in(struct preempt_notifier *pn, int cpu)
> +{
> + vmstat_dirty_to_thread_flag();
> +}
> +
> +static void task_isol_sched_out(struct preempt_notifier *pn,
> + struct task_struct *next)
> +{
> +}
> +
> +static __read_mostly struct preempt_ops task_isol_preempt_ops = {
> + .sched_in = task_isol_sched_in,
> + .sched_out = task_isol_sched_out,
> +};
> +
> static struct task_isol_info *task_isol_alloc_context(void)
> {
> struct task_isol_info *info;
> @@ -48,6 +67,10 @@ static struct task_isol_info *task_isol_
> if (unlikely(!info))
> return ERR_PTR(-ENOMEM);
>
> + preempt_notifier_inc();
> + preempt_notifier_init(&info->preempt_notifier, &task_isol_preempt_ops);
> + preempt_notifier_register(&info->preempt_notifier);
> +
> preempt_disable();
> init_sync_vmstat();
> preempt_enable();
> Index: linux-2.6/include/linux/task_isolation.h
> ===================================================================
> --- linux-2.6.orig/include/linux/task_isolation.h
> +++ linux-2.6/include/linux/task_isolation.h
> @@ -17,6 +17,8 @@ struct task_isol_info {
> u64 oneshot_mask;
>
> u8 inherit_mask;
> +
> + struct preempt_notifier preempt_notifier;

Since preempt_notifier is visible only when CONFIG_KVM is enabled,
I think we can add KVM to the dependencies of CONFIG_TASK_ISOLATION.

Or we could encounter missing definition error while building without
CONFIG_KVM.

Thanks,
Oscar

2022-04-27 10:35:45

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch v12 09/13] task isolation: add preempt notifier to sync per-CPU vmstat dirty info to thread info

On Tue, Mar 15 2022 at 12:31, Marcelo Tosatti wrote:
> If a thread has task isolation activated, is preempted by thread B,
> which marks vmstat information dirty, and is preempted back in,
> one might return to userspace with vmstat dirty information on the
> CPU in question.
>
> To address this problem, add a preempt notifier that transfers vmstat dirty
> information to TIF_TASK_ISOL thread flag.

How does this compile with CONFIG_KVM=n?

Thanks,

tglx

2022-04-27 12:34:57

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch v12 09/13] task isolation: add preempt notifier to sync per-CPU vmstat dirty info to thread info

On Wed, Apr 27 2022 at 09:11, Thomas Gleixner wrote:
> On Tue, Mar 15 2022 at 12:31, Marcelo Tosatti wrote:
>> If a thread has task isolation activated, is preempted by thread B,
>> which marks vmstat information dirty, and is preempted back in,
>> one might return to userspace with vmstat dirty information on the
>> CPU in question.
>>
>> To address this problem, add a preempt notifier that transfers vmstat dirty
>> information to TIF_TASK_ISOL thread flag.
>
> How does this compile with CONFIG_KVM=n?

Aside of that, the existance of this preempt notifier alone tells me
that this is either a design fail or has no design in the first place.

The state of vmstat does not matter at all at the point where a task is
scheduled in. It matters when an isolated task goes out to user space or
enters a VM.

We already have something similar in the exit to user path:

tick_nohz_user_enter_prepare()

So you can do something like the below and have:

static inline void task_isol_exit_to_user_prepare(void)
{
if (unlikely(current_needs_isol_exit_to_user())
__task_isol_exit_to_user_prepare();
}

where current_needs_isol_exit_to_user() is a simple check of either an
existing mechanism like

task->syscall_work & SYSCALL_WORK_TASK_ISOL_EXIT

or of some new task isolation specific member of task_struct which is
placed so it is cache hot at that point:

task->isol_work & SYSCALL_TASK_ISOL_EXIT

which is going to be almost zero overhead for any non isolated task.

It's trivial enough to encode the real stuff into task->isol_work and
I'm pretty sure, that a 32bit member is sufficient for that. There is
absolutely no need for a potential 64x64 bit feature matrix.

Thanks,

tglx
---
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -142,6 +142,12 @@ void noinstr exit_to_user_mode(void)
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }

+static void exit_to_user_update_work(void)
+{
+ tick_nohz_user_enter_prepare();
+ task_isol_exit_to_user_prepare();
+}
+
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
@@ -178,8 +184,7 @@ static unsigned long exit_to_user_mode_l
*/
local_irq_disable_exit_to_user();

- /* Check if any of the above work has queued a deferred wakeup */
- tick_nohz_user_enter_prepare();
+ exit_to_user_update_work();

ti_work = read_thread_flags();
}
@@ -194,8 +199,7 @@ static void exit_to_user_mode_prepare(st

lockdep_assert_irqs_disabled();

- /* Flush pending rcuog wakeup before the last need_resched() check */
- tick_nohz_user_enter_prepare();
+ exit_to_user_update_work();

if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);

2022-05-04 17:07:20

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch v12 09/13] task isolation: add preempt notifier to sync per-CPU vmstat dirty info to thread info

On Wed, Apr 27, 2022 at 02:09:16PM +0200, Thomas Gleixner wrote:
> On Wed, Apr 27 2022 at 09:11, Thomas Gleixner wrote:
> > On Tue, Mar 15 2022 at 12:31, Marcelo Tosatti wrote:
> >> If a thread has task isolation activated, is preempted by thread B,
> >> which marks vmstat information dirty, and is preempted back in,
> >> one might return to userspace with vmstat dirty information on the
> >> CPU in question.
> >>
> >> To address this problem, add a preempt notifier that transfers vmstat dirty
> >> information to TIF_TASK_ISOL thread flag.
> >
> > How does this compile with CONFIG_KVM=n?
>
> Aside of that, the existance of this preempt notifier alone tells me
> that this is either a design fail or has no design in the first place.
>
> The state of vmstat does not matter at all at the point where a task is
> scheduled in. It matters when an isolated task goes out to user space or
> enters a VM.

If the following happens, with two threads with names that mean whether
a thread has task isolation enabled or not:

Thread-no-task-isol, Thread-task-isol.

Events:

not-runnable Thread-task-isol
runnable Thread-task-no-isol
marks vmstat dirty Thread-task-no-isol (writes to some per-CPU vmstat
counter)
not-runnable Thread-task-no-isol
runnable Thread-task-isol

Then we have to transfer the "vmstat dirty" information from per-CPU
bool to per-thread TIF_TASK_ISOL bit (so that the
task_isolation_process_work thing executes on return to userspace).

> We already have something similar in the exit to user path:
>
> tick_nohz_user_enter_prepare()
>
> So you can do something like the below and have:
>
> static inline void task_isol_exit_to_user_prepare(void)
> {
> if (unlikely(current_needs_isol_exit_to_user())
> __task_isol_exit_to_user_prepare();
> }
>
> where current_needs_isol_exit_to_user() is a simple check of either an
> existing mechanism like
>
> task->syscall_work & SYSCALL_WORK_TASK_ISOL_EXIT
>
> or of some new task isolation specific member of task_struct which is
> placed so it is cache hot at that point:
>
> task->isol_work & SYSCALL_TASK_ISOL_EXIT
>
> which is going to be almost zero overhead for any non isolated task.

Sure, but who sets SYSCALL_TASK_ISOL_EXIT or SYSCALL_TASK_ISOL_EXIT ?

> It's trivial enough to encode the real stuff into task->isol_work and
> I'm pretty sure, that a 32bit member is sufficient for that. There is
> absolutely no need for a potential 64x64 bit feature matrix.

Well, OK, the meaning of TIF_TASK_ISOL thread flag is ambiguous:

1) We set it when quiescing vmstat feature of task isolation.
2) We set it when switching between tasks A and B, B has
task isolation configured and activated, and per-CPU vmstat information
was dirty.
3) We clear it on return to userspace:

if (test_bit(TIF_TASK_ISOL, thread->flags)) {
clear_bit(TIF_TASK_ISOL, thread->flags))
process_task_isol_work();
}

So you prefer to separate:

Use TIF_TASK_ISOL for "task isolation configured and activated,
quiesce vmstat work on return to userspace" only, and then have
the "is vmstat per-CPU data dirty?" information held on
task->syscall_work or task->isol_work ? (that will be probably be two
cachelines).

You'd still need the preempt notifier, though (unless i am missing
something).

Happy with either case.

Thanks for the review!

> Thanks,
>
> tglx
> ---
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -142,6 +142,12 @@ void noinstr exit_to_user_mode(void)
> /* Workaround to allow gradual conversion of architecture code */
> void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
>
> +static void exit_to_user_update_work(void)
> +{
> + tick_nohz_user_enter_prepare();
> + task_isol_exit_to_user_prepare();
> +}
> +
> static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> unsigned long ti_work)
> {
> @@ -178,8 +184,7 @@ static unsigned long exit_to_user_mode_l
> */
> local_irq_disable_exit_to_user();
>
> - /* Check if any of the above work has queued a deferred wakeup */
> - tick_nohz_user_enter_prepare();
> + exit_to_user_update_work();
>
> ti_work = read_thread_flags();
> }
> @@ -194,8 +199,7 @@ static void exit_to_user_mode_prepare(st
>
> lockdep_assert_irqs_disabled();
>
> - /* Flush pending rcuog wakeup before the last need_resched() check */
> - tick_nohz_user_enter_prepare();
> + exit_to_user_update_work();
>
> if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
> ti_work = exit_to_user_mode_loop(regs, ti_work);
>
>


2022-05-06 06:45:30

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch v12 09/13] task isolation: add preempt notifier to sync per-CPU vmstat dirty info to thread info

On Wed, May 04 2022 at 13:32, Marcelo Tosatti wrote:
> On Wed, Apr 27, 2022 at 02:09:16PM +0200, Thomas Gleixner wrote:
>> Aside of that, the existance of this preempt notifier alone tells me
>> that this is either a design fail or has no design in the first place.
>>
>> The state of vmstat does not matter at all at the point where a task is
>> scheduled in. It matters when an isolated task goes out to user space or
>> enters a VM.
>
> If the following happens, with two threads with names that mean whether
> a thread has task isolation enabled or not:
>
> Thread-no-task-isol, Thread-task-isol.
>
> Events:
>
> not-runnable Thread-task-isol
> runnable Thread-task-no-isol
> marks vmstat dirty Thread-task-no-isol (writes to some per-CPU vmstat
> counter)
> not-runnable Thread-task-no-isol
> runnable Thread-task-isol
>
> Then we have to transfer the "vmstat dirty" information from per-CPU
> bool to per-thread TIF_TASK_ISOL bit (so that the
> task_isolation_process_work thing executes on return to userspace).

That's absolute nonsense.

sched_out() isolated task
vmstat_dirty()
this_cpu_or(isolwork, VMSTAT);
sched_in() isolated task

return_to_user()
local_irq_disable();
exit_to_user_update_work()
task_isol_exit_to_user_prepare()
if (!isolated_task())
return;
if (this_cpu_read(isolwork) & current->isol_work_mask)
set_thread_flag(TIF_ISOL);

exit_to_user_mode_loop()
do {
local_irq_enable();
handle_TIF_bits();
local_irq_disable();
exit_to_user_update_work();
work = read_thread_flags();
} while (work & EXIT_WORK);

Solves the problem nicely with a minimal overhead for non-isolated
tasks.

Plus some of these isolwork bits could even be handled _after_ returning
from exit_do_user_mode_loop() if they are good to be done in irq
diasbled context.

> Sure, but who sets SYSCALL_TASK_ISOL_EXIT or SYSCALL_TASK_ISOL_EXIT ?

It's set once by the prctl() when an isolation feature is enabled for a
task and it's cleared by the prctl() when the last isolation feature is
disabled for the task.

That's then used in:

static inline bool isolated_task()
{
return current->XXXX_work & TASK_ISOL_EXIT;
}

IOW, the return to user path has

- _ONE_ extra cache hot conditional for non-isolated tasks.

- _ONE_ central place to transform the per cpu isolation muck into
the TIF flag.

See? No sprinkling of TIF bits, no preempt notifiers, nothing.

> Use TIF_TASK_ISOL for "task isolation configured and activated,
> quiesce vmstat work on return to userspace" only, and then have
> the "is vmstat per-CPU data dirty?" information held on
> task->syscall_work or task->isol_work ? (that will be probably be two
> cachelines).

See above.

> You'd still need the preempt notifier, though (unless i am missing
> something).

Yes, see above.

Using a preempt notifier isa design fail because it tags information at
a place where this information is absolutely irrelevant and subject to
change.

Aside of that this information is not a task property. vmmstat_is_dirty
is a per CPU property. The only point where this per CPU property is
relevant for a task is when the task is isolated and goes out to user
space or enters a VM.

Trying to carry this information in a task flag is fundamentaly wrong
for obvious reasons and causes pointless overhead and complexity for
absolutely no value.

Thanks,

tglx