2022-03-16 15:10:12

by Marcelo Tosatti

[permalink] [raw]
Subject: [patch v12 05/13] task isolation: sync vmstats on return to userspace

The logic to disable vmstat worker thread, when entering
nohz full, does not cover all scenarios. For example, it is possible
for the following to happen:

1) enter nohz_full, which calls refresh_cpu_vm_stats, syncing the stats.
2) app runs mlock, which increases counters for mlock'ed pages.
3) start -RT loop

Since refresh_cpu_vm_stats from nohz_full logic can happen _before_
the mlock, vmstat shepherd can restart vmstat worker thread on
the CPU in question.

To fix this, use the task isolation prctl interface to quiesce
deferred actions when returning to userspace.

This patch adds hooks to fork and exit code paths.

Signed-off-by: Marcelo Tosatti <[email protected]>

---
v12: set TIF_TASK_ISOL only when necessary (Frederic)
v11: fold patch to add task_isol_exit hooks (Frederic)
Use _TIF_TASK_ISOL bit on thread flags (Frederic)

v6: modify exit_to_user_mode_loop to cover exceptions and interrupts
v5: no changes
v4: add oneshot mode support

include/linux/task_isolation.h | 16 ++++++++++++++++
include/linux/vmstat.h | 8 ++++++++
kernel/entry/common.c | 15 +++++++++++----
kernel/task_isolation.c | 21 +++++++++++++++++++++
mm/vmstat.c | 21 +++++++++++++++++++++
5 files changed, 77 insertions(+), 4 deletions(-)

Index: linux-2.6/include/linux/task_isolation.h
===================================================================
--- linux-2.6.orig/include/linux/task_isolation.h
+++ linux-2.6/include/linux/task_isolation.h
@@ -27,6 +27,13 @@ static inline void task_isol_free(struct
__task_isol_free(tsk);
}

+void __task_isol_exit(struct task_struct *tsk);
+static inline void task_isol_exit(struct task_struct *tsk)
+{
+ if (tsk->task_isol_info)
+ __task_isol_exit(tsk);
+}
+
int prctl_task_isol_feat_get(unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5);
int prctl_task_isol_cfg_get(unsigned long arg2, unsigned long arg3,
@@ -40,12 +47,22 @@ int prctl_task_isol_activate_set(unsigne

int __copy_task_isol(struct task_struct *tsk);

+void task_isol_exit_to_user_mode(void);
+
#else

+static inline void task_isol_exit_to_user_mode(void)
+{
+}
+
static inline void task_isol_free(struct task_struct *tsk)
{
}

+static inline void task_isol_exit(struct task_struct *tsk)
+{
+}
+
static inline int prctl_task_isol_feat_get(unsigned long arg2,
unsigned long arg3,
unsigned long arg4,
Index: linux-2.6/include/linux/vmstat.h
===================================================================
--- linux-2.6.orig/include/linux/vmstat.h
+++ linux-2.6/include/linux/vmstat.h
@@ -21,6 +21,14 @@ int sysctl_vm_numa_stat_handler(struct c
void *buffer, size_t *length, loff_t *ppos);
#endif

+#if defined(CONFIG_SMP) && defined(CONFIG_TASK_ISOLATION)
+void sync_vmstat(void);
+#else
+static inline void sync_vmstat(void)
+{
+}
+#endif
+
struct reclaim_stat {
unsigned nr_dirty;
unsigned nr_unqueued_dirty;
Index: linux-2.6/kernel/entry/common.c
===================================================================
--- linux-2.6.orig/kernel/entry/common.c
+++ linux-2.6/kernel/entry/common.c
@@ -6,6 +6,7 @@
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
+#include <linux/task_isolation.h>

#include "common.h"

@@ -174,6 +175,9 @@ static unsigned long exit_to_user_mode_l
if (ti_work & _TIF_NOTIFY_RESUME)
tracehook_notify_resume(regs);

+ if (ti_work & _TIF_TASK_ISOL)
+ task_isol_exit_to_user_mode();
+
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);

Index: linux-2.6/kernel/task_isolation.c
===================================================================
--- linux-2.6.orig/kernel/task_isolation.c
+++ linux-2.6/kernel/task_isolation.c
@@ -18,6 +18,12 @@
#include <linux/sysfs.h>
#include <linux/init.h>
#include <linux/sched/task.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+
+void __task_isol_exit(struct task_struct *tsk)
+{
+}

void __task_isol_free(struct task_struct *tsk)
{
@@ -251,6 +257,11 @@ static int cfg_feat_quiesce_set(unsigned
info->quiesce_mask = i_qctrl->quiesce_mask;
info->oneshot_mask = i_qctrl->quiesce_oneshot_mask;
info->conf_mask |= ISOL_F_QUIESCE;
+
+ if ((info->active_mask & ISOL_F_QUIESCE) &&
+ (info->quiesce_mask & ISOL_F_QUIESCE_VMSTATS))
+ set_thread_flag(TIF_TASK_ISOL);
+
ret = 0;

out_free:
@@ -303,6 +314,7 @@ int __copy_task_isol(struct task_struct
new_info->active_mask = info->active_mask;

tsk->task_isol_info = new_info;
+ set_ti_thread_flag(task_thread_info(tsk), TIF_TASK_ISOL);

return 0;
}
@@ -330,6 +342,10 @@ int prctl_task_isol_activate_set(unsigne
info->active_mask = active_mask;
ret = 0;

+ if ((info->active_mask & ISOL_F_QUIESCE) &&
+ (info->quiesce_mask & ISOL_F_QUIESCE_VMSTATS))
+ set_thread_flag(TIF_TASK_ISOL);
+
out:
return ret;
}
@@ -349,3 +365,24 @@ int prctl_task_isol_activate_get(unsigne

return 0;
}
+
+void task_isol_exit_to_user_mode(void)
+{
+ struct task_isol_info *i;
+
+ clear_thread_flag(TIF_TASK_ISOL);
+
+ i = current->task_isol_info;
+ if (!i)
+ return;
+
+ if (i->active_mask != ISOL_F_QUIESCE)
+ return;
+
+ if (i->quiesce_mask & ISOL_F_QUIESCE_VMSTATS) {
+ sync_vmstat();
+ if (i->oneshot_mask & ISOL_F_QUIESCE_VMSTATS)
+ i->quiesce_mask &= ~ISOL_F_QUIESCE_VMSTATS;
+ }
+}
+EXPORT_SYMBOL_GPL(task_isol_exit_to_user_mode);
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c
+++ linux-2.6/mm/vmstat.c
@@ -2018,6 +2018,29 @@ static void vmstat_shepherd(struct work_
round_jiffies_relative(sysctl_stat_interval));
}

+#ifdef CONFIG_TASK_ISOLATION
+void sync_vmstat(void)
+{
+ int cpu;
+
+ cpu = get_cpu();
+
+ refresh_cpu_vm_stats(false);
+ put_cpu();
+
+ /*
+ * If task is migrated to another CPU between put_cpu
+ * and cancel_delayed_work_sync, the code below might
+ * cancel vmstat_update work for a different cpu
+ * (than the one from which the vmstats were flushed).
+ *
+ * However, vmstat shepherd will re-enable it later,
+ * so its harmless.
+ */
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+}
+#endif
+
static void __init start_shepherd_timer(void)
{
int cpu;
Index: linux-2.6/include/linux/entry-common.h
===================================================================
--- linux-2.6.orig/include/linux/entry-common.h
+++ linux-2.6/include/linux/entry-common.h
@@ -60,7 +60,7 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
- ARCH_EXIT_TO_USER_MODE_WORK)
+ _TIF_TASK_ISOL | ARCH_EXIT_TO_USER_MODE_WORK)

/**
* arch_check_user_regs - Architecture specific sanity check for user mode regs
Index: linux-2.6/kernel/exit.c
===================================================================
--- linux-2.6.orig/kernel/exit.c
+++ linux-2.6/kernel/exit.c
@@ -64,6 +64,7 @@
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
+#include <linux/task_isolation.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -759,6 +760,7 @@ void __noreturn do_exit(long code)
validate_creds_for_do_exit(tsk);

io_uring_files_cancel();
+ task_isol_exit(tsk);
exit_signals(tsk); /* sets PF_EXITING */

/* sync mm's RSS info before statistics gathering */
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -2427,6 +2427,7 @@ bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_task_isol:
+ task_isol_exit(p);
task_isol_free(p);
bad_fork_cleanup_thread:
exit_thread(p);



2022-04-25 23:52:07

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch v12 05/13] task isolation: sync vmstats on return to userspace

On Tue, Mar 15 2022 at 12:31, Marcelo Tosatti wrote:
> The logic to disable vmstat worker thread, when entering
> nohz full, does not cover all scenarios. For example, it is possible
> for the following to happen:
>
> 1) enter nohz_full, which calls refresh_cpu_vm_stats, syncing the stats.
> 2) app runs mlock, which increases counters for mlock'ed pages.
> 3) start -RT loop
>
> Since refresh_cpu_vm_stats from nohz_full logic can happen _before_
> the mlock, vmstat shepherd can restart vmstat worker thread on
> the CPU in question.
>
> To fix this, use the task isolation prctl interface to quiesce
> deferred actions when returning to userspace.
>
> This patch adds hooks to fork and exit code paths.

git grep 'This patch' Documentation/process/

> +void __task_isol_exit(struct task_struct *tsk);
> +static inline void task_isol_exit(struct task_struct *tsk)

I assume the amount of new lines per patch is restricted somehow, right?

Glueing the __task_isol_exit() declaration to the definition of
task_isol_exit() is just annoyingly disturbing the reading flow.

New lines exist for a reason.

> +{
> + if (tsk->task_isol_info)
> + __task_isol_exit(tsk);
> +}
> #else

but ...

> +static inline void task_isol_exit_to_user_mode(void)
> +{
> +}
> +
> static inline void task_isol_free(struct task_struct *tsk)
> {
> }
>
> +static inline void task_isol_exit(struct task_struct *tsk)
> +{
> +}
> +

here you use plenty of them where it does not matter at all....
What's wrong with:

static inline void task_isol_exit_to_user_mode(void) { }
static inline void task_isol_free(struct task_struct *tsk) { }
static inline void task_isol_exit(struct task_struct *tsk) { }

and spending at least one of the saved newlines for separating the
above:

+ void __task_isol_exit(struct task_struct *tsk);
+
+ static inline void task_isol_exit(struct task_struct *tsk)

Hmm?

> @@ -251,6 +257,11 @@ static int cfg_feat_quiesce_set(unsigned
> info->quiesce_mask = i_qctrl->quiesce_mask;
> info->oneshot_mask = i_qctrl->quiesce_oneshot_mask;
> info->conf_mask |= ISOL_F_QUIESCE;
> +
> + if ((info->active_mask & ISOL_F_QUIESCE) &&
> + (info->quiesce_mask & ISOL_F_QUIESCE_VMSTATS))
> + set_thread_flag(TIF_TASK_ISOL);

Yet more hard coded special purpose muck. Plus the proof of the
inconsistency I described before...

> +void task_isol_exit_to_user_mode(void)
> +{
> + struct task_isol_info *i;

*i is really a descriptive variable name. Is this supposed to be
submitted to the convoluted C-code contest?

Dammit, we are not short of characters here and 'i' is generally used as
iterator variable which is hardly of type struct task_isol_info *.

> + clear_thread_flag(TIF_TASK_ISOL);

What? See below....

> + i = current->task_isol_info;
> + if (!i)
> + return;

That really makes sense. Why can a task which has TIF_TASK_ISOL set,
have current->task_isol_info != NULL?

I'm all for defensive programming, but if you really want to check this
then this should be:

isol_info = current->task_isol_info;
if (WARN_ON_ONCE(!isol_info))
return;
No?

> + if (i->active_mask != ISOL_F_QUIESCE)
> + return;

Yay, more future proof hard coding!

> + if (i->quiesce_mask & ISOL_F_QUIESCE_VMSTATS) {
> + sync_vmstat();
> + if (i->oneshot_mask & ISOL_F_QUIESCE_VMSTATS)
> + i->quiesce_mask &= ~ISOL_F_QUIESCE_VMSTATS;

The point of this exercise is?

To clear quiesce_mask because this code path cannot be reached anymore
due to TIF_TASK_ISOL being cleared above.

Of course the active vs. no subfeature configured inconsistency is
preserved here for consistency reasons. At least something which is
consistent.

> /**
> * arch_check_user_regs - Architecture specific sanity check for user mode regs
> Index: linux-2.6/kernel/exit.c
> ===================================================================
> --- linux-2.6.orig/kernel/exit.c
> +++ linux-2.6/kernel/exit.c
> @@ -64,6 +64,7 @@
> #include <linux/compat.h>
> #include <linux/io_uring.h>
> #include <linux/kprobes.h>
> +#include <linux/task_isolation.h>
>
> #include <linux/uaccess.h>
> #include <asm/unistd.h>
> @@ -759,6 +760,7 @@ void __noreturn do_exit(long code)
> validate_creds_for_do_exit(tsk);
>
> io_uring_files_cancel();
> + task_isol_exit(tsk);

The purpose of this is?

> +static inline void task_isol_exit(struct task_struct *tsk)
> +{
> + if (tsk->task_isol_info)
> + __task_isol_exit(tsk);
> +}

and

>+ void __task_isol_exit(struct task_struct *tsk)
>+ {
>+ }

Makes a lot of sense and is thoroughly explained in the changelog and
comments....

Thanks,

tglx

2022-04-27 10:12:26

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch v12 05/13] task isolation: sync vmstats on return to userspace

On Tue, Mar 15 2022 at 12:31, Marcelo Tosatti wrote:
> --- linux-2.6.orig/kernel/entry/common.c
> +++ linux-2.6/kernel/entry/common.c
>
> @@ -174,6 +175,9 @@ static unsigned long exit_to_user_mode_l
> if (ti_work & _TIF_NOTIFY_RESUME)
> tracehook_notify_resume(regs);
>
> + if (ti_work & _TIF_TASK_ISOL)
> + task_isol_exit_to_user_mode();
> +
> /* Architecture specific TIF work */
> arch_exit_to_user_mode_work(regs, ti_work);

> --- linux-2.6.orig/include/linux/entry-common.h
> +++ linux-2.6/include/linux/entry-common.h
> @@ -60,7 +60,7 @@
> #define EXIT_TO_USER_MODE_WORK \
> (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
> _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
> - ARCH_EXIT_TO_USER_MODE_WORK)
> + _TIF_TASK_ISOL | ARCH_EXIT_TO_USER_MODE_WORK)

How is this supposed to compile when _TIF_TASK_ISOL is not defined by an
architecture?

Hint: Search for _TIF_PATCH_PENDING in the very same header file.

Thanks,

tglx