2021-07-27 10:46:14

by Marcelo Tosatti

[permalink] [raw]
Subject: [patch 1/4] add basic task isolation prctl interface

Add basic prctl task isolation interface, which allows
informing the kernel that application is executing
latency sensitive code (where interruptions are undesired).

Interface is described by task_isolation.rst (added by this patch).

Signed-off-by: Marcelo Tosatti <[email protected]>

Index: linux-2.6-vmstat-update/Documentation/userspace-api/task_isolation.rst
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/Documentation/userspace-api/task_isolation.rst
@@ -0,0 +1,52 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============================
+Task isolation prctl interface
+=============================
+
+Set thread isolation mode and parameters, which allows
+informing the kernel that application is
+executing latency sensitive code (where interruptions
+are undesired).
+
+Its composed of 4 prctl commands (passed as arg1 to
+prctl):
+
+PR_ISOL_SET: set isolation parameters for the task
+
+PR_ISOL_GET: get isolation parameters for the task
+
+PR_ISOL_ENTER: indicate that task should be considered
+ isolated from this point on
+
+PR_ISOL_EXIT: indicate that task should not be considered
+ isolated from this point on
+
+The isolation parameters and mode are not inherited by
+children created by fork(2) and clone(2). The setting is
+preserved across execve(2).
+
+The meaning of isolated is specified as follows, when setting arg2 to
+PR_ISOL_SET or PR_ISOL_GET, with the following arguments passed as arg3.
+
+Isolation mode (PR_ISOL_MODE):
+------------------------------
+
+- PR_ISOL_MODE_NONE (arg4): no per-task isolation (default mode).
+ PR_ISOL_EXIT sets mode to PR_ISOL_MODE_NONE.
+
+- PR_ISOL_MODE_NORMAL (arg4): applications can perform system calls normally,
+ and in case of interruption events, the notifications can be collected
+ by BPF programs.
+ In this mode, if system calls are performed, deferred actions initiated
+ by the system call will be executed before return to userspace.
+
+Other modes, which for example send signals upon interruptions events,
+can be implemented.
+
+Example
+=======
+
+The ``samples/task_isolation/`` directory contains a sample
+application.
+
Index: linux-2.6-vmstat-update/include/uapi/linux/prctl.h
===================================================================
--- linux-2.6-vmstat-update.orig/include/uapi/linux/prctl.h
+++ linux-2.6-vmstat-update/include/uapi/linux/prctl.h
@@ -267,4 +267,13 @@ struct prctl_mm_map {
# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */
# define PR_SCHED_CORE_MAX 4

+/* Task isolation control */
+#define PR_ISOL_SET 62
+#define PR_ISOL_GET 63
+#define PR_ISOL_ENTER 64
+#define PR_ISOL_EXIT 65
+# define PR_ISOL_MODE 1
+
+# define PR_ISOL_MODE_NONE 0
+# define PR_ISOL_MODE_NORMAL 1
#endif /* _LINUX_PRCTL_H */
Index: linux-2.6-vmstat-update/kernel/Makefile
===================================================================
--- linux-2.6-vmstat-update.orig/kernel/Makefile
+++ linux-2.6-vmstat-update/kernel/Makefile
@@ -132,6 +132,8 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue
obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o

+obj-$(CONFIG_CPU_ISOLATION) += task_isolation.o
+
CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
Index: linux-2.6-vmstat-update/kernel/sys.c
===================================================================
--- linux-2.6-vmstat-update.orig/kernel/sys.c
+++ linux-2.6-vmstat-update/kernel/sys.c
@@ -58,6 +58,7 @@
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
+#include <linux/task_isolation.h>
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>
@@ -2567,6 +2568,18 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_ISOL_SET:
+ error = prctl_task_isolation_set(arg2, arg3, arg4, arg5);
+ break;
+ case PR_ISOL_GET:
+ error = prctl_task_isolation_get(arg2, arg3, arg4, arg5);
+ break;
+ case PR_ISOL_ENTER:
+ error = prctl_task_isolation_enter(arg2, arg3, arg4, arg5);
+ break;
+ case PR_ISOL_EXIT:
+ error = prctl_task_isolation_exit(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
Index: linux-2.6-vmstat-update/samples/task_isolation/task_isolation.c
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/samples/task_isolation/task_isolation.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+int main(void)
+{
+ int ret;
+ void *buf = malloc(4096);
+
+ memset(buf, 1, 4096);
+ ret = mlock(buf, 4096);
+ if (ret) {
+ perror("mlock");
+ return EXIT_FAILURE;
+ }
+
+ ret = prctl(PR_ISOL_SET, PR_ISOL_MODE, PR_ISOL_MODE_NORMAL, 0, 0);
+ if (ret == -1) {
+ perror("prctl PR_ISOL_SET");
+ return EXIT_FAILURE;
+ }
+
+ ret = prctl(PR_ISOL_ENTER, 0, 0, 0, 0);
+ if (ret == -1) {
+ perror("prctl PR_ISOL_ENTER");
+ exit(0);
+ }
+
+ /* busy loop */
+ while (ret < 99999999) {
+ memset(buf, 0, 10);
+ ret = ret+1;
+ }
+
+ ret = prctl(PR_ISOL_EXIT, 0, 0, 0, 0);
+ if (ret == -1) {
+ perror("prctl PR_ISOL_EXIT");
+ return EXIT_FAILURE;
+ }
+
+ return EXIT_SUCCESS;
+}
+
Index: linux-2.6-vmstat-update/include/linux/sched.h
===================================================================
--- linux-2.6-vmstat-update.orig/include/linux/sched.h
+++ linux-2.6-vmstat-update/include/linux/sched.h
@@ -66,6 +66,7 @@ struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
+struct isol_info;

/*
* Task state bitmask. NOTE! These bits are also
@@ -1400,6 +1401,10 @@ struct task_struct {
struct llist_head kretprobe_instances;
#endif

+#ifdef CONFIG_CPU_ISOLATION
+ struct isol_info *isol_info;
+#endif
+
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
Index: linux-2.6-vmstat-update/init/init_task.c
===================================================================
--- linux-2.6-vmstat-update.orig/init/init_task.c
+++ linux-2.6-vmstat-update/init/init_task.c
@@ -213,6 +213,9 @@ struct task_struct init_task
#ifdef CONFIG_SECCOMP_FILTER
.seccomp = { .filter_count = ATOMIC_INIT(0) },
#endif
+#ifdef CONFIG_CPU_ISOLATION
+ .isol_info = NULL,
+#endif
};
EXPORT_SYMBOL(init_task);

Index: linux-2.6-vmstat-update/kernel/fork.c
===================================================================
--- linux-2.6-vmstat-update.orig/kernel/fork.c
+++ linux-2.6-vmstat-update/kernel/fork.c
@@ -97,6 +97,7 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/task_isolation.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -734,6 +735,7 @@ void __put_task_struct(struct task_struc
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);

+ tsk_isol_exit(tsk);
io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
@@ -2084,7 +2086,9 @@ static __latent_entropy struct task_stru
#ifdef CONFIG_BPF_SYSCALL
RCU_INIT_POINTER(p->bpf_storage, NULL);
#endif
-
+#ifdef CONFIG_CPU_ISOLATION
+ p->isol_info = NULL;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
Index: linux-2.6-vmstat-update/include/linux/task_isolation.h
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/include/linux/task_isolation.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_TASK_ISOL_H
+#define __LINUX_TASK_ISOL_H
+
+#ifdef CONFIG_CPU_ISOLATION
+
+struct isol_info {
+ u8 mode;
+ u8 active;
+};
+
+extern void __tsk_isol_exit(struct task_struct *tsk);
+
+static inline void tsk_isol_exit(struct task_struct *tsk)
+{
+ if (tsk->isol_info)
+ __tsk_isol_exit(tsk);
+}
+
+int prctl_task_isolation_get(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+
+int prctl_task_isolation_set(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+
+int prctl_task_isolation_enter(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+
+int prctl_task_isolation_exit(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5);
+
+
+#else
+
+static inline void tsk_isol_exit(struct task_struct *tsk)
+{
+}
+
+
+static inline int prctl_task_isolation_get(unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int prctl_task_isolation_set(unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int prctl_task_isolation_enter(unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int prctl_task_isolation_exit(unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_CPU_ISOLATION */
+
+#endif /* __LINUX_TASK_ISOL_H */
Index: linux-2.6-vmstat-update/kernel/task_isolation.c
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/kernel/task_isolation.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Implementation of task isolation.
+ *
+ * Authors:
+ * Chris Metcalf <[email protected]>
+ * Alex Belits <[email protected]>
+ * Yuri Norov <[email protected]>
+ * Marcelo Tosatti <[email protected]>
+ */
+
+#include <linux/sched.h>
+#include <linux/task_isolation.h>
+#include <linux/prctl.h>
+#include <linux/slab.h>
+
+static int tsk_isol_alloc_context(struct task_struct *task)
+{
+ struct isol_info *info;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (unlikely(!info))
+ return -ENOMEM;
+
+ task->isol_info = info;
+ return 0;
+}
+
+void __tsk_isol_exit(struct task_struct *tsk)
+{
+ kfree(tsk->isol_info);
+ tsk->isol_info = NULL;
+}
+
+int prctl_task_isolation_get(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (arg2 != PR_ISOL_MODE)
+ return -EOPNOTSUPP;
+
+ if (current->isol_info != NULL)
+ return current->isol_info->mode;
+
+ return PR_ISOL_MODE_NONE;
+}
+
+
+int prctl_task_isolation_set(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ int ret;
+
+ if (arg2 != PR_ISOL_MODE)
+ return -EOPNOTSUPP;
+
+ if (arg3 != PR_ISOL_MODE_NORMAL)
+ return -EINVAL;
+
+ ret = tsk_isol_alloc_context(current);
+ if (ret)
+ return ret;
+
+ current->isol_info->mode = arg3;
+ return 0;
+}
+
+int prctl_task_isolation_enter(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+
+ if (current->isol_info == NULL)
+ return -EINVAL;
+
+ if (current->isol_info->mode != PR_ISOL_MODE_NORMAL)
+ return -EINVAL;
+
+ current->isol_info->active = 1;
+
+ return 0;
+}
+
+int prctl_task_isolation_exit(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ if (current->isol_info == NULL)
+ return -EINVAL;
+
+ if (current->isol_info->mode != PR_ISOL_MODE_NORMAL)
+ return -EINVAL;
+
+ current->isol_info->active = 0;
+
+ return 0;
+}
+
+
Index: linux-2.6-vmstat-update/samples/Kconfig
===================================================================
--- linux-2.6-vmstat-update.orig/samples/Kconfig
+++ linux-2.6-vmstat-update/samples/Kconfig
@@ -223,4 +223,11 @@ config SAMPLE_WATCH_QUEUE
Build example userspace program to use the new mount_notify(),
sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function.

+config SAMPLE_TASK_ISOLATION
+ bool "task isolation sample"
+ depends on CC_CAN_LINK && HEADERS_INSTALL
+ help
+ Build example userspace program to use prctl task isolation
+ interface.
+
endif # SAMPLES
Index: linux-2.6-vmstat-update/samples/Makefile
===================================================================
--- linux-2.6-vmstat-update.orig/samples/Makefile
+++ linux-2.6-vmstat-update/samples/Makefile
@@ -30,3 +30,4 @@ obj-$(CONFIG_SAMPLE_INTEL_MEI) += mei/
subdir-$(CONFIG_SAMPLE_WATCHDOG) += watchdog
subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/
+subdir-$(CONFIG_SAMPLE_TASK_ISOLATION) += task_isolation
Index: linux-2.6-vmstat-update/samples/task_isolation/Makefile
===================================================================
--- /dev/null
+++ linux-2.6-vmstat-update/samples/task_isolation/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+userprogs-always-y += task_isolation
+
+userccflags += -I usr/include




2021-07-27 10:52:29

by Nicolas Saenz Julienne

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Tue, 2021-07-27 at 07:38 -0300, Marcelo Tosatti wrote:
> +Isolation mode (PR_ISOL_MODE):
> +------------------------------
> +
> +- PR_ISOL_MODE_NONE (arg4): no per-task isolation (default mode).
> + PR_ISOL_EXIT sets mode to PR_ISOL_MODE_NONE.
> +
> +- PR_ISOL_MODE_NORMAL (arg4): applications can perform system calls normally,
> + and in case of interruption events, the notifications can be collected
> + by BPF programs.
> + In this mode, if system calls are performed, deferred actions initiated
> + by the system call will be executed before return to userspace.
> +
> +Other modes, which for example send signals upon interruptions events,
> +can be implemented.

Shouldn't this be a set of flags that enable specific isolation features?
Something the likes of 'PR_ISOL_QUIESCE_ON_EXIT'. Modes seem more restrictive
and too much of a commitment. If we merge MODE_NORMAL as is, we won't be able
to tweak/extend its behaviour in the future.

--
Nicolás Sáenz


2021-07-27 11:04:59

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Tue, Jul 27, 2021 at 12:48:33PM +0200, [email protected] wrote:
> On Tue, 2021-07-27 at 07:38 -0300, Marcelo Tosatti wrote:
> > +Isolation mode (PR_ISOL_MODE):
> > +------------------------------
> > +
> > +- PR_ISOL_MODE_NONE (arg4): no per-task isolation (default mode).
> > + PR_ISOL_EXIT sets mode to PR_ISOL_MODE_NONE.
> > +
> > +- PR_ISOL_MODE_NORMAL (arg4): applications can perform system calls normally,
> > + and in case of interruption events, the notifications can be collected
> > + by BPF programs.
> > + In this mode, if system calls are performed, deferred actions initiated
> > + by the system call will be executed before return to userspace.
> > +
> > +Other modes, which for example send signals upon interruptions events,
> > +can be implemented.
>
> Shouldn't this be a set of flags that enable specific isolation features?
> Something the likes of 'PR_ISOL_QUIESCE_ON_EXIT'. Modes seem more restrictive
> and too much of a commitment. If we merge MODE_NORMAL as is, we won't be able
> to tweak/extend its behaviour in the future.

Hi Nicolas,

Well, its assuming PR_ISOL_MODE_NORMAL means "enable all isolation
features on return to userspace".

Later on, if desired, can add extend interface as follows (using
Christoph's idea to not perform automatic quiesce on return to
userspace, but expose which parts need quiescing
so userspace can do it on its own, as an example):

#define PR_ISOL_QUIESCE_ON_EXIT (1<<0)
#define PR_ISOL_VSYSCALL_PAGE (1<<1)
...

unsigned long bitmap = PR_ISOL_VSYSCALL_PAGE;

/* allow system calls */
prctl(PR_ISOL_SET, PR_ISOL_MODE, PR_ISOL_MODE_NORMAL, 0, 0, 0);

/*
* disable quiescing on exit, enable reporting through
* vsyscall page
*/
prctl(PR_ISOL_SET, PR_ISOL_FEATURES, &bitmap, 0, 0);
/*
* configure vsyscall page
*/
prctl(PR_ISOL_VSYSCALLS, params, ...);

So unless i am missing something, it is possible to tweak/extend the
interface. No?


2021-07-27 12:40:06

by Nicolas Saenz Julienne

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

Hi Marcelo,

On Tue, 2021-07-27 at 08:00 -0300, Marcelo Tosatti wrote:
> On Tue, Jul 27, 2021 at 12:48:33PM +0200, [email protected] wrote:
> > On Tue, 2021-07-27 at 07:38 -0300, Marcelo Tosatti wrote:
> > > +Isolation mode (PR_ISOL_MODE):
> > > +------------------------------
> > > +
> > > +- PR_ISOL_MODE_NONE (arg4): no per-task isolation (default mode).
> > > + PR_ISOL_EXIT sets mode to PR_ISOL_MODE_NONE.
> > > +
> > > +- PR_ISOL_MODE_NORMAL (arg4): applications can perform system calls normally,
> > > + and in case of interruption events, the notifications can be collected
> > > + by BPF programs.
> > > + In this mode, if system calls are performed, deferred actions initiated
> > > + by the system call will be executed before return to userspace.
> > > +
> > > +Other modes, which for example send signals upon interruptions events,
> > > +can be implemented.
> >
> > Shouldn't this be a set of flags that enable specific isolation features?
> > Something the likes of 'PR_ISOL_QUIESCE_ON_EXIT'. Modes seem more restrictive
> > and too much of a commitment. If we merge MODE_NORMAL as is, we won't be able
> > to tweak/extend its behaviour in the future.
>
> Hi Nicolas,
>
> Well, its assuming PR_ISOL_MODE_NORMAL means "enable all isolation
> features on return to userspace".
>
> Later on, if desired, can add extend interface as follows (using
> Christoph's idea to not perform automatic quiesce on return to
> userspace, but expose which parts need quiescing
> so userspace can do it on its own, as an example):
>
> #define PR_ISOL_QUIESCE_ON_EXIT (1<<0)
> #define PR_ISOL_VSYSCALL_PAGE (1<<1)
> ...
>
> unsigned long bitmap = PR_ISOL_VSYSCALL_PAGE;
>
> /* allow system calls */
> prctl(PR_ISOL_SET, PR_ISOL_MODE, PR_ISOL_MODE_NORMAL, 0, 0, 0);
>
> /*
> * disable quiescing on exit, enable reporting through
> * vsyscall page
> */
> prctl(PR_ISOL_SET, PR_ISOL_FEATURES, &bitmap, 0, 0);
> /*
> * configure vsyscall page
> */
> prctl(PR_ISOL_VSYSCALLS, params, ...);
>
> So unless i am missing something, it is possible to tweak/extend the
> interface. No?

OK, sorry if I'm being thick, but what is the benefit of having a distincnt
PR_ISOL_MODE instead expressing everything as PR_ISOL_FEATURES.

PR_ISOL_MODE_NONE == Empty PR_ISOL_FEATURES bitmap

PR_ISOL_MODE_NORMAL == Bitmap of commonly used PR_ISOL_FEATURES
(we could introduce a define)

PR_ISOL_MODE_NORMAL+PR_ISOL_VSYSCALLS == Custom bitmap

Other than that, my rationale is that if you extend PR_ISOL_MODE_NORMAL's
behaviour as new features are merged, wouldn't you be potentially breaking
userspace (i.e. older applications might not like the new default)?

--
Nicolás Sáenz


2021-07-27 13:10:21

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Tue, Jul 27, 2021 at 02:38:15PM +0200, [email protected] wrote:
> Hi Marcelo,
>
> On Tue, 2021-07-27 at 08:00 -0300, Marcelo Tosatti wrote:
> > On Tue, Jul 27, 2021 at 12:48:33PM +0200, [email protected] wrote:
> > > On Tue, 2021-07-27 at 07:38 -0300, Marcelo Tosatti wrote:
> > > > +Isolation mode (PR_ISOL_MODE):
> > > > +------------------------------
> > > > +
> > > > +- PR_ISOL_MODE_NONE (arg4): no per-task isolation (default mode).
> > > > + PR_ISOL_EXIT sets mode to PR_ISOL_MODE_NONE.
> > > > +
> > > > +- PR_ISOL_MODE_NORMAL (arg4): applications can perform system calls normally,
> > > > + and in case of interruption events, the notifications can be collected
> > > > + by BPF programs.
> > > > + In this mode, if system calls are performed, deferred actions initiated
> > > > + by the system call will be executed before return to userspace.
> > > > +
> > > > +Other modes, which for example send signals upon interruptions events,
> > > > +can be implemented.
> > >
> > > Shouldn't this be a set of flags that enable specific isolation features?
> > > Something the likes of 'PR_ISOL_QUIESCE_ON_EXIT'. Modes seem more restrictive
> > > and too much of a commitment. If we merge MODE_NORMAL as is, we won't be able
> > > to tweak/extend its behaviour in the future.
> >
> > Hi Nicolas,
> >
> > Well, its assuming PR_ISOL_MODE_NORMAL means "enable all isolation
> > features on return to userspace".
> >
> > Later on, if desired, can add extend interface as follows (using
> > Christoph's idea to not perform automatic quiesce on return to
> > userspace, but expose which parts need quiescing
> > so userspace can do it on its own, as an example):
> >
> > #define PR_ISOL_QUIESCE_ON_EXIT (1<<0)
> > #define PR_ISOL_VSYSCALL_PAGE (1<<1)
> > ...
> >
> > unsigned long bitmap = PR_ISOL_VSYSCALL_PAGE;
> >
> > /* allow system calls */
> > prctl(PR_ISOL_SET, PR_ISOL_MODE, PR_ISOL_MODE_NORMAL, 0, 0, 0);
> >
> > /*
> > * disable quiescing on exit, enable reporting through
> > * vsyscall page
> > */
> > prctl(PR_ISOL_SET, PR_ISOL_FEATURES, &bitmap, 0, 0);
> > /*
> > * configure vsyscall page
> > */
> > prctl(PR_ISOL_VSYSCALLS, params, ...);
> >
> > So unless i am missing something, it is possible to tweak/extend the
> > interface. No?
>
> OK, sorry if I'm being thick, but what is the benefit of having a distincnt
> PR_ISOL_MODE instead expressing everything as PR_ISOL_FEATURES.
>
> PR_ISOL_MODE_NONE == Empty PR_ISOL_FEATURES bitmap
>
> PR_ISOL_MODE_NORMAL == Bitmap of commonly used PR_ISOL_FEATURES
> (we could introduce a define)
>
> PR_ISOL_MODE_NORMAL+PR_ISOL_VSYSCALLS == Custom bitmap
>
> Other than that, my rationale is that if you extend PR_ISOL_MODE_NORMAL's
> behaviour as new features are merged, wouldn't you be potentially breaking
> userspace (i.e. older applications might not like the new default)?
>
> --
> Nicol?s S?enz

The main reason is that PR_ISOL_MODE would allow for distinct
modes to be implemented (matching each use case). For example:

https://lwn.net/Articles/816298/

"When a task has finished its initialization, it can activate isolation
by using the PR_TASK_ISOLATION operation provided by the prctl()
system call. This operation may fail for either permanent or temporary
reasons. An example of a permanent error is when the task is set up
on a CPU without isolation; in this case, entering isolation mode
is not possible. Temporary errors are indicated by the EAGAIN error
code; examples include a time when the delayed workqueues could not be
stopped. In such cases, the task may retry the operation if it wants to
enter isolation, as it may succeed the next time.

In the prctl() call, the developer may also configure the signal to be
sent to the task when it loses isolation. The additional macro to use is
PR_TASK_ISOLATION_SET_SIG(), passing it the signal to send. The command
then becomes similar to the one in the example code:"



2021-07-27 13:10:29

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Tue, Jul 27, 2021 at 10:06:41AM -0300, Marcelo Tosatti wrote:
> On Tue, Jul 27, 2021 at 02:38:15PM +0200, [email protected] wrote:
> > Hi Marcelo,
> >
> > On Tue, 2021-07-27 at 08:00 -0300, Marcelo Tosatti wrote:
> > > On Tue, Jul 27, 2021 at 12:48:33PM +0200, [email protected] wrote:
> > > > On Tue, 2021-07-27 at 07:38 -0300, Marcelo Tosatti wrote:
> > > > > +Isolation mode (PR_ISOL_MODE):
> > > > > +------------------------------
> > > > > +
> > > > > +- PR_ISOL_MODE_NONE (arg4): no per-task isolation (default mode).
> > > > > + PR_ISOL_EXIT sets mode to PR_ISOL_MODE_NONE.
> > > > > +
> > > > > +- PR_ISOL_MODE_NORMAL (arg4): applications can perform system calls normally,
> > > > > + and in case of interruption events, the notifications can be collected
> > > > > + by BPF programs.
> > > > > + In this mode, if system calls are performed, deferred actions initiated
> > > > > + by the system call will be executed before return to userspace.
> > > > > +
> > > > > +Other modes, which for example send signals upon interruptions events,
> > > > > +can be implemented.
> > > >
> > > > Shouldn't this be a set of flags that enable specific isolation features?
> > > > Something the likes of 'PR_ISOL_QUIESCE_ON_EXIT'. Modes seem more restrictive
> > > > and too much of a commitment. If we merge MODE_NORMAL as is, we won't be able
> > > > to tweak/extend its behaviour in the future.
> > >
> > > Hi Nicolas,
> > >
> > > Well, its assuming PR_ISOL_MODE_NORMAL means "enable all isolation
> > > features on return to userspace".
> > >
> > > Later on, if desired, can add extend interface as follows (using
> > > Christoph's idea to not perform automatic quiesce on return to
> > > userspace, but expose which parts need quiescing
> > > so userspace can do it on its own, as an example):
> > >
> > > #define PR_ISOL_QUIESCE_ON_EXIT (1<<0)
> > > #define PR_ISOL_VSYSCALL_PAGE (1<<1)
> > > ...
> > >
> > > unsigned long bitmap = PR_ISOL_VSYSCALL_PAGE;
> > >
> > > /* allow system calls */
> > > prctl(PR_ISOL_SET, PR_ISOL_MODE, PR_ISOL_MODE_NORMAL, 0, 0, 0);
> > >
> > > /*
> > > * disable quiescing on exit, enable reporting through
> > > * vsyscall page
> > > */
> > > prctl(PR_ISOL_SET, PR_ISOL_FEATURES, &bitmap, 0, 0);
> > > /*
> > > * configure vsyscall page
> > > */
> > > prctl(PR_ISOL_VSYSCALLS, params, ...);
> > >
> > > So unless i am missing something, it is possible to tweak/extend the
> > > interface. No?
> >
> > OK, sorry if I'm being thick, but what is the benefit of having a distincnt
> > PR_ISOL_MODE instead expressing everything as PR_ISOL_FEATURES.
> >
> > PR_ISOL_MODE_NONE == Empty PR_ISOL_FEATURES bitmap
> >
> > PR_ISOL_MODE_NORMAL == Bitmap of commonly used PR_ISOL_FEATURES
> > (we could introduce a define)
> >
> > PR_ISOL_MODE_NORMAL+PR_ISOL_VSYSCALLS == Custom bitmap
> >
> > Other than that, my rationale is that if you extend PR_ISOL_MODE_NORMAL's
> > behaviour as new features are merged, wouldn't you be potentially breaking
> > userspace (i.e. older applications might not like the new default)?
> >
> > --
> > Nicol?s S?enz
>
> The main reason is that PR_ISOL_MODE would allow for distinct
> modes to be implemented (matching each use case). For example:
>
> https://lwn.net/Articles/816298/
>
> "When a task has finished its initialization, it can activate isolation
> by using the PR_TASK_ISOLATION operation provided by the prctl()
> system call. This operation may fail for either permanent or temporary
> reasons. An example of a permanent error is when the task is set up
> on a CPU without isolation; in this case, entering isolation mode
> is not possible. Temporary errors are indicated by the EAGAIN error
> code; examples include a time when the delayed workqueues could not be
> stopped. In such cases, the task may retry the operation if it wants to
> enter isolation, as it may succeed the next time.
>
> In the prctl() call, the developer may also configure the signal to be
> sent to the task when it loses isolation. The additional macro to use is
> PR_TASK_ISOLATION_SET_SIG(), passing it the signal to send. The command
> then becomes similar to the one in the example code:"

But have no strong preference: fine with PR_ISOL_FEATURES as you
describe above, and if that is the consensus, can resubmit.


2021-07-27 13:11:48

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Tue, Jul 27, 2021 at 02:38:15PM +0200, [email protected] wrote:
> Hi Marcelo,
>
> On Tue, 2021-07-27 at 08:00 -0300, Marcelo Tosatti wrote:
> OK, sorry if I'm being thick, but what is the benefit of having a distincnt
> PR_ISOL_MODE instead expressing everything as PR_ISOL_FEATURES.
>
> PR_ISOL_MODE_NONE == Empty PR_ISOL_FEATURES bitmap
>
> PR_ISOL_MODE_NORMAL == Bitmap of commonly used PR_ISOL_FEATURES
> (we could introduce a define)
>
> PR_ISOL_MODE_NORMAL+PR_ISOL_VSYSCALLS == Custom bitmap
>
> Other than that, my rationale is that if you extend PR_ISOL_MODE_NORMAL's
> behaviour as new features are merged, wouldn't you be potentially breaking
> userspace (i.e. older applications might not like the new default)?

I agree with Nicolas, and that was Thomas request too.
Let's leave policy implementation to userspace and take
only the individual isolation features to the kernel.

CPU/Task isolation is a relatively young feature and many users don't
communicate much about their needs. We don't know exactly how finegrained
the ABI will need to be so let's not make too many high level assumptions.

It's easy for userspace to set all isolation bits by itself.

Besides, those bits will be implemented one by one over time, this
means that a prctl() bit saying "isolate everything" will have a different
behaviour as those features get integrated. And we really want well defined
behaviours.

Thanks.

2021-07-27 14:56:41

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Tue, Jul 27, 2021 at 03:09:30PM +0200, Frederic Weisbecker wrote:
> On Tue, Jul 27, 2021 at 02:38:15PM +0200, [email protected] wrote:
> > Hi Marcelo,
> >
> > On Tue, 2021-07-27 at 08:00 -0300, Marcelo Tosatti wrote:
> > OK, sorry if I'm being thick, but what is the benefit of having a distincnt
> > PR_ISOL_MODE instead expressing everything as PR_ISOL_FEATURES.
> >
> > PR_ISOL_MODE_NONE == Empty PR_ISOL_FEATURES bitmap
> >
> > PR_ISOL_MODE_NORMAL == Bitmap of commonly used PR_ISOL_FEATURES
> > (we could introduce a define)
> >
> > PR_ISOL_MODE_NORMAL+PR_ISOL_VSYSCALLS == Custom bitmap
> >
> > Other than that, my rationale is that if you extend PR_ISOL_MODE_NORMAL's
> > behaviour as new features are merged, wouldn't you be potentially breaking
> > userspace (i.e. older applications might not like the new default)?
>
> I agree with Nicolas, and that was Thomas request too.
> Let's leave policy implementation to userspace and take
> only the individual isolation features to the kernel.
>
> CPU/Task isolation is a relatively young feature and many users don't
> communicate much about their needs. We don't know exactly how finegrained
> the ABI will need to be so let's not make too many high level assumptions.
>
> It's easy for userspace to set all isolation bits by itself.
>
> Besides, those bits will be implemented one by one over time, this
> means that a prctl() bit saying "isolate everything" will have a different
> behaviour as those features get integrated. And we really want well defined
> behaviours.
>
> Thanks.
>
>

OK, how about this:

...

The meaning of isolated is specified as follows:

Isolation features
==================

- prctl(PR_ISOL_GET, ISOL_SUP_FEATURES, 0, 0, 0) returns the supported
features as a return value.

- prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
the bitmask.

- prctl(PR_ISOL_GET, ISOL_FEATURES, 0, 0, 0) returns the currently
enabled features.

The supported features are:

ISOL_F_QUIESCE_ON_URET: quiesce deferred actions on return to userspace.
----------------------

Quiescing of different actions can be performed on return to userspace.

- prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0) returns
the supported actions to be quiesced.

- prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) returns
the currently supported actions to be quiesced.

- prctl(PR_ISOL_GET, PR_ISOL_QUIESCE_CFG, 0, 0, 0) returns
the currently enabled actions to be quiesced.

#define ISOL_F_QUIESCE_VMSTAT_SYNC (1<<0)
#define ISOL_F_QUIESCE_NOHZ_FULL (1<<1)
#define ISOL_F_QUIESCE_DEFER_TLB_FLUSH (1<<2)
...


2021-07-27 23:46:43

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Tue, Jul 27, 2021 at 11:52:09AM -0300, Marcelo Tosatti wrote:
> The meaning of isolated is specified as follows:
>
> Isolation features
> ==================
>
> - prctl(PR_ISOL_GET, ISOL_SUP_FEATURES, 0, 0, 0) returns the supported
> features as a return value.
>
> - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> the bitmask.
>
> - prctl(PR_ISOL_GET, ISOL_FEATURES, 0, 0, 0) returns the currently
> enabled features.

So what are the ISOL_FEATURES here? A mode that we enter such as flush
vmstat _everytime_ we resume to userpace after (and including) this prctl() ?

If so I'd rather call that ISOL_MODE because feature is too general.

>
> The supported features are:
>
> ISOL_F_QUIESCE_ON_URET: quiesce deferred actions on return to userspace.
> ----------------------
>
> Quiescing of different actions can be performed on return to userspace.
>
> - prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0) returns
> the supported actions to be quiesced.
>
> - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) returns
> the currently supported actions to be quiesced.
>
> - prctl(PR_ISOL_GET, PR_ISOL_QUIESCE_CFG, 0, 0, 0) returns
> the currently enabled actions to be quiesced.
>
> #define ISOL_F_QUIESCE_VMSTAT_SYNC (1<<0)
> #define ISOL_F_QUIESCE_NOHZ_FULL (1<<1)
> #define ISOL_F_QUIESCE_DEFER_TLB_FLUSH (1<<2)

And then PR_ISOL_QUIESCE_CFG is a oneshot operation that applies only upon
return to this ctrl, right? If so perhaps this should be called just
ISOL_QUIESCE or ISOL_QUIESCE_ONCE or ISOL_REQ ?

But that's just naming debate because otherwise that prctl layout looks good
to me.

Thanks!

2021-07-28 09:58:54

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Wed, Jul 28, 2021 at 01:45:39AM +0200, Frederic Weisbecker wrote:
> On Tue, Jul 27, 2021 at 11:52:09AM -0300, Marcelo Tosatti wrote:
> > The meaning of isolated is specified as follows:
> >
> > Isolation features
> > ==================
> >
> > - prctl(PR_ISOL_GET, ISOL_SUP_FEATURES, 0, 0, 0) returns the supported
> > features as a return value.
> >
> > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> > the bitmask.
> >
> > - prctl(PR_ISOL_GET, ISOL_FEATURES, 0, 0, 0) returns the currently
> > enabled features.
>
> So what are the ISOL_FEATURES here? A mode that we enter such as flush
> vmstat _everytime_ we resume to userpace after (and including) this prctl() ?

ISOL_FEATURES is just the "command" type (which you can get and set).

The bitmask would include ISOL_F_QUIESCE_ON_URET, so:

- bitmask = ISOL_F_QUIESCE_ON_URET;
- prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
the bitmask.

- quiesce_bitmap = prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0)
(1)

(returns the supported actions to be quiesced).

- prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) _sets_
the actions to be quiesced (2)

If an application does not modify "quiesce_bitmask" between
points (1) and (2) above, it will enable quiescing of all
"features" the kernel supports.

Application can, however, modify quiesce_bitmap to its preference.

Flushing vmstat _everytime_ you resume to userspace is enabled only
_after_ prctl(PR_ISOL_ENTER, 0, 0, 0, 0) is performed (which happens
only when isolation is fully configured with the PR_ISOL_SET calls).
OK, will better document that.

> If so I'd rather call that ISOL_MODE because feature is too general.

Well, in the first patchset, there was one "mode" implemented (but
it was possible to implement different modes in the future).

This would allow for example easier integration of "full task isolation"
patchset type of functionality, disallowing syscalls.

I think we'd like to keep that, so i'll keep the previous distinct modes
(but allow configuration of individual features on the bitmap).

> >
> > The supported features are:
> >
> > ISOL_F_QUIESCE_ON_URET: quiesce deferred actions on return to userspace.
> > ----------------------
> >
> > Quiescing of different actions can be performed on return to userspace.
> >
> > - prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0) returns
> > the supported actions to be quiesced.
> >
> > - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) returns

s/returns/sets/

> > the currently supported actions to be quiesced.
> >
> > - prctl(PR_ISOL_GET, PR_ISOL_QUIESCE_CFG, 0, 0, 0) returns
> > the currently enabled actions to be quiesced.
> >
> > #define ISOL_F_QUIESCE_VMSTAT_SYNC (1<<0)
> > #define ISOL_F_QUIESCE_NOHZ_FULL (1<<1)
> > #define ISOL_F_QUIESCE_DEFER_TLB_FLUSH (1<<2)
>
> And then PR_ISOL_QUIESCE_CFG is a oneshot operation that applies only upon
> return to this ctrl, right? If so perhaps this should be called just
> ISOL_QUIESCE or ISOL_QUIESCE_ONCE or ISOL_REQ ?

There was no one-shot operation implemented in the first patchset. What
application would do to achieve that is:

1. Configure isolation with PR_ISOL_SET (say configure mode which
allows system calls, and when a system call happens, flush all deferred
actions on return to userspace).

2. prctl(PR_ISOL_ENTER, 0, 0, 0, 0) (this actually enables the flushing,
and tags the task_struct as isolated). Here we can transfer this information
from per-task to per-CPU data, for example, to be able to implement
other features such as deferred TLB flushing.

On return from this prctl(), deferrable actions are flushed.

3. latency sensitive loop, with no system calls.

4. some event which requires system calls is noticed:
prctl(PR_ISOL_EXIT, 0, 0, 0, 0)
(this would untag task_struct as isolated).

5. perform system calls A, B, C, D (with no flushing of vmstat,
for example).

6. jmp to 2.

So there is a problem with this logic, which is that one would like
certain isolation functionality to remain enabled between points 4
and 6 (for example, blocking CPU hotplug or other blockable activities
that would cause interruptions).

One way to achieve this would be to replace PR_ISOL_ENTER/PR_ISOL_EXIT
with PR_ISOL_ENABLE, which accepts a bitmask:

1. Configure isolation with PR_ISOL_SET (say configure mode which
allows system calls, and when a system call happens, flush all deferred
actions on return to userspace).

2. enabled_bitmask = ISOL_F_QUIESCE_ON_URET|ISOL_F_BLOCK_INTERRUPTORS;
prctl(PR_ISOL_ENABLE, enabled_bitmask, 0, 0, 0)

On return from this prctl(), deferrable actions are flushed.

3. latency sensitive loop, with no system calls.

4. some event which requires system calls is noticed:

prctl(PR_ISOL_ENABLE, ISOL_F_BLOCK_INTERRUPTORS, 0, 0, 0)
(this would clear ISOL_F_QUIESCE_ON_URET, so no flushing
is performed on return from system calls).

5. perform system calls A, B, C, D (with no flushing of vmstat).

6. jmp to 2.

...

On exit: prctl(PR_ISOL_ENABLE, 0, 0, 0, 0)

IOW: the one-shot operation does not allow the application
to inform the kernel when the latency sensitive loop has
begun or has ended.

>
> But that's just naming debate because otherwise that prctl layout looks good
> to me.
>
> Thanks!

Thank you for the input!


2021-07-28 11:48:22

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Wed, Jul 28, 2021 at 06:37:07AM -0300, Marcelo Tosatti wrote:
> On Wed, Jul 28, 2021 at 01:45:39AM +0200, Frederic Weisbecker wrote:
> > On Tue, Jul 27, 2021 at 11:52:09AM -0300, Marcelo Tosatti wrote:
> > > The meaning of isolated is specified as follows:
> > >
> > > Isolation features
> > > ==================
> > >
> > > - prctl(PR_ISOL_GET, ISOL_SUP_FEATURES, 0, 0, 0) returns the supported
> > > features as a return value.
> > >
> > > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> > > the bitmask.
> > >
> > > - prctl(PR_ISOL_GET, ISOL_FEATURES, 0, 0, 0) returns the currently
> > > enabled features.
> >
> > So what are the ISOL_FEATURES here? A mode that we enter such as flush
> > vmstat _everytime_ we resume to userpace after (and including) this prctl() ?
>
> ISOL_FEATURES is just the "command" type (which you can get and set).
>
> The bitmask would include ISOL_F_QUIESCE_ON_URET, so:
>
> - bitmask = ISOL_F_QUIESCE_ON_URET;
> - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> the bitmask.

But does it quiesce once or for every further uret?

>
> - quiesce_bitmap = prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0)
> (1)
>
> (returns the supported actions to be quiesced).
>
> - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) _sets_
> the actions to be quiesced (2)
>
> If an application does not modify "quiesce_bitmask" between
> points (1) and (2) above, it will enable quiescing of all
> "features" the kernel supports.

I don't get the difference between ISOL_FEATURES and PR_ISOL_QUIESCE_CFG.

>
> Application can, however, modify quiesce_bitmap to its preference.
>
> Flushing vmstat _everytime_ you resume to userspace is enabled only
> _after_ prctl(PR_ISOL_ENTER, 0, 0, 0, 0) is performed (which happens
> only when isolation is fully configured with the PR_ISOL_SET calls).
> OK, will better document that.

Yes please, I'm completely confused :o)

>
> > If so I'd rather call that ISOL_MODE because feature is too general.
>
> Well, in the first patchset, there was one "mode" implemented (but
> it was possible to implement different modes in the future).
>
> This would allow for example easier integration of "full task isolation"
> patchset type of functionality, disallowing syscalls.
>
> I think we'd like to keep that, so i'll keep the previous distinct modes
> (but allow configuration of individual features on the bitmap).

And I also don't see how such modes differ from configuration of individual
features on the bitmap.

> > > - prctl(PR_ISOL_GET, PR_ISOL_QUIESCE_CFG, 0, 0, 0) returns
> > > the currently enabled actions to be quiesced.
> > >
> > > #define ISOL_F_QUIESCE_VMSTAT_SYNC (1<<0)
> > > #define ISOL_F_QUIESCE_NOHZ_FULL (1<<1)
> > > #define ISOL_F_QUIESCE_DEFER_TLB_FLUSH (1<<2)
> >
> > And then PR_ISOL_QUIESCE_CFG is a oneshot operation that applies only upon
> > return to this ctrl, right? If so perhaps this should be called just
> > ISOL_QUIESCE or ISOL_QUIESCE_ONCE or ISOL_REQ ?
>
> There was no one-shot operation implemented in the first patchset. What
> application would do to achieve that is:
>
> 1. Configure isolation with PR_ISOL_SET (say configure mode which
> allows system calls, and when a system call happens, flush all deferred
> actions on return to userspace).
>
> 2. prctl(PR_ISOL_ENTER, 0, 0, 0, 0) (this actually enables the flushing,
> and tags the task_struct as isolated). Here we can transfer this information
> from per-task to per-CPU data, for example, to be able to implement
> other features such as deferred TLB flushing.
>
> On return from this prctl(), deferrable actions are flushed.
>
> 3. latency sensitive loop, with no system calls.
>
> 4. some event which requires system calls is noticed:
> prctl(PR_ISOL_EXIT, 0, 0, 0, 0)
> (this would untag task_struct as isolated).
>
> 5. perform system calls A, B, C, D (with no flushing of vmstat,
> for example).
>
> 6. jmp to 2.
>
> So there is a problem with this logic, which is that one would like
> certain isolation functionality to remain enabled between points 4
> and 6 (for example, blocking CPU hotplug or other blockable activities
> that would cause interruptions).
>
> One way to achieve this would be to replace PR_ISOL_ENTER/PR_ISOL_EXIT
> with PR_ISOL_ENABLE, which accepts a bitmask:
>
> 1. Configure isolation with PR_ISOL_SET (say configure mode which
> allows system calls, and when a system call happens, flush all deferred
> actions on return to userspace).
>
> 2. enabled_bitmask = ISOL_F_QUIESCE_ON_URET|ISOL_F_BLOCK_INTERRUPTORS;
> prctl(PR_ISOL_ENABLE, enabled_bitmask, 0, 0, 0)
>
> On return from this prctl(), deferrable actions are flushed.
>
> 3. latency sensitive loop, with no system calls.
>
> 4. some event which requires system calls is noticed:
>
> prctl(PR_ISOL_ENABLE, ISOL_F_BLOCK_INTERRUPTORS, 0, 0, 0)
> (this would clear ISOL_F_QUIESCE_ON_URET, so no flushing
> is performed on return from system calls).

So PR_ISOL_ENABLE is a way to perform action when some sort of kernel entry
happens. Then we take actions when that happens (signal, warn, etc...).

I guess we'll need to define what kind of kernel entry, and what kind of
response need to happen. Ok that's a whole issue of its own that we'll need
to handle seperately.

Thanks.

2021-07-28 11:56:36

by Nicolas Saenz Julienne

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

Hi Marcelo,

On Wed, 2021-07-28 at 06:37 -0300, Marcelo Tosatti wrote:
> On Wed, Jul 28, 2021 at 01:45:39AM +0200, Frederic Weisbecker wrote:
> > On Tue, Jul 27, 2021 at 11:52:09AM -0300, Marcelo Tosatti wrote:
> > > The meaning of isolated is specified as follows:
> > >
> > > Isolation features
> > > ==================
> > >
> > > - prctl(PR_ISOL_GET, ISOL_SUP_FEATURES, 0, 0, 0) returns the supported
> > > features as a return value.
> > >
> > > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> > > the bitmask.
> > >
> > > - prctl(PR_ISOL_GET, ISOL_FEATURES, 0, 0, 0) returns the currently
> > > enabled features.
> >
> > So what are the ISOL_FEATURES here? A mode that we enter such as flush
> > vmstat _everytime_ we resume to userpace after (and including) this prctl() ?
>
> ISOL_FEATURES is just the "command" type (which you can get and set).
>
> The bitmask would include ISOL_F_QUIESCE_ON_URET, so:
>
> - bitmask = ISOL_F_QUIESCE_ON_URET;
> - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> the bitmask.
>
> - quiesce_bitmap = prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0)
> (1)
>
> (returns the supported actions to be quiesced).
>
> - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) _sets_
> the actions to be quiesced (2)
>
> If an application does not modify "quiesce_bitmask" between
> points (1) and (2) above, it will enable quiescing of all
> "features" the kernel supports.

I think this pattern of enabling all by default might be prone to subtly
breaking things.

For example, let's say we introduce ISOL_F_QUIESCE_DEFER_TLB_FLUSH, this will
defer relatively short IPIs on isolated CPUs in exchange for a longer flush
whenever we enter the kernel (syscall, IRQs, NMI, etc...). A latency sensitive
application might be OK with the former but not with the latter.

--
Nicolás Sáenz


2021-07-28 13:22:48

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Wed, Jul 28, 2021 at 01:55:33PM +0200, [email protected] wrote:
> Hi Marcelo,
>
> On Wed, 2021-07-28 at 06:37 -0300, Marcelo Tosatti wrote:
> > On Wed, Jul 28, 2021 at 01:45:39AM +0200, Frederic Weisbecker wrote:
> > > On Tue, Jul 27, 2021 at 11:52:09AM -0300, Marcelo Tosatti wrote:
> > > > The meaning of isolated is specified as follows:
> > > >
> > > > Isolation features
> > > > ==================
> > > >
> > > > - prctl(PR_ISOL_GET, ISOL_SUP_FEATURES, 0, 0, 0) returns the supported
> > > > features as a return value.
> > > >
> > > > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> > > > the bitmask.
> > > >
> > > > - prctl(PR_ISOL_GET, ISOL_FEATURES, 0, 0, 0) returns the currently
> > > > enabled features.
> > >
> > > So what are the ISOL_FEATURES here? A mode that we enter such as flush
> > > vmstat _everytime_ we resume to userpace after (and including) this prctl() ?
> >
> > ISOL_FEATURES is just the "command" type (which you can get and set).
> >
> > The bitmask would include ISOL_F_QUIESCE_ON_URET, so:
> >
> > - bitmask = ISOL_F_QUIESCE_ON_URET;
> > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> > the bitmask.
> >
> > - quiesce_bitmap = prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0)
> > (1)
> >
> > (returns the supported actions to be quiesced).
> >
> > - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) _sets_
> > the actions to be quiesced (2)
> >
> > If an application does not modify "quiesce_bitmask" between
> > points (1) and (2) above, it will enable quiescing of all
> > "features" the kernel supports.
>
> I think this pattern of enabling all by default might be prone to subtly
> breaking things.

The reasoning behind this pattern is that many latency sensitive applications
(as far as i am aware) prefer "as few interruptions as possible, no
interruptions is preferred".

In that case, the pattern makes sense.

> For example, let's say we introduce ISOL_F_QUIESCE_DEFER_TLB_FLUSH, this will
> defer relatively short IPIs on isolated CPUs in exchange for a longer flush
> whenever we enter the kernel (syscall, IRQs, NMI, etc...).

Why the flush has to be longer when you enter the kernel?

ISOL_F_QUIESCE_DEFER_TLB_FLUSH might collapse multiple IPIs
into a single IPI, so the behaviour might be beneficial
for "standard" types of application as well.

> A latency sensitive
> application might be OK with the former but not with the latter.

Two alternatives:

1) The pattern above, where particular subsystems that might interrupt
the kernel are enabled automatically if the kernel supports it.

Pros:
Applications which implement this only need to be changed once,
and can benefit from new kernel features.

Applications can disable particular features if they turn
out to be problematic.

Cons:
New features might break applications.

2) Force applications to enable each new feature individually.

Pros: Won't cause regressions, kernel behaviour is explicitly
controlled by userspace.

Cons: Apps won't benefit from new features automatically.

---

It seems to me 1) is preferred. Can also add a sysfs control to
have a "default_isolation_feature" flag, which can be changed
by a sysadmin in case a new feature is undesired.

Thoughts?


2021-07-28 13:22:48

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface


On Wed, Jul 28, 2021 at 01:45:48PM +0200, Frederic Weisbecker wrote:
> On Wed, Jul 28, 2021 at 06:37:07AM -0300, Marcelo Tosatti wrote:
> > On Wed, Jul 28, 2021 at 01:45:39AM +0200, Frederic Weisbecker wrote:
> > > On Tue, Jul 27, 2021 at 11:52:09AM -0300, Marcelo Tosatti wrote:
> > > > The meaning of isolated is specified as follows:
> > > >
> > > > Isolation features
> > > > ==================
> > > >
> > > > - prctl(PR_ISOL_GET, ISOL_SUP_FEATURES, 0, 0, 0) returns the supported
> > > > features as a return value.
> > > >
> > > > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> > > > the bitmask.
> > > >
> > > > - prctl(PR_ISOL_GET, ISOL_FEATURES, 0, 0, 0) returns the currently
> > > > enabled features.
> > >
> > > So what are the ISOL_FEATURES here? A mode that we enter such as flush
> > > vmstat _everytime_ we resume to userpace after (and including) this prctl() ?
> >
> > ISOL_FEATURES is just the "command" type (which you can get and set).
> >
> > The bitmask would include ISOL_F_QUIESCE_ON_URET, so:
> >
> > - bitmask = ISOL_F_QUIESCE_ON_URET;
> > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> > the bitmask.
>
> But does it quiesce once or for every further uret?

For every uret, while ISOL_F_QUIESCE_ON_URET is enabled through
prctl(PR_ISOL_ENABLE, enabled_bitmask, 0, 0, 0).

> > - quiesce_bitmap = prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0)
> > (1)
> >
> > (returns the supported actions to be quiesced).
> >
> > - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) _sets_
> > the actions to be quiesced (2)
> >
> > If an application does not modify "quiesce_bitmask" between
> > points (1) and (2) above, it will enable quiescing of all
> > "features" the kernel supports.
>
> I don't get the difference between ISOL_FEATURES and PR_ISOL_QUIESCE_CFG.

prctl(PR_ISOL_SET, cmd, ...) is intented to accept different types of "command"
variables (including ones for new features which are not known at this
time).

- prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
the bitmask

(which might now be superceded by

prctl(PR_ISOL_ENABLE, ISOL_F_QUIESCE_ON_URET, 0, 0, 0))

- prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, bitmask, 0, 0) configures
quiescing of which subsystem/feature is performed:

#define ISOL_F_QUIESCE_VMSTAT_SYNC (1<<0)
#define ISOL_F_QUIESCE_NOHZ_FULL (1<<1)
#define ISOL_F_QUIESCE_DEFER_TLB_FLUSH (1<<2)


> > Application can, however, modify quiesce_bitmap to its preference.
> >
> > Flushing vmstat _everytime_ you resume to userspace is enabled only
> > _after_ prctl(PR_ISOL_ENTER, 0, 0, 0, 0) is performed (which happens
> > only when isolation is fully configured with the PR_ISOL_SET calls).
> > OK, will better document that.
>
> Yes please, I'm completely confused :o)

OK.

> > > If so I'd rather call that ISOL_MODE because feature is too general.
> >
> > Well, in the first patchset, there was one "mode" implemented (but
> > it was possible to implement different modes in the future).
> >
> > This would allow for example easier integration of "full task isolation"
> > patchset type of functionality, disallowing syscalls.
> >
> > I think we'd like to keep that, so i'll keep the previous distinct modes
> > (but allow configuration of individual features on the bitmap).
>
> And I also don't see how such modes differ from configuration of individual
> features on the bitmap.

Good point, they do not intersect, syscall disablement and notification of
"isolation breakage" are orthogonal to quiescing.

> > > > - prctl(PR_ISOL_GET, PR_ISOL_QUIESCE_CFG, 0, 0, 0) returns
> > > > the currently enabled actions to be quiesced.
> > > >
> > > > #define ISOL_F_QUIESCE_VMSTAT_SYNC (1<<0)
> > > > #define ISOL_F_QUIESCE_NOHZ_FULL (1<<1)
> > > > #define ISOL_F_QUIESCE_DEFER_TLB_FLUSH (1<<2)
> > >
> > > And then PR_ISOL_QUIESCE_CFG is a oneshot operation that applies only upon
> > > return to this ctrl, right? If so perhaps this should be called just
> > > ISOL_QUIESCE or ISOL_QUIESCE_ONCE or ISOL_REQ ?
> >
> > There was no one-shot operation implemented in the first patchset. What
> > application would do to achieve that is:
> >
> > 1. Configure isolation with PR_ISOL_SET (say configure mode which
> > allows system calls, and when a system call happens, flush all deferred
> > actions on return to userspace).
> >
> > 2. prctl(PR_ISOL_ENTER, 0, 0, 0, 0) (this actually enables the flushing,
> > and tags the task_struct as isolated). Here we can transfer this information
> > from per-task to per-CPU data, for example, to be able to implement
> > other features such as deferred TLB flushing.
> >
> > On return from this prctl(), deferrable actions are flushed.
> >
> > 3. latency sensitive loop, with no system calls.
> >
> > 4. some event which requires system calls is noticed:
> > prctl(PR_ISOL_EXIT, 0, 0, 0, 0)
> > (this would untag task_struct as isolated).
> >
> > 5. perform system calls A, B, C, D (with no flushing of vmstat,
> > for example).
> >
> > 6. jmp to 2.
> >
> > So there is a problem with this logic, which is that one would like
> > certain isolation functionality to remain enabled between points 4
> > and 6 (for example, blocking CPU hotplug or other blockable activities
> > that would cause interruptions).
> >
> > One way to achieve this would be to replace PR_ISOL_ENTER/PR_ISOL_EXIT
> > with PR_ISOL_ENABLE, which accepts a bitmask:
> >
> > 1. Configure isolation with PR_ISOL_SET (say configure mode which
> > allows system calls, and when a system call happens, flush all deferred
> > actions on return to userspace).
> >
> > 2. enabled_bitmask = ISOL_F_QUIESCE_ON_URET|ISOL_F_BLOCK_INTERRUPTORS;
> > prctl(PR_ISOL_ENABLE, enabled_bitmask, 0, 0, 0)
> >
> > On return from this prctl(), deferrable actions are flushed.
> >
> > 3. latency sensitive loop, with no system calls.
> >
> > 4. some event which requires system calls is noticed:
> >
> > prctl(PR_ISOL_ENABLE, ISOL_F_BLOCK_INTERRUPTORS, 0, 0, 0)
> > (this would clear ISOL_F_QUIESCE_ON_URET, so no flushing
> > is performed on return from system calls).
>
> So PR_ISOL_ENABLE is a way to perform action when some sort of kernel entry
> happens. Then we take actions when that happens (signal, warn, etc...).
>
> I guess we'll need to define what kind of kernel entry, and what kind of
> response need to happen. Ok that's a whole issue of its own that we'll need
> to handle seperately.
>
> Thanks.

In fact, why one can't use SECCOMP for syscall blocking?

Thanks.


2021-07-28 17:05:00

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Wed, Jul 28, 2021 at 11:00:01AM -0400, Nitesh Lal wrote:
> > > A latency sensitive
> > > application might be OK with the former but not with the latter.
> >
> > Two alternatives:
> >
> > 1) The pattern above, where particular subsystems that might interrupt
> > the kernel are enabled automatically if the kernel supports it.
> >
> > Pros:
> > Applications which implement this only need to be changed once,
> > and can benefit from new kernel features.
> >
> > Applications can disable particular features if they turn
> > out to be problematic.
> >
> > Cons:
> > New features might break applications.
> >
> > 2) Force applications to enable each new feature individually.
> >
> > Pros: Won't cause regressions, kernel behaviour is explicitly
> > controlled by userspace.
> >
> > Cons: Apps won't benefit from new features automatically.
> >
> > ---
> >
> > It seems to me 1) is preferred. Can also add a sysfs control to
> > have a "default_isolation_feature" flag, which can be changed
> > by a sysadmin in case a new feature is undesired.
> >
> > Thoughts?
> >
> >
> The first option may work specifically with the sysfs interface that you
> mentioned, however, IMHO (2) is safer than regressing the workloads. Also,
> if the previously implemented controls are good enough for the workload
> then there should not be a need to enable the new ones.

OK, can set default_isolation_feature as 0 then, which admin can
configure to a non-default value. This would enable the new
features only if the admin enables them.

Thanks.


2021-07-28 17:06:11

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Wed, Jul 28, 2021 at 10:48:25AM -0400, Nitesh Lal wrote:
> On Wed, Jul 28, 2021 at 5:56 AM Marcelo Tosatti <[email protected]> wrote:
>
> > On Wed, Jul 28, 2021 at 01:45:39AM +0200, Frederic Weisbecker wrote:
> > > On Tue, Jul 27, 2021 at 11:52:09AM -0300, Marcelo Tosatti wrote:
> > > > The meaning of isolated is specified as follows:
> > > >
> > > > Isolation features
> > > > ==================
> > > >
> > > > - prctl(PR_ISOL_GET, ISOL_SUP_FEATURES, 0, 0, 0) returns the supported
> > > > features as a return value.
> > > >
> > > > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the
> > features in
> > > > the bitmask.
> > > >
> > > > - prctl(PR_ISOL_GET, ISOL_FEATURES, 0, 0, 0) returns the currently
> > > > enabled features.
> > >
> > > So what are the ISOL_FEATURES here? A mode that we enter such as flush
> > > vmstat _everytime_ we resume to userpace after (and including) this
> > prctl() ?
> >
> > ISOL_FEATURES is just the "command" type (which you can get and set).
> >
>
> So, ISOL_FEATURES is really defining when the operations are really going
> to take place for eg. on every uret?

ISOL_F_QUIESCE_ON_URET enables quiescing on userspace return.

> > The bitmask would include ISOL_F_QUIESCE_ON_URET, so:
> >
> >
> When we talk about full/complete isolation

https://lwn.net/Articles/816298/

Nohz and task isolation section

These features reduce interruptions on the isolated CPUs, but do not
fully eliminate them; task isolation is an attempt to finish the job by
removing all interruptions. A process that enters the isolation mode
will be able to run in user space with no interference from the kernel
or other processes.

> then does that translates to
> enabling all possible features supported by something like
> ISOL_F_QUIESCE_ON_URET?

Not necessarily.

If one controls what apps execute on the system (say the system
is completly idle), ISOL_F_QUIESCE_ON_URET with vmstat sync should
be sufficient for complete isolation (one can read events via
rt-trace-bpf.py).

> - bitmask = ISOL_F_QUIESCE_ON_URET;
> > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> > the bitmask.
> >
> > - quiesce_bitmap = prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0)
> > (1)
> >
> > (returns the supported actions to be quiesced).
> >
> > - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) _sets_
> > the actions to be quiesced (2)
> >
> > If an application does not modify "quiesce_bitmask" between
> > points (1) and (2) above, it will enable quiescing of all
> > "features" the kernel supports.
> >
> > Application can, however, modify quiesce_bitmap to its preference.
> >
> > Flushing vmstat _everytime_ you resume to userspace is enabled only
> > _after_ prctl(PR_ISOL_ENTER, 0, 0, 0, 0) is performed (which happens
> > only when isolation is fully configured with the PR_ISOL_SET calls).
> >
>
> Will this also happen if I disable ISOL_F_QUIESCE_VMSTAT_SYNC from the
> quiesce_bitmask?

Yes.

> > OK, will better document that.
> >
> > > If so I'd rather call that ISOL_MODE because feature is too general.
> >
> > Well, in the first patchset, there was one "mode" implemented (but
> > it was possible to implement different modes in the future).
> >
> > This would allow for example easier integration of "full task isolation"
> > patchset type of functionality, disallowing syscalls.
> >
> >
> Makes sense to go back to the usage of ISOL_MODE.
> After this change, the ISOL_FEATURES will be replaced with something like
> PR_ISOL_MODE_NORMAL/PR_MODE_ISOL?
>
> I think we'd like to keep that, so i'll keep the previous distinct modes
> > (but allow configuration of individual features on the bitmap).
> >
> > > >
> > > > The supported features are:
> > > >
> > > > ISOL_F_QUIESCE_ON_URET: quiesce deferred actions on return to
> > userspace.
> > > > ----------------------
> > > >
> > > > Quiescing of different actions can be performed on return to userspace.
> > > >
> > > > - prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0) returns
> > > > the supported actions to be quiesced.
> > > >
> > > > - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0)
> > returns
> >
> > s/returns/sets/
> >
> > > > the currently supported actions to be quiesced.
> > > >
> > > > - prctl(PR_ISOL_GET, PR_ISOL_QUIESCE_CFG, 0, 0, 0) returns
> > > > the currently enabled actions to be quiesced.
> > > >
> > > > #define ISOL_F_QUIESCE_VMSTAT_SYNC (1<<0)
> > > > #define ISOL_F_QUIESCE_NOHZ_FULL (1<<1)
> > > > #define ISOL_F_QUIESCE_DEFER_TLB_FLUSH (1<<2)
> > >
> > > And then PR_ISOL_QUIESCE_CFG is a oneshot operation that applies only
> > upon
> > > return to this ctrl, right? If so perhaps this should be called just
> > > ISOL_QUIESCE or ISOL_QUIESCE_ONCE or ISOL_REQ ?
> >
> > There was no one-shot operation implemented in the first patchset. What
> > application would do to achieve that is:
> >
> > 1. Configure isolation with PR_ISOL_SET (say configure mode which
> > allows system calls, and when a system call happens, flush all deferred
> > actions on return to userspace).
> >
> > 2. prctl(PR_ISOL_ENTER, 0, 0, 0, 0) (this actually enables the flushing,
> > and tags the task_struct as isolated). Here we can transfer this
> > information
> > from per-task to per-CPU data, for example, to be able to implement
> > other features such as deferred TLB flushing.
> >
> > On return from this prctl(), deferrable actions are flushed.
> >
> > 3. latency sensitive loop, with no system calls.
> >
> > 4. some event which requires system calls is noticed:
> > prctl(PR_ISOL_EXIT, 0, 0, 0, 0)
> > (this would untag task_struct as isolated).
> >
> > 5. perform system calls A, B, C, D (with no flushing of vmstat,
> > for example).
> >
> > 6. jmp to 2.
> >
> > So there is a problem with this logic, which is that one would like
> > certain isolation functionality to remain enabled between points 4
> > and 6 (for example, blocking CPU hotplug or other blockable activities
> > that would cause interruptions).
> >
> > One way to achieve this would be to replace PR_ISOL_ENTER/PR_ISOL_EXIT
> > with PR_ISOL_ENABLE, which accepts a bitmask:
> >
> > 1. Configure isolation with PR_ISOL_SET (say configure mode which
> > allows system calls, and when a system call happens, flush all deferred
> > actions on return to userspace).
> >
> > 2. enabled_bitmask = ISOL_F_QUIESCE_ON_URET|ISOL_F_BLOCK_INTERRUPTORS;
> > prctl(PR_ISOL_ENABLE, enabled_bitmask, 0, 0, 0)
> >
> > On return from this prctl(), deferrable actions are flushed.
> >
> > 3. latency sensitive loop, with no system calls.
> >
> > 4. some event which requires system calls is noticed:
> >
> > prctl(PR_ISOL_ENABLE, ISOL_F_BLOCK_INTERRUPTORS, 0, 0, 0)
> > (this would clear ISOL_F_QUIESCE_ON_URET, so no flushing
> > is performed on return from system calls).
> >
>
> FWIU we will still exit before this via prctl(PR_ISOL_EXIT, 0, 0, 0, 0)?
> Because if we are still in a latency-sensitive loop then not flushing
> while returning to the userspace can cause interruptions anyways.

No, PR_ISOL_ENABLE replaces PR_ISOL_ENTER/PR_ISOL_EXIT.

> > 5. perform system calls A, B, C, D (with no flushing of vmstat).
> >
> > 6. jmp to 2.
> >
> > ...
> >
> > On exit: prctl(PR_ISOL_ENABLE, 0, 0, 0, 0)
> >
> > IOW: the one-shot operation does not allow the application
> > to inform the kernel when the latency sensitive loop has
> > begun or has ended.
> >
> > >
> > > But that's just naming debate because otherwise that prctl layout looks
> > good
> > > to me.
> > >
> > > Thanks!
> >
> > Thank you for the input!
> >
> >
>
> --
> Thanks
> Nitesh


2021-07-28 17:12:07

by Nicolas Saenz Julienne

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Wed, 2021-07-28 at 10:16 -0300, Marcelo Tosatti wrote:
> > For example, let's say we introduce ISOL_F_QUIESCE_DEFER_TLB_FLUSH, this will
> > defer relatively short IPIs on isolated CPUs in exchange for a longer flush
> > whenever we enter the kernel (syscall, IRQs, NMI, etc...).
>
> Why the flush has to be longer when you enter the kernel?

What I had in mind was cost of rapid partial flushes (IPIs) vs full flushes on
entry, although I haven't really measured anything so the extra latency cost
might as well be zero.

> ISOL_F_QUIESCE_DEFER_TLB_FLUSH might collapse multiple IPIs
> into a single IPI, so the behaviour might be beneficial
> for "standard" types of application as well.
>
> > A latency sensitive
> > application might be OK with the former but not with the latter.
>
> Two alternatives:
>
> 1) The pattern above, where particular subsystems that might interrupt
> the kernel are enabled automatically if the kernel supports it.
>
> Pros:
> Applications which implement this only need to be changed once,
> and can benefit from new kernel features.
>
> Applications can disable particular features if they turn
> out to be problematic.
>
> Cons:
> New features might break applications.
>
> 2) Force applications to enable each new feature individually.
>
> Pros: Won't cause regressions, kernel behaviour is explicitly
> controlled by userspace.
>
> Cons: Apps won't benefit from new features automatically.
>
> ---
>
> It seems to me 1) is preferred. Can also add a sysfs control to
> have a "default_isolation_feature" flag, which can be changed
> by a sysadmin in case a new feature is undesired.
>
> Thoughts?

I'd still take option 2. Nitesh has a very good point, latency requirements are
hit or miss. What's the benefit of enabling new features on an already valid
application vs the potential regression?

That said I see value in providing means for users that want all
features/modes, but it should be an through an explicit action on their part.

--
Nicolás Sáenz


2021-07-28 21:25:19

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [patch 1/4] add basic task isolation prctl interface

On Wed, Jul 28, 2021 at 10:21:34AM -0300, Marcelo Tosatti wrote:
> > > ISOL_FEATURES is just the "command" type (which you can get and set).
> > >
> > > The bitmask would include ISOL_F_QUIESCE_ON_URET, so:
> > >
> > > - bitmask = ISOL_F_QUIESCE_ON_URET;
> > > - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> > > the bitmask.
> >
> > But does it quiesce once or for every further uret?
>
> For every uret, while ISOL_F_QUIESCE_ON_URET is enabled through
> prctl(PR_ISOL_ENABLE, enabled_bitmask, 0, 0, 0).

Ok.

>
> > > - quiesce_bitmap = prctl(PR_ISOL_GET, PR_ISOL_SUP_QUIESCE_CFG, 0, 0, 0)
> > > (1)
> > >
> > > (returns the supported actions to be quiesced).
> > >
> > > - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, quiesce_bitmask, 0, 0) _sets_
> > > the actions to be quiesced (2)
> > >
> > > If an application does not modify "quiesce_bitmask" between
> > > points (1) and (2) above, it will enable quiescing of all
> > > "features" the kernel supports.
> >
> > I don't get the difference between ISOL_FEATURES and PR_ISOL_QUIESCE_CFG.
>
> prctl(PR_ISOL_SET, cmd, ...) is intented to accept different types of "command"
> variables (including ones for new features which are not known at this
> time).
>
> - prctl(PR_ISOL_SET, ISOL_FEATURES, bitmask, 0, 0) enables the features in
> the bitmask
>
> (which might now be superceded by
>
> prctl(PR_ISOL_ENABLE, ISOL_F_QUIESCE_ON_URET, 0, 0, 0))
>
> - prctl(PR_ISOL_SET, PR_ISOL_QUIESCE_CFG, bitmask, 0, 0) configures
> quiescing of which subsystem/feature is performed:
>
> #define ISOL_F_QUIESCE_VMSTAT_SYNC (1<<0)
> #define ISOL_F_QUIESCE_NOHZ_FULL (1<<1)
> #define ISOL_F_QUIESCE_DEFER_TLB_FLUSH (1<<2)

Ok but...I still don't get the difference between ISOL_FEATURES and
PR_ISOL_QUIESCE_CFG :-)

> > So PR_ISOL_ENABLE is a way to perform action when some sort of kernel entry
> > happens. Then we take actions when that happens (signal, warn, etc...).
> >
> > I guess we'll need to define what kind of kernel entry, and what kind of
> > response need to happen. Ok that's a whole issue of its own that we'll need
> > to handle seperately.
> >
> > Thanks.
>
> In fact, why one can't use SECCOMP for syscall blocking?

Heh! Good point!