2021-04-12 23:48:04

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v2 4/9] sched: Move SCHED_DEBUG sysctl to debugfs

Stop polluting sysctl with undocumented knobs that really are debug
only, move them all to /debug/sched/ along with the existing
/debug/sched_* files that already exist.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Greg Kroah-Hartman <[email protected]>
---
include/linux/sched/sysctl.h | 8 +--
kernel/sched/core.c | 4 +
kernel/sched/debug.c | 74 +++++++++++++++++++++++++++++++--
kernel/sched/fair.c | 9 ----
kernel/sched/sched.h | 2
kernel/sysctl.c | 96 -------------------------------------------
6 files changed, 80 insertions(+), 113 deletions(-)

--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct
enum { sysctl_hung_task_timeout_secs = 0 };
#endif

+extern unsigned int sysctl_sched_child_runs_first;
+
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_child_runs_first;

enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
@@ -37,7 +38,7 @@ enum sched_tunable_scaling {
SCHED_TUNABLESCALING_LINEAR,
SCHED_TUNABLESCALING_END,
};
-extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_sched_tunable_scaling;

extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
@@ -47,9 +48,6 @@ extern unsigned int sysctl_numa_balancin
#ifdef CONFIG_SCHED_DEBUG
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos);
#endif

/*
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5504,9 +5504,11 @@ static const struct file_operations sche
.release = single_release,
};

+extern struct dentry *debugfs_sched;
+
static __init int sched_init_debug_dynamic(void)
{
- debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+ debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
return 0;
}
late_initcall(sched_init_debug_dynamic);
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -169,15 +169,81 @@ static const struct file_operations sche
.release = single_release,
};

+#ifdef CONFIG_SMP
+
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[16];
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+ return -EINVAL;
+
+ if (sched_update_scaling())
+ return -EINVAL;
+
+ *ppos += cnt;
+ return cnt;
+}
+
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
+ return 0;
+}
+
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_scaling_show, NULL);
+}
+
+static const struct file_operations sched_scaling_fops = {
+ .open = sched_scaling_open,
+ .write = sched_scaling_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif /* SMP */
+
__read_mostly bool sched_debug_enabled;

+struct dentry *debugfs_sched;
+
static __init int sched_init_debug(void)
{
- debugfs_create_file("sched_features", 0644, NULL, NULL,
- &sched_feat_fops);
+ struct dentry __maybe_unused *numa;
+
+ debugfs_sched = debugfs_create_dir("sched", NULL);
+
+ debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
+ debugfs_create_bool("debug_enabled", 0644, debugfs_sched, &sched_debug_enabled);
+
+ debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
+ debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+ debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
+
+#ifdef CONFIG_SMP
+ debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ numa = debugfs_create_dir("numa_balancing", debugfs_sched);

- debugfs_create_bool("sched_debug", 0644, NULL,
- &sched_debug_enabled);
+ debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
+ debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
+ debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
+ debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+#endif

return 0;
}
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -49,7 +49,7 @@ static unsigned int normalized_sysctl_sc
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;

/*
* Minimal preemption granularity for CPU-bound tasks:
@@ -627,15 +627,10 @@ struct sched_entity *__pick_last_entity(
* Scheduling class statistics methods:
*/

-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int sched_update_scaling(void)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();

- if (ret || !write)
- return ret;
-
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1569,6 +1569,8 @@ static inline void unregister_sched_doma
}
#endif

+extern int sched_update_scaling(void);
+
extern void flush_smp_call_function_from_idle(void);

#else /* !CONFIG_SMP: */
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -184,17 +184,6 @@ static enum sysctl_writes_mode sysctl_wr
int sysctl_legacy_va_layout;
#endif

-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000; /* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
-static int min_wakeup_granularity_ns; /* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
-
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
static int max_extfrag_threshold = 1000;
@@ -1659,91 +1648,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#ifdef CONFIG_SCHED_DEBUG
- {
- .procname = "sched_min_granularity_ns",
- .data = &sysctl_sched_min_granularity,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_granularity_ns,
- .extra2 = &max_sched_granularity_ns,
- },
- {
- .procname = "sched_latency_ns",
- .data = &sysctl_sched_latency,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_granularity_ns,
- .extra2 = &max_sched_granularity_ns,
- },
- {
- .procname = "sched_wakeup_granularity_ns",
- .data = &sysctl_sched_wakeup_granularity,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_wakeup_granularity_ns,
- .extra2 = &max_wakeup_granularity_ns,
- },
-#ifdef CONFIG_SMP
- {
- .procname = "sched_tunable_scaling",
- .data = &sysctl_sched_tunable_scaling,
- .maxlen = sizeof(enum sched_tunable_scaling),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_tunable_scaling,
- .extra2 = &max_sched_tunable_scaling,
- },
- {
- .procname = "sched_migration_cost_ns",
- .data = &sysctl_sched_migration_cost,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sched_nr_migrate",
- .data = &sysctl_sched_nr_migrate,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_NUMA_BALANCING
- {
- .procname = "numa_balancing_scan_delay_ms",
- .data = &sysctl_numa_balancing_scan_delay,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_period_min_ms",
- .data = &sysctl_numa_balancing_scan_period_min,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_period_max_ms",
- .data = &sysctl_numa_balancing_scan_period_max,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_size_mb",
- .data = &sysctl_numa_balancing_scan_size,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- },
-#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",



2021-04-15 16:31:20

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH] sched/debug: Rename the sched_debug parameter to sched_debug_verbose

On Mon, Apr 12, 2021 at 12:14:25PM +0200, Peter Zijlstra wrote:

> + debugfs_create_bool("debug_enabled", 0644, debugfs_sched, &sched_debug_enabled);

How's this on top of the whole series?

---
Subject: sched/debug: Rename the sched_debug parameter to sched_debug_verbose
From: Peter Zijlstra <[email protected]>
Date: Thu Apr 15 18:23:17 CEST 2021

CONFIG_SCHED_DEBUG is the build-time Kconfig knob, the boot param
sched_debug and the /debug/sched/debug_enabled knobs control the
sched_debug_enabled variable, but what they really do is make
SCHED_DEBUG more verbose, so rename the lot.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
Documentation/admin-guide/kernel-parameters.txt | 3 ++-
Documentation/scheduler/sched-domains.rst | 10 +++++-----
kernel/sched/debug.c | 4 ++--
kernel/sched/topology.c | 10 +++++-----
4 files changed, 14 insertions(+), 13 deletions(-)

--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4756,7 +4756,8 @@

sbni= [NET] Granch SBNI12 leased line adapter

- sched_debug [KNL] Enables verbose scheduler debug messages.
+ sched_debug_verbose
+ [KNL] Enables verbose scheduler debug messages.

schedstats= [KNL,X86] Enable or disable scheduled statistics.
Allowed values are enable and disable. This feature
--- a/Documentation/scheduler/sched-domains.rst
+++ b/Documentation/scheduler/sched-domains.rst
@@ -74,8 +74,8 @@ for a given topology level by creating a
calling set_sched_topology() with this array as the parameter.

The sched-domains debugging infrastructure can be enabled by enabling
-CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
-tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
-knob. This enables an error checking parse of the sched domains which should
-catch most possible errors (described above). It also prints out the domain
-structure in a visual format.
+CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
+forgot to tweak your cmdline, you can also flip the
+/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
+the sched domains which should catch most possible errors (described above). It
+also prints out the domain structure in a visual format.
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -275,7 +275,7 @@ static const struct file_operations sche

#endif /* CONFIG_PREEMPT_DYNAMIC */

-__read_mostly bool sched_debug_enabled;
+__read_mostly bool sched_debug_verbose;

static const struct seq_operations sched_debug_sops;

@@ -300,7 +300,7 @@ static __init int sched_init_debug(void)
debugfs_sched = debugfs_create_dir("sched", NULL);

debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
- debugfs_create_bool("debug_enabled", 0644, debugfs_sched, &sched_debug_enabled);
+ debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);
#ifdef CONFIG_PREEMPT_DYNAMIC
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
#endif
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -14,7 +14,7 @@ static cpumask_var_t sched_domains_tmpma

static int __init sched_debug_setup(char *str)
{
- sched_debug_enabled = true;
+ sched_debug_verbose = true;

return 0;
}
@@ -22,7 +22,7 @@ early_param("sched_debug", sched_debug_s

static inline bool sched_debug(void)
{
- return sched_debug_enabled;
+ return sched_debug_verbose;
}

#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
@@ -131,7 +131,7 @@ static void sched_domain_debug(struct sc
{
int level = 0;

- if (!sched_debug_enabled)
+ if (!sched_debug_verbose)
return;

if (!sd) {
@@ -152,7 +152,7 @@ static void sched_domain_debug(struct sc
}
#else /* !CONFIG_SCHED_DEBUG */

-# define sched_debug_enabled 0
+# define sched_debug_verbose 0
# define sched_domain_debug(sd, cpu) do { } while (0)
static inline bool sched_debug(void)
{
@@ -2141,7 +2141,7 @@ build_sched_domains(const struct cpumask
if (has_asym)
static_branch_inc_cpuslocked(&sched_asym_cpucapacity);

- if (rq && sched_debug_enabled) {
+ if (rq && sched_debug_verbose) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}

Subject: [tip: sched/core] sched: Move SCHED_DEBUG sysctl to debugfs

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 8a99b6833c884fa0e7919030d93fecedc69fc625
Gitweb: https://git.kernel.org/tip/8a99b6833c884fa0e7919030d93fecedc69fc625
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 24 Mar 2021 11:43:21 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Fri, 16 Apr 2021 17:06:34 +02:00

sched: Move SCHED_DEBUG sysctl to debugfs

Stop polluting sysctl with undocumented knobs that really are debug
only, move them all to /debug/sched/ along with the existing
/debug/sched_* files that already exist.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Greg Kroah-Hartman <[email protected]>
Tested-by: Valentin Schneider <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/linux/sched/sysctl.h | 8 +--
kernel/sched/core.c | 4 +-
kernel/sched/debug.c | 74 +++++++++++++++++++++++++--
kernel/sched/fair.c | 9 +---
kernel/sched/sched.h | 2 +-
kernel/sysctl.c | 96 +-----------------------------------
6 files changed, 80 insertions(+), 113 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 3c31ba8..0a3f346 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
enum { sysctl_hung_task_timeout_secs = 0 };
#endif

+extern unsigned int sysctl_sched_child_runs_first;
+
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_child_runs_first;

enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
@@ -37,7 +38,7 @@ enum sched_tunable_scaling {
SCHED_TUNABLESCALING_LINEAR,
SCHED_TUNABLESCALING_END,
};
-extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_sched_tunable_scaling;

extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
@@ -47,9 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
#ifdef CONFIG_SCHED_DEBUG
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos);
#endif

/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7d031da..bac30db 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5504,9 +5504,11 @@ static const struct file_operations sched_dynamic_fops = {
.release = single_release,
};

+extern struct dentry *debugfs_sched;
+
static __init int sched_init_debug_dynamic(void)
{
- debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+ debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
return 0;
}
late_initcall(sched_init_debug_dynamic);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4b49cc2..2093b90 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -169,15 +169,81 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};

+#ifdef CONFIG_SMP
+
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[16];
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+ return -EINVAL;
+
+ if (sched_update_scaling())
+ return -EINVAL;
+
+ *ppos += cnt;
+ return cnt;
+}
+
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
+ return 0;
+}
+
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_scaling_show, NULL);
+}
+
+static const struct file_operations sched_scaling_fops = {
+ .open = sched_scaling_open,
+ .write = sched_scaling_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif /* SMP */
+
__read_mostly bool sched_debug_enabled;

+struct dentry *debugfs_sched;
+
static __init int sched_init_debug(void)
{
- debugfs_create_file("sched_features", 0644, NULL, NULL,
- &sched_feat_fops);
+ struct dentry __maybe_unused *numa;

- debugfs_create_bool("sched_debug", 0644, NULL,
- &sched_debug_enabled);
+ debugfs_sched = debugfs_create_dir("sched", NULL);
+
+ debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
+ debugfs_create_bool("debug_enabled", 0644, debugfs_sched, &sched_debug_enabled);
+
+ debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
+ debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+ debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
+
+#ifdef CONFIG_SMP
+ debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+ debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+ debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
+#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ numa = debugfs_create_dir("numa_balancing", debugfs_sched);
+
+ debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
+ debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
+ debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
+ debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+#endif

return 0;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9b8ae02..b3ea14c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -49,7 +49,7 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
*/
-enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;

/*
* Minimal preemption granularity for CPU-bound tasks:
@@ -634,15 +634,10 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
* Scheduling class statistics methods:
*/

-int sched_proc_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+int sched_update_scaling(void)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
unsigned int factor = get_update_sysctl_factor();

- if (ret || !write)
- return ret;
-
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7e7e936..123ff3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1568,6 +1568,8 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif

+extern int sched_update_scaling(void);
+
extern void flush_smp_call_function_from_idle(void);

#else /* !CONFIG_SMP: */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 17f1cc9..4bff44d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -184,17 +184,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
int sysctl_legacy_va_layout;
#endif

-#ifdef CONFIG_SCHED_DEBUG
-static int min_sched_granularity_ns = 100000; /* 100 usecs */
-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
-static int min_wakeup_granularity_ns; /* 0 usecs */
-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
-#ifdef CONFIG_SMP
-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif /* CONFIG_SMP */
-#endif /* CONFIG_SCHED_DEBUG */
-
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
static int max_extfrag_threshold = 1000;
@@ -1659,91 +1648,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
-#ifdef CONFIG_SCHED_DEBUG
- {
- .procname = "sched_min_granularity_ns",
- .data = &sysctl_sched_min_granularity,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_granularity_ns,
- .extra2 = &max_sched_granularity_ns,
- },
- {
- .procname = "sched_latency_ns",
- .data = &sysctl_sched_latency,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_granularity_ns,
- .extra2 = &max_sched_granularity_ns,
- },
- {
- .procname = "sched_wakeup_granularity_ns",
- .data = &sysctl_sched_wakeup_granularity,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_wakeup_granularity_ns,
- .extra2 = &max_wakeup_granularity_ns,
- },
-#ifdef CONFIG_SMP
- {
- .procname = "sched_tunable_scaling",
- .data = &sysctl_sched_tunable_scaling,
- .maxlen = sizeof(enum sched_tunable_scaling),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_tunable_scaling,
- .extra2 = &max_sched_tunable_scaling,
- },
- {
- .procname = "sched_migration_cost_ns",
- .data = &sysctl_sched_migration_cost,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "sched_nr_migrate",
- .data = &sysctl_sched_nr_migrate,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_NUMA_BALANCING
- {
- .procname = "numa_balancing_scan_delay_ms",
- .data = &sysctl_numa_balancing_scan_delay,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_period_min_ms",
- .data = &sysctl_numa_balancing_scan_period_min,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_period_max_ms",
- .data = &sysctl_numa_balancing_scan_period_max,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "numa_balancing_scan_size_mb",
- .data = &sysctl_numa_balancing_scan_size,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- },
-#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",

2021-04-19 22:18:01

by Josh Don

[permalink] [raw]
Subject: Re: [PATCH] sched/debug: Rename the sched_debug parameter to sched_debug_verbose

Hi Peter,

Looks reasonable to me.

> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4756,7 +4756,8 @@
>
> sbni= [NET] Granch SBNI12 leased line adapter
>
> - sched_debug [KNL] Enables verbose scheduler debug messages.
> + sched_debug_verbose
> + [KNL] Enables verbose scheduler debug messages.

boot param is not renamed from sched_debug below.

> @@ -22,7 +22,7 @@ early_param("sched_debug", sched_debug_s
>
> static inline bool sched_debug(void)

nit: consider renaming. Or, we can even get rid of this function
entirely, since in the !CONFIG_SCHED_DEBUG case we have
sched_debug_verbose defined to 0.

2021-04-27 15:04:43

by Christian Borntraeger

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs

Peter,

I just realized that we moved away sysctl tunabled to debugfs in next.
We have seen several cases where it was benefitial to set
sched_migration_cost_ns to a lower value. For example with KVM I can
easily get 50% more transactions with 50000 instead of 500000.
Until now it was possible to use tuned or /etc/sysctl.conf to set
these things permanently.

Given that some people do not want to have debugfs mounted all the time
I would consider this a regression. The sysctl tunable was always
available.

I am ok with the "informational" things being in debugfs, but not
the tunables. So how do we proceed here?

Christian

2021-04-27 15:10:27

by Steven Rostedt

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs

On Tue, 27 Apr 2021 16:59:25 +0200
Christian Borntraeger <[email protected]> wrote:

> Peter,
>
> I just realized that we moved away sysctl tunabled to debugfs in next.
> We have seen several cases where it was benefitial to set
> sched_migration_cost_ns to a lower value. For example with KVM I can
> easily get 50% more transactions with 50000 instead of 500000.
> Until now it was possible to use tuned or /etc/sysctl.conf to set
> these things permanently.
>
> Given that some people do not want to have debugfs mounted all the time
> I would consider this a regression. The sysctl tunable was always
> available.
>
> I am ok with the "informational" things being in debugfs, but not
> the tunables. So how do we proceed here?

Should there be a schedfs created?

This is the reason I created the tracefs file system, was to get the
tracing code out of debugfs, as debugfs is a catch all for everything and
can lead to poor and insecure interfaces that people do not want to add on
systems that they still want tracing on.

Or perhaps we should add a "tunefs" for tunables that are stable interfaces
that should not be in /proc but also not in debugfs.

-- Steve

2021-04-27 15:32:25

by Christian Borntraeger

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs



On 27.04.21 17:09, Steven Rostedt wrote:
> On Tue, 27 Apr 2021 16:59:25 +0200
> Christian Borntraeger <[email protected]> wrote:
>
>> Peter,
>>
>> I just realized that we moved away sysctl tunabled to debugfs in next.
>> We have seen several cases where it was benefitial to set
>> sched_migration_cost_ns to a lower value. For example with KVM I can
>> easily get 50% more transactions with 50000 instead of 500000.
>> Until now it was possible to use tuned or /etc/sysctl.conf to set
>> these things permanently.
>>
>> Given that some people do not want to have debugfs mounted all the time
>> I would consider this a regression. The sysctl tunable was always
>> available.
>>
>> I am ok with the "informational" things being in debugfs, but not
>> the tunables. So how do we proceed here?
>
> Should there be a schedfs created?
>
> This is the reason I created the tracefs file system, was to get the
> tracing code out of debugfs, as debugfs is a catch all for everything and
> can lead to poor and insecure interfaces that people do not want to add on
> systems that they still want tracing on.
>
> Or perhaps we should add a "tunefs" for tunables that are stable interfaces
> that should not be in /proc but also not in debugfs.

Yes, a tunefs or schedfs could be considered a replacement for sysctl.
It will still break existing setups with kernel.sched* things in /etc/sysctl.conf
but at least there is a stable transition path.


2021-04-28 08:49:47

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs

On Tue, Apr 27, 2021 at 04:59:25PM +0200, Christian Borntraeger wrote:
> Peter,
>
> I just realized that we moved away sysctl tunabled to debugfs in next.
> We have seen several cases where it was benefitial to set
> sched_migration_cost_ns to a lower value. For example with KVM I can
> easily get 50% more transactions with 50000 instead of 500000.
> Until now it was possible to use tuned or /etc/sysctl.conf to set
> these things permanently.
>
> Given that some people do not want to have debugfs mounted all the time
> I would consider this a regression. The sysctl tunable was always
> available.
>
> I am ok with the "informational" things being in debugfs, but not
> the tunables. So how do we proceed here?

It's all SCHED_DEBUG; IOW you're relying on DEBUG infrastructure for
production performance, and that's your fail.

I very explicitly do not care to support people that poke random values
into those 'tunables'. If people wants to do that, they get to keep any
and all pieces.

The right thing to do here is to analyze the situation and determine why
migration_cost needs changing; is that an architectural thing, does s390
benefit from less sticky tasks due to its cache setup (the book caches
could be absorbing some of the penalties here for example). Or is it
something that's workload related, does KVM intrinsically not care about
migrating so much, or is it something else.

Basically, you get to figure out what the actual performance issue is,
and then we can look at what to do about it so that everyone benefits,
and not grow some random tweaks on the interweb that might or might not
actually work for someone else.

2021-04-28 08:51:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs

On Tue, Apr 27, 2021 at 11:09:26AM -0400, Steven Rostedt wrote:
> Or perhaps we should add a "tunefs" for tunables that are stable interfaces
> that should not be in /proc but also not in debugfs.

As said in that other email, I very emphatically do not want to do that.
I want to actively discourage people from 'tuning' and start to debug.

2021-04-28 08:57:14

by Christian Borntraeger

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs



On 28.04.21 10:46, Peter Zijlstra wrote:
> On Tue, Apr 27, 2021 at 04:59:25PM +0200, Christian Borntraeger wrote:
>> Peter,
>>
>> I just realized that we moved away sysctl tunabled to debugfs in next.
>> We have seen several cases where it was benefitial to set
>> sched_migration_cost_ns to a lower value. For example with KVM I can
>> easily get 50% more transactions with 50000 instead of 500000.
>> Until now it was possible to use tuned or /etc/sysctl.conf to set
>> these things permanently.
>>
>> Given that some people do not want to have debugfs mounted all the time
>> I would consider this a regression. The sysctl tunable was always
>> available.
>>
>> I am ok with the "informational" things being in debugfs, but not
>> the tunables. So how do we proceed here?
>
> It's all SCHED_DEBUG; IOW you're relying on DEBUG infrastructure for
> production performance, and that's your fail.

No its not. sched_migration_cost_ns was NEVER protected by CONFIG_SCHED_DEBUG.
It was available on all kernels with CONFIG_SMP.

>
> I very explicitly do not care to support people that poke random values
> into those 'tunables'. If people wants to do that, they get to keep any
> and all pieces.
>
> The right thing to do here is to analyze the situation and determine why
> migration_cost needs changing; is that an architectural thing, does s390
> benefit from less sticky tasks due to its cache setup (the book caches
> could be absorbing some of the penalties here for example). Or is it
> something that's workload related, does KVM intrinsically not care about
> migrating so much, or is it something else.
>
> Basically, you get to figure out what the actual performance issue is,
> and then we can look at what to do about it so that everyone benefits,
> and not grow some random tweaks on the interweb that might or might not
> actually work for someone else.

Yes, I agree. We have seen the effect of this value recently and we want
look into that. Still that does not change the fact that you are removing
an interface that was there for ages.

2021-04-28 09:02:23

by Christian Borntraeger

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs



On 28.04.21 10:54, Christian Borntraeger wrote:
>
>
> On 28.04.21 10:46, Peter Zijlstra wrote:
>> On Tue, Apr 27, 2021 at 04:59:25PM +0200, Christian Borntraeger wrote:
>>> Peter,
>>>
>>> I just realized that we moved away sysctl tunabled to debugfs in next.
>>> We have seen several cases where it was benefitial to set
>>> sched_migration_cost_ns to a lower value. For example with KVM I can
>>> easily get 50% more transactions with 50000 instead of 500000.
>>> Until now it was possible to use tuned or /etc/sysctl.conf to set
>>> these things permanently.
>>>
>>> Given that some people do not want to have debugfs mounted all the time
>>> I would consider this a regression. The sysctl tunable was always
>>> available.
>>>
>>> I am ok with the "informational" things being in debugfs, but not
>>> the tunables. So how do we proceed here?
>>
>> It's all SCHED_DEBUG; IOW you're relying on DEBUG infrastructure for
>> production performance, and that's your fail.
>
> No its not. sched_migration_cost_ns was NEVER protected by CONFIG_SCHED_DEBUG.
> It was available on all kernels with CONFIG_SMP.

Have to correct myself, it was SCHED_DEBUG in 3.0.
>
>>
>> I very explicitly do not care to support people that poke random values
>> into those 'tunables'. If people wants to do that, they get to keep any
>> and all pieces.
>>
>> The right thing to do here is to analyze the situation and determine why
>> migration_cost needs changing; is that an architectural thing, does s390
>> benefit from less sticky tasks due to its cache setup (the book caches
>> could be absorbing some of the penalties here for example). Or is it
>> something that's workload related, does KVM intrinsically not care about
>> migrating so much, or is it something else.
>>
>> Basically, you get to figure out what the actual performance issue is,
>> and then we can look at what to do about it so that everyone benefits,
>> and not grow some random tweaks on the interweb that might or might not
>> actually work for someone else.
>
> Yes, I agree. We have seen the effect of this value recently and we want
> look into that. Still that does not change the fact that you are removing
> an interface that was there for ages.

2021-04-28 09:29:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs

On Wed, Apr 28, 2021 at 10:54:37AM +0200, Christian Borntraeger wrote:
>
>
> On 28.04.21 10:46, Peter Zijlstra wrote:
> > On Tue, Apr 27, 2021 at 04:59:25PM +0200, Christian Borntraeger wrote:
> > > Peter,
> > >
> > > I just realized that we moved away sysctl tunabled to debugfs in next.
> > > We have seen several cases where it was benefitial to set
> > > sched_migration_cost_ns to a lower value. For example with KVM I can
> > > easily get 50% more transactions with 50000 instead of 500000.
> > > Until now it was possible to use tuned or /etc/sysctl.conf to set
> > > these things permanently.
> > >
> > > Given that some people do not want to have debugfs mounted all the time
> > > I would consider this a regression. The sysctl tunable was always
> > > available.
> > >
> > > I am ok with the "informational" things being in debugfs, but not
> > > the tunables. So how do we proceed here?
> >
> > It's all SCHED_DEBUG; IOW you're relying on DEBUG infrastructure for
> > production performance, and that's your fail.
>
> No its not. sched_migration_cost_ns was NEVER protected by CONFIG_SCHED_DEBUG.
> It was available on all kernels with CONFIG_SMP.

The relevant section from origin/master:kernel/sysctl.c:

#ifdef CONFIG_SCHED_DEBUG
{
.procname = "sched_min_granularity_ns",
.data = &sysctl_sched_min_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
},
{
.procname = "sched_latency_ns",
.data = &sysctl_sched_latency,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
},
{
.procname = "sched_wakeup_granularity_ns",
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
.maxlen = sizeof(enum sched_tunable_scaling),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_tunable_scaling,
.extra2 = &max_sched_tunable_scaling,
},
{
.procname = "sched_migration_cost_ns",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_nr_migrate",
.data = &sysctl_sched_nr_migrate,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
.data = NULL,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_schedstats,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing_scan_delay_ms",
.data = &sysctl_numa_balancing_scan_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_min_ms",
.data = &sysctl_numa_balancing_scan_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_max_ms",
.data = &sysctl_numa_balancing_scan_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_size_mb",
.data = &sysctl_numa_balancing_scan_size,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
{
.procname = "numa_balancing",
.data = NULL, /* filled in by handler */
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_numa_balancing,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */

How is migration_cost not under SCHED_DEBUG? The bigger problem is that
world+dog has SCHED_DEBUG=y in their .config.


2021-04-28 09:35:37

by Christian Borntraeger

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs



On 28.04.21 11:25, Peter Zijlstra wrote:
> On Wed, Apr 28, 2021 at 10:54:37AM +0200, Christian Borntraeger wrote:
>>
>>
>> On 28.04.21 10:46, Peter Zijlstra wrote:
>>> On Tue, Apr 27, 2021 at 04:59:25PM +0200, Christian Borntraeger wrote:
>>>> Peter,
>>>>
>>>> I just realized that we moved away sysctl tunabled to debugfs in next.
>>>> We have seen several cases where it was benefitial to set
>>>> sched_migration_cost_ns to a lower value. For example with KVM I can
>>>> easily get 50% more transactions with 50000 instead of 500000.
>>>> Until now it was possible to use tuned or /etc/sysctl.conf to set
>>>> these things permanently.
>>>>
>>>> Given that some people do not want to have debugfs mounted all the time
>>>> I would consider this a regression. The sysctl tunable was always
>>>> available.
>>>>
>>>> I am ok with the "informational" things being in debugfs, but not
>>>> the tunables. So how do we proceed here?
>>>
>>> It's all SCHED_DEBUG; IOW you're relying on DEBUG infrastructure for
>>> production performance, and that's your fail.
>>
>> No its not. sched_migration_cost_ns was NEVER protected by CONFIG_SCHED_DEBUG.
>> It was available on all kernels with CONFIG_SMP.
>
> The relevant section from origin/master:kernel/sysctl.c:

[...]
> How is migration_cost not under SCHED_DEBUG? The bigger problem is that
> world+dog has SCHED_DEBUG=y in their .config.

Hmm, yes my bad. I disabled it but it was silently reenabled due to a
dependency. So yes you are right, it is under SCHED_DEBUG.

2021-04-28 10:15:11

by Christian Borntraeger

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs



On 28.04.21 10:46, Peter Zijlstra wrote:
[..]
> The right thing to do here is to analyze the situation and determine why
> migration_cost needs changing; is that an architectural thing, does s390
> benefit from less sticky tasks due to its cache setup (the book caches
> could be absorbing some of the penalties here for example). Or is it
> something that's workload related, does KVM intrinsically not care about
> migrating so much, or is it something else.

So lets focus on the performance issue.

One workload where we have seen this is transactional workload that is
triggered by external network requests. So every external request
triggered a wakup of a guest and a wakeup of a process in the guest.
The end result was that KVM was 40% slower than z/VM (in terms of
transactions per second) while we had more idle time.
With smaller sched_migration_cost_ns (e.g. 100000) KVM was as fast
as z/VM.

So to me it looks like that the wakeup and reschedule to a free CPU
was just not fast enough. It might also depend where I/O interrupts
land. Not sure yet.

2021-04-28 14:47:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs

On Wed, Apr 28, 2021 at 11:42:57AM +0200, Christian Borntraeger wrote:
> On 28.04.21 10:46, Peter Zijlstra wrote:
> [..]
> > The right thing to do here is to analyze the situation and determine why
> > migration_cost needs changing; is that an architectural thing, does s390
> > benefit from less sticky tasks due to its cache setup (the book caches
> > could be absorbing some of the penalties here for example). Or is it
> > something that's workload related, does KVM intrinsically not care about
> > migrating so much, or is it something else.
>
> So lets focus on the performance issue.
>
> One workload where we have seen this is transactional workload that is
> triggered by external network requests. So every external request
> triggered a wakup of a guest and a wakeup of a process in the guest.
> The end result was that KVM was 40% slower than z/VM (in terms of
> transactions per second) while we had more idle time.
> With smaller sched_migration_cost_ns (e.g. 100000) KVM was as fast
> as z/VM.
>
> So to me it looks like that the wakeup and reschedule to a free CPU
> was just not fast enough. It might also depend where I/O interrupts
> land. Not sure yet.

So there's unfortunately three places where migration_cost is used; one
is in {nohz_,}newidle_balance(), see below. Someone tried removing it
before and that ran into so weird regressions somewhere. But it is worth
checking if this is the thing that matters for your workload.

The other (main) use is in task_hot(), where we try and prevent
migrating tasks that have recently run on a CPU. We already have an
exception for SMT there, because SMT siblings share all cache levels per
defintion, so moving it to the sibling should have no ill effect.

It could be that the current measure is fundamentally too high for your
machine -- it is basically a random number that was determined many
years ago on some random x86 machine, so it not reflecting reality today
on an entirely different platform is no surprise.

Back in the day, we had some magic code that measured cache latency per
sched_domain and we used that, but that suffered from boot-to-boot
variance and made things rather non-deterministic, but the idea of
having per-domain cost certainly makes sense.

Over the years people have tried bringing parts of that back, but it
never really had convincing numbers justifying the complexity. So that's
another thing you could be looking at I suppose.

And then finally we have an almost random use in rebalance_domains(),
and I can't remember the story behind that one :/


Anyway, TL;DR, try and figure out which of these three is responsible
for your performance woes. If it's the first, the below patch might be a
good candidate. If it's task_hot(), we might need to re-eval per domain
costs. If its that other thing, I'll have to dig to figure out wth that
was supposed to accomplish ;-)

---

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3bdc41f22909..9189bd78ad8f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10557,10 +10557,6 @@ static void nohz_newidle_balance(struct rq *this_rq)
if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
return;

- /* Will wake up very soon. No time for doing anything else*/
- if (this_rq->avg_idle < sysctl_sched_migration_cost)
- return;
-
/* Don't need to update blocked load of idle CPUs*/
if (!READ_ONCE(nohz.has_blocked) ||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
@@ -10622,8 +10618,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
*/
rq_unpin_lock(this_rq, rf);

- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !READ_ONCE(this_rq->rd->overload)) {
+ if (!READ_ONCE(this_rq->rd->overload)) {

rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);

2021-04-28 17:35:55

by Christian Borntraeger

[permalink] [raw]
Subject: Re: sched: Move SCHED_DEBUG sysctl to debugfs

On 28.04.21 14:38, Peter Zijlstra wrote:
> On Wed, Apr 28, 2021 at 11:42:57AM +0200, Christian Borntraeger wrote:
>> On 28.04.21 10:46, Peter Zijlstra wrote:
>> [..]
>>> The right thing to do here is to analyze the situation and determine why
>>> migration_cost needs changing; is that an architectural thing, does s390
>>> benefit from less sticky tasks due to its cache setup (the book caches
>>> could be absorbing some of the penalties here for example). Or is it
>>> something that's workload related, does KVM intrinsically not care about
>>> migrating so much, or is it something else.
>>
>> So lets focus on the performance issue.
>>
>> One workload where we have seen this is transactional workload that is
>> triggered by external network requests. So every external request
>> triggered a wakup of a guest and a wakeup of a process in the guest.
>> The end result was that KVM was 40% slower than z/VM (in terms of
>> transactions per second) while we had more idle time.
>> With smaller sched_migration_cost_ns (e.g. 100000) KVM was as fast
>> as z/VM.
>>
>> So to me it looks like that the wakeup and reschedule to a free CPU
>> was just not fast enough. It might also depend where I/O interrupts
>> land. Not sure yet.
>
> So there's unfortunately three places where migration_cost is used; one
> is in {nohz_,}newidle_balance(), see below. Someone tried removing it
> before and that ran into so weird regressions somewhere. But it is worth
> checking if this is the thing that matters for your workload.
>
> The other (main) use is in task_hot(), where we try and prevent
> migrating tasks that have recently run on a CPU. We already have an
> exception for SMT there, because SMT siblings share all cache levels per
> defintion, so moving it to the sibling should have no ill effect.
>
> It could be that the current measure is fundamentally too high for your
> machine -- it is basically a random number that was determined many
> years ago on some random x86 machine, so it not reflecting reality today
> on an entirely different platform is no surprise.
>
> Back in the day, we had some magic code that measured cache latency per
> sched_domain and we used that, but that suffered from boot-to-boot
> variance and made things rather non-deterministic, but the idea of
> having per-domain cost certainly makes sense.
>
> Over the years people have tried bringing parts of that back, but it
> never really had convincing numbers justifying the complexity. So that's
> another thing you could be looking at I suppose.
>
> And then finally we have an almost random use in rebalance_domains(),
> and I can't remember the story behind that one :/
>
>
> Anyway, TL;DR, try and figure out which of these three is responsible
> for your performance woes. If it's the first, the below patch might be a
> good candidate. If it's task_hot(), we might need to re-eval per domain
> costs. If its that other thing, I'll have to dig to figure out wth that
> was supposed to accomplish ;-)

Thanks for the insight. I will try to find out which of these areas make
a difference here.
[..]

2021-07-07 12:35:35

by Christian Borntraeger

[permalink] [raw]
Subject: [PATCH 0/1] Improve yield (was: sched: Move SCHED_DEBUG sysctl to debugfs)

I did a reduced testcase and assuming that the reduced testcase has the
same issue, it turns out that a lower sched_migration_cost_ns does not
solve a specific problem, instead it seems to make a different problem
less problematic. In the end the problem seemed to be worse on KVM hosts
(guest changes did also help but much less so). In the end what did help
was to improve the behaviour of yield_to from KVM.
See the patch for more details. The problem seems to be real, my
solution might not be the best one - I am open for better ways to
code things.

2021-07-07 12:36:36

by Christian Borntraeger

[permalink] [raw]
Subject: [PATCH 1/1] sched/fair: improve yield_to vs fairness

After some debugging in situations where a smaller sched_latency_ns and
smaller sched_migration_cost settings helped for KVM host, I was able to
come up with a reduced testcase.
This testcase has 2 vcpus working on a shared memory location and
waiting for mem % 2 == cpu number to then do an add on the shared
memory.
To start simple I pinned all vcpus to one host CPU. Without the
yield_to in KVM the testcase was horribly slow. This is expected as each
vcpu will spin a whole time slice. With the yield_to from KVM things are
much better, but I was still seeing yields being ignored.
In the end pick_next_entity decided to keep the current process running
due to fairness reasons. On this path we really know that there is no
point in continuing current. So let us make things a bit unfairer to
current.
This makes the reduced testcase noticeable faster. It improved a more
realistic test case (many guests on some host CPUs with overcomitment)
even more.
In the end this is similar to the old compat_sched_yield approach with
an important difference:
Instead of doing it for all yields we now only do it for yield_to
a place where we really know that current it waiting for the target.

What are alternative implementations for this patch
- do the same as the old compat_sched_yield:
current->vruntime = rightmost->vruntime+1
- provide a new tunable sched_ns_yield_penalty: how much vruntime to add
(could be per architecture)
- also fiddle with the vruntime of the target
e.g. subtract from the target what we add to the source

Signed-off-by: Christian Borntraeger <[email protected]>
---
kernel/sched/fair.c | 11 +++++++++++
1 file changed, 11 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23663318fb81..4f661a9ed3ba 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7337,6 +7337,7 @@ static void yield_task_fair(struct rq *rq)
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
+ struct sched_entity *curr = &rq->curr->se;

/* throttled hierarchies are not runnable */
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
@@ -7347,6 +7348,16 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)

yield_task_fair(rq);

+ /*
+ * This path is special and only called from KVM. In contrast to yield,
+ * in yield_to we really know that current is spinning and we know
+ * (s390) or have good heuristics whom are we waiting for. There is
+ * absolutely no point in continuing the current task, even if this
+ * means to become unfairer. Let us give the current process some
+ * "fake" penalty.
+ */
+ curr->vruntime += sched_slice(cfs_rq_of(curr), curr);
+
return true;
}

--
2.31.1

2021-07-07 18:11:12

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness

Hi Christian,

I love your patch! Perhaps something to improve:

[auto build test WARNING on tip/sched/core]
[also build test WARNING on v5.13 next-20210707]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/0day-ci/linux/commits/Christian-Borntraeger/sched-fair-improve-yield_to-vs-fairness/20210707-213440
base: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 031e3bd8986fffe31e1ddbf5264cccfe30c9abd7
config: i386-randconfig-s002-20210707 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.3-341-g8af24329-dirty
# https://github.com/0day-ci/linux/commit/75196412f9c36f51144f4c333b2b02d57bb0ebde
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Christian-Borntraeger/sched-fair-improve-yield_to-vs-fairness/20210707-213440
git checkout 75196412f9c36f51144f4c333b2b02d57bb0ebde
# save the attached .config to linux build tree
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=i386

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>


sparse warnings: (new ones prefixed by >>)
kernel/sched/fair.c:830:34: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected struct sched_entity *se @@ got struct sched_entity [noderef] __rcu * @@
kernel/sched/fair.c:830:34: sparse: expected struct sched_entity *se
kernel/sched/fair.c:830:34: sparse: got struct sched_entity [noderef] __rcu *
kernel/sched/fair.c:5458:38: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct task_struct *curr @@ got struct task_struct [noderef] __rcu *curr @@
kernel/sched/fair.c:5458:38: sparse: expected struct task_struct *curr
kernel/sched/fair.c:5458:38: sparse: got struct task_struct [noderef] __rcu *curr
kernel/sched/fair.c:7048:38: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct task_struct *curr @@ got struct task_struct [noderef] __rcu *curr @@
kernel/sched/fair.c:7048:38: sparse: expected struct task_struct *curr
kernel/sched/fair.c:7048:38: sparse: got struct task_struct [noderef] __rcu *curr
kernel/sched/fair.c:7332:38: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct task_struct *curr @@ got struct task_struct [noderef] __rcu *curr @@
kernel/sched/fair.c:7332:38: sparse: expected struct task_struct *curr
kernel/sched/fair.c:7332:38: sparse: got struct task_struct [noderef] __rcu *curr
>> kernel/sched/fair.c:7364:40: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct sched_entity *curr @@ got struct sched_entity [noderef] __rcu * @@
kernel/sched/fair.c:7364:40: sparse: expected struct sched_entity *curr
kernel/sched/fair.c:7364:40: sparse: got struct sched_entity [noderef] __rcu *
kernel/sched/fair.c:5387:35: sparse: sparse: marked inline, but without a definition
kernel/sched/fair.c: note: in included file:
kernel/sched/sched.h:2011:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
kernel/sched/sched.h:2011:25: sparse: struct task_struct [noderef] __rcu *
kernel/sched/sched.h:2011:25: sparse: struct task_struct *
kernel/sched/sched.h:2169:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
kernel/sched/sched.h:2169:9: sparse: struct task_struct [noderef] __rcu *
kernel/sched/sched.h:2169:9: sparse: struct task_struct *
kernel/sched/sched.h:2011:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
kernel/sched/sched.h:2011:25: sparse: struct task_struct [noderef] __rcu *
kernel/sched/sched.h:2011:25: sparse: struct task_struct *
kernel/sched/sched.h:2011:25: sparse: sparse: incompatible types in comparison expression (different address spaces):
kernel/sched/sched.h:2011:25: sparse: struct task_struct [noderef] __rcu *
kernel/sched/sched.h:2011:25: sparse: struct task_struct *

vim +7364 kernel/sched/fair.c

7360
7361 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
7362 {
7363 struct sched_entity *se = &p->se;
> 7364 struct sched_entity *curr = &rq->curr->se;
7365
7366 /* throttled hierarchies are not runnable */
7367 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
7368 return false;
7369
7370 /* Tell the scheduler that we'd really like pse to run next. */
7371 set_next_buddy(se);
7372
7373 yield_task_fair(rq);
7374
7375 /*
7376 * This path is special and only called from KVM. In contrast to yield,
7377 * in yield_to we really know that current is spinning and we know
7378 * (s390) or have good heuristics whom are we waiting for. There is
7379 * absolutely no point in continuing the current task, even if this
7380 * means to become unfairer. Let us give the current process some
7381 * "fake" penalty.
7382 */
7383 curr->vruntime += sched_slice(cfs_rq_of(curr), curr);
7384
7385 return true;
7386 }
7387

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (5.56 kB)
.config.gz (35.31 kB)
Download all attachments

2021-07-23 09:39:14

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness

On Wed, Jul 07, 2021 at 02:34:02PM +0200, Christian Borntraeger wrote:
> After some debugging in situations where a smaller sched_latency_ns and
> smaller sched_migration_cost settings helped for KVM host, I was able to
> come up with a reduced testcase.
> This testcase has 2 vcpus working on a shared memory location and
> waiting for mem % 2 == cpu number to then do an add on the shared
> memory.
> To start simple I pinned all vcpus to one host CPU. Without the
> yield_to in KVM the testcase was horribly slow. This is expected as each
> vcpu will spin a whole time slice. With the yield_to from KVM things are
> much better, but I was still seeing yields being ignored.
> In the end pick_next_entity decided to keep the current process running
> due to fairness reasons. On this path we really know that there is no
> point in continuing current. So let us make things a bit unfairer to
> current.
> This makes the reduced testcase noticeable faster. It improved a more
> realistic test case (many guests on some host CPUs with overcomitment)
> even more.
> In the end this is similar to the old compat_sched_yield approach with
> an important difference:
> Instead of doing it for all yields we now only do it for yield_to
> a place where we really know that current it waiting for the target.
>
> What are alternative implementations for this patch
> - do the same as the old compat_sched_yield:
> current->vruntime = rightmost->vruntime+1
> - provide a new tunable sched_ns_yield_penalty: how much vruntime to add
> (could be per architecture)
> - also fiddle with the vruntime of the target
> e.g. subtract from the target what we add to the source
>
> Signed-off-by: Christian Borntraeger <[email protected]>

I think this one accidentally fell off everyones radar including mine.
At the time this patch was mailed I remembered thinking that playing games with
vruntime might have other consequences. For example, what I believe is
the most relevant problem for KVM is that a task spinning to acquire a
lock may be waiting on a vcpu holding the lock that has been
descheduled. Without vcpu pinning, it's possible that the holder is on
the same runqueue as the lock acquirer so the acquirer is wasting CPU.

In such a case, changing the acquirers vcpu may mean that it unfairly
loses CPU time simply because it's a lock acquirer. Vincent, what do you
think? Christian, would you mind testing this as an alternative with your
demonstration test case and more importantly the "realistic test case"?

--8<--
sched: Do not select highest priority task to run if it should be skipped

pick_next_entity will consider the "next buddy" over the highest priority
task if it's not unfair to do so (as determined by wakekup_preempt_entity).
The potential problem is that an in-kernel user of yield_to() such as
KVM may explicitly want to yield the current task because it is trying
to acquire a spinlock from a task that is currently descheduled and
potentially running on the same runqueue. However, if it's more fair from
the scheduler perspective to continue running the current task, it'll continue
to spin uselessly waiting on a descheduled task to run.

This patch will select the targeted task to run even if it's unfair if the
highest priority task is explicitly marked as "skip".

This was evaluated using a debugging patch to expose yield_to as a system
call. A demonstration program creates N number of threads and arranges
them in a ring that are updating a shared value in memory. Each thread
spins until the value matches the thread ID. It then updates the value
and wakes the next thread in the ring. It measures how many times it spins
before it gets its turn. Without the patch, the number of spins is highly
variable and unstable but with the patch it's more consistent.

Signed-off-by: Mel Gorman <[email protected]>
---
kernel/sched/fair.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44c452072a1b..ddc0212d520f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4522,7 +4522,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = second;
}

- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+ if (cfs_rq->next &&
+ (cfs_rq->skip == left || wakeup_preempt_entity(cfs_rq->next, left) < 1)) {
/*
* Someone really wants this to run. If it's not unfair, run it.
*/

--
Mel Gorman
SUSE Labs

2021-07-23 12:39:00

by Christian Borntraeger

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness



On 23.07.21 11:35, Mel Gorman wrote:
> On Wed, Jul 07, 2021 at 02:34:02PM +0200, Christian Borntraeger wrote:
>> After some debugging in situations where a smaller sched_latency_ns and
>> smaller sched_migration_cost settings helped for KVM host, I was able to
>> come up with a reduced testcase.
>> This testcase has 2 vcpus working on a shared memory location and
>> waiting for mem % 2 == cpu number to then do an add on the shared
>> memory.
>> To start simple I pinned all vcpus to one host CPU. Without the
>> yield_to in KVM the testcase was horribly slow. This is expected as each
>> vcpu will spin a whole time slice. With the yield_to from KVM things are
>> much better, but I was still seeing yields being ignored.
>> In the end pick_next_entity decided to keep the current process running
>> due to fairness reasons. On this path we really know that there is no
>> point in continuing current. So let us make things a bit unfairer to
>> current.
>> This makes the reduced testcase noticeable faster. It improved a more
>> realistic test case (many guests on some host CPUs with overcomitment)
>> even more.
>> In the end this is similar to the old compat_sched_yield approach with
>> an important difference:
>> Instead of doing it for all yields we now only do it for yield_to
>> a place where we really know that current it waiting for the target.
>>
>> What are alternative implementations for this patch
>> - do the same as the old compat_sched_yield:
>> current->vruntime = rightmost->vruntime+1
>> - provide a new tunable sched_ns_yield_penalty: how much vruntime to add
>> (could be per architecture)
>> - also fiddle with the vruntime of the target
>> e.g. subtract from the target what we add to the source
>>
>> Signed-off-by: Christian Borntraeger <[email protected]>
>
> I think this one accidentally fell off everyones radar including mine.
> At the time this patch was mailed I remembered thinking that playing games with
> vruntime might have other consequences. For example, what I believe is
> the most relevant problem for KVM is that a task spinning to acquire a
> lock may be waiting on a vcpu holding the lock that has been
> descheduled. Without vcpu pinning, it's possible that the holder is on
> the same runqueue as the lock acquirer so the acquirer is wasting CPU.
>
> In such a case, changing the acquirers vcpu may mean that it unfairly
> loses CPU time simply because it's a lock acquirer. Vincent, what do you
> think? Christian, would you mind testing this as an alternative with your
> demonstration test case and more importantly the "realistic test case"?
>
> --8<--
> sched: Do not select highest priority task to run if it should be skipped
>
> pick_next_entity will consider the "next buddy" over the highest priority
> task if it's not unfair to do so (as determined by wakekup_preempt_entity).
> The potential problem is that an in-kernel user of yield_to() such as
> KVM may explicitly want to yield the current task because it is trying
> to acquire a spinlock from a task that is currently descheduled and
> potentially running on the same runqueue. However, if it's more fair from
> the scheduler perspective to continue running the current task, it'll continue
> to spin uselessly waiting on a descheduled task to run.
>
> This patch will select the targeted task to run even if it's unfair if the
> highest priority task is explicitly marked as "skip".
>
> This was evaluated using a debugging patch to expose yield_to as a system
> call. A demonstration program creates N number of threads and arranges
> them in a ring that are updating a shared value in memory. Each thread
> spins until the value matches the thread ID. It then updates the value
> and wakes the next thread in the ring. It measures how many times it spins
> before it gets its turn. Without the patch, the number of spins is highly
> variable and unstable but with the patch it's more consistent.
>
> Signed-off-by: Mel Gorman <[email protected]>
> ---
> kernel/sched/fair.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 44c452072a1b..ddc0212d520f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4522,7 +4522,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> se = second;
> }
>
> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> + if (cfs_rq->next &&
> + (cfs_rq->skip == left || wakeup_preempt_entity(cfs_rq->next, left) < 1)) {
> /*
> * Someone really wants this to run. If it's not unfair, run it.
> */
>

I do see a reduction in ignored yields, but from a performance aspect for my
testcases this patch does not provide a benefit, while the the simple
curr->vruntime += sysctl_sched_min_granularity;
does.
I still think that your approach is probably the cleaner one, any chance to improve this
somehow?

FWIW, I recently realized that my patch also does not solve the problem for KVM with libvirt.
My testcase only improves with qemu started by hand. As soon as the cpu cgroup controller is
active, my patch also no longer helps.

If you have any patch to test, I can test it. Meanwhile I will also do some more testing.

2021-07-23 16:24:45

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness

On Fri, Jul 23, 2021 at 02:36:21PM +0200, Christian Borntraeger wrote:
> > sched: Do not select highest priority task to run if it should be skipped
> >
> > <SNIP>
> >
> > index 44c452072a1b..ddc0212d520f 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4522,7 +4522,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> > se = second;
> > }
> > - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> > + if (cfs_rq->next &&
> > + (cfs_rq->skip == left || wakeup_preempt_entity(cfs_rq->next, left) < 1)) {
> > /*
> > * Someone really wants this to run. If it's not unfair, run it.
> > */
> >
>
> I do see a reduction in ignored yields, but from a performance aspect for my
> testcases this patch does not provide a benefit, while the the simple
> curr->vruntime += sysctl_sched_min_granularity;
> does.

I'm still not a fan because vruntime gets distorted. From the docs

Small detail: on "ideal" hardware, at any time all tasks would have the same
p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
would ever get "out of balance" from the "ideal" share of CPU time

If yield_to impacts this "ideal share" then it could have other
consequences.

I think your patch may be performing better in your test case because every
"wrong" task selected that is not the yield_to target gets penalised and
so the yield_to target gets pushed up the list.

> I still think that your approach is probably the cleaner one, any chance to improve this
> somehow?
>

Potentially. The patch was a bit off because while it noticed that skip
was not being obeyed, the fix was clumsy and isolated. The current flow is

1. pick se == left as the candidate
2. try pick a different se if the "ideal" candidate is a skip candidate
3. Ignore the se update if next or last are set

Step 3 looks off because it ignores skip if next or last buddies are set
and I don't think that was intended. Can you try this?

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44c452072a1b..d56f7772a607 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4522,12 +4522,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = second;
}

- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) {
/*
* Someone really wants this to run. If it's not unfair, run it.
*/
se = cfs_rq->next;
- } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) {
/*
* Prefer last buddy, try to return the CPU to a preempted task.
*/

2021-07-26 18:44:31

by Christian Borntraeger

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness



On 23.07.21 18:21, Mel Gorman wrote:
> On Fri, Jul 23, 2021 at 02:36:21PM +0200, Christian Borntraeger wrote:
>>> sched: Do not select highest priority task to run if it should be skipped
>>>
>>> <SNIP>
>>>
>>> index 44c452072a1b..ddc0212d520f 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4522,7 +4522,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>>> se = second;
>>> }
>>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
>>> + if (cfs_rq->next &&
>>> + (cfs_rq->skip == left || wakeup_preempt_entity(cfs_rq->next, left) < 1)) {
>>> /*
>>> * Someone really wants this to run. If it's not unfair, run it.
>>> */
>>>
>>
>> I do see a reduction in ignored yields, but from a performance aspect for my
>> testcases this patch does not provide a benefit, while the the simple
>> curr->vruntime += sysctl_sched_min_granularity;
>> does.
>
> I'm still not a fan because vruntime gets distorted. From the docs
>
> Small detail: on "ideal" hardware, at any time all tasks would have the same
> p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
> would ever get "out of balance" from the "ideal" share of CPU time
>
> If yield_to impacts this "ideal share" then it could have other
> consequences.
>
> I think your patch may be performing better in your test case because every
> "wrong" task selected that is not the yield_to target gets penalised and
> so the yield_to target gets pushed up the list.
>
>> I still think that your approach is probably the cleaner one, any chance to improve this
>> somehow?
>>
>
> Potentially. The patch was a bit off because while it noticed that skip
> was not being obeyed, the fix was clumsy and isolated. The current flow is
>
> 1. pick se == left as the candidate
> 2. try pick a different se if the "ideal" candidate is a skip candidate
> 3. Ignore the se update if next or last are set
>
> Step 3 looks off because it ignores skip if next or last buddies are set
> and I don't think that was intended. Can you try this?
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 44c452072a1b..d56f7772a607 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4522,12 +4522,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> se = second;
> }
>
> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) {
> /*
> * Someone really wants this to run. If it's not unfair, run it.
> */
> se = cfs_rq->next;
> - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
> + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) {
> /*
> * Prefer last buddy, try to return the CPU to a preempted task.
> */
>

This one alone does not seem to make a difference. Neither in ignored yield, nor
in performance.

Your first patch does really help in terms of ignored yields when
all threads are pinned to one host CPU. After that we do have no ignored yield
it seems. But it does not affect the performance of my testcase.
I did some more experiments and I removed the wakeup_preempt_entity checks in
pick_next_entity - assuming that this will result in source always being stopped
and target always being picked. But still, no performance difference.
As soon as I play with vruntime I do see a difference (but only without the cpu cgroup
controller). I will try to better understand the scheduler logic and do some more
testing. If you have anything that I should test, let me know.

Christian

2021-07-26 19:34:21

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness

On Mon, Jul 26, 2021 at 08:41:15PM +0200, Christian Borntraeger wrote:
> > Potentially. The patch was a bit off because while it noticed that skip
> > was not being obeyed, the fix was clumsy and isolated. The current flow is
> >
> > 1. pick se == left as the candidate
> > 2. try pick a different se if the "ideal" candidate is a skip candidate
> > 3. Ignore the se update if next or last are set
> >
> > Step 3 looks off because it ignores skip if next or last buddies are set
> > and I don't think that was intended. Can you try this?
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 44c452072a1b..d56f7772a607 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4522,12 +4522,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> > se = second;
> > }
> > - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> > + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) {
> > /*
> > * Someone really wants this to run. If it's not unfair, run it.
> > */
> > se = cfs_rq->next;
> > - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
> > + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) {
> > /*
> > * Prefer last buddy, try to return the CPU to a preempted task.
> > */
> >
>
> This one alone does not seem to make a difference. Neither in ignored yield, nor
> in performance.
>
> Your first patch does really help in terms of ignored yields when
> all threads are pinned to one host CPU.

Ok, that tells us something. It implies, but does not prove, that the
block above that handles skip is failing either the entity_before()
test or the wakeup_preempt_entity() test. To what degree that should be
relaxed when cfs_rq->next is !NULL is harder to determine.

> After that we do have no ignored yield
> it seems. But it does not affect the performance of my testcase.

Ok, this is the first patch. The second patch is not improving ignored
yields at all so the above paragraph still applies. It would be nice
if you could instrument with trace_printk when cfs->rq_next is valid
whether it's the entity_before() check that is preventing the skip or
wakeup_preempt_entity. Would that be possible?

I still think the second patch is right independent of it helping your
test case because it makes no sense to me at all that the task after the
skip candidate is ignored if there is a next or last buddy.

> I did some more experiments and I removed the wakeup_preempt_entity checks in
> pick_next_entity - assuming that this will result in source always being stopped
> and target always being picked. But still, no performance difference.
> As soon as I play with vruntime I do see a difference (but only without the cpu cgroup
> controller). I will try to better understand the scheduler logic and do some more
> testing. If you have anything that I should test, let me know.
>

The fact that vruntime tricks only makes a difference when cgroups are
involved is interesting. Can you describe roughly what how the cgroup
is configured? Similarly, does your config have CONFIG_SCHED_AUTOGROUP
or CONFIG_FAIR_GROUP_SCHED set? I assume FAIR_GROUP_SCHED must be and
I wonder if the impact of your patch is dropping groups of tasks in
priority as opposed to individual tasks. I'm not that familiar with how
groups are handled in terms of how they are prioritised unfortunately.

I'm still hesitant to consider the vruntime hammer in case it causes
fairness problems when vruntime is no longer reflecting time spent on
the CPU.

--
Mel Gorman
SUSE Labs

2021-07-27 07:02:05

by Christian Borntraeger

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness



On 26.07.21 21:32, Mel Gorman wrote:
> On Mon, Jul 26, 2021 at 08:41:15PM +0200, Christian Borntraeger wrote:
>>> Potentially. The patch was a bit off because while it noticed that skip
>>> was not being obeyed, the fix was clumsy and isolated. The current flow is
>>>
>>> 1. pick se == left as the candidate
>>> 2. try pick a different se if the "ideal" candidate is a skip candidate
>>> 3. Ignore the se update if next or last are set
>>>
>>> Step 3 looks off because it ignores skip if next or last buddies are set
>>> and I don't think that was intended. Can you try this?
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 44c452072a1b..d56f7772a607 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4522,12 +4522,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>>> se = second;
>>> }
>>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
>>> + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) {
>>> /*
>>> * Someone really wants this to run. If it's not unfair, run it.
>>> */
>>> se = cfs_rq->next;
>>> - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
>>> + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) {
>>> /*
>>> * Prefer last buddy, try to return the CPU to a preempted task.
>>> */
>>>
>>
>> This one alone does not seem to make a difference. Neither in ignored yield, nor
>> in performance.
>>
>> Your first patch does really help in terms of ignored yields when
>> all threads are pinned to one host CPU.
>
> Ok, that tells us something. It implies, but does not prove, that the
> block above that handles skip is failing either the entity_before()
> test or the wakeup_preempt_entity() test. To what degree that should be
> relaxed when cfs_rq->next is !NULL is harder to determine.
>
>> After that we do have no ignored yield
>> it seems. But it does not affect the performance of my testcase.
>
> Ok, this is the first patch. The second patch is not improving ignored
> yields at all so the above paragraph still applies. It would be nice
> if you could instrument with trace_printk when cfs->rq_next is valid
> whether it's the entity_before() check that is preventing the skip or
> wakeup_preempt_entity. Would that be possible?

I will try that.
>
> I still think the second patch is right independent of it helping your
> test case because it makes no sense to me at all that the task after the
> skip candidate is ignored if there is a next or last buddy.

I agree. This patch makes sense to me as a bug fix.
And I think also the first patch makes sense on its own.
>
>> I did some more experiments and I removed the wakeup_preempt_entity checks in
>> pick_next_entity - assuming that this will result in source always being stopped
>> and target always being picked. But still, no performance difference.
>> As soon as I play with vruntime I do see a difference (but only without the cpu cgroup
>> controller). I will try to better understand the scheduler logic and do some more
>> testing. If you have anything that I should test, let me know.
>>
>
> The fact that vruntime tricks only makes a difference when cgroups are
> involved is interesting. Can you describe roughly what how the cgroup
> is configured?

Its the other way around. My vruntime patch ONLY helps WITHOUT the cpu cgroup controller.
In other words this example on a 16CPU host (resulting in 4x overcommitment)
time ( for ((d=0; d<16; d++)) ; do cgexec -g cpu:test$d qemu-system-s390x -enable-kvm -kernel /root/REPOS/kvm-unit-tests/s390x/diag9c.elf -smp 4 -nographic -nodefaults -device sclpconsole,chardev=c2 -chardev file,path=/tmp/log$d.log,id=c2 & done; wait)
does NOT benefit from the vruntime patch, but when I remove the "cgexec -g cpu:test$d" it does:
time ( for ((d=0; d<16; d++)) ; do qemu-system-s390x -enable-kvm -kernel /root/REPOS/kvm-unit-tests/s390x/diag9c.elf -smp 4 -nographic -nodefaults -device sclpconsole,chardev=c2 -chardev file,path=/tmp/log$d.log,id=c2 & done; wait)


Similarly, does your config have CONFIG_SCHED_AUTOGROUP
> or CONFIG_FAIR_GROUP_SCHED set? I assume FAIR_GROUP_SCHED must be and

Yes, both are set.
> I wonder if the impact of your patch is dropping groups of tasks in
> priority as opposed to individual tasks. I'm not that familiar with how
> groups are handled in terms of how they are prioritised unfortunately.
>
> I'm still hesitant to consider the vruntime hammer in case it causes
> fairness problems when vruntime is no longer reflecting time spent on
> the CPU.

I understand your concerns. What about subtracting the same amount of
vruntime from the target as we add on the yielder? Would that result in
quicker rebalancing while still keeping everything in order?
The reason why I am asking is that initially we
realized that setting some tunables lower, e.g.
kernel.sched_latency_ns = 2000000
kernel.sched_migration_cost_ns = 100000
makes things faster in a similar fashion. And that also works with cgroups.
But ideally we find a solution without changing tuneables.

2021-07-27 13:39:42

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness

On Fri, Jul 23, 2021 at 05:21:37PM +0100, Mel Gorman wrote:

> I'm still not a fan because vruntime gets distorted. From the docs
>
> Small detail: on "ideal" hardware, at any time all tasks would have the same
> p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
> would ever get "out of balance" from the "ideal" share of CPU time
>
> If yield_to impacts this "ideal share" then it could have other
> consequences.

Note that we already violate this ideal both subtly and explicitly.

For an example of the latter consider pretty much the entirety of
place_entity() with GENTLE_FAIR_SLEEPERS being the most egregious
example.

That said; adding to vruntime will only penalize the task itself, while
subtracting from vruntime will penalize everyone else. And in that sense
addition to vruntime is a safe option.

I've not fully considered the case at hand; just wanted to give some
context.

2021-07-27 13:40:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness

On Fri, Jul 23, 2021 at 10:35:23AM +0100, Mel Gorman wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 44c452072a1b..ddc0212d520f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4522,7 +4522,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> se = second;
> }
>
> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> + if (cfs_rq->next &&
> + (cfs_rq->skip == left || wakeup_preempt_entity(cfs_rq->next, left) < 1)) {
> /*
> * Someone really wants this to run. If it's not unfair, run it.
> */

With a little more context this function reads like:

se = left;

if (cfs_rq->skip && cfs_rq->skip == se) {
...
+ if (cfs_rq->next && (cfs_rq->skip == left || ...))

If '...' doesn't change @left (afaict it doesn't), then your change (+)
is equivalent to '&& true', or am I reading things wrong?

2021-07-27 14:33:36

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness

On Tue, Jul 27, 2021 at 03:33:00PM +0200, Peter Zijlstra wrote:
> On Fri, Jul 23, 2021 at 10:35:23AM +0100, Mel Gorman wrote:
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 44c452072a1b..ddc0212d520f 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4522,7 +4522,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> > se = second;
> > }
> >
> > - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> > + if (cfs_rq->next &&
> > + (cfs_rq->skip == left || wakeup_preempt_entity(cfs_rq->next, left) < 1)) {
> > /*
> > * Someone really wants this to run. If it's not unfair, run it.
> > */
>
> With a little more context this function reads like:
>
> se = left;
>
> if (cfs_rq->skip && cfs_rq->skip == se) {
> ...
> + if (cfs_rq->next && (cfs_rq->skip == left || ...))
>
> If '...' doesn't change @left (afaict it doesn't), then your change (+)
> is equivalent to '&& true', or am I reading things wrong?

You're not reading it wrong although the patch is clumsy and may introduce
unfairness that gets incrementally worse if there was repeated yields to
the same task. A second patch was posted that does

- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) {

i.e. if the skip hint picks a second alternative then next or last buddies
should be compared to the second alternative and not "left". It doesn't
help indicating that the skip hint is not obeyed because "second" failed
the entity_before() or wakeup_preempt_entity() checks. I'm waiting on a
trace to see which check dominates.

That said, I'm still undecided on how to approach this. None of the
proposed patches on their own helps but the options are

1. Strictly obey the next buddy if the skip hint is the same se as left
(first patch which I'm not very happy with even if it helped the
test case)

2. My second patch which compares next/last with "second" if the skip
hint skips "left". This may be a sensible starting point no matter
what

3. Relaxing how "second" is selected if next or last buddies are set

4. vruntime tricks even if it punishes fairness for the task yielding
the CPU. The advantage of this approach is if there are multiple tasks
ahead of the task being yielded to then yield_to task will become
"left" very quickly regardless of any buddy-related hints.

I don't know what "3" would look like yet, it might be very fragile but
lets see what the tracing says. Otherwise, testing 2+4 might be worthwhile
to see if the combination helps Christian's test case when the cpu cgroup
is involved.

--
Mel Gorman
SUSE Labs

2021-07-27 18:58:50

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness

Christian Borntraeger <[email protected]> writes:

> On 23.07.21 18:21, Mel Gorman wrote:
>> On Fri, Jul 23, 2021 at 02:36:21PM +0200, Christian Borntraeger wrote:
>>>> sched: Do not select highest priority task to run if it should be skipped
>>>>
>>>> <SNIP>
>>>>
>>>> index 44c452072a1b..ddc0212d520f 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -4522,7 +4522,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>>>> se = second;
>>>> }
>>>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
>>>> + if (cfs_rq->next &&
>>>> + (cfs_rq->skip == left || wakeup_preempt_entity(cfs_rq->next, left) < 1)) {
>>>> /*
>>>> * Someone really wants this to run. If it's not unfair, run it.
>>>> */
>>>>
>>>
>>> I do see a reduction in ignored yields, but from a performance aspect for my
>>> testcases this patch does not provide a benefit, while the the simple
>>> curr->vruntime += sysctl_sched_min_granularity;
>>> does.
>> I'm still not a fan because vruntime gets distorted. From the docs
>> Small detail: on "ideal" hardware, at any time all tasks would have the
>> same
>> p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
>> would ever get "out of balance" from the "ideal" share of CPU time
>> If yield_to impacts this "ideal share" then it could have other
>> consequences.
>> I think your patch may be performing better in your test case because every
>> "wrong" task selected that is not the yield_to target gets penalised and
>> so the yield_to target gets pushed up the list.
>>
>>> I still think that your approach is probably the cleaner one, any chance to improve this
>>> somehow?
>>>
>> Potentially. The patch was a bit off because while it noticed that skip
>> was not being obeyed, the fix was clumsy and isolated. The current flow is
>> 1. pick se == left as the candidate
>> 2. try pick a different se if the "ideal" candidate is a skip candidate
>> 3. Ignore the se update if next or last are set
>> Step 3 looks off because it ignores skip if next or last buddies are set
>> and I don't think that was intended. Can you try this?
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 44c452072a1b..d56f7772a607 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4522,12 +4522,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>> se = second;
>> }
>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
>> + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) {
>> /*
>> * Someone really wants this to run. If it's not unfair, run it.
>> */
>> se = cfs_rq->next;
>> - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
>> + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) {
>> /*
>> * Prefer last buddy, try to return the CPU to a preempted task.
>> */
>>
>
> This one alone does not seem to make a difference. Neither in ignored yield, nor
> in performance.
>
> Your first patch does really help in terms of ignored yields when
> all threads are pinned to one host CPU. After that we do have no ignored yield
> it seems. But it does not affect the performance of my testcase.
> I did some more experiments and I removed the wakeup_preempt_entity checks in
> pick_next_entity - assuming that this will result in source always being stopped
> and target always being picked. But still, no performance difference.
> As soon as I play with vruntime I do see a difference (but only without the cpu cgroup
> controller). I will try to better understand the scheduler logic and do some more
> testing. If you have anything that I should test, let me know.
>
> Christian

If both yielder and target are in the same cpu cgroup or the cpu cgroup
is disabled (ie, if cfs_rq_of(p->se) matches), you could try

if (p->se.vruntime > rq->curr->se.vruntime)
swap(p->se.vruntime, rq->curr->se.vruntime)

as well as the existing buddy flags, as an entirely fair vruntime boost
to the target.

For when they aren't direct siblings, you /could/ use find_matching_se,
but it's much less clear that's desirable, since it would yield vruntime
for the entire hierarchy to the target's hierarchy.

2021-07-28 16:28:20

by Christian Borntraeger

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness



On 27.07.21 20:57, Benjamin Segall wrote:
> Christian Borntraeger <[email protected]> writes:
>
>> On 23.07.21 18:21, Mel Gorman wrote:
>>> On Fri, Jul 23, 2021 at 02:36:21PM +0200, Christian Borntraeger wrote:
>>>>> sched: Do not select highest priority task to run if it should be skipped
>>>>>
>>>>> <SNIP>
>>>>>
>>>>> index 44c452072a1b..ddc0212d520f 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -4522,7 +4522,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>>>>> se = second;
>>>>> }
>>>>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
>>>>> + if (cfs_rq->next &&
>>>>> + (cfs_rq->skip == left || wakeup_preempt_entity(cfs_rq->next, left) < 1)) {
>>>>> /*
>>>>> * Someone really wants this to run. If it's not unfair, run it.
>>>>> */
>>>>>
>>>>
>>>> I do see a reduction in ignored yields, but from a performance aspect for my
>>>> testcases this patch does not provide a benefit, while the the simple
>>>> curr->vruntime += sysctl_sched_min_granularity;
>>>> does.
>>> I'm still not a fan because vruntime gets distorted. From the docs
>>> Small detail: on "ideal" hardware, at any time all tasks would have the
>>> same
>>> p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
>>> would ever get "out of balance" from the "ideal" share of CPU time
>>> If yield_to impacts this "ideal share" then it could have other
>>> consequences.
>>> I think your patch may be performing better in your test case because every
>>> "wrong" task selected that is not the yield_to target gets penalised and
>>> so the yield_to target gets pushed up the list.
>>>
>>>> I still think that your approach is probably the cleaner one, any chance to improve this
>>>> somehow?
>>>>
>>> Potentially. The patch was a bit off because while it noticed that skip
>>> was not being obeyed, the fix was clumsy and isolated. The current flow is
>>> 1. pick se == left as the candidate
>>> 2. try pick a different se if the "ideal" candidate is a skip candidate
>>> 3. Ignore the se update if next or last are set
>>> Step 3 looks off because it ignores skip if next or last buddies are set
>>> and I don't think that was intended. Can you try this?
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 44c452072a1b..d56f7772a607 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4522,12 +4522,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
>>> se = second;
>>> }
>>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
>>> + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) {
>>> /*
>>> * Someone really wants this to run. If it's not unfair, run it.
>>> */
>>> se = cfs_rq->next;
>>> - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
>>> + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) {
>>> /*
>>> * Prefer last buddy, try to return the CPU to a preempted task.
>>> */
>>>
>>
>> This one alone does not seem to make a difference. Neither in ignored yield, nor
>> in performance.
>>
>> Your first patch does really help in terms of ignored yields when
>> all threads are pinned to one host CPU. After that we do have no ignored yield
>> it seems. But it does not affect the performance of my testcase.
>> I did some more experiments and I removed the wakeup_preempt_entity checks in
>> pick_next_entity - assuming that this will result in source always being stopped
>> and target always being picked. But still, no performance difference.
>> As soon as I play with vruntime I do see a difference (but only without the cpu cgroup
>> controller). I will try to better understand the scheduler logic and do some more
>> testing. If you have anything that I should test, let me know.
>>
>> Christian
>
> If both yielder and target are in the same cpu cgroup or the cpu cgroup
> is disabled (ie, if cfs_rq_of(p->se) matches), you could try
>
> if (p->se.vruntime > rq->curr->se.vruntime)
> swap(p->se.vruntime, rq->curr->se.vruntime)

I tried that and it does not show the performance benefit. I then played with my
patch (uses different values to add) and the benefit seems to be depending on the
size that is being added, maybe when swapping it was just not large enough.

I have to say that this is all a bit unclear what and why performance improves.
It just seems that the cpu cgroup controller has a fair share of the performance
problems.

I also asked the performance people to run some measurements and the numbers of
some transactional workload under KVM was
baseline: 11813
with much smaller sched_latency_ns and sched_migration_cost_ns: 16419
with cpu controller disabled: 15962
with cpu controller disabled + my patch: 16782

I will be travelling the next 2 weeks, so I can continue with more debugging
after that.

Thanks for all the ideas and help so far.

Christian

> as well as the existing buddy flags, as an entirely fair vruntime boost
> to the target.
>
> For when they aren't direct siblings, you /could/ use find_matching_se,
> but it's much less clear that's desirable, since it would yield vruntime
> for the entire hierarchy to the target's hierarchy.
>

2021-08-10 11:48:07

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 1/1] sched/fair: improve yield_to vs fairness

On Wed, 28 Jul 2021 at 18:24, Christian Borntraeger
<[email protected]> wrote:
>
>
>
> On 27.07.21 20:57, Benjamin Segall wrote:
> > Christian Borntraeger <[email protected]> writes:
> >
> >> On 23.07.21 18:21, Mel Gorman wrote:
> >>> On Fri, Jul 23, 2021 at 02:36:21PM +0200, Christian Borntraeger wrote:
> >>>>> sched: Do not select highest priority task to run if it should be skipped
> >>>>>
> >>>>> <SNIP>
> >>>>>
> >>>>> index 44c452072a1b..ddc0212d520f 100644
> >>>>> --- a/kernel/sched/fair.c
> >>>>> +++ b/kernel/sched/fair.c
> >>>>> @@ -4522,7 +4522,8 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> >>>>> se = second;
> >>>>> }
> >>>>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> >>>>> + if (cfs_rq->next &&
> >>>>> + (cfs_rq->skip == left || wakeup_preempt_entity(cfs_rq->next, left) < 1)) {
> >>>>> /*
> >>>>> * Someone really wants this to run. If it's not unfair, run it.
> >>>>> */
> >>>>>
> >>>>
> >>>> I do see a reduction in ignored yields, but from a performance aspect for my
> >>>> testcases this patch does not provide a benefit, while the the simple
> >>>> curr->vruntime += sysctl_sched_min_granularity;
> >>>> does.
> >>> I'm still not a fan because vruntime gets distorted. From the docs
> >>> Small detail: on "ideal" hardware, at any time all tasks would have the
> >>> same
> >>> p->se.vruntime value --- i.e., tasks would execute simultaneously and no task
> >>> would ever get "out of balance" from the "ideal" share of CPU time
> >>> If yield_to impacts this "ideal share" then it could have other
> >>> consequences.
> >>> I think your patch may be performing better in your test case because every
> >>> "wrong" task selected that is not the yield_to target gets penalised and
> >>> so the yield_to target gets pushed up the list.
> >>>
> >>>> I still think that your approach is probably the cleaner one, any chance to improve this
> >>>> somehow?
> >>>>
> >>> Potentially. The patch was a bit off because while it noticed that skip
> >>> was not being obeyed, the fix was clumsy and isolated. The current flow is
> >>> 1. pick se == left as the candidate
> >>> 2. try pick a different se if the "ideal" candidate is a skip candidate
> >>> 3. Ignore the se update if next or last are set
> >>> Step 3 looks off because it ignores skip if next or last buddies are set
> >>> and I don't think that was intended. Can you try this?
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index 44c452072a1b..d56f7772a607 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -4522,12 +4522,12 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
> >>> se = second;
> >>> }
> >>> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
> >>> + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) {
> >>> /*
> >>> * Someone really wants this to run. If it's not unfair, run it.
> >>> */
> >>> se = cfs_rq->next;
> >>> - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
> >>> + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) {
> >>> /*
> >>> * Prefer last buddy, try to return the CPU to a preempted task.
> >>> */
> >>>
> >>
> >> This one alone does not seem to make a difference. Neither in ignored yield, nor
> >> in performance.
> >>
> >> Your first patch does really help in terms of ignored yields when
> >> all threads are pinned to one host CPU. After that we do have no ignored yield
> >> it seems. But it does not affect the performance of my testcase.
> >> I did some more experiments and I removed the wakeup_preempt_entity checks in
> >> pick_next_entity - assuming that this will result in source always being stopped
> >> and target always being picked. But still, no performance difference.
> >> As soon as I play with vruntime I do see a difference (but only without the cpu cgroup
> >> controller). I will try to better understand the scheduler logic and do some more
> >> testing. If you have anything that I should test, let me know.
> >>
> >> Christian
> >
> > If both yielder and target are in the same cpu cgroup or the cpu cgroup
> > is disabled (ie, if cfs_rq_of(p->se) matches), you could try
> >
> > if (p->se.vruntime > rq->curr->se.vruntime)
> > swap(p->se.vruntime, rq->curr->se.vruntime)
>
> I tried that and it does not show the performance benefit. I then played with my
> patch (uses different values to add) and the benefit seems to be depending on the
> size that is being added, maybe when swapping it was just not large enough.
>
> I have to say that this is all a bit unclear what and why performance improves.
> It just seems that the cpu cgroup controller has a fair share of the performance
> problems.
>
> I also asked the performance people to run some measurements and the numbers of
> some transactional workload under KVM was
> baseline: 11813
> with much smaller sched_latency_ns and sched_migration_cost_ns: 16419

Have you also tried to increase sched_wakeup_granularity which is used
to decide whether we can preempt curr ?

> with cpu controller disabled: 15962
> with cpu controller disabled + my patch: 16782

Your patch modifies the vruntime of the task but cgroup sched_entity
stays unchanged. Scheduler starts to compare the vruntime of the
sched_entity of the group before reaching your task. That's probably
why your patch doesn't help with cgroup and will be difficult to
extend to cgroup because the yield of the task should not impact the
other entities in the group.

>
> I will be travelling the next 2 weeks, so I can continue with more debugging
> after that.
>
> Thanks for all the ideas and help so far.
>
> Christian
>
> > as well as the existing buddy flags, as an entirely fair vruntime boost
> > to the target.
> >
> > For when they aren't direct siblings, you /could/ use find_matching_se,
> > but it's much less clear that's desirable, since it would yield vruntime
> > for the entire hierarchy to the target's hierarchy.
> >