2009-11-30 11:16:50

by Christian Ehrhardt

[permalink] [raw]
Subject: [PATCH 0/3] fix rescaling of scheduler tunables v2

*updates in v2*
Some testing later v2 has a fix and a few minor changes
- changes of the scaling type update the normalized values according
to the new factor
- minor fixes to satisfy checkpatch.pl
-> ready for discussion and/or acceptance now :-)

This patch series is based on the ideas of Peter Zijlstra and me in the
discussion how to fix missing updates to the scheduler tunables in case
of cpu hot add/remove. The values are scaled on boot time but not (yet)
in the runtime #cpu changes.

Series contains:
[PATCH 1/3] sched: fix missing sched tunable recalculation on cpu add/remove
[PATCH 2/3] sched: make tunable scaling style configurable
[PATCH 3/3] sched: update normalized values on user updates via proc v2

[diffstat]
kernel/sched.c | 41 ++++++++++++++----------
kernel/sched_fair.c | 27 +++++++++++++++
kernel/sysctl.c | 14 +++++---
linux-2.6-git-schedrecalc/include/linux/sched.h | 10 +++++
linux-2.6-git-schedrecalc/kernel/sched.c | 15 ++++++++
linux-2.6-git-schedrecalc/kernel/sched_debug.c | 10 +++++
linux-2.6-git-schedrecalc/kernel/sched_fair.c | 13 +++++++
linux-2.6-git-schedrecalc/kernel/sysctl.c | 14 ++++++++
8 files changed, 121 insertions(+), 23 deletions(-)


2009-11-30 11:16:59

by Christian Ehrhardt

[permalink] [raw]
Subject: [PATCH 1/3] sched: fix missing sched tunable recalculation on cpu add/remove

From: Christian Ehrhardt <[email protected]>

Based on Peter Zijlstras patch suggestion this enables recalculation of the
scheduler tunables in response of a change in the number of cpus.
It also adds a max of eight cpus that are considered in that scaling.

Signed-off-by: Christian Ehrhardt <[email protected]>
---

[diffstat]
sched.c | 29 ++++++++++++++++-------------
sched_fair.c | 16 ++++++++++++++++
2 files changed, 32 insertions(+), 13 deletions(-)

[diff]
Index: linux-2.6-git-schedrecalc/kernel/sched.c
===================================================================
--- linux-2.6-git-schedrecalc.orig/kernel/sched.c
+++ linux-2.6-git-schedrecalc/kernel/sched.c
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr
* default: 0.25ms
*/
unsigned int sysctl_sched_shares_ratelimit = 250000;
+unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;

/*
* Inject some fuzzyness into changing the per-cpu group shares
@@ -1810,6 +1811,7 @@ static void cfs_rq_set_shares(struct cfs
#endif

static void calc_load_account_active(struct rq *this_rq);
+static void update_sysctl(void);

#include "sched_stats.h"
#include "sched_idletask.c"
@@ -7003,22 +7005,23 @@ cpumask_var_t nohz_cpu_mask;
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
-static inline void sched_init_granularity(void)
+static void update_sysctl(void)
{
- unsigned int factor = 1 + ilog2(num_online_cpus());
- const unsigned long limit = 200000000;
-
- sysctl_sched_min_granularity *= factor;
- if (sysctl_sched_min_granularity > limit)
- sysctl_sched_min_granularity = limit;
-
- sysctl_sched_latency *= factor;
- if (sysctl_sched_latency > limit)
- sysctl_sched_latency = limit;
+ unsigned int cpus = max(num_online_cpus(), 8U);
+ unsigned int factor = 1 + ilog2(cpus);

- sysctl_sched_wakeup_granularity *= factor;
+#define SET_SYSCTL(name) \
+ (sysctl_##name = (factor) * normalized_sysctl_##name)
+ SET_SYSCTL(sched_min_granularity);
+ SET_SYSCTL(sched_latency);
+ SET_SYSCTL(sched_wakeup_granularity);
+ SET_SYSCTL(sched_shares_ratelimit);
+#undef SET_SYSCTL
+}

- sysctl_sched_shares_ratelimit *= factor;
+static inline void sched_init_granularity(void)
+{
+ update_sysctl();
}

#ifdef CONFIG_SMP
Index: linux-2.6-git-schedrecalc/kernel/sched_fair.c
===================================================================
--- linux-2.6-git-schedrecalc.orig/kernel/sched_fair.c
+++ linux-2.6-git-schedrecalc/kernel/sched_fair.c
@@ -35,12 +35,14 @@
* run vmstat and monitor the context-switches (cs) field)
*/
unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 5000000ULL;

/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;

/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +72,7 @@ unsigned int __read_mostly sysctl_sched_
* have immediate wakeup/sleep latencies.
*/
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;

const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

@@ -1850,6 +1853,17 @@ move_one_task_fair(struct rq *this_rq, i

return 0;
}
+
+static void rq_online_fair(struct rq *rq)
+{
+ update_sysctl();
+}
+
+static void rq_offline_fair(struct rq *rq)
+{
+ update_sysctl();
+}
+
#endif /* CONFIG_SMP */

/*
@@ -1997,6 +2011,8 @@ static const struct sched_class fair_sch

.load_balance = load_balance_fair,
.move_one_task = move_one_task_fair,
+ .rq_online = rq_online_fair,
+ .rq_offline = rq_offline_fair,
#endif

.set_curr_task = set_curr_task_fair,

2009-11-30 11:17:19

by Christian Ehrhardt

[permalink] [raw]
Subject: [PATCH 2/3] sched: make tunable scaling style configurable

From: Christian Ehrhardt <[email protected]>

As scaling now takes place on all kind of cpu add/remove events a user that
configures values via proc should be able to configure if his set values are
still rescaled or kept whatever happens.

As the comments state that log2 was just a second guess that worked the
interface is not just designed for on/off, but to choose a scaling type.
Currently this allows none, log and linear, but more important it allwos us
to keep the interface even if someone has an even better idea how to scale
the values.

Signed-off-by: Christian Ehrhardt <[email protected]>
---

[diffstat]
include/linux/sched.h | 10 +++++++++-
kernel/sched.c | 15 +++++++++++++++
kernel/sched_debug.c | 10 ++++++++++
kernel/sched_fair.c | 13 +++++++++++++
kernel/sysctl.c | 14 ++++++++++++++
5 files changed, 61 insertions(+), 1 deletion(-)

[diff]
Index: linux-2.6-git-schedrecalc/include/linux/sched.h
===================================================================
--- linux-2.6-git-schedrecalc.orig/include/linux/sched.h
+++ linux-2.6-git-schedrecalc/include/linux/sched.h
@@ -1899,6 +1899,14 @@ extern unsigned int sysctl_sched_wakeup_
extern unsigned int sysctl_sched_shares_ratelimit;
extern unsigned int sysctl_sched_shares_thresh;
extern unsigned int sysctl_sched_child_runs_first;
+
+enum sched_tunable_scaling {
+ SCHED_TUNABLESCALING_NONE,
+ SCHED_TUNABLESCALING_LOG,
+ SCHED_TUNABLESCALING_LINEAR,
+ SCHED_TUNABLESCALING_END,
+};
+extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
@@ -1906,7 +1914,7 @@ extern unsigned int sysctl_sched_nr_migr
extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration;

-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos);
#endif
Index: linux-2.6-git-schedrecalc/kernel/sched.c
===================================================================
--- linux-2.6-git-schedrecalc.orig/kernel/sched.c
+++ linux-2.6-git-schedrecalc/kernel/sched.c
@@ -7010,6 +7010,21 @@ static void update_sysctl(void)
unsigned int cpus = max(num_online_cpus(), 8U);
unsigned int factor = 1 + ilog2(cpus);

+ unsigned int factor;
+
+ switch (sysctl_sched_tunable_scaling) {
+ case SCHED_TUNABLESCALING_NONE:
+ factor = 1;
+ break;
+ case SCHED_TUNABLESCALING_LINEAR:
+ factor = cpus;
+ break;
+ case SCHED_TUNABLESCALING_LOG:
+ default:
+ factor = 1 + ilog2(cpus);
+ break;
+ }
+
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
SET_SYSCTL(sched_min_granularity);
Index: linux-2.6-git-schedrecalc/kernel/sched_fair.c
===================================================================
--- linux-2.6-git-schedrecalc.orig/kernel/sched_fair.c
+++ linux-2.6-git-schedrecalc/kernel/sched_fair.c
@@ -21,6 +21,7 @@
*/

#include <linux/latencytop.h>
+#include <linux/sched.h>

/*
* Targeted preemption latency for CPU-bound tasks:
@@ -38,6 +39,18 @@ unsigned int sysctl_sched_latency = 5000
unsigned int normalized_sysctl_sched_latency = 5000000ULL;

/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+ = SCHED_TUNABLESCALING_LOG;
+
+/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
Index: linux-2.6-git-schedrecalc/kernel/sysctl.c
===================================================================
--- linux-2.6-git-schedrecalc.orig/kernel/sysctl.c
+++ linux-2.6-git-schedrecalc/kernel/sysctl.c
@@ -248,6 +248,8 @@ static int min_sched_granularity_ns = 10
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
#endif

static struct ctl_table kern_table[] = {
@@ -303,6 +305,18 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_tunable_scaling",
+ .data = &sysctl_sched_tunable_scaling,
+ .maxlen = sizeof(enum sched_tunable_scaling),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_tunable_scaling,
+ .extra2 = &max_sched_tunable_scaling,
+ },
+
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "sched_shares_thresh",
.data = &sysctl_sched_shares_thresh,
.maxlen = sizeof(unsigned int),
Index: linux-2.6-git-schedrecalc/kernel/sched_debug.c
===================================================================
--- linux-2.6-git-schedrecalc.orig/kernel/sched_debug.c
+++ linux-2.6-git-schedrecalc/kernel/sched_debug.c
@@ -305,6 +305,12 @@ static void print_cpu(struct seq_file *m
print_rq(m, rq, cpu);
}

+static const char *sched_tunable_scaling_names[] = {
+ "none",
+ "logaritmic",
+ "linear"
+};
+
static int sched_debug_show(struct seq_file *m, void *v)
{
u64 now = ktime_to_ns(ktime_get());
@@ -330,6 +336,10 @@ static int sched_debug_show(struct seq_f
#undef PN
#undef P

+ SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+ sysctl_sched_tunable_scaling,
+ sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+
for_each_online_cpu(cpu)
print_cpu(m, cpu);

2009-11-30 11:16:48

by Christian Ehrhardt

[permalink] [raw]
Subject: [PATCH 3/3] sched: update normalized values on user updates via proc v2

From: Christian Ehrhardt <[email protected]>

*updates in v2*
The normalized values are also recalculated in case the scaling factor changes.

This patch updates the internally used scheduler tuning values that are
normalized to one cpu in case a user sets new values via sysfs.

Together with patch 2 of this series this allows to let user configured
values scale (or not) to cpu add/remove events taking place later.

Signed-off-by: Christian Ehrhardt <[email protected]>
---

[diffstat]
sched.c | 12 +++++++++---
sched_fair.c | 11 ++++++++++-
sysctl.c | 14 +++++++++-----
3 files changed, 28 insertions(+), 9 deletions(-)

[diff]
Index: linux-2.6-git-schedrecalc/kernel/sched.c
===================================================================
--- linux-2.6-git-schedrecalc.orig/kernel/sched.c
+++ linux-2.6-git-schedrecalc/kernel/sched.c
@@ -1812,6 +1812,7 @@ static void cfs_rq_set_shares(struct cfs

static void calc_load_account_active(struct rq *this_rq);
static void update_sysctl(void);
+static int get_update_sysctl_factor(void);

#include "sched_stats.h"
#include "sched_idletask.c"
@@ -7005,11 +7006,9 @@ cpumask_var_t nohz_cpu_mask;
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
-static void update_sysctl(void)
+static int get_update_sysctl_factor(void)
{
unsigned int cpus = max(num_online_cpus(), 8U);
- unsigned int factor = 1 + ilog2(cpus);
-
unsigned int factor;

switch (sysctl_sched_tunable_scaling) {
@@ -7025,6 +7024,13 @@ static void update_sysctl(void)
break;
}

+ return factor;
+}
+
+static void update_sysctl(void)
+{
+ unsigned int factor = get_update_sysctl_factor();
+
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
SET_SYSCTL(sched_min_granularity);
Index: linux-2.6-git-schedrecalc/kernel/sched_fair.c
===================================================================
--- linux-2.6-git-schedrecalc.orig/kernel/sched_fair.c
+++ linux-2.6-git-schedrecalc/kernel/sched_fair.c
@@ -399,11 +399,12 @@ static struct sched_entity *__pick_last_
*/

#ifdef CONFIG_SCHED_DEBUG
-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ int factor = get_update_sysctl_factor();

if (ret || !write)
return ret;
@@ -411,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);

+#define WRT_SYSCTL(name) \
+ (normalized_sysctl_##name = sysctl_##name / (factor))
+ WRT_SYSCTL(sched_min_granularity);
+ WRT_SYSCTL(sched_latency);
+ WRT_SYSCTL(sched_wakeup_granularity);
+ WRT_SYSCTL(sched_shares_ratelimit);
+#undef WRT_SYSCTL
+
return 0;
}
#endif
Index: linux-2.6-git-schedrecalc/kernel/sysctl.c
===================================================================
--- linux-2.6-git-schedrecalc.orig/kernel/sysctl.c
+++ linux-2.6-git-schedrecalc/kernel/sysctl.c
@@ -250,6 +250,8 @@ static int min_wakeup_granularity_ns;
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+static int min_sched_shares_ratelimit = 100000; /* 100 usec */
+static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
#endif

static struct ctl_table kern_table[] = {
@@ -268,7 +270,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_min_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &sched_nr_latency_handler,
+ .proc_handler = &sched_proc_update_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
@@ -279,7 +281,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_latency,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &sched_nr_latency_handler,
+ .proc_handler = &sched_proc_update_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
@@ -290,7 +292,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &sched_proc_update_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
@@ -301,7 +303,9 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_shares_ratelimit,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &sched_proc_update_handler,
+ .extra1 = &min_sched_shares_ratelimit,
+ .extra2 = &max_sched_shares_ratelimit,
},
{
.ctl_name = CTL_UNNUMBERED,
@@ -309,7 +313,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_tunable_scaling,
.maxlen = sizeof(enum sched_tunable_scaling),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &sched_proc_update_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_tunable_scaling,
.extra2 = &max_sched_tunable_scaling,

2009-12-04 09:49:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: update normalized values on user updates via proc v2

On Mon, 2009-11-30 at 12:16 +0100, [email protected] wrote:
> @@ -301,7 +303,9 @@ static struct ctl_table kern_table[] = {
> .data = &sysctl_sched_shares_ratelimit,
> .maxlen = sizeof(unsigned int),
> .mode = 0644,
> - .proc_handler = &proc_dointvec,
> + .proc_handler = &sched_proc_update_handler,
> + .extra1 = &min_sched_shares_ratelimit,
> + .extra2 = &max_sched_shares_ratelimit,
> },

While I don't object to that change it really should have been a
separate patch.

And at the very least the changelog should have said something about it.

Anyway, took all 3 patches.

2009-12-04 09:49:23

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/3] sched: make tunable scaling style configurable

On Mon, 2009-11-30 at 12:16 +0100, [email protected] wrote:

> +++ linux-2.6-git-schedrecalc/kernel/sched.c
> @@ -7010,6 +7010,21 @@ static void update_sysctl(void)
> unsigned int cpus = max(num_online_cpus(), 8U);
> unsigned int factor = 1 + ilog2(cpus);
>
> + unsigned int factor;
> +

Now that won't compile will it ;-)

3rd patch removes the extra factor again, fixed it up properly.

2009-12-04 09:49:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/3] sched: fix missing sched tunable recalculation on cpu add/remove

On Mon, 2009-11-30 at 12:16 +0100, [email protected] wrote:

> + unsigned int cpus = max(num_online_cpus(), 8U);

Hehe, you copied my bug too ;-)

I made that min() now.

2009-12-09 09:59:00

by Christian Ehrhardt

[permalink] [raw]
Subject: [tip:sched/urgent] sched: Fix missing sched tunable recalculation on cpu add/remove

Commit-ID: 0bcdcf28c979869f44e05121b96ff2cfb05bd8e6
Gitweb: http://git.kernel.org/tip/0bcdcf28c979869f44e05121b96ff2cfb05bd8e6
Author: Christian Ehrhardt <[email protected]>
AuthorDate: Mon, 30 Nov 2009 12:16:46 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 9 Dec 2009 10:03:58 +0100

sched: Fix missing sched tunable recalculation on cpu add/remove

Based on Peter Zijlstras patch suggestion this enables recalculation of
the scheduler tunables in response of a change in the number of cpus. It
also adds a max of eight cpus that are considered in that scaling.

Signed-off-by: Christian Ehrhardt <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched.c | 29 ++++++++++++++++-------------
kernel/sched_fair.c | 16 ++++++++++++++++
2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3878f50..b54ecf8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
* default: 0.25ms
*/
unsigned int sysctl_sched_shares_ratelimit = 250000;
+unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;

/*
* Inject some fuzzyness into changing the per-cpu group shares
@@ -1814,6 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
#endif

static void calc_load_account_active(struct rq *this_rq);
+static void update_sysctl(void);

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
@@ -7028,22 +7030,23 @@ cpumask_var_t nohz_cpu_mask;
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
-static inline void sched_init_granularity(void)
+static void update_sysctl(void)
{
- unsigned int factor = 1 + ilog2(num_online_cpus());
- const unsigned long limit = 200000000;
-
- sysctl_sched_min_granularity *= factor;
- if (sysctl_sched_min_granularity > limit)
- sysctl_sched_min_granularity = limit;
-
- sysctl_sched_latency *= factor;
- if (sysctl_sched_latency > limit)
- sysctl_sched_latency = limit;
+ unsigned int cpus = min(num_online_cpus(), 8U);
+ unsigned int factor = 1 + ilog2(cpus);

- sysctl_sched_wakeup_granularity *= factor;
+#define SET_SYSCTL(name) \
+ (sysctl_##name = (factor) * normalized_sysctl_##name)
+ SET_SYSCTL(sched_min_granularity);
+ SET_SYSCTL(sched_latency);
+ SET_SYSCTL(sched_wakeup_granularity);
+ SET_SYSCTL(sched_shares_ratelimit);
+#undef SET_SYSCTL
+}

- sysctl_sched_shares_ratelimit *= factor;
+static inline void sched_init_granularity(void)
+{
+ update_sysctl();
}

#ifdef CONFIG_SMP
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c163a28..71b3458 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,12 +35,14 @@
* run vmstat and monitor the context-switches (cs) field)
*/
unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 5000000ULL;

/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;

/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +72,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
* have immediate wakeup/sleep latencies.
*/
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;

const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

@@ -1890,6 +1893,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,

return 0;
}
+
+static void rq_online_fair(struct rq *rq)
+{
+ update_sysctl();
+}
+
+static void rq_offline_fair(struct rq *rq)
+{
+ update_sysctl();
+}
+
#endif /* CONFIG_SMP */

/*
@@ -2035,6 +2049,8 @@ static const struct sched_class fair_sched_class = {

.load_balance = load_balance_fair,
.move_one_task = move_one_task_fair,
+ .rq_online = rq_online_fair,
+ .rq_offline = rq_offline_fair,
#endif

.set_curr_task = set_curr_task_fair,

2009-12-09 09:56:58

by Christian Ehrhardt

[permalink] [raw]
Subject: [tip:sched/urgent] sched: Make tunable scaling style configurable

Commit-ID: 1983a922a1bc843806b9a36cf3a370b242783140
Gitweb: http://git.kernel.org/tip/1983a922a1bc843806b9a36cf3a370b242783140
Author: Christian Ehrhardt <[email protected]>
AuthorDate: Mon, 30 Nov 2009 12:16:47 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 9 Dec 2009 10:04:01 +0100

sched: Make tunable scaling style configurable

As scaling now takes place on all kind of cpu add/remove events a user
that configures values via proc should be able to configure if his set
values are still rescaled or kept whatever happens.

As the comments state that log2 was just a second guess that worked the
interface is not just designed for on/off, but to choose a scaling type.
Currently this allows none, log and linear, but more important it allwos
us to keep the interface even if someone has an even better idea how to
scale the values.

Signed-off-by: Christian Ehrhardt <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/sched.h | 11 ++++++++++-
kernel/sched.c | 15 ++++++++++++++-
kernel/sched_debug.c | 10 ++++++++++
kernel/sched_fair.c | 13 +++++++++++++
kernel/sysctl.c | 14 ++++++++++++++
5 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4b1ebd3..ee9f200 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1902,13 +1902,22 @@ extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_shares_ratelimit;
extern unsigned int sysctl_sched_shares_thresh;
extern unsigned int sysctl_sched_child_runs_first;
+
+enum sched_tunable_scaling {
+ SCHED_TUNABLESCALING_NONE,
+ SCHED_TUNABLESCALING_LOG,
+ SCHED_TUNABLESCALING_LINEAR,
+ SCHED_TUNABLESCALING_END,
+};
+extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration;

-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos);
#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index b54ecf8..116efed 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7033,7 +7033,20 @@ cpumask_var_t nohz_cpu_mask;
static void update_sysctl(void)
{
unsigned int cpus = min(num_online_cpus(), 8U);
- unsigned int factor = 1 + ilog2(cpus);
+ unsigned int factor;
+
+ switch (sysctl_sched_tunable_scaling) {
+ case SCHED_TUNABLESCALING_NONE:
+ factor = 1;
+ break;
+ case SCHED_TUNABLESCALING_LINEAR:
+ factor = cpus;
+ break;
+ case SCHED_TUNABLESCALING_LOG:
+ default:
+ factor = 1 + ilog2(cpus);
+ break;
+ }

#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5fda666..0fc5287 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
print_rq(m, rq, cpu);
}

+static const char *sched_tunable_scaling_names[] = {
+ "none",
+ "logaritmic",
+ "linear"
+};
+
static int sched_debug_show(struct seq_file *m, void *v)
{
u64 now = ktime_to_ns(ktime_get());
@@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
#undef PN
#undef P

+ SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+ sysctl_sched_tunable_scaling,
+ sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+
for_each_online_cpu(cpu)
print_cpu(m, cpu);

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 71b3458..455106d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
*/

#include <linux/latencytop.h>
+#include <linux/sched.h>

/*
* Targeted preemption latency for CPU-bound tasks:
@@ -38,6 +39,18 @@ unsigned int sysctl_sched_latency = 5000000ULL;
unsigned int normalized_sysctl_sched_latency = 5000000ULL;

/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+ = SCHED_TUNABLESCALING_LOG;
+
+/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e5cc535..d10406e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -251,6 +251,8 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
#endif

static struct ctl_table kern_table[] = {
@@ -306,6 +308,18 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_tunable_scaling",
+ .data = &sysctl_sched_tunable_scaling,
+ .maxlen = sizeof(enum sched_tunable_scaling),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_sched_tunable_scaling,
+ .extra2 = &max_sched_tunable_scaling,
+ },
+
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "sched_shares_thresh",
.data = &sysctl_sched_shares_thresh,
.maxlen = sizeof(unsigned int),

2009-12-09 09:57:08

by Christian Ehrhardt

[permalink] [raw]
Subject: [tip:sched/urgent] sched: Update normalized values on user updates via proc

Commit-ID: acb4a848da821a095ae9e4d8b22ae2d9633ba5cd
Gitweb: http://git.kernel.org/tip/acb4a848da821a095ae9e4d8b22ae2d9633ba5cd
Author: Christian Ehrhardt <[email protected]>
AuthorDate: Mon, 30 Nov 2009 12:16:48 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 9 Dec 2009 10:04:02 +0100

sched: Update normalized values on user updates via proc

The normalized values are also recalculated in case the scaling factor
changes.

This patch updates the internally used scheduler tuning values that are
normalized to one cpu in case a user sets new values via sysfs.

Together with patch 2 of this series this allows to let user configured
values scale (or not) to cpu add/remove events taking place later.

Signed-off-by: Christian Ehrhardt <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
LKML-Reference: <[email protected]>
[ v2: fix warning ]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched.c | 12 ++++++++++--
kernel/sched_fair.c | 11 ++++++++++-
kernel/sysctl.c | 14 +++++++++-----
3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 116efed..0a60e8e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1816,6 +1816,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

static void calc_load_account_active(struct rq *this_rq);
static void update_sysctl(void);
+static int get_update_sysctl_factor(void);

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
@@ -7030,9 +7031,9 @@ cpumask_var_t nohz_cpu_mask;
*
* This idea comes from the SD scheduler of Con Kolivas:
*/
-static void update_sysctl(void)
+static int get_update_sysctl_factor(void)
{
- unsigned int cpus = min(num_online_cpus(), 8U);
+ unsigned int cpus = min(num_online_cpus(), 8);
unsigned int factor;

switch (sysctl_sched_tunable_scaling) {
@@ -7048,6 +7049,13 @@ static void update_sysctl(void)
break;
}

+ return factor;
+}
+
+static void update_sysctl(void)
+{
+ unsigned int factor = get_update_sysctl_factor();
+
#define SET_SYSCTL(name) \
(sysctl_##name = (factor) * normalized_sysctl_##name)
SET_SYSCTL(sched_min_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 455106d..804a411 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -399,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
*/

#ifdef CONFIG_SCHED_DEBUG
-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ int factor = get_update_sysctl_factor();

if (ret || !write)
return ret;
@@ -411,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
sysctl_sched_min_granularity);

+#define WRT_SYSCTL(name) \
+ (normalized_sysctl_##name = sysctl_##name / (factor))
+ WRT_SYSCTL(sched_min_granularity);
+ WRT_SYSCTL(sched_latency);
+ WRT_SYSCTL(sched_wakeup_granularity);
+ WRT_SYSCTL(sched_shares_ratelimit);
+#undef WRT_SYSCTL
+
return 0;
}
#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d10406e..b9e5a45 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -253,6 +253,8 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+static int min_sched_shares_ratelimit = 100000; /* 100 usec */
+static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
#endif

static struct ctl_table kern_table[] = {
@@ -271,7 +273,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_min_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &sched_nr_latency_handler,
+ .proc_handler = &sched_proc_update_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
@@ -282,7 +284,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_latency,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &sched_nr_latency_handler,
+ .proc_handler = &sched_proc_update_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
@@ -293,7 +295,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &sched_proc_update_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
@@ -304,7 +306,9 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_shares_ratelimit,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &sched_proc_update_handler,
+ .extra1 = &min_sched_shares_ratelimit,
+ .extra2 = &max_sched_shares_ratelimit,
},
{
.ctl_name = CTL_UNNUMBERED,
@@ -312,7 +316,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_tunable_scaling,
.maxlen = sizeof(enum sched_tunable_scaling),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &sched_proc_update_handler,
.strategy = &sysctl_intvec,
.extra1 = &min_sched_tunable_scaling,
.extra2 = &max_sched_tunable_scaling,