It was found that recently the flags of sched domain could
not be customized via sysctl, which might make it a little
inconenient for performance tuning/debugging.
echo 343 > /proc/sys/kernel/sched_domain/cpu0/domain0/flags
bash: flags: Permission denied
343 stands for:
(SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE)
As mentioned in Commit 9818427c6270 ("sched/debug: Make sd->flags
sysctl read-only"), the flags field is made read-only, due to the
concerns that the sd flags and the per cpu cache domain pointer are
not coherent.
This trial patch tries to address the issue by updating the cache
domain pointer once the flag has been modified. So that the sd->flags
could be changed via sysctl.
I'm not sure if there is other purpose that we've set the flags to
read-only, but it seems that keeping the sd->flags writable could
help diagnose the system easier. Any comment would be appreciated.
Chen Yu (2):
sched/topology: Add update_domain_cpu()
sched/debug: Make sd->flags sysctl writable again
include/linux/sched/topology.h | 5 +++++
kernel/sched/debug.c | 30 +++++++++++++++++++++++++++---
kernel/sched/topology.c | 11 +++++++++++
3 files changed, 43 insertions(+), 3 deletions(-)
--
2.17.1
Introduce update_domain_cpu(), which is a wrapper of
update_top_cache_domain(). In update_domain_cpu() the
cpu hotplug lock is to protect against the rebuild of
sched domain, and the rcu read lock is to protect against
the dereference of domain tree(rq->sd) in update_top_cache_domain().
This patch is to prepare for the next patch to update the
flags of sched domain via sysctl.
No intentional functional impact.
Signed-off-by: Chen Yu <[email protected]>
---
include/linux/sched/topology.h | 5 +++++
kernel/sched/topology.c | 11 +++++++++++
2 files changed, 16 insertions(+)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index fb11091129b3..dc81736090e3 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -161,6 +161,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
bool cpus_share_cache(int this_cpu, int that_cpu);
+void update_domain_cpu(int cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
typedef int (*sched_domain_flags_f)(void);
@@ -214,6 +215,10 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
return true;
}
+static inline void update_domain_cpu(int cpu)
+{
+}
+
#endif /* !CONFIG_SMP */
#ifndef arch_scale_cpu_capacity
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..495b65367a12 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -651,6 +651,17 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
}
+void update_domain_cpu(int cpu)
+{
+ /* Protect against sched domain rebuild. */
+ get_online_cpus();
+ /* Guard read-side sched domain dereference. */
+ rcu_read_lock();
+ update_top_cache_domain(cpu);
+ rcu_read_unlock();
+ put_online_cpus();
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
--
2.17.1
The flags of sched domain in sysctl was limited to read-only in a
recent change. However this might bring inconvenience to some performance
tuning/function test which are sensitive to load balance. Since
the user might want to evaluate the impact of different combination
of sched domain flags on the workloads.
Per Commit 9818427c6270 ("sched/debug: Make sd->flags sysctl read-only"),
we can make the flags writable if we update the cached pointers after
the flag of sched domain has been modified.
Leverage update_domain_cpu() to update corresponding cached pointer of that
CPU once the flags has been changed.
Signed-off-by: Chen Yu <[email protected]>
---
kernel/sched/debug.c | 30 +++++++++++++++++++++++++++---
1 file changed, 27 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 36c54265bb2b..bfeaf547d4af 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -227,6 +227,8 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
sd_free_ctl_entry(&entry->child);
if (entry->proc_handler == NULL)
kfree(entry->procname);
+ if (entry->extra1)
+ kfree(entry->extra1);
}
kfree(*tablep);
@@ -245,20 +247,42 @@ set_table_entry(struct ctl_table *entry,
entry->proc_handler = proc_handler;
}
+static int sysctl_sd_flags_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (!ret && write)
+ update_domain_cpu(*(int *)(table->extra1));
+
+ return ret;
+}
+
static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
+sd_alloc_ctl_domain_table(struct sched_domain *sd, int cpu)
{
struct ctl_table *table = sd_alloc_ctl_entry(9);
+ int *cpu_param;
if (table == NULL)
return NULL;
+ /* Pass the CPU index to the handler. */
+ cpu_param = kmalloc(sizeof(int), GFP_KERNEL);
+ if (!cpu_param) {
+ kfree(table);
+ return NULL;
+ }
+ *cpu_param = cpu;
+
set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, sysctl_sd_flags_handler);
+ table[5].extra1 = (void *)cpu_param;
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */
@@ -284,7 +308,7 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
snprintf(buf, 32, "domain%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
- entry->child = sd_alloc_ctl_domain_table(sd);
+ entry->child = sd_alloc_ctl_domain_table(sd, cpu);
entry++;
i++;
}
--
2.17.1
On Tue, Jul 07, 2020 at 03:36:13AM +0800, Chen Yu wrote:
> It was found that recently the flags of sched domain could
> not be customized via sysctl, which might make it a little
> inconenient for performance tuning/debugging.
What specific goals do you have? This is a debug interface.
On 06/07/20 21:00, Peter Zijlstra wrote:
> On Tue, Jul 07, 2020 at 03:36:13AM +0800, Chen Yu wrote:
>> It was found that recently the flags of sched domain could
>> not be customized via sysctl, which might make it a little
>> inconenient for performance tuning/debugging.
>
> What specific goals do you have? This is a debug interface.
Also, while the update_top_cache_domain() call on sysctl write may work,
you're back to square one as soon as you go through a hotplug cycle, which
is icky.
That said, I second Peter in that I'm curious as to what you're really
using this interface for. Manually hacking the default / arch topology
flags is a bit tedious, but it's doable.
Hi Peter,
On Mon, Jul 06, 2020 at 10:00:49PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 07, 2020 at 03:36:13AM +0800, Chen Yu wrote:
> > It was found that recently the flags of sched domain could
> > not be customized via sysctl, which might make it a little
> > inconenient for performance tuning/debugging.
>
> What specific goals do you have? This is a debug interface.
The origin motivation is for debugging purpose during daily work,
I was trying to evaluate the load balance behavior on some specific
platforms.
Another motivation was inspired by a previous lkp performance regression
report that, after the rework of load balance, some workloads might get
performance downgrading. And it is suspected that the system after the
rework is more likely to spread tasks onto more CPUs, which brings more
idle time -> and deeper cstate -> lower performance. So in order to
evaluate which flag might help mitigate the spreading(disable PREFER_SIBLING?),
it might be an applicable option to make the flags writable, so lkp could
launch the workload with different flags settings so we can gather more clues.
But yes, I'm ok if the flags are kept read-only, and we can hack into
the code temporarily to turn it into writable for debugging.
thx,
Chenyu
Hi Valentin,
On Mon, Jul 06, 2020 at 11:11:32PM +0100, Valentin Schneider wrote:
>
> On 06/07/20 21:00, Peter Zijlstra wrote:
> > On Tue, Jul 07, 2020 at 03:36:13AM +0800, Chen Yu wrote:
> >> It was found that recently the flags of sched domain could
> >> not be customized via sysctl, which might make it a little
> >> inconenient for performance tuning/debugging.
> >
> > What specific goals do you have? This is a debug interface.
>
> Also, while the update_top_cache_domain() call on sysctl write may work,
> you're back to square one as soon as you go through a hotplug cycle, which
> is icky.
Do you mean, after the hotplug, all the settings of flags are lost? Yes,
it is, but in our testing environment we do not do hotplug offen : )
>
> That said, I second Peter in that I'm curious as to what you're really
> using this interface for. Manually hacking the default / arch topology
> flags is a bit tedious, but it's doable.
Agree, but since we do monitor performance testings automatically,
it might save more time for us to not have to reboot everytime we
change the flags. So I guess we can hack the code to make that
flags field temporarily writable. I guess the concern here is that it
looks a little overkilled for us to invoke update_top_cache_domain(),
I'm okay with current read-only attribute.
Thanks,
Chenyu
Hi Chen,
On 07/07/20 12:56, Chen Yu wrote:
> Hi Valentin,
> On Mon, Jul 06, 2020 at 11:11:32PM +0100, Valentin Schneider wrote:
>>
>> On 06/07/20 21:00, Peter Zijlstra wrote:
>> > On Tue, Jul 07, 2020 at 03:36:13AM +0800, Chen Yu wrote:
>> >> It was found that recently the flags of sched domain could
>> >> not be customized via sysctl, which might make it a little
>> >> inconenient for performance tuning/debugging.
>> >
>> > What specific goals do you have? This is a debug interface.
>>
>> Also, while the update_top_cache_domain() call on sysctl write may work,
>> you're back to square one as soon as you go through a hotplug cycle, which
>> is icky.
> Do you mean, after the hotplug, all the settings of flags are lost? Yes,
> it is, but in our testing environment we do not do hotplug offen : )
Right, and neither do I, but sadly hotplug is one of those things that we
have to take into account :(
>>
>> That said, I second Peter in that I'm curious as to what you're really
>> using this interface for. Manually hacking the default / arch topology
>> flags is a bit tedious, but it's doable.
> Agree, but since we do monitor performance testings automatically,
> it might save more time for us to not have to reboot everytime we
> change the flags. So I guess we can hack the code to make that
> flags field temporarily writable. I guess the concern here is that it
> looks a little overkilled for us to invoke update_top_cache_domain(),
> I'm okay with current read-only attribute.
>
To properly update the SD landscape (i.e. the cached pointers), we *need*
that update_top_cache_domain() call. But then there's also the hotplug
thing, so we would need to e.g. write back the flags values into the right
topology level of sched_domain_topology.
I can see value in it for testing, but I'm not convinced it is something we
want to open up again. At least it's not too complicated to automate
testing of different flag combinations - you just need to commit some
change to the topology flags (either default_topology or whatever your arch
uses).
> Thanks,
> Chenyu