For reference: https://lore.kernel.org/lkml/[email protected]/
And the latest proposal: https://lore.kernel.org/lkml/[email protected]/
Pcp cache draining isolation needs a function that abstracts checking
if a CPU is isolated through isolcpus= or nohz_full=. Take advantage
of that to do some cleanups.
Frederic Weisbecker (2):
sched/isolation: Merge individual nohz_full features into a common
housekeeping flag
sched/isolation: Add cpu_is_isolated() API
arch/x86/kvm/x86.c | 2 +-
drivers/char/random.c | 2 +-
drivers/pci/pci-driver.c | 2 +-
include/linux/sched/isolation.h | 13 +++++++------
include/net/ip_vs.h | 4 ++--
kernel/cpu.c | 4 ++--
kernel/kthread.c | 4 ++--
kernel/rcu/tasks.h | 2 +-
kernel/rcu/tree_plugin.h | 6 +++---
kernel/sched/core.c | 12 ++++++------
kernel/sched/fair.c | 6 +++---
kernel/sched/isolation.c | 22 ++++++----------------
kernel/watchdog.c | 2 +-
kernel/workqueue.c | 2 +-
net/core/net-sysfs.c | 2 +-
net/netfilter/ipvs/ip_vs_ctl.c | 2 +-
16 files changed, 39 insertions(+), 48 deletions(-)
--
2.34.1
The individual isolation features turned on by nohz_full were initially
split in order for each of them to be tunable through cpusets. However
plans have changed in favour of an interface (be it cpusets or sysctl)
grouping all these features to be turned on/off altogether. Then should
the need ever arise, the interface can still be expanded to handle the
individual isolation features.
Therefore the current isolation split between tick/timer/workqueue/rcu/
kthreads/misc doesn't make sense anymore. Gather them all into a common
flag gathering the kernel noise that is turned off through nohz_full=
and later runtime interface.
Signed-off-by: Frederic Weisbecker <[email protected]>
---
arch/x86/kvm/x86.c | 2 +-
drivers/char/random.c | 2 +-
drivers/pci/pci-driver.c | 2 +-
include/linux/sched/isolation.h | 7 +------
include/net/ip_vs.h | 4 ++--
kernel/cpu.c | 4 ++--
kernel/kthread.c | 4 ++--
kernel/rcu/tasks.h | 2 +-
kernel/rcu/tree_plugin.h | 6 +++---
kernel/sched/core.c | 12 ++++++------
kernel/sched/fair.c | 6 +++---
kernel/sched/isolation.c | 22 ++++++----------------
kernel/watchdog.c | 2 +-
kernel/workqueue.c | 2 +-
net/core/net-sysfs.c | 2 +-
net/netfilter/ipvs/ip_vs_ctl.c | 2 +-
16 files changed, 33 insertions(+), 48 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index da4bbd043a7b..e3e989eed872 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9347,7 +9347,7 @@ int kvm_arch_init(void *opaque)
}
if (pi_inject_timer == -1)
- pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
+ pi_inject_timer = housekeeping_enabled(HK_TYPE_KERNEL_NOISE);
#ifdef CONFIG_X86_64
pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
diff --git a/drivers/char/random.c b/drivers/char/random.c
index ce3ccd172cc8..d6b2145712bd 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1300,7 +1300,7 @@ static void __cold try_to_generate_entropy(void)
preempt_disable();
/* Only schedule callbacks on timer CPUs that are online. */
- cpumask_and(&timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask);
+ cpumask_and(&timer_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE), cpu_online_mask);
num_cpus = cpumask_weight(&timer_cpus);
/* In very bizarre case of misconfiguration, fallback to all online. */
if (unlikely(num_cpus == 0)) {
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index a2ceeacc33eb..e8711ec206d9 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -378,7 +378,7 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
goto out;
}
cpumask_and(wq_domain_mask,
- housekeeping_cpumask(HK_TYPE_WQ),
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),
housekeeping_cpumask(HK_TYPE_DOMAIN));
cpu = cpumask_any_and(cpumask_of_node(node),
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 8c15abd67aed..b645cc81fe01 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -6,15 +6,10 @@
#include <linux/tick.h>
enum hk_type {
- HK_TYPE_TIMER,
- HK_TYPE_RCU,
- HK_TYPE_MISC,
+ HK_TYPE_KERNEL_NOISE,
HK_TYPE_SCHED,
- HK_TYPE_TICK,
HK_TYPE_DOMAIN,
- HK_TYPE_WQ,
HK_TYPE_MANAGED_IRQ,
- HK_TYPE_KTHREAD,
HK_TYPE_MAX
};
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index c6c61100d244..41773b13577b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1177,7 +1177,7 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
if (ipvs->est_cpulist_valid)
return ipvs->sysctl_est_cpulist;
else
- return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ return housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
}
static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
@@ -1284,7 +1284,7 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs)
static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
{
- return housekeeping_cpumask(HK_TYPE_KTHREAD);
+ return housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
}
static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6c0a92ca6bb5..023d5ef8f3fd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1524,8 +1524,8 @@ int freeze_secondary_cpus(int primary)
cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
- if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
- primary = housekeeping_any_cpu(HK_TYPE_TIMER);
+ if (!housekeeping_cpu(primary, HK_TYPE_KERNEL_NOISE))
+ primary = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
} else {
if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f97fd01a2932..0e52ae05fdba 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -355,7 +355,7 @@ static int kthread(void *_create)
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m);
- set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -722,7 +722,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index fe9840d90e96..e2f74ace62a2 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -537,7 +537,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
struct rcu_tasks *rtp = arg;
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current, HK_TYPE_RCU);
+ housekeeping_affine(current, HK_TYPE_KERNEL_NOISE);
WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
/*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7b0fe741a088..29276afeb8d3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1241,9 +1241,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
if (cpumask_empty(cm)) {
- cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
if (outgoingcpu >= 0)
cpumask_clear_cpu(outgoingcpu, cm);
}
@@ -1301,5 +1301,5 @@ static void rcu_bind_gp_kthread(void)
{
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current, HK_TYPE_RCU);
+ housekeeping_affine(current, HK_TYPE_KERNEL_NOISE);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e838feb6adc5..d4b822c8387f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1067,13 +1067,13 @@ int get_nohz_timer_target(void)
struct sched_domain *sd;
const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
- hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -1089,7 +1089,7 @@ int get_nohz_timer_target(void)
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
+ default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
cpu = default_cpu;
unlock:
rcu_read_unlock();
@@ -5553,7 +5553,7 @@ void scheduler_tick(void)
unsigned long thermal_pressure;
u64 resched_latency;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
arch_scale_freq_tick();
sched_clock_tick();
@@ -5679,7 +5679,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5700,7 +5700,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f8736991427..03f70ee6314f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11021,7 +11021,7 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
+ * - HK_TYPE_KERNEL_NOISE CPUs are used for this task, because HK_TYPE_SCHED not set
* anywhere yet.
*/
@@ -11030,7 +11030,7 @@ static inline int find_new_ilb(void)
int ilb;
const struct cpumask *hk_mask;
- hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
@@ -11046,7 +11046,7 @@ static inline int find_new_ilb(void)
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 373d42c707bc..2353ec381c96 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -9,15 +9,10 @@
*/
enum hk_flags {
- HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
- HK_FLAG_RCU = BIT(HK_TYPE_RCU),
- HK_FLAG_MISC = BIT(HK_TYPE_MISC),
+ HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
- HK_FLAG_TICK = BIT(HK_TYPE_TICK),
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
- HK_FLAG_WQ = BIT(HK_TYPE_WQ),
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
- HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
};
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
@@ -88,7 +83,7 @@ void __init housekeeping_init(void)
static_branch_enable(&housekeeping_overridden);
- if (housekeeping.flags & HK_FLAG_TICK)
+ if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
sched_tick_offload_init();
for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
@@ -111,7 +106,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
cpumask_var_t non_housekeeping_mask, housekeeping_staging;
int err = 0;
- if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
pr_warn("Housekeeping: nohz unsupported."
" Build with CONFIG_NO_HZ_FULL\n");
@@ -163,7 +158,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
housekeeping_setup_type(type, housekeeping_staging);
}
- if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
tick_nohz_full_setup(non_housekeeping_mask);
housekeeping.flags |= flags;
@@ -179,12 +174,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
static int __init housekeeping_nohz_full_setup(char *str)
{
- unsigned long flags;
-
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
- HK_FLAG_MISC | HK_FLAG_KTHREAD;
-
- return housekeeping_setup(str, flags);
+ return housekeeping_setup(str, HK_FLAG_KERNEL_NOISE);
}
__setup("nohz_full=", housekeeping_nohz_full_setup);
@@ -198,7 +188,7 @@ static int __init housekeeping_isolcpus_setup(char *str)
while (isalpha(*str)) {
if (!strncmp(str, "nohz,", 5)) {
str += 5;
- flags |= HK_FLAG_TICK;
+ flags |= HK_FLAG_KERNEL_NOISE;
continue;
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8e61f21e7e33..fc40a0ed8d04 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -859,7 +859,7 @@ void __init lockup_detector_init(void)
pr_info("Disabling watchdog on nohz_full cores by default\n");
cpumask_copy(&watchdog_cpumask,
- housekeeping_cpumask(HK_TYPE_TIMER));
+ housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
if (!watchdog_nmi_probe())
nmi_watchdog_available = true;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 07895deca271..f58b4544e7f7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -6003,7 +6003,7 @@ void __init workqueue_init_early(void)
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
+ cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index ca55dd747d6c..4cd630884201 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -853,7 +853,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
if (!cpumask_empty(mask)) {
cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
- cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
if (cpumask_empty(mask)) {
free_cpumask_var(mask);
return -EINVAL;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 2a5ed71c82c3..2896ad3d8b59 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1991,7 +1991,7 @@ static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer,
if (ipvs->est_cpulist_valid)
mask = *valp;
else
- mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+ mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
mutex_unlock(&ipvs->est_mutex);
--
2.34.1
Provide this new API to check if a CPU has been isolated either through
isolcpus= or nohz_full= kernel parameter.
It aims at avoiding kernel load deemed to be safely spared on CPUs
running sensitive workload that can't bear any disturbance, such as
pcp cache draining.
Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: Frederic Weisbecker <[email protected]>
---
include/linux/sched/isolation.h | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index b645cc81fe01..088672f08469 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -53,4 +53,10 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
return true;
}
+static inline bool cpu_is_isolated(int cpu)
+{
+ return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
+ !housekeeping_test_cpu(cpu, HK_TYPE_KERNEL_NOISE);
+}
+
#endif /* _LINUX_SCHED_ISOLATION_H */
--
2.34.1
On 2/3/23 18:24, Frederic Weisbecker wrote:
> Provide this new API to check if a CPU has been isolated either through
> isolcpus= or nohz_full= kernel parameter.
>
> It aims at avoiding kernel load deemed to be safely spared on CPUs
> running sensitive workload that can't bear any disturbance, such as
> pcp cache draining.
>
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: Frederic Weisbecker <[email protected]>
> ---
> include/linux/sched/isolation.h | 6 ++++++
> 1 file changed, 6 insertions(+)
>
> diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
> index b645cc81fe01..088672f08469 100644
> --- a/include/linux/sched/isolation.h
> +++ b/include/linux/sched/isolation.h
> @@ -53,4 +53,10 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
> return true;
> }
>
> +static inline bool cpu_is_isolated(int cpu)
> +{
> + return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
> + !housekeeping_test_cpu(cpu, HK_TYPE_KERNEL_NOISE);
> +}
> +
> #endif /* _LINUX_SCHED_ISOLATION_H */
CPUs in an isolated cpuset partition is similar to HK_TYPE_DOMAIN CPUs
as load balancing is disabled. I can add an API to access the cpumask
and add to this API. However, that list is dynamic as it can be changed
at run time. Will that be a problem? Or should that be used separately?
Cheers,
Longman
Hi Frederic,
I love your patch! Yet something to improve:
[auto build test ERROR on tip/sched/core]
[also build test ERROR on horms-ipvs-next/master horms-ipvs/master linus/master v6.2-rc6 next-20230203]
[cannot apply to paulmck-rcu/dev]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Frederic-Weisbecker/sched-isolation-Merge-individual-nohz_full-features-into-a-common-housekeeping-flag/20230204-072510
patch link: https://lore.kernel.org/r/20230203232409.163847-3-frederic%40kernel.org
patch subject: [PATCH 2/2] sched/isolation: Add cpu_is_isolated() API
config: alpha-defconfig (https://download.01.org/0day-ci/archive/20230204/[email protected]/config)
compiler: alpha-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/89596a035dc10e00cb66d4e75e49d69b75413807
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Frederic-Weisbecker/sched-isolation-Merge-individual-nohz_full-features-into-a-common-housekeeping-flag/20230204-072510
git checkout 89596a035dc10e00cb66d4e75e49d69b75413807
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=alpha olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=alpha SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
In file included from drivers/char/random.c:56:
include/linux/sched/isolation.h: In function 'cpu_is_isolated':
>> include/linux/sched/isolation.h:58:17: error: implicit declaration of function 'housekeeping_test_cpu'; did you mean 'housekeeping_any_cpu'? [-Werror=implicit-function-declaration]
58 | return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
| ^~~~~~~~~~~~~~~~~~~~~
| housekeeping_any_cpu
cc1: some warnings being treated as errors
--
In file included from init/main.c:56:
include/linux/sched/isolation.h: In function 'cpu_is_isolated':
>> include/linux/sched/isolation.h:58:17: error: implicit declaration of function 'housekeeping_test_cpu'; did you mean 'housekeeping_any_cpu'? [-Werror=implicit-function-declaration]
58 | return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
| ^~~~~~~~~~~~~~~~~~~~~
| housekeeping_any_cpu
init/main.c: At top level:
init/main.c:775:20: warning: no previous prototype for 'arch_post_acpi_subsys_init' [-Wmissing-prototypes]
775 | void __init __weak arch_post_acpi_subsys_init(void) { }
| ^~~~~~~~~~~~~~~~~~~~~~~~~~
init/main.c:787:20: warning: no previous prototype for 'mem_encrypt_init' [-Wmissing-prototypes]
787 | void __init __weak mem_encrypt_init(void) { }
| ^~~~~~~~~~~~~~~~
init/main.c:789:20: warning: no previous prototype for 'poking_init' [-Wmissing-prototypes]
789 | void __init __weak poking_init(void) { }
| ^~~~~~~~~~~
cc1: some warnings being treated as errors
--
In file included from kernel/sched/fair.c:38:
include/linux/sched/isolation.h: In function 'cpu_is_isolated':
>> include/linux/sched/isolation.h:58:17: error: implicit declaration of function 'housekeeping_test_cpu'; did you mean 'housekeeping_any_cpu'? [-Werror=implicit-function-declaration]
58 | return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
| ^~~~~~~~~~~~~~~~~~~~~
| housekeeping_any_cpu
kernel/sched/fair.c: At top level:
kernel/sched/fair.c:688:5: warning: no previous prototype for 'sched_update_scaling' [-Wmissing-prototypes]
688 | int sched_update_scaling(void)
| ^~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:6067:6: warning: no previous prototype for 'init_cfs_bandwidth' [-Wmissing-prototypes]
6067 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
| ^~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12493:6: warning: no previous prototype for 'free_fair_sched_group' [-Wmissing-prototypes]
12493 | void free_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12495:5: warning: no previous prototype for 'alloc_fair_sched_group' [-Wmissing-prototypes]
12495 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
| ^~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12500:6: warning: no previous prototype for 'online_fair_sched_group' [-Wmissing-prototypes]
12500 | void online_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~~~
kernel/sched/fair.c:12502:6: warning: no previous prototype for 'unregister_fair_sched_group' [-Wmissing-prototypes]
12502 | void unregister_fair_sched_group(struct task_group *tg) { }
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
vim +58 include/linux/sched/isolation.h
55
56 static inline bool cpu_is_isolated(int cpu)
57 {
> 58 return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
59 !housekeeping_test_cpu(cpu, HK_TYPE_KERNEL_NOISE);
60 }
61
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
Hi Frederic,
I love your patch! Yet something to improve:
[auto build test ERROR on tip/sched/core]
[also build test ERROR on horms-ipvs-next/master horms-ipvs/master linus/master v6.2-rc6 next-20230203]
[cannot apply to paulmck-rcu/dev]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Frederic-Weisbecker/sched-isolation-Merge-individual-nohz_full-features-into-a-common-housekeeping-flag/20230204-072510
patch link: https://lore.kernel.org/r/20230203232409.163847-3-frederic%40kernel.org
patch subject: [PATCH 2/2] sched/isolation: Add cpu_is_isolated() API
config: x86_64-randconfig-a001 (https://download.01.org/0day-ci/archive/20230205/[email protected]/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/89596a035dc10e00cb66d4e75e49d69b75413807
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Frederic-Weisbecker/sched-isolation-Merge-individual-nohz_full-features-into-a-common-housekeeping-flag/20230204-072510
git checkout 89596a035dc10e00cb66d4e75e49d69b75413807
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
In file included from drivers/pci/pci-driver.c:15:
>> include/linux/sched/isolation.h:58:10: error: implicit declaration of function 'housekeeping_test_cpu' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
^
include/linux/sched/isolation.h:58:10: note: did you mean 'housekeeping_any_cpu'?
include/linux/sched/isolation.h:27:19: note: 'housekeeping_any_cpu' declared here
static inline int housekeeping_any_cpu(enum hk_type type)
^
1 error generated.
--
In file included from kernel/sched/fair.c:38:
>> include/linux/sched/isolation.h:58:10: error: implicit declaration of function 'housekeeping_test_cpu' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
^
include/linux/sched/isolation.h:58:10: note: did you mean 'housekeeping_any_cpu'?
include/linux/sched/isolation.h:27:19: note: 'housekeeping_any_cpu' declared here
static inline int housekeeping_any_cpu(enum hk_type type)
^
kernel/sched/fair.c:688:5: warning: no previous prototype for function 'sched_update_scaling' [-Wmissing-prototypes]
int sched_update_scaling(void)
^
kernel/sched/fair.c:688:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
int sched_update_scaling(void)
^
static
1 warning and 1 error generated.
vim +/housekeeping_test_cpu +58 include/linux/sched/isolation.h
55
56 static inline bool cpu_is_isolated(int cpu)
57 {
> 58 return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
59 !housekeeping_test_cpu(cpu, HK_TYPE_KERNEL_NOISE);
60 }
61
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
Hello.
On Fri, Feb 03, 2023 at 10:53:46PM -0500, Waiman Long <[email protected]> wrote:
> CPUs in an isolated cpuset partition is similar to HK_TYPE_DOMAIN CPUs as
> load balancing is disabled. I can add an API to access the cpumask and add
> to this API. However, that list is dynamic as it can be changed at run time.
> Will that be a problem?
I can see a problem already -- as a CPU can be dynamically switched to
"isolated" mode so should all dependent operations support that (switch)
too, i.e. the CPUs local PCP caches would have to be drained when the
CPU enters isolation.
> Or should that be used separately?
It'd be nice to have both (cpuset and cmdline flags) eventually unified.
Alas, it only leads me conservatively to:
#ifndef CONFIG_CPUSETS
// the proposed implementaion
else
static inline bool cpu_is_isolated(int cpu) {
return true;
}
#endif
My 0.02€,
Michal
Hello Frederic.
On Sat, Feb 04, 2023 at 12:24:08AM +0100, Frederic Weisbecker <[email protected]> wrote:
> The individual isolation features turned on by nohz_full were initially
> split in order for each of them to be tunable through cpusets. However
> plans have changed in favour of an interface (be it cpusets or sysctl)
> grouping all these features to be turned on/off altogether.
> Then should the need ever arise, the interface can still be expanded
> to handle the individual isolation features.
>
> Therefore the current isolation split between tick/timer/workqueue/rcu/
> kthreads/misc doesn't make sense anymore.
Why it doesn't make sense? I think it's a useful annotation of
respective operations wrt CPU isolation.
The grouping you did into HK_TYPE_KERNEL_NOISE (or even coarser) should
IMO be done at the place where it'll be exposed into the favored
interface (like it's with nohz_full=).
Thanks,
Michal
On 2/6/23 10:47, Michal Koutný wrote:
> Hello.
>
> On Fri, Feb 03, 2023 at 10:53:46PM -0500, Waiman Long <[email protected]> wrote:
>> CPUs in an isolated cpuset partition is similar to HK_TYPE_DOMAIN CPUs as
>> load balancing is disabled. I can add an API to access the cpumask and add
>> to this API. However, that list is dynamic as it can be changed at run time.
>> Will that be a problem?
> I can see a problem already -- as a CPU can be dynamically switched to
> "isolated" mode so should all dependent operations support that (switch)
> too, i.e. the CPUs local PCP caches would have to be drained when the
> CPU enters isolation.
I see the long term goal is to have more isolation capability to be done
dynamically. However, we are not there yet. There is still a lot of work
to do to achieve that.
>
>> Or should that be used separately?
> It'd be nice to have both (cpuset and cmdline flags) eventually unified.
>
> Alas, it only leads me conservatively to:
>
> #ifndef CONFIG_CPUSETS
> // the proposed implementaion
> else
> static inline bool cpu_is_isolated(int cpu) {
> return true;
> }
> #endif
That is too conservative from my point of view. We can have further
discussion when a patch is ready.
Cheers,
Longman
On Mon, Feb 06, 2023 at 04:51:09PM +0100, Michal Koutn? wrote:
> Hello Frederic.
>
> On Sat, Feb 04, 2023 at 12:24:08AM +0100, Frederic Weisbecker <[email protected]> wrote:
> > The individual isolation features turned on by nohz_full were initially
> > split in order for each of them to be tunable through cpusets. However
> > plans have changed in favour of an interface (be it cpusets or sysctl)
> > grouping all these features to be turned on/off altogether.
> > Then should the need ever arise, the interface can still be expanded
> > to handle the individual isolation features.
> >
> > Therefore the current isolation split between tick/timer/workqueue/rcu/
> > kthreads/misc doesn't make sense anymore.
>
> Why it doesn't make sense? I think it's a useful annotation of
> respective operations wrt CPU isolation.
But what do we need these annotations for? The only outcome I've ever
seen with these is that it confuses everyone.
>
> The grouping you did into HK_TYPE_KERNEL_NOISE (or even coarser) should
> IMO be done at the place where it'll be exposed into the favored
> interface (like it's with nohz_full=).
That being said I should reserve the grouping to HK_TYPE_KERNEL_NOISE when
I'll introduce the cpuset interface. This way I can add the support for
each part smoothly. For example first patch moves HK_TYPE_TIMER to
HK_TYPE_KERNEL_NOISE and unbound timers are supported by cpuset.kernel_noise,
second patch moves HK_TYPE_WQ to HK_TYPE_KERNEL_NOISE and unbound workqueues
are supported by cpuset.kernel_noise, etc until all of them turned by nohz_full=
are supported... This is what I'm doing in fact but I'm so slow to write this patchset...
Thanks.
On Fri, Feb 03, 2023 at 10:53:46PM -0500, Waiman Long wrote:
> On 2/3/23 18:24, Frederic Weisbecker wrote:
> > Provide this new API to check if a CPU has been isolated either through
> > isolcpus= or nohz_full= kernel parameter.
> >
> > It aims at avoiding kernel load deemed to be safely spared on CPUs
> > running sensitive workload that can't bear any disturbance, such as
> > pcp cache draining.
> >
> > Suggested-by: Michal Hocko <[email protected]>
> > Signed-off-by: Frederic Weisbecker <[email protected]>
> > ---
> > include/linux/sched/isolation.h | 6 ++++++
> > 1 file changed, 6 insertions(+)
> >
> > diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
> > index b645cc81fe01..088672f08469 100644
> > --- a/include/linux/sched/isolation.h
> > +++ b/include/linux/sched/isolation.h
> > @@ -53,4 +53,10 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
> > return true;
> > }
> > +static inline bool cpu_is_isolated(int cpu)
> > +{
> > + return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
> > + !housekeeping_test_cpu(cpu, HK_TYPE_KERNEL_NOISE);
> > +}
> > +
> > #endif /* _LINUX_SCHED_ISOLATION_H */
>
> CPUs in an isolated cpuset partition is similar to HK_TYPE_DOMAIN CPUs as
> load balancing is disabled. I can add an API to access the cpumask and add
> to this API. However, that list is dynamic as it can be changed at run time.
> Will that be a problem? Or should that be used separately?
So that's what I intended first but the dynamic part of cpuset made me
postpone that to better days.
But yes ideally it should look like:
static inline bool cpu_is_isolated(int cpu)
{
return !housekeeping_test_cpu(cpu, HK_TYPE_KERNEL_NOISE) ||
on_null_domain(cpu_rq(cpu));
}
And there should be a hook in something like detach_destroy_domains()
to flush the pcp cache when a CPU is attached to a NULL domain.
All that with proper RCU synchronization:
UPDATE READER
------ ------
rcu_assign_pointer(cpu_rq(cpu)->sd, NULL); rcu_read_lock();
synchronize_rcu(); if (!cpu_is_isolated(cpu))
stock = &per_cpu(memcg_stock, cpu); schedule_work_on(cpu, &stock->work);
flush_work(&stock->work); rcu_read_unlock()
Thanks.
On Tue, Feb 07, 2023 at 12:49:41PM +0100, Frederic Weisbecker <[email protected]> wrote:
> But what do we need these annotations for? The only outcome I've ever
> seen with these is that it confuses everyone.
Take that as a note of a lone actor then who found it useful documenting
relations between various parts of the code.
> This way I can add the support for each part smoothly.
Yeah, that makes sense.
> For example first patch moves HK_TYPE_TIMER to HK_TYPE_KERNEL_NOISE
> and unbound timers are supported by cpuset.kernel_noise, second patch
> moves HK_TYPE_WQ to HK_TYPE_KERNEL_NOISE and unbound workqueues are
> supported by cpuset.kernel_noise, etc until all of them turned by
> nohz_full= are supported...
So does this mean you'll re-introduce the finer grained HK_* flags
again?
The idea (not only mine?) is that this would extend
cpuset.cpus.partition that only allows HK_TYPE_DOMAIN analogy. The
mapping to individual flags may not be exposed to users. The graduality
could be achieved by adding more flags under user_exposed_term.
Just to be on the same page -- that's how I understand it, the original
HK_* resolution turned out impractical for users and that's why the
direction is towards some loose combinations representing user
intentions. Is that right?
Cheers,
Michal
On 2/7/23 07:59, Michal Koutný wrote:
> On Tue, Feb 07, 2023 at 12:49:41PM +0100, Frederic Weisbecker <[email protected]> wrote:
>> But what do we need these annotations for? The only outcome I've ever
>> seen with these is that it confuses everyone.
> Take that as a note of a lone actor then who found it useful documenting
> relations between various parts of the code.
>
>> This way I can add the support for each part smoothly.
> Yeah, that makes sense.
>
>> For example first patch moves HK_TYPE_TIMER to HK_TYPE_KERNEL_NOISE
>> and unbound timers are supported by cpuset.kernel_noise, second patch
>> moves HK_TYPE_WQ to HK_TYPE_KERNEL_NOISE and unbound workqueues are
>> supported by cpuset.kernel_noise, etc until all of them turned by
>> nohz_full= are supported...
> So does this mean you'll re-introduce the finer grained HK_* flags
> again?
>
> The idea (not only mine?) is that this would extend
> cpuset.cpus.partition that only allows HK_TYPE_DOMAIN analogy. The
> mapping to individual flags may not be exposed to users. The graduality
> could be achieved by adding more flags under user_exposed_term.
>
> Just to be on the same page -- that's how I understand it, the original
> HK_* resolution turned out impractical for users and that's why the
> direction is towards some loose combinations representing user
> intentions. Is that right?
What I am envisioning is to have additional isolation attributes to an
isolated partition that correspond to what a user can specify on the
kernel command line today. That requires the minimal amount of learning
from the user community. Any finer grained separation of isolation
features will just confuse user. I don't see a problem with a generic
HK_TYPE_KERNEL_NOISE that gets enabled when an attribute that correspond
to nohz_full is used.
Cheers,
Longman
On Sat 04-02-23 00:24:09, Frederic Weisbecker wrote:
> Provide this new API to check if a CPU has been isolated either through
> isolcpus= or nohz_full= kernel parameter.
>
> It aims at avoiding kernel load deemed to be safely spared on CPUs
> running sensitive workload that can't bear any disturbance, such as
> pcp cache draining.
>
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: Frederic Weisbecker <[email protected]>
Is there any locking required? I do not think so as these should be
boot time configured AFAIR. From the discussion around this I have
understood that this might change in the future once cpusets gain a
better isolation support. Maybe this should be documented at this stage?
Thanks!
> ---
> include/linux/sched/isolation.h | 6 ++++++
> 1 file changed, 6 insertions(+)
>
> diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
> index b645cc81fe01..088672f08469 100644
> --- a/include/linux/sched/isolation.h
> +++ b/include/linux/sched/isolation.h
> @@ -53,4 +53,10 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
> return true;
> }
>
> +static inline bool cpu_is_isolated(int cpu)
> +{
> + return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
> + !housekeeping_test_cpu(cpu, HK_TYPE_KERNEL_NOISE);
> +}
> +
> #endif /* _LINUX_SCHED_ISOLATION_H */
> --
> 2.34.1
--
Michal Hocko
SUSE Labs