LinuxLists.cc - [PATCH] sched: Fix tg->load when offlining a CPU

2023-12-21 16:40:35

Subject: [PATCH] sched: Fix tg->load when offlining a CPU

When a CPU taken offline, the contribution of its cfs_rqs to task_groups'
load may remain and impact the calculation of the share of the online
CPUs.
Clear the contribution of an offlining CPU to task groups' load and skip
its contribution while it is inactive.

Reported-by: Imran Khan <[email protected]>
Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 50 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 07200b4751b3..07edb7dce671 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4094,6 +4094,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
if (cfs_rq->tg == &root_task_group)
return;

+ /* rq has been offline and doesn't contribute anymore to the share */
+ if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+ return;
+
/*
* For migration heavy workloads, access to tg->load_avg can be
* unbound. Limit the update rate to at most once per ms.
@@ -4110,6 +4114,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
}
}

+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+ long delta;
+ u64 now;
+
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ delta = 0 - cfs_rq->tg_load_avg_contrib;
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = 0;
+ cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* cpu offline callback */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The rq clock has already been updated in the
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ clear_tg_load_avg(cfs_rq);
+ }
+ rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
+}
+
/*
* Called within set_task_rq() right before setting a task's CPU. The
* caller only guarantees p->pi_lock is held; no other assumptions,
@@ -12423,6 +12470,9 @@ static void rq_offline_fair(struct rq *rq)

/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
+
+ /* Ensure that we remove rq contribution to group share */
+ clear_tg_offline_cfs_rqs(rq);
}

#endif /* CONFIG_SMP */
--
2.34.1

2023-12-21 17:05:22

by Imran Khan

[permalink] [raw]

Subject: Re: [PATCH] sched: Fix tg->load when offlining a CPU

Thanks again for the fix Vincent.

On 22/12/2023 3:40 am, Vincent Guittot wrote:
> When a CPU taken offline, the contribution of its cfs_rqs to task_groups'
> load may remain and impact the calculation of the share of the online
> CPUs.
> Clear the contribution of an offlining CPU to task groups' load and skip
> its contribution while it is inactive.
>
> Reported-by: Imran Khan <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>

Reviewed-and-tested-by: Imran Khan <[email protected]>

> ---
> kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 50 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 07200b4751b3..07edb7dce671 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4094,6 +4094,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
> if (cfs_rq->tg == &root_task_group)
> return;
>
> + /* rq has been offline and doesn't contribute anymore to the share */
> + if (!cpu_active(cpu_of(rq_of(cfs_rq))))
> + return;
> +
> /*
> * For migration heavy workloads, access to tg->load_avg can be
> * unbound. Limit the update rate to at most once per ms.
> @@ -4110,6 +4114,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
> }
> }
>
> +static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
> +{
> + long delta;
> + u64 now;
> +
> + /*
> + * No need to update load_avg for root_task_group as it is not used.
> + */
> + if (cfs_rq->tg == &root_task_group)
> + return;
> +
> + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
> + delta = 0 - cfs_rq->tg_load_avg_contrib;
> + atomic_long_add(delta, &cfs_rq->tg->load_avg);
> + cfs_rq->tg_load_avg_contrib = 0;
> + cfs_rq->last_update_tg_load_avg = now;
> +}
> +
> +/* cpu offline callback */
> +static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
> +{
> + struct task_group *tg;
> +
> + lockdep_assert_rq_held(rq);
> +
> + /*
> + * The rq clock has already been updated in the
> + * set_rq_offline(), so we should skip updating
> + * the rq clock again in unthrottle_cfs_rq().
> + */
> + rq_clock_start_loop_update(rq);
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(tg, &task_groups, list) {
> + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +
> + clear_tg_load_avg(cfs_rq);
> + }
> + rcu_read_unlock();
> +
> + rq_clock_stop_loop_update(rq);
> +}
> +
> /*
> * Called within set_task_rq() right before setting a task's CPU. The
> * caller only guarantees p->pi_lock is held; no other assumptions,
> @@ -12423,6 +12470,9 @@ static void rq_offline_fair(struct rq *rq)
>
> /* Ensure any throttled groups are reachable by pick_next_task */
> unthrottle_offline_cfs_rqs(rq);
> +
> + /* Ensure that we remove rq contribution to group share */
> + clear_tg_offline_cfs_rqs(rq);
> }
>
> #endif /* CONFIG_SMP */

2023-12-22 23:42:15

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH] sched: Fix tg->load when offlining a CPU

Hi Vincent,

kernel test robot noticed the following build errors:

[auto build test ERROR on tip/sched/core]
[also build test ERROR on peterz-queue/sched/core linus/master v6.7-rc6 next-20231222]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Vincent-Guittot/sched-Fix-tg-load-when-offlining-a-CPU/20231222-172438
base: tip/sched/core
patch link: https://lore.kernel.org/r/20231221164014.447122-1-vincent.guittot%40linaro.org
patch subject: [PATCH] sched: Fix tg->load when offlining a CPU
config: hexagon-defconfig (https://download.01.org/0day-ci/archive/20231223/[email protected]/config)
compiler: clang version 18.0.0git (https://github.com/llvm/llvm-project d3ef86708241a3bee902615c190dead1638c4e09)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231223/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All errors (new ones prefixed by >>):

In file included from kernel/sched/fair.c:28:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:337:
include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
547 | val = __raw_readb(PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
560 | val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
37 | #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
| ^
In file included from kernel/sched/fair.c:28:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:337:
include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
573 | val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
35 | #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
| ^
In file included from kernel/sched/fair.c:28:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:337:
include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
584 | __raw_writeb(value, PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
594 | __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
604 | __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
>> kernel/sched/fair.c:12498:2: error: call to undeclared function 'clear_tg_offline_cfs_rqs'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
12498 | clear_tg_offline_cfs_rqs(rq);
| ^
kernel/sched/fair.c:12498:2: note: did you mean 'unthrottle_offline_cfs_rqs'?
kernel/sched/fair.c:6620:20: note: 'unthrottle_offline_cfs_rqs' declared here
6620 | static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
| ^
kernel/sched/fair.c:13124:6: warning: no previous prototype for function 'free_fair_sched_group' [-Wmissing-prototypes]
13124 | void free_fair_sched_group(struct task_group *tg) { }
| ^
kernel/sched/fair.c:13124:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
13124 | void free_fair_sched_group(struct task_group *tg) { }
| ^
| static
kernel/sched/fair.c:13126:5: warning: no previous prototype for function 'alloc_fair_sched_group' [-Wmissing-prototypes]
13126 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
| ^
kernel/sched/fair.c:13126:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
13126 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
| ^
| static
kernel/sched/fair.c:13131:6: warning: no previous prototype for function 'online_fair_sched_group' [-Wmissing-prototypes]
13131 | void online_fair_sched_group(struct task_group *tg) { }
| ^
kernel/sched/fair.c:13131:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
13131 | void online_fair_sched_group(struct task_group *tg) { }
| ^
| static
kernel/sched/fair.c:13133:6: warning: no previous prototype for function 'unregister_fair_sched_group' [-Wmissing-prototypes]
13133 | void unregister_fair_sched_group(struct task_group *tg) { }
| ^
kernel/sched/fair.c:13133:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
13133 | void unregister_fair_sched_group(struct task_group *tg) { }
| ^
| static
10 warnings and 1 error generated.

vim +/clear_tg_offline_cfs_rqs +12498 kernel/sched/fair.c

12489
12490 static void rq_offline_fair(struct rq *rq)
12491 {
12492 update_sysctl();
12493
12494 /* Ensure any throttled groups are reachable by pick_next_task */
12495 unthrottle_offline_cfs_rqs(rq);
12496
12497 /* Ensure that we remove rq contribution to group share */
12498 clear_tg_offline_cfs_rqs(rq);
12499 }
12500

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-12-23 07:06:24

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH] sched: Fix tg->load when offlining a CPU

2023-12-23 17:30:54

by tip-bot2 for Alexey Makhalov

[permalink] [raw]

Subject: [tip: sched/urgent] sched/fair: Fix tg->load when offlining a CPU

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID: 4c0a74642c4260bd8e5b6eb4d029a390435ac8eb
Gitweb: https://git.kernel.org/tip/4c0a74642c4260bd8e5b6eb4d029a390435ac8eb
Author: Vincent Guittot <[email protected]>
AuthorDate: Thu, 21 Dec 2023 17:40:14 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Sat, 23 Dec 2023 17:16:14 +01:00

sched/fair: Fix tg->load when offlining a CPU

When a CPU is taken offline, the contribution of its cfs_rqs to task_groups'
load may remain and will negatively impact the calculation of the share of
the online CPUs.

To fix this bug, clear the contribution of an offlining CPU to task groups'
load and skip its contribution while it is inactive.

Here's the reproducer of the anomaly, by Imran Khan:

"So far I have encountered only one rather lengthy way of reproducing this issue,
which is as follows:

1. Take a KVM guest (booted with 4 CPUs and can be scaled up to 124 CPUs) and
create 2 custom cgroups: /sys/fs/cgroup/cpu/test_group_1 and /sys/fs/cgroup/
cpu/test_group_2

2. Assign a CPU intensive workload to each of these cgroups and start the
workload.

For my tests I am using following app:

int main(int argc, char *argv[])
{
unsigned long count, i, val;
if (argc != 2) {
printf("usage: ./a.out <number of random nums to generate> \n");
return 0;
}

count = strtoul(argv[1], NULL, 10);

printf("Generating %lu random numbers \n", count);
for (i = 0; i < count; i++) {
val = rand();
val = val % 2;
//usleep(1);
}
printf("Generated %lu random numbers \n", count);
return 0;
}

Also since the system is booted with 4 CPUs, in order to completely load the
system I am also launching 4 instances of same test app under:

/sys/fs/cgroup/cpu/

3. We can see that both of the cgroups get similar CPU time:

# systemd-cgtop --depth 1
Path Tasks %CPU Memory Input/s Output/s
/ 659 - 5.5G - -
/system.slice - - 5.7G - -
/test_group_1 4 - - - -
/test_group_2 3 - - - -
/user.slice 31 - 56.5M - -

Path Tasks %CPU Memory Input/s Output/s
/ 659 394.6 5.5G - -
/test_group_2 3 65.7 - - -
/user.slice 29 55.1 48.0M - -
/test_group_1 4 47.3 - - -
/system.slice - 2.2 5.7G - -

Path Tasks %CPU Memory Input/s Output/s
/ 659 394.8 5.5G - -
/test_group_1 4 62.9 - - -
/user.slice 28 44.9 54.2M - -
/test_group_2 3 44.7 - - -
/system.slice - 0.9 5.7G - -

Path Tasks %CPU Memory Input/s Output/s
/ 659 394.4 5.5G - -
/test_group_2 3 58.8 - - -
/test_group_1 4 51.9 - - -
/user.slice 30 39.3 59.6M - -
/system.slice - 1.9 5.7G - -

Path Tasks %CPU Memory Input/s Output/s
/ 659 394.7 5.5G - -
/test_group_1 4 60.9 - - -
/test_group_2 3 57.9 - - -
/user.slice 28 43.5 36.9M - -
/system.slice - 3.0 5.7G - -

Path Tasks %CPU Memory Input/s Output/s
/ 659 395.0 5.5G - -
/test_group_1 4 66.8 - - -
/test_group_2 3 56.3 - - -
/user.slice 29 43.1 51.8M - -
/system.slice - 0.7 5.7G - -

4. Now move systemd-udevd to one of these test groups, say test_group_1, and
perform scale up to 124 CPUs followed by scale down back to 4 CPUs from the
host side.

5. Run the same workload i.e 4 instances of CPU hogger under /sys/fs/cgroup/cpu
and one instance of CPU hogger each in /sys/fs/cgroup/cpu/test_group_1 and
/sys/fs/cgroup/test_group_2.

It can be seen that test_group_1 (the one where systemd-udevd was moved) is getting
much less CPU time than the test_group_2, even though at this point of time both of
these groups have only CPU hogger running:

# systemd-cgtop --depth 1
Path Tasks %CPU Memory Input/s Output/s
/ 1219 - 5.4G - -
/system.slice - - 5.6G - -
/test_group_1 4 - - - -
/test_group_2 3 - - - -
/user.slice 26 - 91.3M - -

Path Tasks %CPU Memory Input/s Output/s
/ 1221 394.3 5.4G - -
/test_group_2 3 82.7 - - -
/test_group_1 4 14.3 - - -
/system.slice - 0.8 5.6G - -
/user.slice 26 0.4 91.2M - -

Path Tasks %CPU Memory Input/s Output/s
/ 1221 394.6 5.4G - -
/test_group_2 3 67.4 - - -
/system.slice - 24.6 5.6G - -
/test_group_1 4 12.5 - - -
/user.slice 26 0.4 91.2M - -

Path Tasks %CPU Memory Input/s Output/s
/ 1221 395.2 5.4G - -
/test_group_2 3 60.9 - - -
/system.slice - 27.9 5.6G - -
/test_group_1 4 12.2 - - -
/user.slice 26 0.4 91.2M - -

Path Tasks %CPU Memory Input/s Output/s
/ 1221 395.2 5.4G - -
/test_group_2 3 69.4 - - -
/test_group_1 4 13.9 - - -
/user.slice 28 1.6 92.0M - -
/system.slice - 1.0 5.6G - -

Path Tasks %CPU Memory Input/s Output/s
/ 1221 395.6 5.4G - -
/test_group_2 3 59.3 - - -
/test_group_1 4 14.1 - - -
/user.slice 28 1.3 92.2M - -
/system.slice - 0.7 5.6G - -

Path Tasks %CPU Memory Input/s Output/s
/ 1221 395.5 5.4G - -
/test_group_2 3 67.2 - - -
/test_group_1 4 11.5 - - -
/user.slice 28 1.3 92.5M - -
/system.slice - 0.6 5.6G - -

Path Tasks %CPU Memory Input/s Output/s
/ 1221 395.1 5.4G - -
/test_group_2 3 76.8 - - -
/test_group_1 4 12.9 - - -
/user.slice 28 1.3 92.8M - -
/system.slice - 1.2 5.6G - -

From sched_debug data it can be seen that in bad case the load.weight of per-CPU
sched entities corresponding to test_group_1 has reduced significantly and
also load_avg of test_group_1 remains much higher than that of test_group_2,
even though systemd-udevd stopped running long time back and at this point of
time both cgroups just have the CPU hogger app as running entity."

[ mingo: Added details from the original discussion, plus minor edits to the patch. ]

Reported-by: Imran Khan <[email protected]>
Tested-by: Imran Khan <[email protected]>
Tested-by: Aaron Lu <[email protected]>
Signed-off-by: Vincent Guittot <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 50 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7a3c63..cda7726 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4096,6 +4096,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
if (cfs_rq->tg == &root_task_group)
return;

+ /* rq has been offline and doesn't contribute to the share anymore: */
+ if (!cpu_active(cpu_of(rq_of(cfs_rq))))
+ return;
+
/*
* For migration heavy workloads, access to tg->load_avg can be
* unbound. Limit the update rate to at most once per ms.
@@ -4112,6 +4116,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
}
}

+static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
+{
+ long delta;
+ u64 now;
+
+ /*
+ * No need to update load_avg for root_task_group, as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
+ now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+ delta = 0 - cfs_rq->tg_load_avg_contrib;
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = 0;
+ cfs_rq->last_update_tg_load_avg = now;
+}
+
+/* CPU offline callback: */
+static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
+{
+ struct task_group *tg;
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * The rq clock has already been updated in
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ clear_tg_load_avg(cfs_rq);
+ }
+ rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
+}
+
/*
* Called within set_task_rq() right before setting a task's CPU. The
* caller only guarantees p->pi_lock is held; no other assumptions,
@@ -12413,6 +12460,9 @@ static void rq_offline_fair(struct rq *rq)

/* Ensure any throttled groups are reachable by pick_next_task */
unthrottle_offline_cfs_rqs(rq);
+
+ /* Ensure that we remove rq contribution to group share: */
+ clear_tg_offline_cfs_rqs(rq);
}

#endif /* CONFIG_SMP */