When using sysbench to benchmark Postgres in a single docker instance
with sysbench's nr_threads set to nr_cpu, it is observed there are times
update_cfs_group() and update_load_avg() shows noticeable overhead on
cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
While cpus of the other node normally sees a lower cycle percent:
4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
Annotate shows the cycles are mostly spent on accessing tg->load_avg
with update_load_avg() being the write side and update_cfs_group() being
the read side.
The reason why only cpus of one node has bigger overhead is: task_group
is allocated on demand from a slab and whichever cpu happens to do the
allocation, the allocated tg will be located on that node and accessing
to tg->load_avg will have a lower cost for cpus on the same node and
a higer cost for cpus of the remote node.
Tim Chen told me that PeterZ once mentioned a way to solve a similar
problem by making a counter per node so do the same for tg->load_avg.
After this change, the worst number I saw during a 5 minutes run from
both nodes are:
2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
Another observation of this workload is: it has a lot of wakeup time
task migrations and that is the reason why update_load_avg() and
update_cfs_group() shows noticeable cost. Running this workload in N
instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
task migrations on wake up time are greatly reduced and the overhead from
the two above mentioned functions also dropped a lot. It's not clear to
me why running in multiple instances can reduce task migrations on
wakeup path yet.
Reported-by: Nitin Tekchandani <[email protected]>
Signed-off-by: Aaron Lu <[email protected]>
---
kernel/sched/core.c | 24 +++++++++++++++++-------
kernel/sched/debug.c | 2 +-
kernel/sched/fair.c | 5 +++--
kernel/sched/sched.h | 32 ++++++++++++++++++++++++--------
4 files changed, 45 insertions(+), 18 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a4918a1faa9..531d465038d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9759,9 +9759,6 @@ int in_sched_functions(unsigned long addr)
*/
struct task_group root_task_group;
LIST_HEAD(task_groups);
-
-/* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
#endif
void __init sched_init(void)
@@ -9820,8 +9817,6 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
- task_group_cache = KMEM_CACHE(task_group, 0);
-
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
@@ -10219,7 +10214,6 @@ static void sched_free_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
- kmem_cache_free(task_group_cache, tg);
}
static void sched_free_group_rcu(struct rcu_head *rcu)
@@ -10241,11 +10235,27 @@ static void sched_unregister_group(struct task_group *tg)
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent)
{
+ size_t size = sizeof(struct task_group);
+ int __maybe_unused i, nodes;
struct task_group *tg;
- tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ nodes = num_possible_nodes();
+ size += nodes * sizeof(void *);
+ tg = kzalloc(size, GFP_KERNEL);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ for_each_node(i) {
+ tg->node_info[i] = kzalloc_node(sizeof(struct tg_node_info), GFP_KERNEL, i);
+ if (!tg->node_info[i])
+ return ERR_PTR(-ENOMEM);
+ }
+#else
+ tg = kzalloc(size, GFP_KERNEL);
if (!tg)
return ERR_PTR(-ENOMEM);
+#endif
if (!alloc_fair_sched_group(tg, parent))
goto err;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07a..2f20728aa093 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -645,7 +645,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
- atomic_long_read(&cfs_rq->tg->load_avg));
+ tg_load_avg(cfs_rq->tg));
#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f8736991427..68ac015fab6a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3439,7 +3439,7 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
- tg_weight = atomic_long_read(&tg->load_avg);
+ tg_weight = tg_load_avg(tg);
/* Ensure tg_weight >= load */
tg_weight -= cfs_rq->tg_load_avg_contrib;
@@ -3608,6 +3608,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ int node = cpu_to_node(cfs_rq->rq->cpu);
/*
* No need to update load_avg for root_task_group as it is not used.
@@ -3616,7 +3617,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
return;
if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
- atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ atomic_long_add(delta, &cfs_rq->tg->node_info[node]->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8ddb7053..11a1aed4e8f0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -365,6 +365,14 @@ struct cfs_bandwidth {
#endif
};
+struct tg_node_info {
+ /*
+ * load_avg can be heavily contended at clock tick and task
+ * enqueue/dequeue time, so put it in its own cacheline.
+ */
+ atomic_long_t load_avg ____cacheline_aligned;
+};
+
/* Task group related information */
struct task_group {
struct cgroup_subsys_state css;
@@ -379,14 +387,6 @@ struct task_group {
/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;
-#ifdef CONFIG_SMP
- /*
- * load_avg can be heavily contended at clock tick time, so put
- * it in its own cacheline separated from the fields above which
- * will also be accessed at each tick.
- */
- atomic_long_t load_avg ____cacheline_aligned;
-#endif
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -418,8 +418,24 @@ struct task_group {
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ struct tg_node_info *node_info[];
+#endif
};
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+static inline long tg_load_avg(struct task_group *tg)
+{
+ long load_avg = 0;
+ int i;
+
+ for_each_node(i)
+ load_avg += atomic_long_read(&tg->node_info[i]->load_avg);
+
+ return load_avg;
+}
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
base-commit: c9c3395d5e3dcc6daee66c6908354d47bf98cb0c
--
2.39.2
On 2023-03-27 at 13:39:55 +0800, Aaron Lu wrote:
> When using sysbench to benchmark Postgres in a single docker instance
> with sysbench's nr_threads set to nr_cpu, it is observed there are times
> update_cfs_group() and update_load_avg() shows noticeable overhead on
> cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
>
> 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
>
> While cpus of the other node normally sees a lower cycle percent:
>
> 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
>
> Annotate shows the cycles are mostly spent on accessing tg->load_avg
> with update_load_avg() being the write side and update_cfs_group() being
> the read side.
>
> The reason why only cpus of one node has bigger overhead is: task_group
> is allocated on demand from a slab and whichever cpu happens to do the
> allocation, the allocated tg will be located on that node and accessing
> to tg->load_avg will have a lower cost for cpus on the same node and
> a higer cost for cpus of the remote node.
>
> Tim Chen told me that PeterZ once mentioned a way to solve a similar
> problem by making a counter per node so do the same for tg->load_avg.
> After this change, the worst number I saw during a 5 minutes run from
> both nodes are:
>
> 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
>
> Another observation of this workload is: it has a lot of wakeup time
> task migrations and that is the reason why update_load_avg() and
> update_cfs_group() shows noticeable cost. Running this workload in N
> instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> task migrations on wake up time are greatly reduced and the overhead from
> the two above mentioned functions also dropped a lot. It's not clear to
> me why running in multiple instances can reduce task migrations on
> wakeup path yet.
>
Looks interesting, when the sysbench is 1 instance and nr_threads = nr_cpu,
and when the launches more than 1 instance of sysbench, while nr_threads set
to 1/N * nr_cpu, do both cases have similar CPU utilization? Currently the
task wakeup inhibits migration wakeup if the system is overloaded.
[...]
> struct task_group *sched_create_group(struct task_group *parent)
> {
> + size_t size = sizeof(struct task_group);
> + int __maybe_unused i, nodes;
> struct task_group *tg;
>
> - tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
> +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> + nodes = num_possible_nodes();
> + size += nodes * sizeof(void *);
> + tg = kzalloc(size, GFP_KERNEL);
> + if (!tg)
> + return ERR_PTR(-ENOMEM);
> +
> + for_each_node(i) {
> + tg->node_info[i] = kzalloc_node(sizeof(struct tg_node_info), GFP_KERNEL, i);
> + if (!tg->node_info[i])
> + return ERR_PTR(-ENOMEM);
Do we need to free tg above in case of memory leak?
thanks,
Chenyu
Hi Yu,
Thanks for taking a look.
On Mon, Mar 27, 2023 at 10:45:56PM +0800, Chen Yu wrote:
> On 2023-03-27 at 13:39:55 +0800, Aaron Lu wrote:
> > When using sysbench to benchmark Postgres in a single docker instance
> > with sysbench's nr_threads set to nr_cpu, it is observed there are times
> > update_cfs_group() and update_load_avg() shows noticeable overhead on
> > cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
> >
> > 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> > 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
> >
> > While cpus of the other node normally sees a lower cycle percent:
> >
> > 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> > 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
> >
> > Annotate shows the cycles are mostly spent on accessing tg->load_avg
> > with update_load_avg() being the write side and update_cfs_group() being
> > the read side.
> >
> > The reason why only cpus of one node has bigger overhead is: task_group
> > is allocated on demand from a slab and whichever cpu happens to do the
> > allocation, the allocated tg will be located on that node and accessing
> > to tg->load_avg will have a lower cost for cpus on the same node and
> > a higer cost for cpus of the remote node.
> >
> > Tim Chen told me that PeterZ once mentioned a way to solve a similar
> > problem by making a counter per node so do the same for tg->load_avg.
> > After this change, the worst number I saw during a 5 minutes run from
> > both nodes are:
> >
> > 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> > 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
> >
> > Another observation of this workload is: it has a lot of wakeup time
> > task migrations and that is the reason why update_load_avg() and
> > update_cfs_group() shows noticeable cost. Running this workload in N
> > instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> > task migrations on wake up time are greatly reduced and the overhead from
> > the two above mentioned functions also dropped a lot. It's not clear to
> > me why running in multiple instances can reduce task migrations on
> > wakeup path yet.
> >
> Looks interesting, when the sysbench is 1 instance and nr_threads = nr_cpu,
> and when the launches more than 1 instance of sysbench, while nr_threads set
> to 1/N * nr_cpu, do both cases have similar CPU utilization? Currently the
> task wakeup inhibits migration wakeup if the system is overloaded.
I think this is a good point. I did notice during a run, when CPU util
is up, the migration number will drop. And 4 instances setup generally
has higher CPU util than 1 instance setup.
I should also add that in vanilla kernel, if tg is allocated on node 0
then task migrations happening on remote node is the deciding factor of
an increased cost of update_cfs_group() and update_load_avg() because
remote node has a higher cost of accessing tg->load_avg.
> [...]
> > struct task_group *sched_create_group(struct task_group *parent)
> > {
> > + size_t size = sizeof(struct task_group);
> > + int __maybe_unused i, nodes;
> > struct task_group *tg;
> >
> > - tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
> > +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> > + nodes = num_possible_nodes();
> > + size += nodes * sizeof(void *);
> > + tg = kzalloc(size, GFP_KERNEL);
> > + if (!tg)
> > + return ERR_PTR(-ENOMEM);
> > +
> > + for_each_node(i) {
> > + tg->node_info[i] = kzalloc_node(sizeof(struct tg_node_info), GFP_KERNEL, i);
> > + if (!tg->node_info[i])
> > + return ERR_PTR(-ENOMEM);
> Do we need to free tg above in case of memory leak?
Good catch, will fix this in next posting, thanks!
On 27/03/2023 07:39, Aaron Lu wrote:
> When using sysbench to benchmark Postgres in a single docker instance
> with sysbench's nr_threads set to nr_cpu, it is observed there are times
> update_cfs_group() and update_load_avg() shows noticeable overhead on
> cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
>
> 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
>
> While cpus of the other node normally sees a lower cycle percent:
>
> 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
>
> Annotate shows the cycles are mostly spent on accessing tg->load_avg
> with update_load_avg() being the write side and update_cfs_group() being
> the read side.
>
> The reason why only cpus of one node has bigger overhead is: task_group
> is allocated on demand from a slab and whichever cpu happens to do the
> allocation, the allocated tg will be located on that node and accessing
> to tg->load_avg will have a lower cost for cpus on the same node and
> a higer cost for cpus of the remote node.
>
> Tim Chen told me that PeterZ once mentioned a way to solve a similar
> problem by making a counter per node so do the same for tg->load_avg.
> After this change, the worst number I saw during a 5 minutes run from
> both nodes are:
>
> 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
>
> Another observation of this workload is: it has a lot of wakeup time
> task migrations and that is the reason why update_load_avg() and
> update_cfs_group() shows noticeable cost. Running this workload in N
> instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> task migrations on wake up time are greatly reduced and the overhead from
> the two above mentioned functions also dropped a lot. It's not clear to
> me why running in multiple instances can reduce task migrations on
> wakeup path yet.
>
> Reported-by: Nitin Tekchandani <[email protected]>
> Signed-off-by: Aaron Lu <[email protected]>
I'm so far not seeing this issue on my Arm64 server.
$ numactl -H
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
44 45 46 47
node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
68 69 70 71
node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
92 93 94 95
node distances:
node 0 1 2 3
0: 10 12 20 22
1: 12 10 22 24
2: 20 22 10 12
3: 22 24 12 10
sysbench --table-size=100000 --tables=24 --threads=96 ...
/usr/share/sysbench/oltp_read_write.lua run
perf report | grep kernel | head
9.12% sysbench [kernel.vmlinux] [k] _raw_spin_unlock_irqrestore
5.26% sysbench [kernel.vmlinux] [k] finish_task_switch
1.56% sysbench [kernel.vmlinux] [k] __do_softirq
1.22% sysbench [kernel.vmlinux] [k] arch_local_irq_restore
1.12% sysbench [kernel.vmlinux] [k] __arch_copy_to_user
1.12% sysbench [kernel.vmlinux] [k] el0_svc_common.constprop.1
0.95% sysbench [kernel.vmlinux] [k] __fget_light
0.94% sysbench [kernel.vmlinux] [k] rwsem_spin_on_owner
0.85% sysbench [kernel.vmlinux] [k] tcp_ack
0.56% sysbench [kernel.vmlinux] [k] do_sys_poll
Is your postgres/sysbench running in a cgroup with cpu controller
attached? Mine isn't.
Maybe I'm doing something else differently?
Hi Dietmar,
Thanks for taking a look.
On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
> On 27/03/2023 07:39, Aaron Lu wrote:
> > When using sysbench to benchmark Postgres in a single docker instance
> > with sysbench's nr_threads set to nr_cpu, it is observed there are times
> > update_cfs_group() and update_load_avg() shows noticeable overhead on
> > cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
> >
> > 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> > 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
> >
> > While cpus of the other node normally sees a lower cycle percent:
> >
> > 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> > 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
> >
> > Annotate shows the cycles are mostly spent on accessing tg->load_avg
> > with update_load_avg() being the write side and update_cfs_group() being
> > the read side.
> >
> > The reason why only cpus of one node has bigger overhead is: task_group
> > is allocated on demand from a slab and whichever cpu happens to do the
> > allocation, the allocated tg will be located on that node and accessing
> > to tg->load_avg will have a lower cost for cpus on the same node and
> > a higer cost for cpus of the remote node.
> >
> > Tim Chen told me that PeterZ once mentioned a way to solve a similar
> > problem by making a counter per node so do the same for tg->load_avg.
> > After this change, the worst number I saw during a 5 minutes run from
> > both nodes are:
> >
> > 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> > 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
> >
> > Another observation of this workload is: it has a lot of wakeup time
> > task migrations and that is the reason why update_load_avg() and
> > update_cfs_group() shows noticeable cost. Running this workload in N
> > instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> > task migrations on wake up time are greatly reduced and the overhead from
> > the two above mentioned functions also dropped a lot. It's not clear to
> > me why running in multiple instances can reduce task migrations on
> > wakeup path yet.
> >
> > Reported-by: Nitin Tekchandani <[email protected]>
> > Signed-off-by: Aaron Lu <[email protected]>
>
> I'm so far not seeing this issue on my Arm64 server.
>
> $ numactl -H
> available: 4 nodes (0-3)
> node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
> node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
> 44 45 46 47
> node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
> 68 69 70 71
> node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
> 92 93 94 95
> node distances:
> node 0 1 2 3
> 0: 10 12 20 22
> 1: 12 10 22 24
> 2: 20 22 10 12
> 3: 22 24 12 10
>
> sysbench --table-size=100000 --tables=24 --threads=96 ...
> /usr/share/sysbench/oltp_read_write.lua run
>
> perf report | grep kernel | head
>
> 9.12% sysbench [kernel.vmlinux] [k] _raw_spin_unlock_irqrestore
> 5.26% sysbench [kernel.vmlinux] [k] finish_task_switch
> 1.56% sysbench [kernel.vmlinux] [k] __do_softirq
> 1.22% sysbench [kernel.vmlinux] [k] arch_local_irq_restore
> 1.12% sysbench [kernel.vmlinux] [k] __arch_copy_to_user
> 1.12% sysbench [kernel.vmlinux] [k] el0_svc_common.constprop.1
> 0.95% sysbench [kernel.vmlinux] [k] __fget_light
> 0.94% sysbench [kernel.vmlinux] [k] rwsem_spin_on_owner
> 0.85% sysbench [kernel.vmlinux] [k] tcp_ack
> 0.56% sysbench [kernel.vmlinux] [k] do_sys_poll
Did you test with a v6.3-rc based kernel?
I encountered another problem on those kernels and had to temporarily use
a v6.2 based kernel, maybe you have to do the same:
https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
>
> Is your postgres/sysbench running in a cgroup with cpu controller
> attached? Mine isn't.
Yes, I had postgres and sysbench running in the same cgroup with cpu
controller enabled. docker created the cgroup directory under
/sys/fs/cgroup/system.slice/docker-XXX and cgroup.controllers has cpu
there.
>
> Maybe I'm doing something else differently?
Maybe, you didn't mention how you started postgres, if you start it from
the same session as sysbench and if autogroup is enabled, then all those
tasks would be in the same autogroup taskgroup then it should have the
same effect as my setup.
Anyway, you can try the following steps to see if you can reproduce this
problem on your Arm64 server:
1 docker pull postgres
2 sudo docker run --rm --name postgres-instance -e POSTGRES_PASSWORD=mypass -e POSTGRES_USER=sbtest -d postgres -c shared_buffers=80MB -c max_connections=250
3 go inside the container
sudo docker exec -it $the_just_started_container_id bash
4 install sysbench inside container
apt update and apt install sysbench
5 prepare
root@container:/# sysbench --db-driver=pgsql --pgsql-user=sbtest --pgsql_password=mypass --pgsql-db=sbtest --pgsql-port=5432 --tables=16 --table-size=10000 --threads=224 --time=60 --report-interval=2 /usr/share/sysbench/oltp_read_only.lua prepare
6 run
root@container:/# sysbench --db-driver=pgsql --pgsql-user=sbtest --pgsql_password=mypass --pgsql-db=sbtest --pgsql-port=5432 --tables=16 --table-size=10000 --threads=224 --time=60 --report-interval=2 /usr/share/sysbench/oltp_read_only.lua run
Note that I used 224 threads where this problem is visible. I also tried
96 and update_cfs_group() and update_load_avg() cost about 1% cycles then.
On 28/03/2023 14:56, Aaron Lu wrote:
> Hi Dietmar,
>
> Thanks for taking a look.
>
> On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
>> On 27/03/2023 07:39, Aaron Lu wrote:
[...]
> Did you test with a v6.3-rc based kernel?
> I encountered another problem on those kernels and had to temporarily use
> a v6.2 based kernel, maybe you have to do the same:
> https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
No, I'm also on v6.2.
>> Is your postgres/sysbench running in a cgroup with cpu controller
>> attached? Mine isn't.
>
> Yes, I had postgres and sysbench running in the same cgroup with cpu
> controller enabled. docker created the cgroup directory under
> /sys/fs/cgroup/system.slice/docker-XXX and cgroup.controllers has cpu
> there.
I'm running postgresql service directly on the machine. I boot now with
'cgroup_no_v1=all systemd.unified_cgroup_hierarchy=1' so I can add the
cpu controller to:
system.slice/system-postgresql.slice/[email protected]
where the 96 postgres threads run and to
user.slice/user-1005.slice/session-4.scope
where the 96 sysbench threads run.
Checked with systemd-cgls and `cat /sys/kernel/debug/sched/debug` that
those threads are really running there.
Still not seeing `update_load_avg` or `update_cfs_group` in perf report,
only some very low values for `update_blocked_averages`.
Also added CFS BW throttling to both cgroups. No change.
Then I moved session-4.scope's shell into `[email protected]`
so that `postgres` and `sysbench` threads run in the same cgroup.
Didn't change much.
>> Maybe I'm doing something else differently?
>
> Maybe, you didn't mention how you started postgres, if you start it from
> the same session as sysbench and if autogroup is enabled, then all those
> tasks would be in the same autogroup taskgroup then it should have the
> same effect as my setup.
This should be now close to my setup running `postgres` and `sysbench`
in `[email protected]`.
> Anyway, you can try the following steps to see if you can reproduce this
> problem on your Arm64 server:
>
> 1 docker pull postgres
> 2 sudo docker run --rm --name postgres-instance -e POSTGRES_PASSWORD=mypass -e POSTGRES_USER=sbtest -d postgres -c shared_buffers=80MB -c max_connections=250
> 3 go inside the container
> sudo docker exec -it $the_just_started_container_id bash
> 4 install sysbench inside container
> apt update and apt install sysbench
> 5 prepare
> root@container:/# sysbench --db-driver=pgsql --pgsql-user=sbtest --pgsql_password=mypass --pgsql-db=sbtest --pgsql-port=5432 --tables=16 --table-size=10000 --threads=224 --time=60 --report-interval=2 /usr/share/sysbench/oltp_read_only.lua prepare
> 6 run
> root@container:/# sysbench --db-driver=pgsql --pgsql-user=sbtest --pgsql_password=mypass --pgsql-db=sbtest --pgsql-port=5432 --tables=16 --table-size=10000 --threads=224 --time=60 --report-interval=2 /usr/share/sysbench/oltp_read_only.lua run
I would have to find time to learn how to set up docker on my machine
... But I use very similar values for the setup and sysbench test.
> Note that I used 224 threads where this problem is visible. I also tried
> 96 and update_cfs_group() and update_load_avg() cost about 1% cycles then.
True, I was hopping to see at least the 1% ;-)
On Wed, Mar 29, 2023 at 02:36:44PM +0200, Dietmar Eggemann wrote:
> On 28/03/2023 14:56, Aaron Lu wrote:
> > Hi Dietmar,
> >
> > Thanks for taking a look.
> >
> > On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
> >> On 27/03/2023 07:39, Aaron Lu wrote:
>
> [...]
>
> > Did you test with a v6.3-rc based kernel?
> > I encountered another problem on those kernels and had to temporarily use
> > a v6.2 based kernel, maybe you have to do the same:
> > https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
>
> No, I'm also on v6.2.
>
> >> Is your postgres/sysbench running in a cgroup with cpu controller
> >> attached? Mine isn't.
> >
> > Yes, I had postgres and sysbench running in the same cgroup with cpu
> > controller enabled. docker created the cgroup directory under
> > /sys/fs/cgroup/system.slice/docker-XXX and cgroup.controllers has cpu
> > there.
>
> I'm running postgresql service directly on the machine. I boot now with
> 'cgroup_no_v1=all systemd.unified_cgroup_hierarchy=1' so I can add the
> cpu controller to:
>
> system.slice/system-postgresql.slice/[email protected]
>
> where the 96 postgres threads run and to
>
> user.slice/user-1005.slice/session-4.scope
>
> where the 96 sysbench threads run.
>
> Checked with systemd-cgls and `cat /sys/kernel/debug/sched/debug` that
> those threads are really running there.
>
> Still not seeing `update_load_avg` or `update_cfs_group` in perf report,
> only some very low values for `update_blocked_averages`.
>
> Also added CFS BW throttling to both cgroups. No change.
>
> Then I moved session-4.scope's shell into `[email protected]`
> so that `postgres` and `sysbench` threads run in the same cgroup.
>
> Didn't change much.
>
> >> Maybe I'm doing something else differently?
> >
> > Maybe, you didn't mention how you started postgres, if you start it from
> > the same session as sysbench and if autogroup is enabled, then all those
> > tasks would be in the same autogroup taskgroup then it should have the
> > same effect as my setup.
>
> This should be now close to my setup running `postgres` and `sysbench`
> in `[email protected]`.
Yes.
>
> > Anyway, you can try the following steps to see if you can reproduce this
> > problem on your Arm64 server:
> >
> > 1 docker pull postgres
> > 2 sudo docker run --rm --name postgres-instance -e POSTGRES_PASSWORD=mypass -e POSTGRES_USER=sbtest -d postgres -c shared_buffers=80MB -c max_connections=250
> > 3 go inside the container
> > sudo docker exec -it $the_just_started_container_id bash
> > 4 install sysbench inside container
> > apt update and apt install sysbench
> > 5 prepare
> > root@container:/# sysbench --db-driver=pgsql --pgsql-user=sbtest --pgsql_password=mypass --pgsql-db=sbtest --pgsql-port=5432 --tables=16 --table-size=10000 --threads=224 --time=60 --report-interval=2 /usr/share/sysbench/oltp_read_only.lua prepare
> > 6 run
> > root@container:/# sysbench --db-driver=pgsql --pgsql-user=sbtest --pgsql_password=mypass --pgsql-db=sbtest --pgsql-port=5432 --tables=16 --table-size=10000 --threads=224 --time=60 --report-interval=2 /usr/share/sysbench/oltp_read_only.lua run
>
> I would have to find time to learn how to set up docker on my machine
> ... But I use very similar values for the setup and sysbench test.
Agree. And docker just made running this workload easier but since you
already grouped all tasks in the same taskgroup, there is no need to
mess with docker.
>
> > Note that I used 224 threads where this problem is visible. I also tried
> > 96 and update_cfs_group() and update_load_avg() cost about 1% cycles then.
>
> True, I was hopping to see at least the 1% ;-)
One more question: when you do 'perf report', did you use
--sort=dso,symbol to aggregate different paths of the same target? Maybe
you have already done this, just want to confirm :-)
And not sure if you did the profile on different nodes? I normally chose
4 cpus of each node and do 'perf record -C' with them, to get an idea
of how different node behaves and also to reduce the record size.
Normally, when tg is allocated on node 0, then node 1's profile would
show higher cycles for update_cfs_group() and update_load_avg().
Another thing worth mentioning about this workload is, it has a lot of
wakeups and migrations during the initial 2 minutes or so and a lot of
migrations is the reason of increased cost of update_cfs_group() and
update_load_avg(). On my side, with sysbench's nr_thread=224, the
wakeups and migration numbers during a 5s window are(recorded after
about 1 minute the workload is started):
@migrations[1]: 1821379
@migrations[0]: 4482989
@wakeups[1]: 3036473
@wakeups[0]: 6504496
The above number is derived from below script:
kretfunc:select_task_rq_fair
{
@wakeups[numaid] = count();
if (args->p->thread_info.cpu != retval) {
@migrations[numaid] = count();
}
}
interval:s:5
{
exit();
}
And during this time window, node1's profile shows update_cfs_group()'s
cycle percent is 12.45% and update_load_avg() is 7.99%.
I guess your setup may have a much lower migration number?
On 2023-03-29 at 14:36:44 +0200, Dietmar Eggemann wrote:
> On 28/03/2023 14:56, Aaron Lu wrote:
> > Hi Dietmar,
> >
> > Thanks for taking a look.
> >
> > On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
> >> On 27/03/2023 07:39, Aaron Lu wrote:
>
> [...]
>
> > Did you test with a v6.3-rc based kernel?
> > I encountered another problem on those kernels and had to temporarily use
> > a v6.2 based kernel, maybe you have to do the same:
> > https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
>
> No, I'm also on v6.2.
>
> >> Is your postgres/sysbench running in a cgroup with cpu controller
> >> attached? Mine isn't.
> >
> > Yes, I had postgres and sysbench running in the same cgroup with cpu
> > controller enabled. docker created the cgroup directory under
> > /sys/fs/cgroup/system.slice/docker-XXX and cgroup.controllers has cpu
> > there.
>
> I'm running postgresql service directly on the machine. I boot now with
> 'cgroup_no_v1=all systemd.unified_cgroup_hierarchy=1' so I can add the
> cpu controller to:
>
> system.slice/system-postgresql.slice/[email protected]
>
> where the 96 postgres threads run and to
>
> user.slice/user-1005.slice/session-4.scope
>
> where the 96 sysbench threads run.
>
> Checked with systemd-cgls and `cat /sys/kernel/debug/sched/debug` that
> those threads are really running there.
>
> Still not seeing `update_load_avg` or `update_cfs_group` in perf report,
> only some very low values for `update_blocked_averages`.
>
> Also added CFS BW throttling to both cgroups. No change.
>
> Then I moved session-4.scope's shell into `[email protected]`
> so that `postgres` and `sysbench` threads run in the same cgroup.
>
> Didn't change much.
>
> >> Maybe I'm doing something else differently?
> >
> > Maybe, you didn't mention how you started postgres, if you start it from
> > the same session as sysbench and if autogroup is enabled, then all those
> > tasks would be in the same autogroup taskgroup then it should have the
> > same effect as my setup.
>
> This should be now close to my setup running `postgres` and `sysbench`
> in `[email protected]`.
>
> > Anyway, you can try the following steps to see if you can reproduce this
> > problem on your Arm64 server:
> >
> > 1 docker pull postgres
> > 2 sudo docker run --rm --name postgres-instance -e POSTGRES_PASSWORD=mypass -e POSTGRES_USER=sbtest -d postgres -c shared_buffers=80MB -c max_connections=250
> > 3 go inside the container
> > sudo docker exec -it $the_just_started_container_id bash
> > 4 install sysbench inside container
> > apt update and apt install sysbench
> > 5 prepare
> > root@container:/# sysbench --db-driver=pgsql --pgsql-user=sbtest --pgsql_password=mypass --pgsql-db=sbtest --pgsql-port=5432 --tables=16 --table-size=10000 --threads=224 --time=60 --report-interval=2 /usr/share/sysbench/oltp_read_only.lua prepare
> > 6 run
> > root@container:/# sysbench --db-driver=pgsql --pgsql-user=sbtest --pgsql_password=mypass --pgsql-db=sbtest --pgsql-port=5432 --tables=16 --table-size=10000 --threads=224 --time=60 --report-interval=2 /usr/share/sysbench/oltp_read_only.lua run
>
> I would have to find time to learn how to set up docker on my machine
> ... But I use very similar values for the setup and sysbench test.
>
> > Note that I used 224 threads where this problem is visible. I also tried
> > 96 and update_cfs_group() and update_load_avg() cost about 1% cycles then.
>
> True, I was hopping to see at least the 1% ;-)
According to Aaron's description, the relatively high cost of update_load_avg() was
caused by cross-node access. If the task group is allocated on node0, but some tasks
in this task group are load balanced to node1, the issue could be triggered
easier? echo 0 > /sys/kernel/debug/sched/numa_balancing
thanks,
Chenyu
Hi Aaron,
On Wed, Mar 29, 2023 at 09:54:55PM +0800, Aaron Lu wrote:
> On Wed, Mar 29, 2023 at 02:36:44PM +0200, Dietmar Eggemann wrote:
> > On 28/03/2023 14:56, Aaron Lu wrote:
> > > On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
> > >> On 27/03/2023 07:39, Aaron Lu wrote:
> And not sure if you did the profile on different nodes? I normally chose
> 4 cpus of each node and do 'perf record -C' with them, to get an idea
> of how different node behaves and also to reduce the record size.
> Normally, when tg is allocated on node 0, then node 1's profile would
> show higher cycles for update_cfs_group() and update_load_avg().
Wouldn't the choice of CPUs have a big effect on the data, depending on
where sysbench or postgres tasks run?
> I guess your setup may have a much lower migration number?
I also tried this and sure enough didn't see as many migrations on
either of two systems. I used a container with your steps with a plain
6.2 kernel underneath, and the cpu controller is on (weight only). I
increased connections and buffer size to suit each machine, and took
Chen's suggestion to try without numa balancing.
AMD EPYC 7J13 64-Core Processor
2 sockets * 64 cores * 2 threads = 256 CPUs
sysbench: nr_threads=256
All observability data was taken at one minute in and using one tool at
a time.
@migrations[1]: 1113
@migrations[0]: 6152
@wakeups[1]: 8871744
@wakeups[0]: 9773321
# profiled the whole system for 5 seconds, reported w/ --sort=dso,symbol
0.38% update_load_avg
0.13% update_cfs_group
Using higher (nr_threads=380) and lower (nr_threads=128) load doesn't
change these numbers much.
The topology of my machine is different from yours, but it's the biggest
I have, and I'm assuming cpu count is more important than topology when
reproducing the remote accesses. I also tried on
Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
2 sockets * 32 cores * 2 thread = 128 CPUs
with nr_threads=128 and got similar results.
I'm guessing you've left all sched knobs alone? Maybe sharing those and
the kconfig would help close the gap. Migrations do increase to near
what you were seeing when I disable SIS_UTIL (with SIS_PROP already off)
on the Xeon, and I see 4-5% apiece for the functions you mention when
profiling, but turning SIS_UTIL off is an odd thing to do.
On Thu, Mar 30, 2023 at 01:46:02PM -0400, Daniel Jordan wrote:
> Hi Aaron,
>
> On Wed, Mar 29, 2023 at 09:54:55PM +0800, Aaron Lu wrote:
> > On Wed, Mar 29, 2023 at 02:36:44PM +0200, Dietmar Eggemann wrote:
> > > On 28/03/2023 14:56, Aaron Lu wrote:
> > > > On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
> > > >> On 27/03/2023 07:39, Aaron Lu wrote:
> > And not sure if you did the profile on different nodes? I normally chose
> > 4 cpus of each node and do 'perf record -C' with them, to get an idea
> > of how different node behaves and also to reduce the record size.
> > Normally, when tg is allocated on node 0, then node 1's profile would
> > show higher cycles for update_cfs_group() and update_load_avg().
>
> Wouldn't the choice of CPUs have a big effect on the data, depending on
> where sysbench or postgres tasks run?
Oh, probably not with NCPU threads though, since the load would be
pretty even, so I think I see where you're coming from.
> > I guess your setup may have a much lower migration number?
>
> I also tried this and sure enough didn't see as many migrations on
> either of two systems. I used a container with your steps with a plain
> 6.2 kernel underneath, and the cpu controller is on (weight only). I
> increased connections and buffer size to suit each machine, and took
> Chen's suggestion to try without numa balancing.
>
> AMD EPYC 7J13 64-Core Processor
> 2 sockets * 64 cores * 2 threads = 256 CPUs
>
> sysbench: nr_threads=256
>
> All observability data was taken at one minute in and using one tool at
> a time.
>
> @migrations[1]: 1113
> @migrations[0]: 6152
> @wakeups[1]: 8871744
> @wakeups[0]: 9773321
>
> # profiled the whole system for 5 seconds, reported w/ --sort=dso,symbol
> 0.38% update_load_avg
> 0.13% update_cfs_group
>
> Using higher (nr_threads=380) and lower (nr_threads=128) load doesn't
> change these numbers much.
>
> The topology of my machine is different from yours, but it's the biggest
> I have, and I'm assuming cpu count is more important than topology when
> reproducing the remote accesses. I also tried on
>
> Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
> 2 sockets * 32 cores * 2 thread = 128 CPUs
>
> with nr_threads=128 and got similar results.
>
> I'm guessing you've left all sched knobs alone? Maybe sharing those and
> the kconfig would help close the gap. Migrations do increase to near
> what you were seeing when I disable SIS_UTIL (with SIS_PROP already off)
> on the Xeon, and I see 4-5% apiece for the functions you mention when
> profiling, but turning SIS_UTIL off is an odd thing to do.
Hi Daniel,
Thanks for taking a look.
On Thu, Mar 30, 2023 at 03:51:57PM -0400, Daniel Jordan wrote:
> On Thu, Mar 30, 2023 at 01:46:02PM -0400, Daniel Jordan wrote:
> > Hi Aaron,
> >
> > On Wed, Mar 29, 2023 at 09:54:55PM +0800, Aaron Lu wrote:
> > > On Wed, Mar 29, 2023 at 02:36:44PM +0200, Dietmar Eggemann wrote:
> > > > On 28/03/2023 14:56, Aaron Lu wrote:
> > > > > On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
> > > > >> On 27/03/2023 07:39, Aaron Lu wrote:
> > > And not sure if you did the profile on different nodes? I normally chose
> > > 4 cpus of each node and do 'perf record -C' with them, to get an idea
> > > of how different node behaves and also to reduce the record size.
> > > Normally, when tg is allocated on node 0, then node 1's profile would
> > > show higher cycles for update_cfs_group() and update_load_avg().
> >
> > Wouldn't the choice of CPUs have a big effect on the data, depending on
> > where sysbench or postgres tasks run?
>
> Oh, probably not with NCPU threads though, since the load would be
> pretty even, so I think I see where you're coming from.
Yes I expect the load to be pretty even within the same node so didn't
do the full cpu record. I used to only record a single cpu on each node
to get a fast report time but settled on using 4 due to being paranoid :-)
>
> > > I guess your setup may have a much lower migration number?
> >
> > I also tried this and sure enough didn't see as many migrations on
> > either of two systems. I used a container with your steps with a plain
> > 6.2 kernel underneath, and the cpu controller is on (weight only). I
> > increased connections and buffer size to suit each machine, and took
> > Chen's suggestion to try without numa balancing.
I also tried disabling numa balancing per Chen's suggestion and I saw
slightly reduced migration on task wake up time for some runs but it
didn't make things dramatically different here.
> >
> > AMD EPYC 7J13 64-Core Processor
> > 2 sockets * 64 cores * 2 threads = 256 CPUs
I have a vague memory AMD machine has a smaller LLC and cpus belonging
to the same LLC is also not many, 8-16?
I tend to think cpu number of LLC play a role here since that's the
domain where idle cpu is searched on task wake up time.
> >
> > sysbench: nr_threads=256
> >
> > All observability data was taken at one minute in and using one tool at
> > a time.
> >
> > @migrations[1]: 1113
> > @migrations[0]: 6152
> > @wakeups[1]: 8871744
> > @wakeups[0]: 9773321
What a nice number for migration!
Of the 10 million wakeups, there are only several thousand migrations
compared to 4-5 millions on my side.
> >
> > # profiled the whole system for 5 seconds, reported w/ --sort=dso,symbol
> > 0.38% update_load_avg
> > 0.13% update_cfs_group
With such a small number of migration, the above percent is expected.
> >
> > Using higher (nr_threads=380) and lower (nr_threads=128) load doesn't
> > change these numbers much.
> >
> > The topology of my machine is different from yours, but it's the biggest
> > I have, and I'm assuming cpu count is more important than topology when
> > reproducing the remote accesses. I also tried on
> >
> > Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
> > 2 sockets * 32 cores * 2 thread = 128 CPUs
> >
> > with nr_threads=128 and got similar results.
> >
> > I'm guessing you've left all sched knobs alone? Maybe sharing those and
Yes I've left all knobs alone. The server I have access to has Ubuntu
22.04.1 installed and here are the values of these knobs:
root@a4bf01924c30:/sys/kernel/debug/sched# sysctl -a |grep sched
kernel.sched_autogroup_enabled = 1
kernel.sched_cfs_bandwidth_slice_us = 5000
kernel.sched_child_runs_first = 0
kernel.sched_deadline_period_max_us = 4194304
kernel.sched_deadline_period_min_us = 100
kernel.sched_energy_aware = 1
kernel.sched_rr_timeslice_ms = 100
kernel.sched_rt_period_us = 1000000
kernel.sched_rt_runtime_us = 950000
kernel.sched_schedstats = 0
kernel.sched_util_clamp_max = 1024
kernel.sched_util_clamp_min = 1024
kernel.sched_util_clamp_min_rt_default = 1024
root@a4bf01924c30:/sys/kernel/debug/sched# for i in `ls features *_ns *_ms preempt`; do echo "$i: `cat $i`"; done
features: GENTLE_FAIR_SLEEPERS START_DEBIT NO_NEXT_BUDDY LAST_BUDDY CACHE_HOT_BUDDY WAKEUP_PREEMPTION NO_HRTICK NO_HRTICK_DL NO_DOUBLE_TICK NONTASK_CAPACITY TTWU_QUEUE NO_SIS_PROP SIS_UTIL NO_WARN_DOUBLE_CLOCK RT_PUSH_IPI NO_RT_RUNTIME_SHARE NO_LB_MIN ATTACH_AGE_LOAD WA_IDLE WA_WEIGHT WA_BIAS UTIL_EST UTIL_EST_FASTUP NO_LATENCY_WARN ALT_PERIOD BASE_SLICE
idle_min_granularity_ns: 750000
latency_ns: 24000000
latency_warn_ms: 100
migration_cost_ns: 500000
min_granularity_ns: 3000000
preempt: none (voluntary) full
wakeup_granularity_ns: 4000000
> > the kconfig would help close the gap. Migrations do increase to near
> > what you were seeing when I disable SIS_UTIL (with SIS_PROP already off)
> > on the Xeon, and I see 4-5% apiece for the functions you mention when
> > profiling, but turning SIS_UTIL off is an odd thing to do.
As you can see from above, I didn't turn off SIS_UTIL.
And attached kconfig, it's basically what the distro provided except I
had to disable some configs related to module sign or something like
that.
On 31/03/2023 06:06, Aaron Lu wrote:
> Hi Daniel,
>
> Thanks for taking a look.
>
> On Thu, Mar 30, 2023 at 03:51:57PM -0400, Daniel Jordan wrote:
>> On Thu, Mar 30, 2023 at 01:46:02PM -0400, Daniel Jordan wrote:
>>> Hi Aaron,
>>>
>>> On Wed, Mar 29, 2023 at 09:54:55PM +0800, Aaron Lu wrote:
>>>> On Wed, Mar 29, 2023 at 02:36:44PM +0200, Dietmar Eggemann wrote:
>>>>> On 28/03/2023 14:56, Aaron Lu wrote:
>>>>>> On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
>>>>>>> On 27/03/2023 07:39, Aaron Lu wrote:
[...]
>>> AMD EPYC 7J13 64-Core Processor
>>> 2 sockets * 64 cores * 2 threads = 256 CPUs
>
> I have a vague memory AMD machine has a smaller LLC and cpus belonging
> to the same LLC is also not many, 8-16?
>
> I tend to think cpu number of LLC play a role here since that's the
> domain where idle cpu is searched on task wake up time.
>
>>>
>>> sysbench: nr_threads=256
>>>
>>> All observability data was taken at one minute in and using one tool at
>>> a time.
>>>
>>> @migrations[1]: 1113
>>> @migrations[0]: 6152
>>> @wakeups[1]: 8871744
>>> @wakeups[0]: 9773321
Just a thought: Could the different behaviour come from different
CPU numbering schemes (consecutive/alternate)?
(1) My Arm server:
numactl -H
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
(2) Intel(R) Xeon(R) Silver 4314
$ numactl -H
available: 2 nodes (0-1)
node 0 cpus: 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62
node 1 cpus: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63
[...]
On Fri, Mar 31, 2023 at 05:48:12PM +0200, Dietmar Eggemann wrote:
> On 31/03/2023 06:06, Aaron Lu wrote:
> > Hi Daniel,
> >
> > Thanks for taking a look.
> >
> > On Thu, Mar 30, 2023 at 03:51:57PM -0400, Daniel Jordan wrote:
> >> On Thu, Mar 30, 2023 at 01:46:02PM -0400, Daniel Jordan wrote:
> >>> Hi Aaron,
> >>>
> >>> On Wed, Mar 29, 2023 at 09:54:55PM +0800, Aaron Lu wrote:
> >>>> On Wed, Mar 29, 2023 at 02:36:44PM +0200, Dietmar Eggemann wrote:
> >>>>> On 28/03/2023 14:56, Aaron Lu wrote:
> >>>>>> On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
> >>>>>>> On 27/03/2023 07:39, Aaron Lu wrote:
>
> [...]
>
> >>> AMD EPYC 7J13 64-Core Processor
> >>> 2 sockets * 64 cores * 2 threads = 256 CPUs
> >
> > I have a vague memory AMD machine has a smaller LLC and cpus belonging
> > to the same LLC is also not many, 8-16?
> >
> > I tend to think cpu number of LLC play a role here since that's the
> > domain where idle cpu is searched on task wake up time.
> >
> >>>
> >>> sysbench: nr_threads=256
> >>>
> >>> All observability data was taken at one minute in and using one tool at
> >>> a time.
> >>>
> >>> @migrations[1]: 1113
> >>> @migrations[0]: 6152
> >>> @wakeups[1]: 8871744
> >>> @wakeups[0]: 9773321
>
> Just a thought: Could the different behaviour come from different
> CPU numbering schemes (consecutive/alternate)?
Yeah they are indeed different, I also attached mine below. But I didn't
see a relationship between migration frequency and CPU numbering schemes,
maybe I missed something?
>
> (1) My Arm server:
>
> numactl -H
> available: 4 nodes (0-3)
> node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
> node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
> node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
> node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
>
>
> (2) Intel(R) Xeon(R) Silver 4314
>
> $ numactl -H
> available: 2 nodes (0-1)
> node 0 cpus: 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62
> node 1 cpus: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63
>
> [...]
Machine I'm testing on:
Intel (R) Xeon (R) CPU Max 9480
$ numactl -H
available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
node 0 size: 257686 MB
node 0 free: 251453 MB
node 1 cpus: 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
node 1 size: 258009 MB
node 1 free: 247905 MB
node distances:
node 0 1
0: 10 26
1: 26 10
On 2023-03-27 at 13:39:55 +0800, Aaron Lu wrote:
> When using sysbench to benchmark Postgres in a single docker instance
> with sysbench's nr_threads set to nr_cpu, it is observed there are times
> update_cfs_group() and update_load_avg() shows noticeable overhead on
> cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
>
> 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
>
> While cpus of the other node normally sees a lower cycle percent:
>
> 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
>
> Annotate shows the cycles are mostly spent on accessing tg->load_avg
> with update_load_avg() being the write side and update_cfs_group() being
> the read side.
>
> The reason why only cpus of one node has bigger overhead is: task_group
> is allocated on demand from a slab and whichever cpu happens to do the
> allocation, the allocated tg will be located on that node and accessing
> to tg->load_avg will have a lower cost for cpus on the same node and
> a higer cost for cpus of the remote node.
>
> Tim Chen told me that PeterZ once mentioned a way to solve a similar
> problem by making a counter per node so do the same for tg->load_avg.
> After this change, the worst number I saw during a 5 minutes run from
> both nodes are:
>
> 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
>
The same issue was found when running netperf on this platform.
According to the perf profile:
11.90% 11.84% swapper [kernel.kallsyms] [k] update_cfs_group
9.79% 9.43% swapper [kernel.kallsyms] [k] update_load_avg
these two functions took quite some cycles.
1. cpufreq governor set to performance, turbo disabled, C6 disabled
2. launches 224 instances of netperf, and each instance is:
netperf -4 -H 127.0.0.1 -t UDP_RR/TCP_RR -c -C -l 100 &
3. perf record -ag sleep 4
Also the test script could be downloaded via
https://github.com/yu-chen-surf/schedtests.git
thanks,
Chenyu
On Tue, Apr 04, 2023 at 04:25:04PM +0800, Chen Yu wrote:
> On 2023-03-27 at 13:39:55 +0800, Aaron Lu wrote:
> > When using sysbench to benchmark Postgres in a single docker instance
> > with sysbench's nr_threads set to nr_cpu, it is observed there are times
> > update_cfs_group() and update_load_avg() shows noticeable overhead on
> > cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
> >
> > 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> > 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
> >
> > While cpus of the other node normally sees a lower cycle percent:
> >
> > 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> > 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
> >
> > Annotate shows the cycles are mostly spent on accessing tg->load_avg
> > with update_load_avg() being the write side and update_cfs_group() being
> > the read side.
> >
> > The reason why only cpus of one node has bigger overhead is: task_group
> > is allocated on demand from a slab and whichever cpu happens to do the
> > allocation, the allocated tg will be located on that node and accessing
> > to tg->load_avg will have a lower cost for cpus on the same node and
> > a higer cost for cpus of the remote node.
> >
> > Tim Chen told me that PeterZ once mentioned a way to solve a similar
> > problem by making a counter per node so do the same for tg->load_avg.
> > After this change, the worst number I saw during a 5 minutes run from
> > both nodes are:
> >
> > 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> > 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
> >
> The same issue was found when running netperf on this platform.
> According to the perf profile:
Thanks for the info!
>
> 11.90% 11.84% swapper [kernel.kallsyms] [k] update_cfs_group
> 9.79% 9.43% swapper [kernel.kallsyms] [k] update_load_avg
>
> these two functions took quite some cycles.
>
> 1. cpufreq governor set to performance, turbo disabled, C6 disabled
I didn't make any changes to the above and then tried netperf as you
described below, using UDP_RR, and the cycle percent of update_cfs_group
is even worse on my SPR system:
v6.3-rc5:
update_cfs_group()%: 27.39% on node0, 31.18% on node1
wakeups[0]: 5623199
wakeups[1]: 7919937
migrations[0]: 3871773
migrations[1]: 5606894
v6.3-rc5 + this_patch:
update_cfs_group()%: 24.12% on node0, 26.15% on node1
wakeups[0]: 13575203
wakeups[1]: 10749893
migrations[0]: 9153060
migrations[1]: 7508095
This patch helps a little bit, but not much. Will take a closer look.
> 2. launches 224 instances of netperf, and each instance is:
> netperf -4 -H 127.0.0.1 -t UDP_RR/TCP_RR -c -C -l 100 &
> 3. perf record -ag sleep 4
>
> Also the test script could be downloaded via
> https://github.com/yu-chen-surf/schedtests.git
Thanks,
Aaron
On Mon, Mar 27, 2023 at 01:39:55PM +0800, Aaron Lu wrote:
[...]
> Another observation of this workload is: it has a lot of wakeup time
> task migrations and that is the reason why update_load_avg() and
> update_cfs_group() shows noticeable cost. Running this workload in N
> instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> task migrations on wake up time are greatly reduced and the overhead from
> the two above mentioned functions also dropped a lot. It's not clear to
> me why running in multiple instances can reduce task migrations on
> wakeup path yet.
Regarding this observation, I've some finding. The TLDR is: 1 instance
setup's overall CPU util is lower than N >= 2 instances setup and as a
result, under 1 instance setup, sis() is more likely to find idle cpus
than N >= 2 instances setup and that is the reason why 1 instance setup
has more migrations.
More details:
For 1 instance with nr_thread=nr_cpu=224 setup, during a 5s window,
there are 10 million calls of select_idle_sibling() and 6.1 million
migrations. Of these migrations, 4.6 million comes from select_idle_cpu(),
1.3 million comes from recent_cpu.
mpstat of this time window:
Average: NODE %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
Average: all 45.15 0.00 18.59 0.00 0.00 17.29 0.00 0.00 0.00 18.98
Average: 0 38.14 0.00 17.29 0.00 0.00 14.77 0.00 0.00 0.00 29.80
Average: 1 52.07 0.00 19.88 0.00 0.00 19.78 0.00 0.00 0.00 8.28
For 4 instance with nr_thread=56 setup, during a 5s window, there are 15
million calls of select_idle_sibling() and only 30k migrations.
select_idle_cpu() is called 15 million times but only 23k of them passed
the sd_share->nr_idle_scan != 0 test.
mpstat of this time window:
Average: NODE %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
Average: all 68.54 0.00 21.54 0.00 0.00 8.35 0.00 0.00 0.00 1.58
Average: 0 70.05 0.00 20.92 0.00 0.00 8.17 0.00 0.00 0.00 0.87
Average: 1 67.03 0.00 22.16 0.00 0.00 8.53 0.00 0.00 0.00 2.29
For 8 instance with nr_thread=28 setup, during a 5s window, there are
16 million calls of select_idle_sibling() and 9.6k migrations.
select_idle_cpu() is called 16 million times but none of them passed the
sd_share->nr_idle_scan != 0 test.
mpstat of this time window:
Average: NODE %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
Average: all 70.29 0.00 20.99 0.00 0.00 8.28 0.00 0.00 0.00 0.43
Average: 0 71.58 0.00 19.98 0.00 0.00 8.04 0.00 0.00 0.00 0.40
Average: 1 69.00 0.00 22.01 0.00 0.00 8.52 0.00 0.00 0.00 0.47
On a side note: when sd_share->nr_idle_scan > 0 and has_idle_core is true,
then sd_share->nr_idle_scan is not actually respected. Is this intended?
It seems to say: if there is idle core, then let's try hard and ignore
SIS_UTIL to find that idle core, right?
On 2023-04-04 at 23:15:40 +0800, Aaron Lu wrote:
> On Mon, Mar 27, 2023 at 01:39:55PM +0800, Aaron Lu wrote:
> [...]
> > Another observation of this workload is: it has a lot of wakeup time
> > task migrations and that is the reason why update_load_avg() and
> > update_cfs_group() shows noticeable cost. Running this workload in N
> > instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> > task migrations on wake up time are greatly reduced and the overhead from
> > the two above mentioned functions also dropped a lot. It's not clear to
> > me why running in multiple instances can reduce task migrations on
> > wakeup path yet.
>
> Regarding this observation, I've some finding. The TLDR is: 1 instance
> setup's overall CPU util is lower than N >= 2 instances setup and as a
> result, under 1 instance setup, sis() is more likely to find idle cpus
> than N >= 2 instances setup and that is the reason why 1 instance setup
> has more migrations.
>
> More details:
>
> For 1 instance with nr_thread=nr_cpu=224 setup, during a 5s window,
> there are 10 million calls of select_idle_sibling() and 6.1 million
> migrations. Of these migrations, 4.6 million comes from select_idle_cpu(),
> 1.3 million comes from recent_cpu.
> mpstat of this time window:
> Average: NODE %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> Average: all 45.15 0.00 18.59 0.00 0.00 17.29 0.00 0.00 0.00 18.98
> Average: 0 38.14 0.00 17.29 0.00 0.00 14.77 0.00 0.00 0.00 29.80
> Average: 1 52.07 0.00 19.88 0.00 0.00 19.78 0.00 0.00 0.00 8.28
>
>
> For 4 instance with nr_thread=56 setup, during a 5s window, there are 15
> million calls of select_idle_sibling() and only 30k migrations.
> select_idle_cpu() is called 15 million times but only 23k of them passed
> the sd_share->nr_idle_scan != 0 test.
> mpstat of this time window:
> Average: NODE %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> Average: all 68.54 0.00 21.54 0.00 0.00 8.35 0.00 0.00 0.00 1.58
> Average: 0 70.05 0.00 20.92 0.00 0.00 8.17 0.00 0.00 0.00 0.87
> Average: 1 67.03 0.00 22.16 0.00 0.00 8.53 0.00 0.00 0.00 2.29
>
> For 8 instance with nr_thread=28 setup, during a 5s window, there are
> 16 million calls of select_idle_sibling() and 9.6k migrations.
> select_idle_cpu() is called 16 million times but none of them passed the
> sd_share->nr_idle_scan != 0 test.
> mpstat of this time window:
> Average: NODE %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> Average: all 70.29 0.00 20.99 0.00 0.00 8.28 0.00 0.00 0.00 0.43
> Average: 0 71.58 0.00 19.98 0.00 0.00 8.04 0.00 0.00 0.00 0.40
> Average: 1 69.00 0.00 22.01 0.00 0.00 8.52 0.00 0.00 0.00 0.47
>
> On a side note: when sd_share->nr_idle_scan > 0 and has_idle_core is true,
> then sd_share->nr_idle_scan is not actually respected. Is this intended?
> It seems to say: if there is idle core, then let's try hard and ignore
> SIS_UTIL to find that idle core, right?
Yes, SIS_UTIL inherits the logic of SIS_PROP, which honors has_idle_core and
scans at any cost. Abel previously proposed a patch to make this more aggressive
by not allowing SIS_UTIL to take effect even when the system is overloaded.
https://lore.kernel.org/lkml/[email protected]/
thanks,
Chenyu
On Fri, Mar 31, 2023 at 12:06:09PM +0800, Aaron Lu wrote:
> Hi Daniel,
>
> Thanks for taking a look.
>
> On Thu, Mar 30, 2023 at 03:51:57PM -0400, Daniel Jordan wrote:
> > On Thu, Mar 30, 2023 at 01:46:02PM -0400, Daniel Jordan wrote:
> > > Hi Aaron,
> > >
> > > On Wed, Mar 29, 2023 at 09:54:55PM +0800, Aaron Lu wrote:
> > > > On Wed, Mar 29, 2023 at 02:36:44PM +0200, Dietmar Eggemann wrote:
> > > > > On 28/03/2023 14:56, Aaron Lu wrote:
> > > > > > On Tue, Mar 28, 2023 at 02:09:39PM +0200, Dietmar Eggemann wrote:
> > > > > >> On 27/03/2023 07:39, Aaron Lu wrote:
> > > > And not sure if you did the profile on different nodes? I normally chose
> > > > 4 cpus of each node and do 'perf record -C' with them, to get an idea
> > > > of how different node behaves and also to reduce the record size.
> > > > Normally, when tg is allocated on node 0, then node 1's profile would
> > > > show higher cycles for update_cfs_group() and update_load_avg().
> > >
> > > Wouldn't the choice of CPUs have a big effect on the data, depending on
> > > where sysbench or postgres tasks run?
> >
> > Oh, probably not with NCPU threads though, since the load would be
> > pretty even, so I think I see where you're coming from.
>
> Yes I expect the load to be pretty even within the same node so didn't
> do the full cpu record. I used to only record a single cpu on each node
> to get a fast report time but settled on using 4 due to being paranoid :-)
Mhm :-) My 4-cpu profiles do look about the same as my all-system one.
> I have a vague memory AMD machine has a smaller LLC and cpus belonging
> to the same LLC is also not many, 8-16?
Yep, 16 cpus in every one. It's a 32M LLC.
> I tend to think cpu number of LLC play a role here since that's the
> domain where idle cpu is searched on task wake up time.
That's true, I hadn't thought of that.
> > > I'm guessing you've left all sched knobs alone? Maybe sharing those and
>
> Yes I've left all knobs alone. The server I have access to has Ubuntu
> 22.04.1 installed and here are the values of these knobs:
> root@a4bf01924c30:/sys/kernel/debug/sched# sysctl -a |grep sched
> kernel.sched_autogroup_enabled = 1
> kernel.sched_cfs_bandwidth_slice_us = 5000
> kernel.sched_child_runs_first = 0
> kernel.sched_deadline_period_max_us = 4194304
> kernel.sched_deadline_period_min_us = 100
> kernel.sched_energy_aware = 1
> kernel.sched_rr_timeslice_ms = 100
> kernel.sched_rt_period_us = 1000000
> kernel.sched_rt_runtime_us = 950000
> kernel.sched_schedstats = 0
> kernel.sched_util_clamp_max = 1024
> kernel.sched_util_clamp_min = 1024
> kernel.sched_util_clamp_min_rt_default = 1024
>
> root@a4bf01924c30:/sys/kernel/debug/sched# for i in `ls features *_ns *_ms preempt`; do echo "$i: `cat $i`"; done
> features: GENTLE_FAIR_SLEEPERS START_DEBIT NO_NEXT_BUDDY LAST_BUDDY CACHE_HOT_BUDDY WAKEUP_PREEMPTION NO_HRTICK NO_HRTICK_DL NO_DOUBLE_TICK NONTASK_CAPACITY TTWU_QUEUE NO_SIS_PROP SIS_UTIL NO_WARN_DOUBLE_CLOCK RT_PUSH_IPI NO_RT_RUNTIME_SHARE NO_LB_MIN ATTACH_AGE_LOAD WA_IDLE WA_WEIGHT WA_BIAS UTIL_EST UTIL_EST_FASTUP NO_LATENCY_WARN ALT_PERIOD BASE_SLICE
> idle_min_granularity_ns: 750000
> latency_ns: 24000000
> latency_warn_ms: 100
> migration_cost_ns: 500000
> min_granularity_ns: 3000000
> preempt: none (voluntary) full
> wakeup_granularity_ns: 4000000
Right, figures, all the same on my machines.
> And attached kconfig, it's basically what the distro provided except I
> had to disable some configs related to module sign or something like
> that.
Thanks for all the info. I got the same low perf percentages using your
kconfig as I got before (<0.50% for both functions), so maybe this just
takes a big machine with big LLCs, which sadly I haven't got.
On Tue, Apr 04, 2023 at 11:15:40PM +0800, Aaron Lu wrote:
> On Mon, Mar 27, 2023 at 01:39:55PM +0800, Aaron Lu wrote:
> [...]
> > Another observation of this workload is: it has a lot of wakeup time
> > task migrations and that is the reason why update_load_avg() and
> > update_cfs_group() shows noticeable cost. Running this workload in N
> > instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> > task migrations on wake up time are greatly reduced and the overhead from
> > the two above mentioned functions also dropped a lot. It's not clear to
> > me why running in multiple instances can reduce task migrations on
> > wakeup path yet.
>
> Regarding this observation, I've some finding. The TLDR is: 1 instance
> setup's overall CPU util is lower than N >= 2 instances setup and as a
> result, under 1 instance setup, sis() is more likely to find idle cpus
> than N >= 2 instances setup and that is the reason why 1 instance setup
> has more migrations.
>
> More details:
>
> For 1 instance with nr_thread=nr_cpu=224 setup, during a 5s window,
> there are 10 million calls of select_idle_sibling() and 6.1 million
> migrations. Of these migrations, 4.6 million comes from select_idle_cpu(),
> 1.3 million comes from recent_cpu.
> mpstat of this time window:
> Average: NODE %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> Average: all 45.15 0.00 18.59 0.00 0.00 17.29 0.00 0.00 0.00 18.98
> Average: 0 38.14 0.00 17.29 0.00 0.00 14.77 0.00 0.00 0.00 29.80
> Average: 1 52.07 0.00 19.88 0.00 0.00 19.78 0.00 0.00 0.00 8.28
Aha. It takes one instance of nr_thread=(3/4)*nr_cpu to get this
overall utilization on my aforementioned Xeon, but then I see 3-4% on
both functions in the profile. I'll poke at it some more, see how bad
it hurts over more loads, might take a bit though.
> For 4 instance with nr_thread=56 setup, during a 5s window, there are 15
> million calls of select_idle_sibling() and only 30k migrations.
> select_idle_cpu() is called 15 million times but only 23k of them passed
> the sd_share->nr_idle_scan != 0 test.
> mpstat of this time window:
> Average: NODE %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> Average: all 68.54 0.00 21.54 0.00 0.00 8.35 0.00 0.00 0.00 1.58
> Average: 0 70.05 0.00 20.92 0.00 0.00 8.17 0.00 0.00 0.00 0.87
> Average: 1 67.03 0.00 22.16 0.00 0.00 8.53 0.00 0.00 0.00 2.29
>
> For 8 instance with nr_thread=28 setup, during a 5s window, there are
> 16 million calls of select_idle_sibling() and 9.6k migrations.
> select_idle_cpu() is called 16 million times but none of them passed the
> sd_share->nr_idle_scan != 0 test.
> mpstat of this time window:
> Average: NODE %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle
> Average: all 70.29 0.00 20.99 0.00 0.00 8.28 0.00 0.00 0.00 0.43
> Average: 0 71.58 0.00 19.98 0.00 0.00 8.04 0.00 0.00 0.00 0.40
> Average: 1 69.00 0.00 22.01 0.00 0.00 8.52 0.00 0.00 0.00 0.47
>
> On a side note: when sd_share->nr_idle_scan > 0 and has_idle_core is true,
> then sd_share->nr_idle_scan is not actually respected. Is this intended?
> It seems to say: if there is idle core, then let's try hard and ignore
> SIS_UTIL to find that idle core, right?
On Mon, Mar 27, 2023 at 01:39:55PM +0800, Aaron Lu wrote:
> When using sysbench to benchmark Postgres in a single docker instance
> with sysbench's nr_threads set to nr_cpu, it is observed there are times
> update_cfs_group() and update_load_avg() shows noticeable overhead on
> cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
>
> 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
>
> While cpus of the other node normally sees a lower cycle percent:
>
> 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
>
> Annotate shows the cycles are mostly spent on accessing tg->load_avg
> with update_load_avg() being the write side and update_cfs_group() being
> the read side.
>
> The reason why only cpus of one node has bigger overhead is: task_group
> is allocated on demand from a slab and whichever cpu happens to do the
> allocation, the allocated tg will be located on that node and accessing
> to tg->load_avg will have a lower cost for cpus on the same node and
> a higer cost for cpus of the remote node.
>
> Tim Chen told me that PeterZ once mentioned a way to solve a similar
> problem by making a counter per node so do the same for tg->load_avg.
Yeah, I send him a very similar patch (except horrible) some 5 years ago
for testing.
> After this change, the worst number I saw during a 5 minutes run from
> both nodes are:
>
> 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
Nice!
> Another observation of this workload is: it has a lot of wakeup time
> task migrations and that is the reason why update_load_avg() and
> update_cfs_group() shows noticeable cost. Running this workload in N
> instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> task migrations on wake up time are greatly reduced and the overhead from
> the two above mentioned functions also dropped a lot. It's not clear to
> me why running in multiple instances can reduce task migrations on
> wakeup path yet.
If there is *any* idle time, we're rather agressive at moving tasks to
idle CPUs in an attempt to avoid said idle time. If you're running at
about the number of CPUs there will be a fair amount of idle time and
hence significant migrations.
When you overload, there will no longer be idle time and hence no more
migrations.
> Reported-by: Nitin Tekchandani <[email protected]>
> Signed-off-by: Aaron Lu <[email protected]>
If you want to make things more complicated you can check
num_possible_nodes()==1 on boot and then avoid the indirection, but
On Thu, Mar 30, 2023 at 01:45:57PM -0400, Daniel Jordan wrote:
> The topology of my machine is different from yours, but it's the biggest
> I have, and I'm assuming cpu count is more important than topology when
> reproducing the remote accesses. I also tried on
Core count definitely matters some, but the thing that really hurts is
the cross-node (and cross-cache, which for intel happens to be the same
set) atomics.
I suppose the thing to measure is where this cost rises most sharply on
the AMD platforms -- is that cross LLC or cross Node?
I mean, setting up the split at boot time is fairly straight forward and
we could equally well split at LLC.
On Wed, Apr 12, 2023 at 01:59:36PM +0200, Peter Zijlstra wrote:
> On Mon, Mar 27, 2023 at 01:39:55PM +0800, Aaron Lu wrote:
> > When using sysbench to benchmark Postgres in a single docker instance
> > with sysbench's nr_threads set to nr_cpu, it is observed there are times
> > update_cfs_group() and update_load_avg() shows noticeable overhead on
> > cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
> >
> > 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> > 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
> >
> > While cpus of the other node normally sees a lower cycle percent:
> >
> > 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> > 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
> >
> > Annotate shows the cycles are mostly spent on accessing tg->load_avg
> > with update_load_avg() being the write side and update_cfs_group() being
> > the read side.
> >
> > The reason why only cpus of one node has bigger overhead is: task_group
> > is allocated on demand from a slab and whichever cpu happens to do the
> > allocation, the allocated tg will be located on that node and accessing
> > to tg->load_avg will have a lower cost for cpus on the same node and
> > a higer cost for cpus of the remote node.
> >
> > Tim Chen told me that PeterZ once mentioned a way to solve a similar
> > problem by making a counter per node so do the same for tg->load_avg.
>
> Yeah, I send him a very similar patch (except horrible) some 5 years ago
> for testing.
>
> > After this change, the worst number I saw during a 5 minutes run from
> > both nodes are:
> >
> > 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> > 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
>
> Nice!
>
> > Another observation of this workload is: it has a lot of wakeup time
> > task migrations and that is the reason why update_load_avg() and
> > update_cfs_group() shows noticeable cost. Running this workload in N
> > instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> > task migrations on wake up time are greatly reduced and the overhead from
> > the two above mentioned functions also dropped a lot. It's not clear to
> > me why running in multiple instances can reduce task migrations on
> > wakeup path yet.
>
> If there is *any* idle time, we're rather agressive at moving tasks to
> idle CPUs in an attempt to avoid said idle time. If you're running at
> about the number of CPUs there will be a fair amount of idle time and
> hence significant migrations.
>
> When you overload, there will no longer be idle time and hence no more
> migrations.
>
> > Reported-by: Nitin Tekchandani <[email protected]>
> > Signed-off-by: Aaron Lu <[email protected]>
>
> If you want to make things more complicated you can check
> num_possible_nodes()==1 on boot and then avoid the indirection, but
... finishing emails is hard :-)
I think I meant to say we should check if there's measurable overhead on
single-node systems before we go overboard or somesuch.
On Wed, Apr 12, 2023 at 01:59:36PM +0200, Peter Zijlstra wrote:
> On Mon, Mar 27, 2023 at 01:39:55PM +0800, Aaron Lu wrote:
> > When using sysbench to benchmark Postgres in a single docker instance
> > with sysbench's nr_threads set to nr_cpu, it is observed there are times
> > update_cfs_group() and update_load_avg() shows noticeable overhead on
> > cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
> >
> > 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> > 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
> >
> > While cpus of the other node normally sees a lower cycle percent:
> >
> > 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> > 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
> >
> > Annotate shows the cycles are mostly spent on accessing tg->load_avg
> > with update_load_avg() being the write side and update_cfs_group() being
> > the read side.
> >
> > The reason why only cpus of one node has bigger overhead is: task_group
> > is allocated on demand from a slab and whichever cpu happens to do the
> > allocation, the allocated tg will be located on that node and accessing
> > to tg->load_avg will have a lower cost for cpus on the same node and
> > a higer cost for cpus of the remote node.
> >
> > Tim Chen told me that PeterZ once mentioned a way to solve a similar
> > problem by making a counter per node so do the same for tg->load_avg.
>
> Yeah, I send him a very similar patch (except horrible) some 5 years ago
> for testing.
>
> > After this change, the worst number I saw during a 5 minutes run from
> > both nodes are:
> >
> > 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> > 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
>
> Nice!
:-)
> > Another observation of this workload is: it has a lot of wakeup time
> > task migrations and that is the reason why update_load_avg() and
> > update_cfs_group() shows noticeable cost. Running this workload in N
> > instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> > task migrations on wake up time are greatly reduced and the overhead from
> > the two above mentioned functions also dropped a lot. It's not clear to
> > me why running in multiple instances can reduce task migrations on
> > wakeup path yet.
>
> If there is *any* idle time, we're rather agressive at moving tasks to
> idle CPUs in an attempt to avoid said idle time. If you're running at
> about the number of CPUs there will be a fair amount of idle time and
> hence significant migrations.
Yes indeed.
> When you overload, there will no longer be idle time and hence no more
> migrations.
True. My later profile showed the multi-instance case has much lower
idle time compared to 1 instance setup, 0.4%-2% vs ~20%, and thus much
fewer migrations on wakeup, thousands vs millions in a 5s window.
> > Reported-by: Nitin Tekchandani <[email protected]>
> > Signed-off-by: Aaron Lu <[email protected]>
>
> If you want to make things more complicated you can check
> num_possible_nodes()==1 on boot and then avoid the indirection, but
Ah right, will think about how to achieve this.
Thanks for your comments.
On Wed, Apr 12, 2023 at 03:58:28PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 12, 2023 at 01:59:36PM +0200, Peter Zijlstra wrote:
> > On Mon, Mar 27, 2023 at 01:39:55PM +0800, Aaron Lu wrote:
> > > When using sysbench to benchmark Postgres in a single docker instance
> > > with sysbench's nr_threads set to nr_cpu, it is observed there are times
> > > update_cfs_group() and update_load_avg() shows noticeable overhead on
> > > cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:
> > >
> > > 10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
> > > 7.84% 7.43% [kernel.vmlinux] [k] update_load_avg
> > >
> > > While cpus of the other node normally sees a lower cycle percent:
> > >
> > > 4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
> > > 4.02% 3.40% [kernel.vmlinux] [k] update_load_avg
> > >
> > > Annotate shows the cycles are mostly spent on accessing tg->load_avg
> > > with update_load_avg() being the write side and update_cfs_group() being
> > > the read side.
> > >
> > > The reason why only cpus of one node has bigger overhead is: task_group
> > > is allocated on demand from a slab and whichever cpu happens to do the
> > > allocation, the allocated tg will be located on that node and accessing
> > > to tg->load_avg will have a lower cost for cpus on the same node and
> > > a higer cost for cpus of the remote node.
> > >
> > > Tim Chen told me that PeterZ once mentioned a way to solve a similar
> > > problem by making a counter per node so do the same for tg->load_avg.
> >
> > Yeah, I send him a very similar patch (except horrible) some 5 years ago
> > for testing.
> >
> > > After this change, the worst number I saw during a 5 minutes run from
> > > both nodes are:
> > >
> > > 2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
> > > 2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group
> >
> > Nice!
> >
> > > Another observation of this workload is: it has a lot of wakeup time
> > > task migrations and that is the reason why update_load_avg() and
> > > update_cfs_group() shows noticeable cost. Running this workload in N
> > > instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
> > > task migrations on wake up time are greatly reduced and the overhead from
> > > the two above mentioned functions also dropped a lot. It's not clear to
> > > me why running in multiple instances can reduce task migrations on
> > > wakeup path yet.
> >
> > If there is *any* idle time, we're rather agressive at moving tasks to
> > idle CPUs in an attempt to avoid said idle time. If you're running at
> > about the number of CPUs there will be a fair amount of idle time and
> > hence significant migrations.
> >
> > When you overload, there will no longer be idle time and hence no more
> > migrations.
> >
> > > Reported-by: Nitin Tekchandani <[email protected]>
> > > Signed-off-by: Aaron Lu <[email protected]>
> >
> > If you want to make things more complicated you can check
> > num_possible_nodes()==1 on boot and then avoid the indirection, but
>
> ... finishing emails is hard :-)
>
> I think I meant to say we should check if there's measurable overhead on
> single-node systems before we go overboard or somesuch.
Got it, hopefully there is no measurable overhead :-)
On Wed, Apr 12, 2023 at 02:07:36PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 30, 2023 at 01:45:57PM -0400, Daniel Jordan wrote:
>
> > The topology of my machine is different from yours, but it's the biggest
> > I have, and I'm assuming cpu count is more important than topology when
> > reproducing the remote accesses. I also tried on
>
> Core count definitely matters some, but the thing that really hurts is
> the cross-node (and cross-cache, which for intel happens to be the same
> set) atomics.
>
> I suppose the thing to measure is where this cost rises most sharply on
> the AMD platforms -- is that cross LLC or cross Node?
>
> I mean, setting up the split at boot time is fairly straight forward and
> we could equally well split at LLC.
To check the cross LLC case, I bound all postgres and sysbench tasks to
a node. The two functions aren't free then on either AMD or Intel,
multiple LLCs or not, but the pain is a bit greater in the cross node
(unbound) case.
The read side (update_cfs_group) gets more expensive with per-node tg
load_avg on AMD, especially cross node--those are the biggest diffs.
These are more containerized sysbench runs, just the same as before.
Base is 6.2, test is 6.2 plus this RFC. Each number under base or test
is the average over ten runs of the profile percent of the function
measured for 5 seconds, 60 seconds into the run. I ran the experiment a
second time, and the numbers were fairly similar to what's below.
AMD EPYC 7J13 64-Core Processor (NPS1)
2 sockets * 64 cores * 2 threads = 256 CPUs
update_load_avg profile% update_cfs_group profile%
affinity nr_threads base test diff base test diff
unbound 96 0.7 0.6 -0.1 0.3 0.6 0.4
unbound 128 0.8 0.7 0.0 0.3 0.7 0.4
unbound 160 2.4 1.7 -0.7 1.2 2.3 1.1
unbound 192 2.3 1.7 -0.6 0.9 2.4 1.5
unbound 224 0.9 0.9 0.0 0.3 0.6 0.3
unbound 256 0.4 0.4 0.0 0.1 0.2 0.1
node0 48 0.7 0.6 -0.1 0.3 0.6 0.3
node0 64 0.7 0.7 -0.1 0.3 0.6 0.3
node0 80 1.4 1.3 -0.1 0.3 0.6 0.3
node0 96 1.5 1.4 -0.1 0.3 0.6 0.3
node0 112 0.8 0.8 0.0 0.2 0.4 0.2
node0 128 0.4 0.4 0.0 0.1 0.2 0.1
node1 48 0.7 0.6 -0.1 0.3 0.6 0.3
node1 64 0.7 0.6 -0.1 0.3 0.6 0.3
node1 80 1.4 1.2 -0.1 0.3 0.6 0.3
node1 96 1.4 1.3 -0.2 0.3 0.6 0.3
node1 112 0.8 0.7 -0.1 0.2 0.3 0.2
node1 128 0.4 0.4 0.0 0.1 0.2 0.1
Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
2 sockets * 32 cores * 2 thread = 128 CPUs
update_load_avg profile% update_cfs_group profile%
affinity nr_threads base test diff base test diff
unbound 48 0.4 0.4 0.0 0.4 0.5 0.1
unbound 64 0.5 0.5 0.0 0.5 0.6 0.1
unbound 80 2.0 1.8 -0.2 2.7 2.4 -0.3
unbound 96 3.3 2.8 -0.5 3.6 3.3 -0.3
unbound 112 2.8 2.6 -0.2 4.1 3.3 -0.8
unbound 128 0.4 0.4 0.0 0.4 0.4 0.1
node0 24 0.4 0.4 0.0 0.3 0.5 0.2
node0 32 0.5 0.5 0.0 0.3 0.4 0.2
node0 40 1.0 1.1 0.1 0.7 0.8 0.1
node0 48 1.5 1.6 0.1 0.8 0.9 0.1
node0 56 1.8 1.9 0.1 0.8 0.9 0.1
node0 64 0.4 0.4 0.0 0.2 0.4 0.1
node1 24 0.4 0.5 0.0 0.3 0.5 0.2
node1 32 0.4 0.5 0.0 0.3 0.5 0.2
node1 40 1.0 1.1 0.0 0.7 0.8 0.1
node1 48 1.6 1.6 0.1 0.8 0.9 0.1
node1 56 1.8 1.9 0.1 0.8 0.9 0.1
node1 64 0.4 0.4 0.0 0.2 0.4 0.1
On Thu, Apr 20, 2023 at 04:52:01PM -0400, Daniel Jordan wrote:
> On Wed, Apr 12, 2023 at 02:07:36PM +0200, Peter Zijlstra wrote:
> > On Thu, Mar 30, 2023 at 01:45:57PM -0400, Daniel Jordan wrote:
> >
> > > The topology of my machine is different from yours, but it's the biggest
> > > I have, and I'm assuming cpu count is more important than topology when
> > > reproducing the remote accesses. I also tried on
> >
> > Core count definitely matters some, but the thing that really hurts is
> > the cross-node (and cross-cache, which for intel happens to be the same
> > set) atomics.
> >
> > I suppose the thing to measure is where this cost rises most sharply on
> > the AMD platforms -- is that cross LLC or cross Node?
> >
> > I mean, setting up the split at boot time is fairly straight forward and
> > we could equally well split at LLC.
>
> To check the cross LLC case, I bound all postgres and sysbench tasks to
> a node. The two functions aren't free then on either AMD or Intel,
> multiple LLCs or not, but the pain is a bit greater in the cross node
> (unbound) case.
>
> The read side (update_cfs_group) gets more expensive with per-node tg
> load_avg on AMD, especially cross node--those are the biggest diffs.
>
> These are more containerized sysbench runs, just the same as before.
> Base is 6.2, test is 6.2 plus this RFC. Each number under base or test
> is the average over ten runs of the profile percent of the function
> measured for 5 seconds, 60 seconds into the run. I ran the experiment a
> second time, and the numbers were fairly similar to what's below.
>
> AMD EPYC 7J13 64-Core Processor (NPS1)
> 2 sockets * 64 cores * 2 threads = 256 CPUs
>
> update_load_avg profile% update_cfs_group profile%
> affinity nr_threads base test diff base test diff
> unbound 96 0.7 0.6 -0.1 0.3 0.6 0.4
> unbound 128 0.8 0.7 0.0 0.3 0.7 0.4
> unbound 160 2.4 1.7 -0.7 1.2 2.3 1.1
> unbound 192 2.3 1.7 -0.6 0.9 2.4 1.5
> unbound 224 0.9 0.9 0.0 0.3 0.6 0.3
> unbound 256 0.4 0.4 0.0 0.1 0.2 0.1
Is it possible to show per-node profile for the two functions? I wonder
how the per-node profile changes with and without this patch on Milan.
And for vanilla kernel, it would be good to know on which node the struct
task_group is allocated. I used below script to fetch this info:
kretfunc:sched_create_group
{
$root = kaddr("root_task_group");
if (args->parent == $root) {
return;
}
printf("cpu%d, node%d: tg=0x%lx, parent=%s\n", cpu, numaid,
retval, str(args->parent->css.cgroup->kn->name));
}
BTW, is the score(transactions) of the workload stable? If so, how the
score change when the patch is applied?
> node0 48 0.7 0.6 -0.1 0.3 0.6 0.3
> node0 64 0.7 0.7 -0.1 0.3 0.6 0.3
> node0 80 1.4 1.3 -0.1 0.3 0.6 0.3
> node0 96 1.5 1.4 -0.1 0.3 0.6 0.3
> node0 112 0.8 0.8 0.0 0.2 0.4 0.2
> node0 128 0.4 0.4 0.0 0.1 0.2 0.1
> node1 48 0.7 0.6 -0.1 0.3 0.6 0.3
> node1 64 0.7 0.6 -0.1 0.3 0.6 0.3
> node1 80 1.4 1.2 -0.1 0.3 0.6 0.3
> node1 96 1.4 1.3 -0.2 0.3 0.6 0.3
> node1 112 0.8 0.7 -0.1 0.2 0.3 0.2
> node1 128 0.4 0.4 0.0 0.1 0.2 0.1
I can see why the cost of update_cfs_group() slightly increased since
now there is no cross node access to tg->load_avg and the patched kernel
doesn't provide any benefit but only incur some overhead due to indirect
access to tg->load_avg, but why update_load_avg()'s cost dropped? I
expect it to be roughly the same after patched or slightly increased.
>
> Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
> 2 sockets * 32 cores * 2 thread = 128 CPUs
>
> update_load_avg profile% update_cfs_group profile%
> affinity nr_threads base test diff base test diff
> unbound 48 0.4 0.4 0.0 0.4 0.5 0.1
> unbound 64 0.5 0.5 0.0 0.5 0.6 0.1
> unbound 80 2.0 1.8 -0.2 2.7 2.4 -0.3
> unbound 96 3.3 2.8 -0.5 3.6 3.3 -0.3
> unbound 112 2.8 2.6 -0.2 4.1 3.3 -0.8
> unbound 128 0.4 0.4 0.0 0.4 0.4 0.1
This is in line with my test on SPR, just the cost is much lower on
Icelake.
> node0 24 0.4 0.4 0.0 0.3 0.5 0.2
> node0 32 0.5 0.5 0.0 0.3 0.4 0.2
> node0 40 1.0 1.1 0.1 0.7 0.8 0.1
> node0 48 1.5 1.6 0.1 0.8 0.9 0.1
> node0 56 1.8 1.9 0.1 0.8 0.9 0.1
> node0 64 0.4 0.4 0.0 0.2 0.4 0.1
> node1 24 0.4 0.5 0.0 0.3 0.5 0.2
> node1 32 0.4 0.5 0.0 0.3 0.5 0.2
> node1 40 1.0 1.1 0.0 0.7 0.8 0.1
> node1 48 1.6 1.6 0.1 0.8 0.9 0.1
> node1 56 1.8 1.9 0.1 0.8 0.9 0.1
> node1 64 0.4 0.4 0.0 0.2 0.4 0.1
And the slight increase for both the read side and write side seem to
suggest it is due to the indirect access introduced in this patch,
especially on the read side where a summation of all node's value is
performed, that's probably why read side's increase is larger: 0.1 - 0.2
vs 0.0 - 0.1.
Thanks for sharing these data.
On 2023-03-27 at 13:39:55 +0800, Aaron Lu wrote:
>
> The reason why only cpus of one node has bigger overhead is: task_group
> is allocated on demand from a slab and whichever cpu happens to do the
> allocation, the allocated tg will be located on that node and accessing
> to tg->load_avg will have a lower cost for cpus on the same node and
> a higer cost for cpus of the remote node.
[...]
> static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
> {
> long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
> + int node = cpu_to_node(cfs_rq->rq->cpu);
>
> /*
> * No need to update load_avg for root_task_group as it is not used.
> @@ -3616,7 +3617,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
> return;
>
> if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
> - atomic_long_add(delta, &cfs_rq->tg->load_avg);
> + atomic_long_add(delta, &cfs_rq->tg->node_info[node]->load_avg);
When entered enqueue_entity(cfs_rq, se) -> update_tg_load_avg(cfs_rq)
the cfs_rq->rq->cpu is not necessary the current cpu, so the node returned by
cpu_to_node(cfs_rq->rq->cpu) is not necessary the current node as the current
CPU, would atomic_add still introduce cross-node overhead due to remote access
to cfs_rq->tg->node_info[node]->load_avg in this case, or do I miss something?
thanks,
Chenyu
On Sat, Apr 22, 2023 at 12:01:59PM +0800, Chen Yu wrote:
> On 2023-03-27 at 13:39:55 +0800, Aaron Lu wrote:
> >
> > The reason why only cpus of one node has bigger overhead is: task_group
> > is allocated on demand from a slab and whichever cpu happens to do the
> > allocation, the allocated tg will be located on that node and accessing
> > to tg->load_avg will have a lower cost for cpus on the same node and
> > a higer cost for cpus of the remote node.
> [...]
> > static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
> > {
> > long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
> > + int node = cpu_to_node(cfs_rq->rq->cpu);
> >
> > /*
> > * No need to update load_avg for root_task_group as it is not used.
> > @@ -3616,7 +3617,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
> > return;
> >
> > if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
> > - atomic_long_add(delta, &cfs_rq->tg->load_avg);
> > + atomic_long_add(delta, &cfs_rq->tg->node_info[node]->load_avg);
> When entered enqueue_entity(cfs_rq, se) -> update_tg_load_avg(cfs_rq)
> the cfs_rq->rq->cpu is not necessary the current cpu, so the node returned by
> cpu_to_node(cfs_rq->rq->cpu) is not necessary the current node as the current
> CPU, would atomic_add still introduce cross-node overhead due to remote access
> to cfs_rq->tg->node_info[node]->load_avg in this case, or do I miss something?
That's good point.
The chance cpu being different is high, but the node being different is
pretty low though. The wakeup path will not do cross LLC activation with
TTWU_QUEUE, but the load balance path does make it possible to dequeue
a task that is on a remote node and I think that's the only path where
the two cpus(current vs cfs_rq->rq->cpu) can be on different nodes.
A quick test shows during a 5s window of running hackbench on a VM, the
number of times when the two nodes derived from current and cfs_rq->rq->cpu
being different are less than 100 while being equal are 992548.
I'll switch to using smp_processor_id() in next posting since it's the
right thing to do, thanks for spotting this!
On Fri, Apr 21, 2023 at 11:05:59PM +0800, Aaron Lu wrote:
> On Thu, Apr 20, 2023 at 04:52:01PM -0400, Daniel Jordan wrote:
> > AMD EPYC 7J13 64-Core Processor (NPS1)
> > 2 sockets * 64 cores * 2 threads = 256 CPUs
> >
> > update_load_avg profile% update_cfs_group profile%
> > affinity nr_threads base test diff base test diff
> > unbound 96 0.7 0.6 -0.1 0.3 0.6 0.4
> > unbound 128 0.8 0.7 0.0 0.3 0.7 0.4
> > unbound 160 2.4 1.7 -0.7 1.2 2.3 1.1
> > unbound 192 2.3 1.7 -0.6 0.9 2.4 1.5
> > unbound 224 0.9 0.9 0.0 0.3 0.6 0.3
> > unbound 256 0.4 0.4 0.0 0.1 0.2 0.1
>
> Is it possible to show per-node profile for the two functions? I wonder
> how the per-node profile changes with and without this patch on Milan.
> And for vanilla kernel, it would be good to know on which node the struct
> task_group is allocated. I used below script to fetch this info:
> kretfunc:sched_create_group
> {
> $root = kaddr("root_task_group");
> if (args->parent == $root) {
> return;
> }
>
> printf("cpu%d, node%d: tg=0x%lx, parent=%s\n", cpu, numaid,
> retval, str(args->parent->css.cgroup->kn->name));
> }
That's helpful, nid below comes from this. The node happened to be different
between base and test kernels on both machines, so that's one less way the
experiment is controlled but for the unbound case where tasks are presumably
spread fairly evenly I'm not sure how much it matters, especially given that
the per-node profile numbers are fairly close to each other.
Data below, same parameters and times as the last mail.
> BTW, is the score(transactions) of the workload stable? If so, how the
> score change when the patch is applied?
Transactions seem to be mostly stable but unfortunately regress overall on both
machines.
FWIW, t-test compares the two sets of ten iterations apiece. The higher the
percentage, the higher the confidence that the difference is significant.
AMD EPYC 7J13 64-Core Processor (NPS1)
2 sockets * 64 cores * 2 threads = 256 CPUs
transactions per second
diff base test
----------------- ------------------ ------------------
tps tps
affinity nr_threads (%diff) (t-test) tps std% nid tps std% nid
unbound 96 -0.8% 100% 128,450 0% 1 127,433 0% 0
unbound 128 -1.0% 100% 138,471 0% 1 137,099 0% 0
unbound 160 -1.2% 100% 136,829 0% 1 135,170 0% 0
unbound 192 0.4% 95% 152,767 0% 1 153,336 0% 0
unbound 224 -0.2% 81% 179,946 0% 1 179,620 0% 0
unbound 256 -0.2% 71% 203,920 0% 1 203,583 0% 0
node0 48 0.1% 46% 69,635 0% 0 69,719 0% 0
node0 64 -0.1% 69% 75,213 0% 0 75,163 0% 0
node0 80 -0.4% 100% 72,520 0% 0 72,217 0% 0
node0 96 -0.2% 89% 81,345 0% 0 81,210 0% 0
node0 112 -0.3% 98% 96,174 0% 0 95,855 0% 0
node0 128 -0.7% 100% 111,813 0% 0 111,045 0% 0
node1 48 0.3% 78% 69,985 1% 1 70,200 1% 1
node1 64 0.6% 100% 75,770 0% 1 76,231 0% 1
node1 80 0.3% 100% 73,329 0% 1 73,567 0% 1
node1 96 0.4% 99% 82,222 0% 1 82,556 0% 1
node1 112 0.1% 62% 96,573 0% 1 96,689 0% 1
node1 128 -0.2% 69% 111,614 0% 1 111,435 0% 1
update_load_avg profile%
all_nodes node0 node1
---------------- ---------------- ----------------
affinity nr_threads base test diff base test diff base test diff
unbound 96 0.7 0.6 -0.1 0.7 0.6 -0.1 0.7 0.6 -0.1
unbound 128 0.8 0.7 -0.1 0.8 0.7 -0.1 0.8 0.7 -0.1
unbound 160 2.3 1.7 -0.7 2.5 1.7 -0.8 2.2 1.6 -0.5
unbound 192 2.2 1.6 -0.6 2.5 1.8 -0.7 2.0 1.4 -0.6
unbound 224 0.9 0.8 -0.1 1.1 0.7 -0.3 0.8 0.8 0.0
unbound 256 0.4 0.4 0.0 0.4 0.4 0.0 0.4 0.4 0.0
node0 48 0.7 0.6 -0.1
node0 64 0.8 0.7 -0.2
node0 80 2.0 1.4 -0.7
node0 96 2.3 1.4 -0.9
node0 112 1.0 0.8 -0.2
node0 128 0.5 0.4 0.0
node1 48 0.7 0.6 -0.1
node1 64 0.8 0.6 -0.1
node1 80 1.4 1.2 -0.2
node1 96 1.5 1.3 -0.2
node1 112 0.8 0.7 -0.1
node1 128 0.4 0.4 -0.1
update_cfs_group profile%
all_nodes node0 node1
---------------- ---------------- ----------------
affinity nr_threads base test diff base test diff base test diff
unbound 96 0.3 0.6 0.3 0.3 0.6 0.3 0.3 0.6 0.3
unbound 128 0.3 0.6 0.3 0.3 0.6 0.3 0.3 0.7 0.4
unbound 160 1.1 2.5 1.4 1.3 2.2 0.9 0.9 2.8 1.9
unbound 192 0.9 2.6 1.7 1.1 2.4 1.3 0.7 2.8 2.1
unbound 224 0.3 0.8 0.5 0.4 0.6 0.3 0.2 0.9 0.6
unbound 256 0.1 0.2 0.1 0.1 0.2 0.1 0.1 0.2 0.1
node0 48 0.4 0.6 0.2
node0 64 0.3 0.6 0.3
node0 80 0.7 0.6 -0.1
node0 96 0.6 0.6 0.0
node0 112 0.3 0.4 0.1
node0 128 0.1 0.2 0.1
node1 48 0.3 0.6 0.3
node1 64 0.3 0.6 0.3
node1 80 0.3 0.6 0.3
node1 96 0.3 0.6 0.3
node1 112 0.2 0.3 0.2
node1 128 0.1 0.2 0.1
Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
2 sockets * 32 cores * 2 thread = 128 CPUs
transactions per second
diff base test
----------------- ------------------ ------------------
tps tps
affinity nr_threads (%diff) (t-test) tps std% nid tps std% nid
unbound 48 -0.9% 100% 75,500 0% 1 74,834 0% 0
unbound 64 -0.4% 100% 81,687 0% 1 81,368 0% 0
unbound 80 -0.4% 100% 78,620 0% 1 78,281 0% 0
unbound 96 -0.5% 74% 78,949 1% 1 78,580 1% 0
unbound 112 -2.9% 87% 94,189 3% 1 91,458 5% 0
unbound 128 -1.4% 100% 117,557 0% 1 115,921 0% 0
node0 24 -0.7% 100% 38,601 0% 0 38,333 0% 0
node0 32 -1.2% 100% 41,539 0% 0 41,038 0% 0
node0 40 -1.6% 100% 42,325 0% 0 41,662 0% 0
node0 48 -1.3% 100% 41,956 0% 0 41,404 0% 0
node0 56 -1.3% 100% 42,115 0% 0 41,569 0% 0
node0 64 -1.0% 100% 62,431 0% 0 61,784 0% 0
node1 24 0.0% 1% 38,752 0% 1 38,752 0% 1
node1 32 0.9% 100% 42,568 0% 1 42,943 0% 1
node1 40 -0.2% 87% 43,452 0% 1 43,358 0% 1
node1 48 -0.5% 100% 43,047 0% 1 42,831 0% 1
node1 56 -0.5% 100% 43,464 0% 1 43,259 0% 1
node1 64 0.5% 100% 64,111 0% 1 64,450 0% 1
update_load_avg profile%
all_nodes node0 node1
---------------- ---------------- ----------------
affinity nr_threads base test diff base test diff base test diff
unbound 48 0.5 0.5 0.0 0.5 0.5 0.0 0.4 0.5 0.0
unbound 64 0.5 0.5 0.0 0.5 0.5 0.0 0.5 0.5 0.0
unbound 80 2.0 1.8 -0.3 2.0 1.7 -0.3 2.0 1.8 -0.2
unbound 96 3.4 2.8 -0.6 3.4 2.8 -0.6 3.4 2.9 -0.5
unbound 112 2.5 2.3 -0.1 4.5 3.8 -0.8 0.5 0.9 0.5
unbound 128 0.4 0.5 0.0 0.4 0.4 0.0 0.5 0.5 0.1
node0 24 0.4 0.5 0.0
node0 32 0.5 0.5 0.0
node0 40 1.0 1.1 0.1
node0 48 1.5 1.6 0.1
node0 56 1.8 1.9 0.1
node0 64 0.4 0.4 0.0
node1 24 0.5 0.4 0.0
node1 32 0.5 0.4 0.0
node1 40 1.0 1.1 0.0
node1 48 1.6 1.6 0.1
node1 56 1.9 1.9 0.0
node1 64 0.4 0.4 -0.1
update_cfs_group profile%
all_nodes node0 node1
---------------- ---------------- ----------------
affinity nr_threads base test diff base test diff base test diff
unbound 48 0.3 0.5 0.2 0.3 0.5 0.2 0.3 0.5 0.2
unbound 64 0.5 0.6 0.1 0.5 0.6 0.1 0.5 0.6 0.1
unbound 80 2.8 2.5 -0.3 2.6 2.4 -0.2 2.9 2.5 -0.5
unbound 96 3.7 3.3 -0.4 3.5 3.3 -0.2 3.9 3.3 -0.6
unbound 112 4.2 3.2 -1.0 4.1 3.3 -0.7 4.4 3.1 -1.2
unbound 128 0.4 0.5 0.1 0.4 0.5 0.1 0.4 0.5 0.1
node0 24 0.3 0.5 0.2
node0 32 0.3 0.4 0.1
node0 40 0.7 0.8 0.1
node0 48 0.8 0.9 0.1
node0 56 0.8 0.9 0.1
node0 64 0.2 0.4 0.1
node1 24 0.3 0.5 0.2
node1 32 0.3 0.5 0.2
node1 40 0.8 0.9 0.1
node1 48 0.8 0.9 0.1
node1 56 0.9 0.9 0.1
node1 64 0.2 0.4 0.1
There doesn't seem to be much of a pattern in the per-node breakdown.
Sometimes there's a bit more overhead on the node remote to the task_group
allocation than the node local to it, like I'd expect, and sometimes it's the
opposite. Generally pretty even.
> > node0 48 0.7 0.6 -0.1 0.3 0.6 0.3
> > node0 64 0.7 0.7 -0.1 0.3 0.6 0.3
> > node0 80 1.4 1.3 -0.1 0.3 0.6 0.3
> > node0 96 1.5 1.4 -0.1 0.3 0.6 0.3
> > node0 112 0.8 0.8 0.0 0.2 0.4 0.2
> > node0 128 0.4 0.4 0.0 0.1 0.2 0.1
> > node1 48 0.7 0.6 -0.1 0.3 0.6 0.3
> > node1 64 0.7 0.6 -0.1 0.3 0.6 0.3
> > node1 80 1.4 1.2 -0.1 0.3 0.6 0.3
> > node1 96 1.4 1.3 -0.2 0.3 0.6 0.3
> > node1 112 0.8 0.7 -0.1 0.2 0.3 0.2
> > node1 128 0.4 0.4 0.0 0.1 0.2 0.1
>
> I can see why the cost of update_cfs_group() slightly increased since
> now there is no cross node access to tg->load_avg and the patched kernel
> doesn't provide any benefit but only incur some overhead due to indirect
> access to tg->load_avg, but why update_load_avg()'s cost dropped? I
> expect it to be roughly the same after patched or slightly increased.
Yeah, that's not immediately obvious, especially when the Intel machine doesn't
do this.
Hi Daniel,
Thanks a lot for collecting these data.
I had hoped to also share some data I collected on other machines after
seeing your last email but trying to explain why only SPR showed benefit
has slowed me down. I now has some finding on this, please see below.
On Wed, May 03, 2023 at 03:41:25PM -0400, Daniel Jordan wrote:
> On Fri, Apr 21, 2023 at 11:05:59PM +0800, Aaron Lu wrote:
> > On Thu, Apr 20, 2023 at 04:52:01PM -0400, Daniel Jordan wrote:
> > > AMD EPYC 7J13 64-Core Processor (NPS1)
> > > 2 sockets * 64 cores * 2 threads = 256 CPUs
> > >
> > > update_load_avg profile% update_cfs_group profile%
> > > affinity nr_threads base test diff base test diff
> > > unbound 96 0.7 0.6 -0.1 0.3 0.6 0.4
> > > unbound 128 0.8 0.7 0.0 0.3 0.7 0.4
> > > unbound 160 2.4 1.7 -0.7 1.2 2.3 1.1
> > > unbound 192 2.3 1.7 -0.6 0.9 2.4 1.5
> > > unbound 224 0.9 0.9 0.0 0.3 0.6 0.3
> > > unbound 256 0.4 0.4 0.0 0.1 0.2 0.1
> >
> > Is it possible to show per-node profile for the two functions? I wonder
> > how the per-node profile changes with and without this patch on Milan.
> > And for vanilla kernel, it would be good to know on which node the struct
> > task_group is allocated. I used below script to fetch this info:
> > kretfunc:sched_create_group
> > {
> > $root = kaddr("root_task_group");
> > if (args->parent == $root) {
> > return;
> > }
> >
> > printf("cpu%d, node%d: tg=0x%lx, parent=%s\n", cpu, numaid,
> > retval, str(args->parent->css.cgroup->kn->name));
> > }
>
> That's helpful, nid below comes from this. The node happened to be different
> between base and test kernels on both machines, so that's one less way the
> experiment is controlled but for the unbound case where tasks are presumably
> spread fairly evenly I'm not sure how much it matters, especially given that
> the per-node profile numbers are fairly close to each other.
>
>
> Data below, same parameters and times as the last mail.
>
> > BTW, is the score(transactions) of the workload stable? If so, how the
> > score change when the patch is applied?
>
> Transactions seem to be mostly stable but unfortunately regress overall on both
> machines.
Yeah, I noticed your result is pretty stable in that the stddev% is
mostly zero. Mine are not that stable though. And it looks like there
are some wins in the node1 case :)
> FWIW, t-test compares the two sets of ten iterations apiece. The higher the
> percentage, the higher the confidence that the difference is significant.
>
>
> AMD EPYC 7J13 64-Core Processor (NPS1)
> 2 sockets * 64 cores * 2 threads = 256 CPUs
>
> transactions per second
>
> diff base test
> ----------------- ------------------ ------------------
> tps tps
> affinity nr_threads (%diff) (t-test) tps std% nid tps std% nid
> unbound 96 -0.8% 100% 128,450 0% 1 127,433 0% 0
> unbound 128 -1.0% 100% 138,471 0% 1 137,099 0% 0
> unbound 160 -1.2% 100% 136,829 0% 1 135,170 0% 0
> unbound 192 0.4% 95% 152,767 0% 1 153,336 0% 0
> unbound 224 -0.2% 81% 179,946 0% 1 179,620 0% 0
> unbound 256 -0.2% 71% 203,920 0% 1 203,583 0% 0
> node0 48 0.1% 46% 69,635 0% 0 69,719 0% 0
> node0 64 -0.1% 69% 75,213 0% 0 75,163 0% 0
> node0 80 -0.4% 100% 72,520 0% 0 72,217 0% 0
> node0 96 -0.2% 89% 81,345 0% 0 81,210 0% 0
> node0 112 -0.3% 98% 96,174 0% 0 95,855 0% 0
> node0 128 -0.7% 100% 111,813 0% 0 111,045 0% 0
> node1 48 0.3% 78% 69,985 1% 1 70,200 1% 1
> node1 64 0.6% 100% 75,770 0% 1 76,231 0% 1
> node1 80 0.3% 100% 73,329 0% 1 73,567 0% 1
> node1 96 0.4% 99% 82,222 0% 1 82,556 0% 1
> node1 112 0.1% 62% 96,573 0% 1 96,689 0% 1
> node1 128 -0.2% 69% 111,614 0% 1 111,435 0% 1
>
> update_load_avg profile%
>
> all_nodes node0 node1
> ---------------- ---------------- ----------------
> affinity nr_threads base test diff base test diff base test diff
> unbound 96 0.7 0.6 -0.1 0.7 0.6 -0.1 0.7 0.6 -0.1
> unbound 128 0.8 0.7 -0.1 0.8 0.7 -0.1 0.8 0.7 -0.1
> unbound 160 2.3 1.7 -0.7 2.5 1.7 -0.8 2.2 1.6 -0.5
> unbound 192 2.2 1.6 -0.6 2.5 1.8 -0.7 2.0 1.4 -0.6
> unbound 224 0.9 0.8 -0.1 1.1 0.7 -0.3 0.8 0.8 0.0
> unbound 256 0.4 0.4 0.0 0.4 0.4 0.0 0.4 0.4 0.0
> node0 48 0.7 0.6 -0.1
> node0 64 0.8 0.7 -0.2
> node0 80 2.0 1.4 -0.7
> node0 96 2.3 1.4 -0.9
> node0 112 1.0 0.8 -0.2
> node0 128 0.5 0.4 0.0
> node1 48 0.7 0.6 -0.1
> node1 64 0.8 0.6 -0.1
> node1 80 1.4 1.2 -0.2
> node1 96 1.5 1.3 -0.2
> node1 112 0.8 0.7 -0.1
> node1 128 0.4 0.4 -0.1
>
> update_cfs_group profile%
>
> all_nodes node0 node1
> ---------------- ---------------- ----------------
> affinity nr_threads base test diff base test diff base test diff
> unbound 96 0.3 0.6 0.3 0.3 0.6 0.3 0.3 0.6 0.3
> unbound 128 0.3 0.6 0.3 0.3 0.6 0.3 0.3 0.7 0.4
> unbound 160 1.1 2.5 1.4 1.3 2.2 0.9 0.9 2.8 1.9
> unbound 192 0.9 2.6 1.7 1.1 2.4 1.3 0.7 2.8 2.1
> unbound 224 0.3 0.8 0.5 0.4 0.6 0.3 0.2 0.9 0.6
> unbound 256 0.1 0.2 0.1 0.1 0.2 0.1 0.1 0.2 0.1
> node0 48 0.4 0.6 0.2
> node0 64 0.3 0.6 0.3
> node0 80 0.7 0.6 -0.1
> node0 96 0.6 0.6 0.0
> node0 112 0.3 0.4 0.1
> node0 128 0.1 0.2 0.1
> node1 48 0.3 0.6 0.3
> node1 64 0.3 0.6 0.3
> node1 80 0.3 0.6 0.3
> node1 96 0.3 0.6 0.3
> node1 112 0.2 0.3 0.2
> node1 128 0.1 0.2 0.1
>
update_load_avg()'s cost dropped while update_cfs_group()'s cost
increased. I think this is reasonable since the write side only has to
deal with local data now while the read side has to iterate the per-node
tg->load_avg on all nodes.
>
> Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
> 2 sockets * 32 cores * 2 thread = 128 CPUs
>
> transactions per second
>
> diff base test
> ----------------- ------------------ ------------------
> tps tps
> affinity nr_threads (%diff) (t-test) tps std% nid tps std% nid
> unbound 48 -0.9% 100% 75,500 0% 1 74,834 0% 0
> unbound 64 -0.4% 100% 81,687 0% 1 81,368 0% 0
> unbound 80 -0.4% 100% 78,620 0% 1 78,281 0% 0
> unbound 96 -0.5% 74% 78,949 1% 1 78,580 1% 0
> unbound 112 -2.9% 87% 94,189 3% 1 91,458 5% 0
> unbound 128 -1.4% 100% 117,557 0% 1 115,921 0% 0
> node0 24 -0.7% 100% 38,601 0% 0 38,333 0% 0
> node0 32 -1.2% 100% 41,539 0% 0 41,038 0% 0
> node0 40 -1.6% 100% 42,325 0% 0 41,662 0% 0
> node0 48 -1.3% 100% 41,956 0% 0 41,404 0% 0
> node0 56 -1.3% 100% 42,115 0% 0 41,569 0% 0
> node0 64 -1.0% 100% 62,431 0% 0 61,784 0% 0
> node1 24 0.0% 1% 38,752 0% 1 38,752 0% 1
> node1 32 0.9% 100% 42,568 0% 1 42,943 0% 1
> node1 40 -0.2% 87% 43,452 0% 1 43,358 0% 1
> node1 48 -0.5% 100% 43,047 0% 1 42,831 0% 1
> node1 56 -0.5% 100% 43,464 0% 1 43,259 0% 1
> node1 64 0.5% 100% 64,111 0% 1 64,450 0% 1
This looks like mostly a loss for Icelake.
I also tested on the same Icelake 8358 and my result is not entirely
the same to yours:
nr_thread=128
score(tps) update_cfs_group% update_load_avg%
6.2.0 97418?0.17% 0.50% - 0.74% 0.69% - 0.93%
this_patch 97029?0.32% 0.68% - 0.89% 0.70% - 0.89%
For the above nr_thread=128 unbound case, the score(tps) is in noise
range, instead of a 1.4% loss in your run. Profile wise, the write
side's cost slightly dropped while the read side's cost slightly
increased. Overall, no big change for nr_thread=128 on this Icelake.
I think this is also expected since in nr_thread=128 case, there are
very few migrations on wake up due to cpu utilization is almost 100% so
this patch shouldn't make an obvious difference.
nr_thread=96
score(tps) update_cfs_group% update_load_avg%
6.2.0 59183?0.21% 2.81% - 3.57% 3.48% - 3.76%
this_patch 58397?0.35% 2.70% - 3.01% 2.82% - 3.24%
For this case, there are enough task migrations on wakeup for this patch
to make a difference: the tps dropped about 1.3%, worse than your run.
Profile wise, both write side and read side dropped. But these drops do
not translate to performance gains. I think from the profile, this patch
is doing something good, it's just the tps suggested otherwise.
On another 2S, 224 cpu Sapphire Rapids:
nr_thread=224
score update_cfs_group% update_load_avg%
6.2.0 93504?4.79% 11.63% - 15.12% 7.00% - 10.31%
this_patch 103040?0.46% 7.08% - 9.08% 4.82% - 6.73%
The above is where this patch helps the most, both profile and score
shows improvement. My finding about why only SPR shows benefit is, I
think this has something to do with SPR's "Ingress Queue overflow" when
many cpus access the same cache line and once that overflow happened,
all those accessing cpus will have their memory operations slowed down.
This is described in section 3.11 of Intel's optimization reference
manual.
To confirm the above explanation, I did the nr_thread=96 run on SPR. To
make sure task migration still happens in this case, some cpus are
offlined and only 128 cpus are left. With fewer threads, the chance of
ingress queue overflow is much lower:
nr_thread=96 with cpu offlined to 128c left (32cores/64cpus on each socket)
score update_cfs_group% update_load_avg%
6.2.0 74878?0.58% 3.47% - 4.90% 3.59% - 4.42%
this_patch 75671?0.24% 2.66% - 3.55% 2.71% - 3.44%
Profile wise, the two functions dropped to near Icelake level, still
higher than Icelake but much better than nr_thread=224 case. When
comparing base line with this patch, it follows what I saw on Icelake:
the two functions' cost dropped but that did not translate to
performance increase(average is slightly higer but it's in noise range).
Base on my current understanding, the summary is:
- Running this workload with nr_thread=224 on SPR, the ingress queue
will overflow and that will slow things down. This patch helps
performance mainly because it transform the "many cpus accessing the
same cacheline" scenario to "many cpus accessing two cachelines" and
that can reduce the likelyhood of ingress queue overflow and thus,
helps performance;
- On Icelake with high nr_threads but not too high that would cause
100% cpu utilization, the two functions' cost will drop a little but
performance did not improve(it actually regressed a little);
- On SPR when there is no ingress queue overflow, it's similar to
Icelake: the two functions' cost will drop but performance did not
improve.
On Thu, May 04, 2023 at 06:27:46PM +0800, Aaron Lu wrote:
> Base on my current understanding, the summary is:
> - Running this workload with nr_thread=224 on SPR, the ingress queue
> will overflow and that will slow things down. This patch helps
> performance mainly because it transform the "many cpus accessing the
> same cacheline" scenario to "many cpus accessing two cachelines" and
> that can reduce the likelyhood of ingress queue overflow and thus,
> helps performance;
> - On Icelake with high nr_threads but not too high that would cause
> 100% cpu utilization, the two functions' cost will drop a little but
> performance did not improve(it actually regressed a little);
> - On SPR when there is no ingress queue overflow, it's similar to
> Icelake: the two functions' cost will drop but performance did not
> improve.
More results when running hackbench and netperf on Sapphire Rapids as
well as on 2 sockets Icelake and 2 sockets Cascade Lake.
The summary is:
- on SPR, hackbench time reduced ~8% and netperf(UDP_RR/nr_thread=100%)
performance increased ~50%;
- on Icelake, performance regressed about 1%-2% for postgres_sysbench
and hackbench, netperf has no performance change;
- on Cascade Lake, netperf/UDP_RR/nr_thread=50% sees performance drop
~3%; others have no performance change.
Together with results kindly collected by Daniel, it looks this patch
helps most for SPR while for other machines, it either is flat or
regressed 1%-3% for some workloads. With these results, I'm thinking an
alternative solution to reduce the cost of accessing tg->load_avg.
There are two main reasons to access tg->load_avg. One is driven by
pelt decay, which has a fixed frequency and is not a concern; the other
is by enqueue_entity/dequeue_entity triggered by task migration. The
number of migrations can be unbound so the access to tg->load_avg can
be huge due to this. This frequent task migration is the problem for
tg->load_avg. One thing I noticed is, on task migration, the load is
carried from the old per-cpu cfs_rq to the new per-cpu cfs_rq. While
the cfs_rq's load_avg and tg_load_avg_contrib should change accordingly
to reflect this so that its corresponding sched entity can get a correct
weight, the task group's load_avg should stay unchanged. So instead of
removing a delta to tg->load_avg by src cfs_rq and then increasing the
same delta to tg->load_avg by target cfs_rq, the two updates to tg's
load_avg could be avoided. With this change, the update to tg->load_avg
will be greatly reduced and the problem should be solved and it is
likely to be a win for most machines/workloads. Not sure if I understand
this correctly? I'm going to persue a solution based on this, feel free
to let me know if you see anything wrong here, thanks.
Below are the test result details of the current patch.
=======================================================================
Details for SPR(2 sockets, 96cores, 192cpus):
- postgres_sysbench score increased 6.5%;
- hackbench(threads, pipe) time reduced to 41s from 45s(less is better);
- netperf(UDP_RR,nr_thread=100%=nr_cpu) throughput increased from 10105
to 15121.
postgres_sysbench:
nr_thread=192
score update_cfs_group% update_load_avg%
6.2.0 92440?2.62% 8.11% - 13.48% 7.07% - 9.54%
this_patch 98425?0.62% 5.73% - 7.56% 4.47% - 5.96%
note: performance increased 6.5% and the two functions cost also
reduced.
nr_thread=96 with cpu offlined to 128c (2 sockets/64cores)
score update_cfs_group% update_load_avg%
6.2.0 75726?0.12% 3.56% - 4.49% 3.58% - 4.42%
this_patch 76736?0.17% 2.95% - 3.32% 2.80% - 3.29%
note: this test is mainly to see if performance incease is due to
ingress queue overflow or not and the result suggests the performance
increase on SPR is mainly due to ingress queue overflow.
hackbench(threads, pipe, groups=10, fds=20, 400 tasks):
time update_cfs_group% update_load_avg%
6.2.0 45.51?0.36% 12.68% - 20.22% 7.73% - 11.01%
this_patch 41.41?0.43% 7.73% - 13.15% 4.31% - 6.91%
note: there is a clear cut of profiles on node 0 and node 1 -
e.g. on v6.2.0, the cost of update_cfs_group()% on node0 is about 13% and 20% on node 1;
on patched, the cost of update_cfs_group()% on node0 is about 8% and 12% on node 1;
update_load_avg() is similar.
netperf(UDP_RR, nr_thread=100%=192):
throughput update_cfs_group% update_load_avg%
6.2.0 10105?2.91% 26.43% - 27.90% 17.51% - 18.31%
this_patch 15121?3.25% 25.12% - 26.50% 12.47% - 16.02%
note: performance increased a lot, although the two functions' cost didn't
drop much.
=======================================================================
Details for Icelake (2sockets, 64cores, 128cpus)
- postgres_sysbench:
nr_thread=128 does not show any performance change;
nr_thread=96 performance regressed 1.3% after patch, the two update
functions' cost reduce a bit though;
- hackbench(pipe/threads):
no obvious performance change after patch; the two update functions
cost reduced ~2% after patch;
- netperf(UDP_RR/nr_thread=100%=nr_cpu):
results in noise range; results are very unstable on vanilla kernel;
the two functions' cost reduced some after patch.
postgres_sysbench:
nr_thread=128
score update_cfs_group% update_load_avg%
6.2.0 97418?0.17% 0.50% - 0.74% 0.69% - 0.93%
this_patch 97029?0.32% 0.68% - 0.89% 0.70% - 0.89%
note: score in noise
nr_thread=96
score update_cfs_group% update_load_avg%
6.2.0 59183?0.21% 2.81% - 3.57% 3.48% - 3.76%
this_patch 58397?0.35% 2.70% - 3.01% 2.82% - 3.24%
note: score is 1.3% worse when patched.
update_XXX()% percent dropped but that does not translate to performance
increase.
hackbench(pipe, threads):
time update_cfs_group% update_load_avg%
6.2.0 41.80?0.65 5.90% - 7.36% 4.37% - 5.28%
this_patch 40.48?1.85 3.36% - 4.34% 2.89% - 3.35%
note: update_XXX()% percent dropped but does not translate to
performance increase.
netperf(UDP_RR, nr_thread=100%=128):
throughput update_cfs_group% update_load_avg%
6.2.0 31146?26% 11% - 33%* 2.30% - 17.7%*
this_patch 24900?2% 14% - 18% 8.67% - 12.03%
note: performance in noise;
update_cfs_group()% on vanilla can show big difference on the two nodes,
and also show big difference on different runs;
update_load_avg()%: for some runs, one node would show very low cost like
2.x% and the other node has 10+%. This is probably because one node's cpu
utils is approaching 100% and that inhibit task migrations; for other
runs, both nodes have 10+%.
=======================================================================
Details for Cascade Lake(2 sockets, 48cores, 96cpus):
- netperf (TCP_STREAM/UDP_RR):
- Most tests have no performance change;
- UDP_RR/nr_thread=50% sees performance drop about 3% on patched kernel;
- UDP_RR/nr_thread=100%: results are unstable for both kernels.
- hackbench(pipe/threads):
- performance in noise range after patched.
netperf/UDP_RR/nr_thread=100%=96
Throughput update_cfs_group% update_load_avg%
v6.2.0 41593?8% 10.94%?20% 10.23%?27%
this_patch 38603?8 9.53% 8.66%
note: performance in noise range; profile wise, the two functions' cost
become stable after patched.
netperf/UDP_RR/nr_thread=50%=48
Throughput update_cfs_group% update_load_avg%
v6.2.0 70489 0.59?8% 1.60
this_patch 68457 -2.9% 1.39 1.62
note: performance dropped ~3%; update_cfs_group()'s cost rises after
patched.
netperf/TCP_STREAM/nr_thread=100%=96
Throughput update_cfs_group% update_load_avg%
v6.2.0 12011 0.57% 2.45%
this_patch 11743 1.44% 2.30%
note: performance in noise range.
netperf/TCP_STREAM/nr_thread=50%=48
Throughput update_cfs_group% update_load_avg%
v6.2.0 16409?12% 0.20?2% 0.54?2%
this_patch 19295 0.47?4% 0.54?2%
note: result unstable for v6.2.0, performance in noise range.
hackbench/threads/pipe:
Throughput update_cfs_group% update_load_avg%
v6.2.0 306321?12% 2.80?58% 3.07?38%
this_patch 322967?10% 3.60?36% 3.56?30%
On 2023-05-16 at 15:50:11 +0800, Aaron Lu wrote:
> On Thu, May 04, 2023 at 06:27:46PM +0800, Aaron Lu wrote:
> > Base on my current understanding, the summary is:
> > - Running this workload with nr_thread=224 on SPR, the ingress queue
> > will overflow and that will slow things down. This patch helps
> > performance mainly because it transform the "many cpus accessing the
> > same cacheline" scenario to "many cpus accessing two cachelines" and
> > that can reduce the likelyhood of ingress queue overflow and thus,
> > helps performance;
> > - On Icelake with high nr_threads but not too high that would cause
> > 100% cpu utilization, the two functions' cost will drop a little but
> > performance did not improve(it actually regressed a little);
> > - On SPR when there is no ingress queue overflow, it's similar to
> > Icelake: the two functions' cost will drop but performance did not
> > improve.
>
> More results when running hackbench and netperf on Sapphire Rapids as
> well as on 2 sockets Icelake and 2 sockets Cascade Lake.
>
> The summary is:
> - on SPR, hackbench time reduced ~8% and netperf(UDP_RR/nr_thread=100%)
> performance increased ~50%;
> - on Icelake, performance regressed about 1%-2% for postgres_sysbench
> and hackbench, netperf has no performance change;
> - on Cascade Lake, netperf/UDP_RR/nr_thread=50% sees performance drop
> ~3%; others have no performance change.
>
> Together with results kindly collected by Daniel, it looks this patch
> helps most for SPR while for other machines, it either is flat or
> regressed 1%-3% for some workloads. With these results, I'm thinking an
> alternative solution to reduce the cost of accessing tg->load_avg.
>
> There are two main reasons to access tg->load_avg. One is driven by
> pelt decay, which has a fixed frequency and is not a concern; the other
> is by enqueue_entity/dequeue_entity triggered by task migration. The
> number of migrations can be unbound so the access to tg->load_avg can
> be huge due to this. This frequent task migration is the problem for
> tg->load_avg. One thing I noticed is, on task migration, the load is
> carried from the old per-cpu cfs_rq to the new per-cpu cfs_rq. While
> the cfs_rq's load_avg and tg_load_avg_contrib should change accordingly
> to reflect this so that its corresponding sched entity can get a correct
> weight, the task group's load_avg should stay unchanged. So instead of
> removing a delta to tg->load_avg by src cfs_rq and then increasing the
> same delta to tg->load_avg by target cfs_rq, the two updates to tg's
> load_avg could be avoided. With this change, the update to tg->load_avg
> will be greatly reduced and the problem should be solved and it is
> likely to be a win for most machines/workloads. Not sure if I understand
> this correctly? I'm going to persue a solution based on this, feel free
> to let me know if you see anything wrong here, thanks.
Sound good, but maybe I understand it incorrectly, if the task has been dequeued
for a long time, and not enqueued yet, since we do not update
the tg->load_avg, will it be out-of-date? Or do you mean the task migration
is a frequent sleep-wakeup sequence?
thanks,
Chenyu
On Tue, May 16, 2023 at 04:57:52PM +0800, Chen Yu wrote:
> On 2023-05-16 at 15:50:11 +0800, Aaron Lu wrote:
> > On Thu, May 04, 2023 at 06:27:46PM +0800, Aaron Lu wrote:
> > > Base on my current understanding, the summary is:
> > > - Running this workload with nr_thread=224 on SPR, the ingress queue
> > > will overflow and that will slow things down. This patch helps
> > > performance mainly because it transform the "many cpus accessing the
> > > same cacheline" scenario to "many cpus accessing two cachelines" and
> > > that can reduce the likelyhood of ingress queue overflow and thus,
> > > helps performance;
> > > - On Icelake with high nr_threads but not too high that would cause
> > > 100% cpu utilization, the two functions' cost will drop a little but
> > > performance did not improve(it actually regressed a little);
> > > - On SPR when there is no ingress queue overflow, it's similar to
> > > Icelake: the two functions' cost will drop but performance did not
> > > improve.
> >
> > More results when running hackbench and netperf on Sapphire Rapids as
> > well as on 2 sockets Icelake and 2 sockets Cascade Lake.
> >
> > The summary is:
> > - on SPR, hackbench time reduced ~8% and netperf(UDP_RR/nr_thread=100%)
> > performance increased ~50%;
> > - on Icelake, performance regressed about 1%-2% for postgres_sysbench
> > and hackbench, netperf has no performance change;
> > - on Cascade Lake, netperf/UDP_RR/nr_thread=50% sees performance drop
> > ~3%; others have no performance change.
> >
> > Together with results kindly collected by Daniel, it looks this patch
> > helps most for SPR while for other machines, it either is flat or
> > regressed 1%-3% for some workloads. With these results, I'm thinking an
> > alternative solution to reduce the cost of accessing tg->load_avg.
> >
> > There are two main reasons to access tg->load_avg. One is driven by
> > pelt decay, which has a fixed frequency and is not a concern; the other
> > is by enqueue_entity/dequeue_entity triggered by task migration. The
> > number of migrations can be unbound so the access to tg->load_avg can
> > be huge due to this. This frequent task migration is the problem for
> > tg->load_avg. One thing I noticed is, on task migration, the load is
> > carried from the old per-cpu cfs_rq to the new per-cpu cfs_rq. While
> > the cfs_rq's load_avg and tg_load_avg_contrib should change accordingly
> > to reflect this so that its corresponding sched entity can get a correct
> > weight, the task group's load_avg should stay unchanged. So instead of
> > removing a delta to tg->load_avg by src cfs_rq and then increasing the
> > same delta to tg->load_avg by target cfs_rq, the two updates to tg's
> > load_avg could be avoided. With this change, the update to tg->load_avg
> > will be greatly reduced and the problem should be solved and it is
> > likely to be a win for most machines/workloads. Not sure if I understand
> > this correctly? I'm going to persue a solution based on this, feel free
> > to let me know if you see anything wrong here, thanks.
> Sound good, but maybe I understand it incorrectly, if the task has been dequeued
> for a long time, and not enqueued yet, since we do not update
> the tg->load_avg, will it be out-of-date? Or do you mean the task migration
> is a frequent sleep-wakeup sequence?
When a task is dequeued due to it's blocked, then its load will not be
subtracted from its cfs_rq. That part of the load on cfs_rq will decay
and tg->load_avg will be updated when needed. Because decay happens in a
fixed frequency, that's not a concern.
When the task finally woke and was appointed a new cpu, then its load
will have to be removed from its original cfs_rq and added to its
new cfs_rq and that may trigger two updates to tg->load_avg depending on
how large the task's load is and the cfs_rq's current load contrib to
tg etc. and that is where I'm looking for some optimization, like the
migration will affect corresponding cfs_rq's load_avg but it shouldn't
affect tg->load_avg so there is no need to subtract task's load_avg from
tg->load_avg by original cfs_rq and then add it back by new cfs_rq. But
I suppose there are some details to sort out.
Thanks,
Aaron