The imbalance limitations are applied inconsistently at fork time
and at runtime. At fork, a new task can remain local until there are
too many running tasks even if the degree of imbalance is larger than
NUMA_IMBALANCE_MIN which is different to runtime. Secondly, the imbalance
figure used during load balancing is different to the one used at NUMA
placement. Load balancing uses the number of tasks that must move to
restore imbalance where as NUMA balancing uses the total imbalance.
In combination, it is possible for a parallel workload that uses a small
number of CPUs without applying scheduler policies to have very variable
run-to-run performance.
[[email protected]: Fix build breakage for arc-allyesconfig]
Signed-off-by: Mel Gorman <[email protected]>
---
kernel/sched/fair.c | 81 +++++++++++++++++++++++++--------------------
1 file changed, 45 insertions(+), 36 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03b1ad79d47d..0b3646be88b3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1043,6 +1043,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+ /*
+ * Allow a NUMA imbalance if busy CPUs is less than the maximum
+ * threshold. Above this threshold, individual tasks may be contending
+ * for both memory bandwidth and any shared HT resources. This is an
+ * approximation as the number of running tasks may not be related to
+ * the number of busy CPUs due to sched_setaffinity.
+ */
+ if (dst_running > imb_numa_nr)
+ return imbalance;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the destination is lightly loaded.
+ */
+ if (imbalance <= NUMA_IMBALANCE_MIN)
+ return 0;
+
+ return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -1536,8 +1563,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -9098,16 +9123,6 @@ static bool update_pick_idlest(struct sched_group *idlest,
return true;
}
-/*
- * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
- * This is an approximation as the number of running tasks may not be
- * related to the number of busy CPUs due to sched_setaffinity.
- */
-static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
-{
- return running <= imb_numa_nr;
-}
-
/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
@@ -9224,6 +9239,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
break;
case group_has_spare:
+#ifdef CONFIG_NUMA
if (sd->flags & SD_NUMA) {
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
@@ -9237,7 +9253,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
/*
* Otherwise, keep the task close to the wakeup source
* and improve locality if the number of running tasks
@@ -9245,9 +9261,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
* allowed. If there is a real need of migration,
* periodic load balance will take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
+ imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+ if (!adjust_numa_imbalance(imbalance,
+ local_sgs.sum_nr_running + 1,
+ sd->imb_numa_nr)) {
return NULL;
+ }
}
+#endif /* CONFIG_NUMA */
/*
* Select group with highest number of idle CPUs. We could also
@@ -9334,24 +9355,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
}
-#define NUMA_IMBALANCE_MIN 2
-
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr)
-{
- if (!allow_numa_imbalance(dst_running, imb_numa_nr))
- return imbalance;
-
- /*
- * Allow a small imbalance based on a simple pair of communicating
- * tasks that remain local when the destination is lightly loaded.
- */
- if (imbalance <= NUMA_IMBALANCE_MIN)
- return 0;
-
- return imbalance;
-}
-
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
@@ -9436,7 +9439,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff >> 1;
+ env->imbalance = nr_diff;
} else {
/*
@@ -9444,15 +9447,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* idle cpus.
*/
env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
- busiest->idle_cpus) >> 1);
+ env->imbalance = max_t(long, 0,
+ (local->idle_cpus - busiest->idle_cpus));
}
+#ifdef CONFIG_NUMA
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- local->sum_nr_running + 1, env->sd->imb_numa_nr);
+ local->sum_nr_running + 1,
+ env->sd->imb_numa_nr);
}
+#endif
+
+ /* Number of tasks to move to restore balance */
+ env->imbalance >>= 1;
return;
}
--
2.34.1
Greeting,
FYI, we noticed a -2.9% regression of unixbench.score due to commit:
commit: 5278ba412faff8402e318ad20ab762cc9ba7a801 ("[PATCH 3/4] sched/numa: Apply imbalance limitations consistently")
url: https://github.com/intel-lab-lkp/linux/commits/Mel-Gorman/Mitigate-inconsistent-NUMA-imbalance-behaviour/20220520-183837
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git 991d8d8142cad94f9c5c05db25e67fa83d6f772a
patch link: https://lore.kernel.org/lkml/[email protected]
in testcase: unixbench
on test machine: 128 threads 2 sockets Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz with 256G memory
with following parameters:
runtime: 300s
nr_task: 1
test: shell1
cpufreq_governor: performance
ucode: 0xd000331
test-description: UnixBench is the original BYTE UNIX benchmark suite aims to test performance of Unix-like system.
test-url: https://github.com/kdlucas/byte-unixbench
In addition to that, the commit also has significant impact on the following tests:
+------------------+---------------------------------------------------------------------------------+
| testcase: change | unixbench: unixbench.score -11.1% regression |
| test machine | 128 threads 2 sockets Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz with 256G memory |
| test parameters | cpufreq_governor=performance |
| | nr_task=1 |
| | runtime=300s |
| | test=shell8 |
| | ucode=0xd000331 |
+------------------+---------------------------------------------------------------------------------+
If you fix the issue, kindly add following tag
Reported-by: kernel test robot <[email protected]>
Details are as below:
-------------------------------------------------------------------------------------------------->
To reproduce:
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
sudo bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
sudo bin/lkp run generated-yaml-file
# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.
=========================================================================================
compiler/cpufreq_governor/kconfig/nr_task/rootfs/runtime/tbox_group/test/testcase/ucode:
gcc-11/performance/x86_64-rhel-8.3/1/debian-10.4-x86_64-20200603.cgz/300s/lkp-icl-2sp2/shell1/unixbench/0xd000331
commit:
626db23ac9 ("sched/numa: Do not swap tasks between nodes when spare capacity is available")
5278ba412f ("sched/numa: Apply imbalance limitations consistently")
626db23ac968c13b 5278ba412faff8402e318ad20ab
---------------- ---------------------------
%stddev %change %stddev
\ | \
2705 -2.9% 2625 unixbench.score
71105 ? 5% -13.4% 61547 ? 4% unixbench.time.involuntary_context_switches
287.33 ? 10% +312.2% 1184 ? 6% unixbench.time.major_page_faults
1.232e+08 -2.9% 1.196e+08 unixbench.time.minor_page_faults
143.00 +1.4% 145.00 unixbench.time.percent_of_cpu_this_job_got
383.13 +7.7% 412.62 unixbench.time.system_time
521.90 -3.4% 504.36 unixbench.time.user_time
3400459 -3.6% 3278744 unixbench.time.voluntary_context_switches
7226950 -2.9% 7015844 unixbench.workload
30.49 +1.5% 30.96 turbostat.RAMWatt
16050 -6.6% 14986 vmstat.system.cs
1293860 ? 6% +4158.8% 55103032 ? 60% numa-numastat.node0.local_node
1344815 ? 5% +3999.6% 55131871 ? 60% numa-numastat.node0.numa_hit
89313120 -63.9% 32257305 ?103% numa-numastat.node1.local_node
89365596 -63.8% 32337261 ?102% numa-numastat.node1.numa_hit
66387 +30.1% 86365 ? 6% meminfo.Active
66201 +30.2% 86181 ? 7% meminfo.Active(anon)
12721152 ? 10% -27.2% 9261056 ? 7% meminfo.DirectMap2M
48790 +10.7% 53987 ? 2% meminfo.Mapped
92524 +25.1% 115764 ? 5% meminfo.Shmem
18726 ? 8% -15.7% 15785 ? 8% sched_debug.cfs_rq:/.min_vruntime.stddev
87.63 ? 5% -11.0% 77.99 ? 4% sched_debug.cfs_rq:/.runnable_avg.avg
618.08 ? 2% +21.9% 753.36 ? 11% sched_debug.cfs_rq:/.runnable_avg.max
10444 ? 28% -376.2% -28845 sched_debug.cfs_rq:/.spread0.avg
60208 ? 4% -77.6% 13496 ? 74% sched_debug.cfs_rq:/.spread0.max
-8539 +479.7% -49504 sched_debug.cfs_rq:/.spread0.min
18727 ? 8% -15.7% 15785 ? 8% sched_debug.cfs_rq:/.spread0.stddev
87.61 ? 5% -11.0% 77.96 ? 4% sched_debug.cfs_rq:/.util_avg.avg
617.98 ? 2% +21.9% 753.33 ? 11% sched_debug.cfs_rq:/.util_avg.max
0.00 ? 12% -24.5% 0.00 ? 23% sched_debug.cpu.next_balance.stddev
161071 ? 4% -22.7% 124508 ? 10% sched_debug.cpu.nr_switches.max
44072 ? 6% -25.6% 32790 ? 3% sched_debug.cpu.nr_switches.stddev
184614 ? 6% -58.9% 75886 ? 28% numa-meminfo.node0.AnonHugePages
227436 ? 5% -50.8% 111893 ? 18% numa-meminfo.node0.AnonPages
250375 ? 6% -39.2% 152234 ? 18% numa-meminfo.node0.AnonPages.max
239538 ? 4% -50.1% 119562 ? 18% numa-meminfo.node0.Inactive
239385 ? 4% -50.1% 119488 ? 18% numa-meminfo.node0.Inactive(anon)
17347 ? 20% -33.1% 11605 ? 35% numa-meminfo.node0.Shmem
61415 ? 2% +35.5% 83195 ? 9% numa-meminfo.node1.Active
61415 ? 2% +35.3% 83107 ? 9% numa-meminfo.node1.Active(anon)
15111 ? 77% +724.0% 124518 ? 17% numa-meminfo.node1.AnonHugePages
48170 ? 26% +252.5% 169822 ? 12% numa-meminfo.node1.AnonPages
977352 +19.6% 1168644 ? 5% numa-meminfo.node1.AnonPages.max
62321 ? 14% +205.7% 190528 ? 11% numa-meminfo.node1.Inactive
62321 ? 14% +205.6% 190446 ? 11% numa-meminfo.node1.Inactive(anon)
75390 ? 4% +38.5% 104414 ? 8% numa-meminfo.node1.Shmem
56856 ? 5% -50.8% 27966 ? 18% numa-vmstat.node0.nr_anon_pages
59844 ? 4% -50.1% 29863 ? 18% numa-vmstat.node0.nr_inactive_anon
4336 ? 20% -33.3% 2894 ? 35% numa-vmstat.node0.nr_shmem
59844 ? 4% -50.1% 29862 ? 18% numa-vmstat.node0.nr_zone_inactive_anon
1344749 ? 5% +3999.7% 55131256 ? 60% numa-vmstat.node0.numa_hit
1293794 ? 6% +4159.0% 55102416 ? 60% numa-vmstat.node0.numa_local
15374 ? 2% +35.3% 20797 ? 9% numa-vmstat.node1.nr_active_anon
11720 ? 27% +258.8% 42056 ? 12% numa-vmstat.node1.nr_anon_pages
15192 ? 14% +210.3% 47137 ? 11% numa-vmstat.node1.nr_inactive_anon
18800 ? 4% +38.6% 26056 ? 8% numa-vmstat.node1.nr_shmem
15374 ? 2% +35.3% 20797 ? 9% numa-vmstat.node1.nr_zone_active_anon
15192 ? 14% +210.3% 47137 ? 11% numa-vmstat.node1.nr_zone_inactive_anon
89364368 -63.8% 32336320 ?102% numa-vmstat.node1.numa_hit
89311892 -63.9% 32256364 ?103% numa-vmstat.node1.numa_local
16551 +30.1% 21527 ? 7% proc-vmstat.nr_active_anon
68924 +2.2% 70416 proc-vmstat.nr_anon_pages
75477 +2.7% 77526 proc-vmstat.nr_inactive_anon
12470 +10.3% 13749 ? 2% proc-vmstat.nr_mapped
23215 +25.1% 29034 ? 5% proc-vmstat.nr_shmem
16551 +30.1% 21527 ? 7% proc-vmstat.nr_zone_active_anon
75477 +2.7% 77526 proc-vmstat.nr_zone_inactive_anon
242786 +41.2% 342858 ? 16% proc-vmstat.numa_hint_faults
241277 +41.0% 340196 ? 16% proc-vmstat.numa_hint_faults_local
90713416 -3.6% 87455312 proc-vmstat.numa_hit
90609984 -3.6% 87346517 proc-vmstat.numa_local
3339 ? 58% +412.8% 17123 ? 58% proc-vmstat.numa_pages_migrated
343034 ? 3% +49.8% 513708 ? 14% proc-vmstat.numa_pte_updates
86449 +86.6% 161358 ? 2% proc-vmstat.pgactivate
90707968 -3.6% 87448830 proc-vmstat.pgalloc_normal
1.257e+08 -2.7% 1.223e+08 proc-vmstat.pgfault
90721230 -3.6% 87454560 proc-vmstat.pgfree
3339 ? 58% +412.8% 17123 ? 58% proc-vmstat.pgmigrate_success
6816403 -2.2% 6668991 proc-vmstat.pgreuse
3800 -3.2% 3677 proc-vmstat.thp_fault_alloc
1597074 -2.9% 1550242 proc-vmstat.unevictable_pgs_culled
0.84 ? 7% -0.2 0.69 ? 10% perf-profile.calltrace.cycles-pp.ret_from_fork
0.81 ? 7% -0.1 0.66 ? 10% perf-profile.calltrace.cycles-pp.kthread.ret_from_fork
0.54 ? 7% +0.1 0.66 ? 12% perf-profile.calltrace.cycles-pp.next_uptodate_page.filemap_map_pages.do_read_fault.do_fault.__handle_mm_fault
0.79 ? 7% +0.1 0.92 ? 7% perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.__open64_nocancel.setlocale
0.78 ? 7% +0.1 0.91 ? 7% perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.__open64_nocancel.setlocale
0.76 ? 8% +0.1 0.88 ? 7% perf-profile.calltrace.cycles-pp.do_sys_openat2.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe.__open64_nocancel
0.76 ? 8% +0.1 0.90 ? 8% perf-profile.calltrace.cycles-pp.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe.__open64_nocancel.setlocale
0.83 ? 7% +0.1 0.96 ? 7% perf-profile.calltrace.cycles-pp.__open64_nocancel.setlocale
0.81 ? 7% -0.1 0.66 ? 10% perf-profile.children.cycles-pp.kthread
1.10 ? 6% -0.1 0.97 ? 8% perf-profile.children.cycles-pp.ret_from_fork
0.31 ? 10% -0.1 0.25 ? 7% perf-profile.children.cycles-pp.newidle_balance
0.04 ? 45% +0.0 0.08 ? 20% perf-profile.children.cycles-pp.folio_add_lru
0.08 ? 20% +0.0 0.12 ? 12% perf-profile.children.cycles-pp.touch_atime
0.05 ? 47% +0.0 0.08 ? 23% perf-profile.children.cycles-pp.apparmor_file_free_security
0.04 ? 71% +0.0 0.07 ? 11% perf-profile.children.cycles-pp.perf_output_copy
0.29 ? 3% +0.0 0.33 ? 8% perf-profile.children.cycles-pp.page_counter_charge
0.17 ? 9% +0.0 0.21 ? 9% perf-profile.children.cycles-pp.__anon_vma_prepare
0.14 ? 8% +0.0 0.18 ? 16% perf-profile.children.cycles-pp.apparmor_file_alloc_security
0.46 ? 4% +0.0 0.51 ? 7% perf-profile.children.cycles-pp.vfs_read
0.03 ?100% +0.1 0.08 ? 19% perf-profile.children.cycles-pp.propagate_protected_usage
0.20 ? 15% +0.1 0.25 ? 4% perf-profile.children.cycles-pp.copy_string_kernel
0.00 +0.1 0.06 ? 13% perf-profile.children.cycles-pp.__mark_inode_dirty
0.27 ? 5% +0.1 0.33 ? 6% perf-profile.children.cycles-pp.vma_interval_tree_remove
0.20 ? 6% +0.1 0.26 ? 5% perf-profile.children.cycles-pp.up_write
0.00 +0.1 0.06 ? 6% perf-profile.children.cycles-pp.rmqueue_bulk
0.36 ? 7% +0.1 0.43 ? 8% perf-profile.children.cycles-pp.get_page_from_freelist
0.34 ? 7% +0.1 0.41 ? 3% perf-profile.children.cycles-pp.unlink_file_vma
0.31 ? 8% +0.1 0.40 ? 10% perf-profile.children.cycles-pp.__slab_free
0.34 ? 10% +0.1 0.44 ? 7% perf-profile.children.cycles-pp.ksys_write
0.72 ? 3% +0.1 0.82 ? 5% perf-profile.children.cycles-pp.__split_vma
0.33 ? 10% +0.1 0.44 ? 8% perf-profile.children.cycles-pp.vfs_write
0.32 ? 10% +0.1 0.42 ? 9% perf-profile.children.cycles-pp.new_sync_write
1.05 ? 4% +0.2 1.29 ? 8% perf-profile.children.cycles-pp.next_uptodate_page
1.41 ? 4% -0.3 1.10 ? 23% perf-profile.self.cycles-pp.menu_select
0.09 ? 10% -0.0 0.06 ? 11% perf-profile.self.cycles-pp.vm_normal_page
0.02 ?141% +0.1 0.07 ? 21% perf-profile.self.cycles-pp.propagate_protected_usage
0.26 ? 5% +0.1 0.32 ? 6% perf-profile.self.cycles-pp.vma_interval_tree_remove
0.20 ? 8% +0.1 0.26 ? 3% perf-profile.self.cycles-pp.up_write
0.52 ? 6% +0.1 0.59 ? 8% perf-profile.self.cycles-pp.vma_interval_tree_insert
0.30 ? 10% +0.1 0.38 ? 10% perf-profile.self.cycles-pp.__slab_free
1.00 ? 4% +0.2 1.24 ? 8% perf-profile.self.cycles-pp.next_uptodate_page
9.50 +3.3% 9.82 perf-stat.i.MPKI
1.467e+09 -3.1% 1.421e+09 perf-stat.i.branch-instructions
25259322 -2.8% 24564546 perf-stat.i.branch-misses
4.64 +3.8 8.40 perf-stat.i.cache-miss-rate%
3128761 +84.4% 5768597 perf-stat.i.cache-misses
16064 -6.6% 15000 perf-stat.i.context-switches
1.28 ? 2% +4.9% 1.35 ? 2% perf-stat.i.cpi
147.24 +456.0% 818.73 perf-stat.i.cpu-migrations
3222 ? 3% -37.4% 2017 ? 2% perf-stat.i.cycles-between-cache-misses
0.03 +0.0 0.04 perf-stat.i.dTLB-load-miss-rate%
646123 ? 2% +10.8% 716013 perf-stat.i.dTLB-load-misses
1.866e+09 -2.7% 1.816e+09 perf-stat.i.dTLB-loads
988430 -1.7% 971311 perf-stat.i.dTLB-store-misses
1.078e+09 -2.4% 1.052e+09 perf-stat.i.dTLB-stores
7.095e+09 -3.1% 6.873e+09 perf-stat.i.instructions
0.89 ? 6% +162.7% 2.35 ? 5% perf-stat.i.major-faults
34.46 -2.8% 33.52 perf-stat.i.metric.M/sec
195370 -2.7% 190058 perf-stat.i.minor-faults
85.14 +3.7 88.89 perf-stat.i.node-load-miss-rate%
504104 ? 2% +121.9% 1118698 perf-stat.i.node-load-misses
104059 ? 3% +44.8% 150652 ? 4% perf-stat.i.node-loads
5.36 ? 18% +19.3 24.63 ? 4% perf-stat.i.node-store-miss-rate%
45098 ? 16% +733.8% 376029 ? 4% perf-stat.i.node-store-misses
937285 +26.6% 1186448 ? 2% perf-stat.i.node-stores
195371 -2.7% 190061 perf-stat.i.page-faults
9.64 +2.9% 9.91 perf-stat.overall.MPKI
4.59 +3.9 8.47 perf-stat.overall.cache-miss-rate%
1.20 ? 2% +4.6% 1.25 ? 2% perf-stat.overall.cpi
2709 ? 2% -44.9% 1493 ? 2% perf-stat.overall.cycles-between-cache-misses
0.03 ? 2% +0.0 0.04 perf-stat.overall.dTLB-load-miss-rate%
82.64 +5.4 88.05 perf-stat.overall.node-load-miss-rate%
4.60 ? 14% +19.4 24.05 ? 4% perf-stat.overall.node-store-miss-rate%
1.465e+09 -3.1% 1.42e+09 perf-stat.ps.branch-instructions
25223900 -2.7% 24532257 perf-stat.ps.branch-misses
3133964 +84.0% 5765238 perf-stat.ps.cache-misses
16037 -6.6% 14976 perf-stat.ps.context-switches
146.96 +455.6% 816.47 perf-stat.ps.cpu-migrations
645888 ? 2% +10.8% 715648 perf-stat.ps.dTLB-load-misses
1.864e+09 -2.7% 1.814e+09 perf-stat.ps.dTLB-loads
987054 -1.7% 970089 perf-stat.ps.dTLB-store-misses
1.077e+09 -2.4% 1.051e+09 perf-stat.ps.dTLB-stores
7.086e+09 -3.1% 6.866e+09 perf-stat.ps.instructions
0.89 ? 6% +162.4% 2.34 ? 5% perf-stat.ps.major-faults
195074 -2.7% 189793 perf-stat.ps.minor-faults
503993 +121.6% 1117027 perf-stat.ps.node-load-misses
105826 ? 3% +43.3% 151624 ? 4% perf-stat.ps.node-loads
45328 ? 16% +728.3% 375435 ? 4% perf-stat.ps.node-store-misses
937188 +26.5% 1185959 ? 2% perf-stat.ps.node-stores
195075 -2.7% 189796 perf-stat.ps.page-faults
4.472e+12 -3.1% 4.335e+12 perf-stat.total.instructions
***************************************************************************************************
lkp-icl-2sp2: 128 threads 2 sockets Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz with 256G memory
=========================================================================================
compiler/cpufreq_governor/kconfig/nr_task/rootfs/runtime/tbox_group/test/testcase/ucode:
gcc-11/performance/x86_64-rhel-8.3/1/debian-10.4-x86_64-20200603.cgz/300s/lkp-icl-2sp2/shell8/unixbench/0xd000331
commit:
626db23ac9 ("sched/numa: Do not swap tasks between nodes when spare capacity is available")
5278ba412f ("sched/numa: Apply imbalance limitations consistently")
626db23ac968c13b 5278ba412faff8402e318ad20ab
---------------- ---------------------------
%stddev %change %stddev
\ | \
9459 -11.1% 8410 unixbench.score
49295 ? 2% -13.0% 42902 unixbench.time.involuntary_context_switches
1616 ? 3% +147.0% 3993 ? 4% unixbench.time.major_page_faults
45966048 -11.0% 40896241 unixbench.time.minor_page_faults
173.46 +18.9% 206.17 unixbench.time.system_time
190.29 -16.7% 158.42 unixbench.time.user_time
1307674 -12.3% 1146302 unixbench.time.voluntary_context_switches
358509 -11.3% 317922 unixbench.workload
31.04 +6.0% 32.92 turbostat.RAMWatt
53649 -10.2% 48196 vmstat.system.cs
0.14 ? 3% +0.0 0.16 mpstat.cpu.all.soft%
1.63 -0.2 1.45 mpstat.cpu.all.usr%
0.34 ?223% +2.3 2.67 ? 83% perf-profile.calltrace.cycles-pp.do_execveat_common.__x64_sys_execve.do_syscall_64.entry_SYSCALL_64_after_hwframe
0.34 ?223% +2.4 2.71 ? 81% perf-profile.calltrace.cycles-pp.__x64_sys_execve.do_syscall_64.entry_SYSCALL_64_after_hwframe
12953258 ? 7% -29.1% 9177429 ? 7% meminfo.DirectMap2M
22474 +13.6% 25533 ? 2% meminfo.KernelStack
7838 +156.0% 20065 ? 20% meminfo.PageTables
62596 ? 16% -39.1% 38109 ? 36% numa-vmstat.node0.nr_inactive_anon
12227 ? 6% +11.5% 13631 ? 4% numa-vmstat.node0.nr_kernel_stack
1163 ? 18% +134.4% 2726 ? 18% numa-vmstat.node0.nr_page_table_pages
62596 ? 16% -39.1% 38109 ? 36% numa-vmstat.node0.nr_zone_inactive_anon
14241 ? 67% +184.6% 40525 ? 36% numa-vmstat.node1.nr_anon_pages
14899 ? 69% +182.7% 42126 ? 33% numa-vmstat.node1.nr_inactive_anon
10252 ? 7% +16.7% 11959 ? 6% numa-vmstat.node1.nr_kernel_stack
809.17 ? 25% +202.7% 2449 ? 21% numa-vmstat.node1.nr_page_table_pages
14899 ? 69% +182.7% 42126 ? 33% numa-vmstat.node1.nr_zone_inactive_anon
244098 ? 15% -39.8% 146934 ? 40% numa-meminfo.node0.AnonPages.max
250449 ? 16% -39.3% 152036 ? 37% numa-meminfo.node0.Inactive
250301 ? 16% -39.3% 151965 ? 37% numa-meminfo.node0.Inactive(anon)
4630 ? 18% +127.3% 10522 ? 23% numa-meminfo.node0.PageTables
56826 ? 67% +184.7% 161757 ? 35% numa-meminfo.node1.AnonPages
64784 ? 57% +166.9% 172884 ? 33% numa-meminfo.node1.AnonPages.max
59459 ? 70% +183.1% 168330 ? 33% numa-meminfo.node1.Inactive
59459 ? 70% +182.8% 168177 ? 33% numa-meminfo.node1.Inactive(anon)
10234 ? 7% +16.8% 11949 ? 6% numa-meminfo.node1.KernelStack
3189 ? 24% +203.1% 9665 ? 22% numa-meminfo.node1.PageTables
73176 +3.2% 75515 proc-vmstat.nr_anon_pages
77439 +3.4% 80074 proc-vmstat.nr_inactive_anon
22484 +13.6% 25550 ? 2% proc-vmstat.nr_kernel_stack
1964 +159.7% 5101 ? 19% proc-vmstat.nr_page_table_pages
77439 +3.4% 80074 proc-vmstat.nr_zone_inactive_anon
33321568 -11.1% 29636150 proc-vmstat.numa_hit
33204884 -11.1% 29522839 proc-vmstat.numa_local
2826 ? 7% +450.7% 15565 proc-vmstat.pgactivate
33314916 -11.1% 29632022 proc-vmstat.pgalloc_normal
46272301 -10.9% 41208227 proc-vmstat.pgfault
33127507 -11.1% 29444660 proc-vmstat.pgfree
2594250 -11.2% 2304883 proc-vmstat.pgreuse
1499 -10.6% 1340 proc-vmstat.thp_fault_alloc
635894 -11.1% 565409 proc-vmstat.unevictable_pgs_culled
10.69 +4.3% 11.15 perf-stat.i.MPKI
4.972e+09 -10.5% 4.448e+09 perf-stat.i.branch-instructions
1.78 +0.0 1.81 perf-stat.i.branch-miss-rate%
87366205 -9.0% 79542506 perf-stat.i.branch-misses
4.39 +10.2 14.62 perf-stat.i.cache-miss-rate%
10832039 +238.1% 36622848 perf-stat.i.cache-misses
2.643e+08 -6.7% 2.466e+08 perf-stat.i.cache-references
55032 -10.2% 49392 perf-stat.i.context-switches
0.84 +11.6% 0.94 perf-stat.i.cpi
1083 ? 4% +90.1% 2058 perf-stat.i.cpu-migrations
1976 -61.1% 769.37 ? 3% perf-stat.i.cycles-between-cache-misses
0.04 ? 4% +0.0 0.04 perf-stat.i.dTLB-load-miss-rate%
6.279e+09 -10.2% 5.638e+09 perf-stat.i.dTLB-loads
3629603 -10.3% 3257478 perf-stat.i.dTLB-store-misses
3.662e+09 -10.2% 3.289e+09 perf-stat.i.dTLB-stores
2.408e+10 -10.5% 2.154e+10 perf-stat.i.instructions
1.19 -10.6% 1.06 perf-stat.i.ipc
25.60 ? 3% +145.3% 62.80 ? 4% perf-stat.i.major-faults
102.69 +76.3% 181.04 perf-stat.i.metric.K/sec
118.55 -10.3% 106.39 perf-stat.i.metric.M/sec
710240 -11.0% 632294 perf-stat.i.minor-faults
67.54 +24.9 92.47 perf-stat.i.node-load-miss-rate%
823765 ? 2% +788.1% 7315469 perf-stat.i.node-load-misses
399301 ? 3% +31.5% 525162 perf-stat.i.node-loads
5.32 ? 6% +34.6 39.92 perf-stat.i.node-store-miss-rate%
193239 ? 6% +1687.7% 3454580 perf-stat.i.node-store-misses
4318690 +18.6% 5123428 perf-stat.i.node-stores
710266 -11.0% 632357 perf-stat.i.page-faults
10.98 +4.2% 11.44 perf-stat.overall.MPKI
1.76 +0.0 1.79 perf-stat.overall.branch-miss-rate%
4.10 +10.8 14.85 perf-stat.overall.cache-miss-rate%
0.83 +12.2% 0.94 perf-stat.overall.cpi
1853 -70.3% 550.23 perf-stat.overall.cycles-between-cache-misses
0.04 ? 4% +0.0 0.05 perf-stat.overall.dTLB-load-miss-rate%
1.20 -10.9% 1.07 perf-stat.overall.ipc
67.35 +25.9 93.30 perf-stat.overall.node-load-miss-rate%
4.28 ? 6% +36.0 40.27 perf-stat.overall.node-store-miss-rate%
4.894e+09 -10.5% 4.379e+09 perf-stat.ps.branch-instructions
86000604 -9.0% 78300554 perf-stat.ps.branch-misses
10662516 +238.1% 36050907 perf-stat.ps.cache-misses
2.602e+08 -6.7% 2.427e+08 perf-stat.ps.cache-references
54171 -10.2% 48620 perf-stat.ps.context-switches
1066 ? 4% +90.1% 2026 perf-stat.ps.cpu-migrations
6.181e+09 -10.2% 5.55e+09 perf-stat.ps.dTLB-loads
3572776 -10.2% 3206610 perf-stat.ps.dTLB-store-misses
3.604e+09 -10.2% 3.237e+09 perf-stat.ps.dTLB-stores
2.37e+10 -10.5% 2.121e+10 perf-stat.ps.instructions
25.20 ? 3% +145.3% 61.82 ? 4% perf-stat.ps.major-faults
699120 -11.0% 622421 perf-stat.ps.minor-faults
810873 ? 2% +788.1% 7201233 perf-stat.ps.node-load-misses
393054 ? 3% +31.5% 516951 perf-stat.ps.node-loads
190213 ? 6% +1687.8% 3400631 perf-stat.ps.node-store-misses
4251087 +18.6% 5043399 perf-stat.ps.node-stores
699145 -11.0% 622482 perf-stat.ps.page-faults
1.523e+12 -10.5% 1.364e+12 perf-stat.total.instructions
Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
--
0-DAY CI Kernel Test Service
https://01.org/lkp
The following commit has been merged into the sched/core branch of tip:
Commit-ID: cb29a5c19d2d68afc641fb1949e1a1c565b582ea
Gitweb: https://git.kernel.org/tip/cb29a5c19d2d68afc641fb1949e1a1c565b582ea
Author: Mel Gorman <[email protected]>
AuthorDate: Fri, 20 May 2022 11:35:18 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Mon, 13 Jun 2022 10:29:59 +02:00
sched/numa: Apply imbalance limitations consistently
The imbalance limitations are applied inconsistently at fork time
and at runtime. At fork, a new task can remain local until there are
too many running tasks even if the degree of imbalance is larger than
NUMA_IMBALANCE_MIN which is different to runtime. Secondly, the imbalance
figure used during load balancing is different to the one used at NUMA
placement. Load balancing uses the number of tasks that must move to
restore imbalance where as NUMA balancing uses the total imbalance.
In combination, it is possible for a parallel workload that uses a small
number of CPUs without applying scheduler policies to have very variable
run-to-run performance.
[[email protected]: Fix build breakage for arc-allyesconfig]
Signed-off-by: Mel Gorman <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Tested-by: K Prateek Nayak <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 81 ++++++++++++++++++++++++--------------------
1 file changed, 45 insertions(+), 36 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23da36c..166f5f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1055,6 +1055,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+ /*
+ * Allow a NUMA imbalance if busy CPUs is less than the maximum
+ * threshold. Above this threshold, individual tasks may be contending
+ * for both memory bandwidth and any shared HT resources. This is an
+ * approximation as the number of running tasks may not be related to
+ * the number of busy CPUs due to sched_setaffinity.
+ */
+ if (dst_running > imb_numa_nr)
+ return imbalance;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the destination is lightly loaded.
+ */
+ if (imbalance <= NUMA_IMBALANCE_MIN)
+ return 0;
+
+ return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -1548,8 +1575,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -9068,16 +9093,6 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
- * This is an approximation as the number of running tasks may not be
- * related to the number of busy CPUs due to sched_setaffinity.
- */
-static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
-{
- return running <= imb_numa_nr;
-}
-
-/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
*
@@ -9193,6 +9208,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
break;
case group_has_spare:
+#ifdef CONFIG_NUMA
if (sd->flags & SD_NUMA) {
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
@@ -9206,7 +9222,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
/*
* Otherwise, keep the task close to the wakeup source
* and improve locality if the number of running tasks
@@ -9214,9 +9230,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
* allowed. If there is a real need of migration,
* periodic load balance will take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
+ imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+ if (!adjust_numa_imbalance(imbalance,
+ local_sgs.sum_nr_running + 1,
+ sd->imb_numa_nr)) {
return NULL;
+ }
}
+#endif /* CONFIG_NUMA */
/*
* Select group with highest number of idle CPUs. We could also
@@ -9303,24 +9324,6 @@ next_group:
}
}
-#define NUMA_IMBALANCE_MIN 2
-
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr)
-{
- if (!allow_numa_imbalance(dst_running, imb_numa_nr))
- return imbalance;
-
- /*
- * Allow a small imbalance based on a simple pair of communicating
- * tasks that remain local when the destination is lightly loaded.
- */
- if (imbalance <= NUMA_IMBALANCE_MIN)
- return 0;
-
- return imbalance;
-}
-
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
@@ -9405,7 +9408,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff >> 1;
+ env->imbalance = nr_diff;
} else {
/*
@@ -9413,15 +9416,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* idle cpus.
*/
env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
- busiest->idle_cpus) >> 1);
+ env->imbalance = max_t(long, 0,
+ (local->idle_cpus - busiest->idle_cpus));
}
+#ifdef CONFIG_NUMA
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- local->sum_nr_running + 1, env->sd->imb_numa_nr);
+ local->sum_nr_running + 1,
+ env->sd->imb_numa_nr);
}
+#endif
+
+ /* Number of tasks to move to restore balance */
+ env->imbalance >>= 1;
return;
}