LinuxLists.cc - [PATCH v9 5/5] locking/qspinlock: Introduce the shuffle reduction optimization into CNA

2020-01-15 04:16:32

Subject: [PATCH v9 5/5] locking/qspinlock: Introduce the shuffle reduction optimization into CNA

This performance optimization reduces the probability threads will be
shuffled between the main and secondary queues when the secondary queue
is empty. It is helpful when the lock is only lightly contended.

Signed-off-by: Alex Kogan <[email protected]>
Reviewed-by: Steve Sistare <[email protected]>
Reviewed-by: Waiman Long <[email protected]>
---
kernel/locking/qspinlock_cna.h | 46 ++++++++++++++++++++++++++++++++--
1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h
index a2b65f87e6f8..f0b0c15dcf9d 100644
--- a/kernel/locking/qspinlock_cna.h
+++ b/kernel/locking/qspinlock_cna.h
@@ -4,6 +4,7 @@
#endif

#include <linux/topology.h>
+#include <linux/random.h>

/*
* Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock).
@@ -57,6 +58,7 @@ struct cna_node {
enum {
LOCAL_WAITER_FOUND = 2, /* 0 and 1 are reserved for @locked */
FLUSH_SECONDARY_QUEUE = 3,
+ PASS_LOCK_IMMEDIATELY = 4,
MIN_ENCODED_TAIL
};

@@ -70,6 +72,34 @@ enum {
*/
unsigned int intra_node_handoff_threshold __ro_after_init = 1 << 16;

+/*
+ * Controls the probability for enabling the scan of the main queue when
+ * the secondary queue is empty. The chosen value reduces the amount of
+ * unnecessary shuffling of threads between the two waiting queues when
+ * the contention is low, while responding fast enough and enabling
+ * the shuffling when the contention is high.
+ */
+#define SHUFFLE_REDUCTION_PROB_ARG (7)
+
+/* Per-CPU pseudo-random number seed */
+static DEFINE_PER_CPU(u32, seed);
+
+/*
+ * Return false with probability 1 / 2^@num_bits.
+ * Intuitively, the larger @num_bits the less likely false is to be returned.
+ * @num_bits must be a number between 0 and 31.
+ */
+static bool probably(unsigned int num_bits)
+{
+ u32 s;
+
+ s = this_cpu_read(seed);
+ s = next_pseudo_random32(s);
+ this_cpu_write(seed, s);
+
+ return s & ((1 << num_bits) - 1);
+}
+
static void __init cna_init_nodes_per_cpu(unsigned int cpu)
{
struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu);
@@ -250,8 +280,11 @@ __always_inline u32 cna_pre_scan(struct qspinlock *lock,
struct cna_node *cn = (struct cna_node *)node;

cn->pre_scan_result =
- cn->intra_count == intra_node_handoff_threshold ?
- FLUSH_SECONDARY_QUEUE : cna_scan_main_queue(node, node);
+ (node->locked <= 1 && probably(SHUFFLE_REDUCTION_PROB_ARG)) ?
+ PASS_LOCK_IMMEDIATELY :
+ cn->intra_count == intra_node_handoff_threshold ?
+ FLUSH_SECONDARY_QUEUE :
+ cna_scan_main_queue(node, node);

return 0;
}
@@ -265,6 +298,14 @@ static inline void cna_pass_lock(struct mcs_spinlock *node,

u32 scan = cn->pre_scan_result;

+ /*
+ * perf. optimization - check if we can skip the logic of triaging
+ * through other possible values in @scan (helps under light lock
+ * contention)
+ */
+ if (scan == PASS_LOCK_IMMEDIATELY)
+ goto pass_lock;
+
/*
* check if a successor from the same numa node has not been found in
* pre-scan, and if so, try to find it in post-scan starting from the
@@ -293,6 +334,7 @@ static inline void cna_pass_lock(struct mcs_spinlock *node,
tail_2nd->next = next;
}

+pass_lock:
arch_mcs_pass_lock(&next_holder->locked, val);
}

--
2.21.0 (Apple Git-122.2)

2020-03-02 01:16:58

by Chen, Rong A

[permalink] [raw]

Subject: [locking/qspinlock] 7b6da71157: unixbench.score 8.4% improvement

Greeting,

FYI, we

commit: url:
in testcase: unixbench
on test with following parameters:

runtime: 300s
nr_task: 30%
test: context1
cpufreq_governor: performance
ucode: 0x500002c

test-description: test-url: In addition

Details are as below:
-----------------

To reproduce:

git clone bin/lkp run job.yaml

================= compiler/cpufreq_ gcc-7/performan
commit:
dfce1eb694 7b6da71157
dfce1eb694321530 ---------------- fail:runs | 1:4 :4 :4 %stddev \ 2659 4016 15666 109.72 ± 2% 3.053e+08 4.175e+08 111361 ± 3% 3085234 35.88 ± 8% 28.29 ± 8% 5943 ± 9% 5259 ± 78% 5132 ± 80% 6664 ± 66% 1282 ± 80% 1665 ± 66% 1282 ± 80% 2.05e+08 ± 2% 5.52 ± 5% 16.59 ± 41% 17973386 ± 22% 15.45 ± 46% 8.15e+09 ± 5% 4.037e+08 ± 4% 2.454e+10 ± 40% 2.308e+10 ± 46% 35252918 ± 22% 1.829e+08 ± 10% 14828999 ± 20% 309808 ± 14% 33894 ± 12% 0.45 ± 7% 0.33 ± 69% 309558 ± 14% 33905 ± 12% 419.84 ± 8% 521161 ± 3% 1410729 ± 15% 740422 ± 17% 35777 ± 50% 3760337 ± 3% 3756863 ± 3% 1877767 ± 3% 1879095 ± 3% 0.79 ± 8% 0.80 ± 8% 0.84 ± 8% 1.09 ± 4% 0.92 ± 6% 0.92 ± 5% 0.96 ± 7% 0.43 ± 4% 1.02 ± 8% 0.91 ± 7% 1.22 ± 3% 1.01 ± 5% 1.00 ± 4% 0.47 ± 19% 1.06 ± 6% 0.08 0.14 ± 5% 0.06 ± 11% 0.14 ± 3% 0.14 ± 6% 0.11 ± 6% 0.14 ± 3% 0.11 ± 7% 0.18 ± 7% 0.21 ± 7% 0.24 ± 5% 0.26 ± 6% 0.35 ± 5% 0.07 ± 17% 0.38 ± 2% 0.16 ± 6% 0.19 ± 30% 0.42 ± 21% 0.42 ± 5% 0.41 ± 21% 0.12 ± 3% 0.06 ± 11% 0.06 ± 14% 0.09 ± 5% 0.21 ± 8% 0.25 ± 5% 0.07 ± 17% 0.23 ± 4% 0.01 ±173% 0.16 ± 7% 0.32 ± 25% 0.61 ± 4% 7.467e+09 1.52 ± 52% 18876088 ± 4% 5.782e+08 3117860 4.95 ± 5% 950.94 ± 11% 8478 ± 5% 0.10 ± 66% 797793 ± 30% 8.708e+09 0.03 ± 64% 124658 ± 30% 3.774e+09 64.91 ± 4% 43785961 ± 2% 23149859 ± 4% 3.291e+10 738.35 ± 2% 0.21 ± 2% 84.65 4397398 847789 92.33 2046078 99069 17.58 ± 2% 0.97 ± 2% 3.26 ± 5% 4.75 8290 ± 3% 0.01 ± 32% 0.00 ± 33% 751.69 0.21 83.84 95.38 30797 7.45e+09 18821819 ± 4% 5.769e+08 3111288 948.06 ± 11% 796174 ± 31% 8.688e+09 124340 ± 31% 3.766e+09 43689908 ± 2% 23103357 ± 4% 3.283e+10 4385696 845448 2040515 98835 1.286e+13 2564 ± 21% 20.25 ± 44% 104.50 ±124% 18.75 ± 35% 18.75 ± 29% 18.75 ± 29% 89357 ± 6% 88209 ± 9% 83630 ± 11% 21.00 ± 36% 110630 ± 8% 106258 ± 9% 41.75 ± 55% 56.75 ±132% 88.75 ±147% 19.00 ± 27% 112.25 ±106% 18.50 ± 18% 92.25 ±142% 88357 ± 7% 89772 ± 9% 2451 ± 32% 2451 ± 32% 88.25 ±143% 107210 ± 8% 96772 ± 7% 102245 ± 2% 21.00 ± 28% 2359 ± 16% 2359 ± 16% 23.00 ± 39% 19.00 ± 34% 20.50 ± 32% 85527 ± 7% 24.00 ± 39% 83285 ± 17% 124.75 ± 99% 54.75 ± 80% 36.25 ± 64% 66.75 ±112% 91.00 ±127% 108108 ± 7% 103866 ± 5% 86.50 ±148% 100777 ± 10% 87385 ± 11% 89523 ± 10% 18.75 ± 36% 89589 ± 12% 89109 ± 11% 84744 ± 11% 85476 ± 15% 100.75 ±133% 40.00 ± 29% 91.75 ±137% 3580 ± 16% 3580 ± 16% 132.75 ± 95% 120.75 ±102% 28.25 ± 27% 3011 ± 18% 3011 ± 18% 49.50 ± 74% 23.00 ± 33% 3077 ± 14% 3077 ± 14% 54.25 ± 88% 45.75 ± 92% 40.50 ± 69% 2264 ± 15% 2264 ± 15% 274.50 ± 71% 86524 ± 4% 37.75 ± 30% 80.50 ±117% 98.00 ±127% 281.75 ± 29% 3275 ± 6% 3275 ± 6% 51.00 ±115% 23.50 ± 71% 68.25 ± 38% 59.25 ±102% 48.00 ± 79% 108205 ± 9% 55463 ± 22%

3.5e+08 | O O OO 3.4e+08 |-+ | 3.3e+08 |-+ | 3.2e+08 |-+ | 3.1e+08 |.+.+.++. | 3e+08 |-+ | 2.9e+08 |-+ | 2.8e+08 3100 +--------- | 3000 |-O O O O | 2900 |-+ | 2800 |-+ | 2700 |-+ |.+.+. .+.+. +. .+.+. 2600 |-+ + | 2500 |-+ | 2400 +--------- [*] bisect-good sample
[O] bisect-bad sample

***************** lkp-skl-fpga01: ================= compiler/cpufreq_ gcc-7/performan
commit:
dfce1eb694 7b6da71157
dfce1eb694321530 ---------------- fail:runs | 1:2

***************** lkp-skl-fpga01: ================= compiler/cpufreq_ gcc-7/performan
commit:
dfce1eb694 7b6da71157
dfce1eb694321530 ---------------- fail:runs | 1:2

Disclaimer:
Results for informational design or

Thanks,
Rong Chen

noticed a 8.4% improvement of unixbench.score due to commit:
7b6da7115786ee28ad82638a5dcb2ec1ffda0e96 ("[PATCH v9 5/5] locking/qspinlock: Introduce the shuffle reduction optimization into CNA")
href="https://github.com/0day-ci/linux/commits/Alex-Kogan/Add-NUMA-awareness-to-qspinlock/20200116-161727">https://github.com/0day-ci/linux/commits/Alex-Kogan/Add-NUMA-awareness-to-qspinlock/20200116-161727
machine: 192 threads Intel(R) Xeon(R) Platinum 9242 CPU @ 2.30GHz with 192G memory
UnixBench is the original BYTE UNIX benchmark suite aims to test performance of Unix-like system.
href="https://github.com/kdlucas/byte-unixbench">https://github.com/kdlucas/byte-unixbench
to that, the commit also has significant impact on the following tests:
--------------------------------------------------------------------------------->
href="https://github.com/intel/lkp-tests.git">https://github.com/intel/lkp-tests.git
bin/lkp install job.yaml # job file is attached in this email
========================================================================
governor/kconfig/nr_task/rootfs/runtime/tbox_group/test/testcase/ucode:
ce/x86_64-rhel-7.6/30%/debian-x86_64-20191114.cgz/300s/lkp-csl-2ap3/context1/unixbench/0x500002c
("locking/qspinlock: Introduce starvation avoidance into CNA")
("locking/qspinlock: Introduce the shuffle reduction optimization into CNA")
7b6da7115786ee28ad82638a5dc
---------------------------
%reproduction fail:runs
| |
-25% :4 kmsg.ipmi_si_dmi-ipmi-si.#:IRQ_index#not_found
25% 1:4 dmesg.WARNING:at#for_ip_swapgs_restore_regs_and_return_to_usermode/0x
50% 2:4 dmesg.WARNING:stack_recursion
%change %stddev
| \
+8.4% 2883 unixbench.score
+1.9% 4092 unixbench.time.percent_of_cpu_this_job_got
+1.6% 15923 unixbench.time.system_time
+10.1% 120.86 ± 2% unixbench.time.user_time
+9.3% 3.336e+08 unixbench.time.voluntary_context_switches
+8.2% 4.515e+08 unixbench.workload
-7.8% 102620 softirqs.CPU89.SCHED
+9.7% 3384659 vmstat.system.cs
-12.8% 31.28 ± 4% boot-time.boot
-15.4% 23.92 boot-time.dhcp
-14.6% 5073 ± 4% boot-time.idle
-80.3% 1035 ± 36% numa-meminfo.node2.Inactive
-79.8% 1035 ± 36% numa-meminfo.node2.Inactive(anon)
-71.7% 1883 ± 54% numa-meminfo.node2.Shmem
-79.8% 258.50 ± 36% numa-vmstat.node2.nr_inactive_anon
-71.8% 470.50 ± 54% numa-vmstat.node2.nr_shmem
-79.8% 258.50 ± 36% numa-vmstat.node2.nr_zone_inactive_anon
+28.2% 2.628e+08 ± 2% turbostat.C1
+2.5 7.99 turbostat.C1%
+12.8 29.43 turbostat.C1E%
-98.8% 211437 ± 84% turbostat.C6
-15.2 0.26 ± 96% turbostat.C6%
+44.4% 1.177e+10 cpuidle.C1.time
+27.2% 5.134e+08 cpuidle.C1.usage
+77.0% 4.343e+10 cpuidle.C1E.time
-97.7% 5.325e+08 ± 66% cpuidle.C6.time
-98.8% 436570 ± 71% cpuidle.C6.usage
-46.2% 98462878 cpuidle.POLL.time
-42.2% 8563858 ± 4% cpuidle.POLL.usage
-22.3% 240665 ± 13% sched_debug.cfs_rq:/.load.max
-28.1% 24377 ± 10% sched_debug.cfs_rq:/.load.stddev
+13.5% 0.51 ± 2% sched_debug.cfs_rq:/.nr_running.avg
-55.3% 0.15 ± 14% sched_debug.cfs_rq:/.nr_spread_over.avg
-22.7% 239243 ± 12% sched_debug.cfs_rq:/.runnable_weight.max
-28.5% 24244 ± 10% sched_debug.cfs_rq:/.runnable_weight.stddev
+14.8% 482.01 sched_debug.cfs_rq:/.util_est_enqueued.avg
-14.9% 443373 ± 5% sched_debug.cpu.avg_idle.avg
-27.3% 1025358 sched_debug.cpu.avg_idle.max
-30.7% 513046 sched_debug.cpu.max_idle_balance_cost.max
-91.3% 3128 ± 62% sched_debug.cpu.max_idle_balance_cost.stddev
+11.4% 4188170 ± 2% sched_debug.cpu.nr_switches.max
+11.4% 4184620 ± 2% sched_debug.cpu.sched_count.max
+11.4% 2091800 ± 2% sched_debug.cpu.sched_goidle.max
+11.4% 2093403 ± 2% sched_debug.cpu.ttwu_count.max
-0.2 0.55 perf-profile.calltrace.cycles-pp.__cna_queued_spin_lock_slowpath._raw_spin_lock.scheduler_tick.update_process_times.tick_sched_handle
-0.2 0.57 perf-profile.calltrace.cycles-pp._raw_spin_lock.scheduler_tick.update_process_times.tick_sched_handle.tick_sched_timer
-0.2 0.62 perf-profile.calltrace.cycles-pp.scheduler_tick.update_process_times.tick_sched_handle.tick_sched_timer.__hrtimer_run_queues
-0.2 0.90 ± 4% perf-profile.calltrace.cycles-pp.__hrtimer_run_queues.hrtimer_interrupt.smp_apic_timer_interrupt.apic_timer_interrupt.cpuidle_enter_state
-0.2 0.72 ± 4% perf-profile.calltrace.cycles-pp.update_process_times.tick_sched_handle.tick_sched_timer.__hrtimer_run_queues.hrtimer_interrupt
-0.2 0.73 ± 4% perf-profile.calltrace.cycles-pp.tick_sched_handle.tick_sched_timer.__hrtimer_run_queues.hrtimer_interrupt.smp_apic_timer_interrupt
-0.2 0.79 ± 5% perf-profile.calltrace.cycles-pp.tick_sched_timer.__hrtimer_run_queues.hrtimer_interrupt.smp_apic_timer_interrupt.apic_timer_interrupt
-0.4 0.05 perf-profile.children.cycles-pp.cna_scan_main_queue
-0.2 0.81 perf-profile.children.cycles-pp._raw_spin_lock
-0.2 0.70 perf-profile.children.cycles-pp.scheduler_tick
-0.2 1.03 ± 3% perf-profile.children.cycles-pp.__hrtimer_run_queues
-0.2 0.82 ± 3% perf-profile.children.cycles-pp.tick_sched_handle
-0.2 0.82 ± 4% perf-profile.children.cycles-pp.update_process_times
-0.2 0.30 ± 25% perf-profile.children.cycles-pp.poll_idle
-0.2 0.90 ± 4% perf-profile.children.cycles-pp.tick_sched_timer
+0.0 0.09 perf-profile.children.cycles-pp.cpus_share_cache
+0.0 0.15 ± 3% perf-profile.children.cycles-pp.sched_clock
+0.0 0.08 ± 6% perf-profile.children.cycles-pp.fsnotify
+0.0 0.15 ± 2% perf-profile.children.cycles-pp.update_cfs_group
+0.0 0.15 ± 3% perf-profile.children.cycles-pp.switch_mm_irqs_off
+0.0 0.13 ± 6% perf-profile.children.cycles-pp.nr_iowait_cpu
+0.0 0.16 ± 5% perf-profile.children.cycles-pp.select_idle_sibling
+0.0 0.13 ± 3% perf-profile.children.cycles-pp.update_ts_time_stats
+0.0 0.21 ± 4% perf-profile.children.cycles-pp.tick_nohz_idle_exit
+0.0 0.24 perf-profile.children.cycles-pp.mutex_unlock
+0.0 0.27 perf-profile.children.cycles-pp.update_rq_clock
+0.0 0.29 ± 3% perf-profile.children.cycles-pp.select_task_rq_fair
+0.0 0.39 ± 3% perf-profile.children.cycles-pp._raw_spin_unlock_irqrestore
+0.0 0.11 ± 14% perf-profile.children.cycles-pp._raw_spin_trylock
+0.0 0.41 ± 3% perf-profile.children.cycles-pp.mutex_lock
+0.1 0.22 ± 3% perf-profile.children.cycles-pp.reweight_entity
+0.1 0.28 ± 10% perf-profile.children.cycles-pp.clockevents_program_event
+0.1 0.55 ± 6% perf-profile.children.cycles-pp.ktime_get
-0.4 0.05 perf-profile.self.cycles-pp.cna_scan_main_queue
-0.2 0.23 ± 31% perf-profile.self.cycles-pp.poll_idle
+0.0 0.14 ± 3% perf-profile.self.cycles-pp.__wake_up_common
+0.0 0.08 ± 6% perf-profile.self.cycles-pp.__unwind_start
+0.0 0.08 ± 6% perf-profile.self.cycles-pp.fsnotify
+0.0 0.11 ± 4% perf-profile.self.cycles-pp.__account_scheduler_latency
+0.0 0.23 ± 3% perf-profile.self.cycles-pp.mutex_unlock
+0.0 0.28 ± 2% perf-profile.self.cycles-pp.stack_trace_save_tsk
+0.0 0.11 ± 14% perf-profile.self.cycles-pp._raw_spin_trylock
+0.0 0.27 ± 3% perf-profile.self.cycles-pp.enqueue_entity
+0.0 0.06 ± 9% perf-profile.self.cycles-pp.tick_nohz_next_event
+0.1 0.21 ± 4% perf-profile.self.cycles-pp.reweight_entity
+0.1 0.44 ± 6% perf-profile.self.cycles-pp.ktime_get
+0.2 0.78 perf-profile.self.cycles-pp._raw_spin_lock_irqsave
+4.5% 7.8e+09 perf-stat.i.branch-instructions
-0.5 1.04 perf-stat.i.branch-miss-rate%
+37.6% 25967363 perf-stat.i.cache-misses
+9.7% 6.34e+08 perf-stat.i.cache-references
+9.3% 3407692 perf-stat.i.context-switches
-9.5% 4.48 perf-stat.i.cpi
-14.8% 809.79 ± 3% perf-stat.i.cpu-migrations
-34.8% 5528 perf-stat.i.cycles-between-cache-misses
-0.1 0.00 ± 9% perf-stat.i.dTLB-load-miss-rate%
-68.3% 252660 ± 4% perf-stat.i.dTLB-load-misses
+4.7% 9.116e+09 perf-stat.i.dTLB-loads
-0.0 0.00 ± 2% perf-stat.i.dTLB-store-miss-rate%
-74.0% 32385 ± 4% perf-stat.i.dTLB-store-misses
+9.1% 4.118e+09 perf-stat.i.dTLB-stores
-2.3 62.61 perf-stat.i.iTLB-load-miss-rate%
+11.7% 48914532 perf-stat.i.iTLB-load-misses
+8.6% 25133067 perf-stat.i.iTLB-loads
+4.8% 3.448e+10 perf-stat.i.instructions
-5.5% 697.64 perf-stat.i.instructions-per-iTLB-miss
+8.0% 0.22 perf-stat.i.ipc
+10.4 95.02 perf-stat.i.node-load-miss-rate%
+8.0% 4748512 perf-stat.i.node-load-misses
-73.3% 226176 perf-stat.i.node-loads
+5.7 98.01 perf-stat.i.node-store-miss-rate%
+98.1% 4054203 perf-stat.i.node-store-misses
-63.5% 36118 perf-stat.i.node-stores
+4.6% 18.39 perf-stat.overall.MPKI
-0.1 0.92 perf-stat.overall.branch-miss-rate%
+0.8 4.10 perf-stat.overall.cache-miss-rate%
-5.2% 4.50 perf-stat.overall.cpi
-27.9% 5974 perf-stat.overall.cycles-between-cache-misses
-0.0 0.00 ± 4% perf-stat.overall.dTLB-load-miss-rate%
-0.0 0.00 ± 5% perf-stat.overall.dTLB-store-miss-rate%
-6.2% 704.91 perf-stat.overall.instructions-per-iTLB-miss
+5.5% 0.22 perf-stat.overall.ipc
+11.6 95.45 perf-stat.overall.node-load-miss-rate%
+3.7 99.12 perf-stat.overall.node-store-miss-rate%
-3.4% 29764 perf-stat.overall.path-length
+4.4% 7.782e+09 perf-stat.ps.branch-instructions
+37.6% 25906176 perf-stat.ps.cache-misses
+9.6% 6.325e+08 perf-stat.ps.cache-references
+9.3% 3399507 perf-stat.ps.context-switches
-14.8% 807.87 ± 3% perf-stat.ps.cpu-migrations
-68.3% 252476 ± 4% perf-stat.ps.dTLB-load-misses
+4.7% 9.095e+09 perf-stat.ps.dTLB-loads
-74.0% 32349 ± 4% perf-stat.ps.dTLB-store-misses
+9.1% 4.108e+09 perf-stat.ps.dTLB-stores
+11.7% 48794646 perf-stat.ps.iTLB-load-misses
+8.5% 25074278 perf-stat.ps.iTLB-loads
+4.8% 3.44e+10 perf-stat.ps.instructions
+8.0% 4738522 perf-stat.ps.node-load-misses
-73.3% 225752 perf-stat.ps.node-loads
+98.3% 4045447 perf-stat.ps.node-store-misses
-63.5% 36074 perf-stat.ps.node-stores
+4.5% 1.344e+13 perf-stat.total.instructions
+36.9% 3510 ± 12% interrupts.CPU0.TLB:TLB_shootdowns
+1430.9% 310.00 ± 79% interrupts.CPU10.TLB:TLB_shootdowns
+201.0% 314.50 ± 53% interrupts.CPU105.TLB:TLB_shootdowns
+418.7% 97.25 ±110% interrupts.CPU108.TLB:TLB_shootdowns
+909.3% 189.25 ± 65% interrupts.CPU110.TLB:TLB_shootdowns
+698.7% 149.75 ± 81% interrupts.CPU113.TLB:TLB_shootdowns
+15.9% 103535 ± 5% interrupts.CPU115.RES:Rescheduling_interrupts
+20.6% 106340 ± 5% interrupts.CPU116.RES:Rescheduling_interrupts
+20.8% 101003 ± 2% interrupts.CPU117.RES:Rescheduling_interrupts
+702.4% 168.50 ± 69% interrupts.CPU117.TLB:TLB_shootdowns
+10.8% 122586 ± 3% interrupts.CPU120.RES:Rescheduling_interrupts
+14.3% 121460 ± 3% interrupts.CPU122.RES:Rescheduling_interrupts
+476.6% 240.75 ± 96% interrupts.CPU123.TLB:TLB_shootdowns
+519.8% 351.75 ± 29% interrupts.CPU124.TLB:TLB_shootdowns
+116.3% 192.00 ± 77% interrupts.CPU128.TLB:TLB_shootdowns
+1509.2% 305.75 ± 98% interrupts.CPU129.TLB:TLB_shootdowns
+278.0% 424.25 ± 51% interrupts.CPU13.TLB:TLB_shootdowns
+839.2% 173.75 ± 73% interrupts.CPU132.TLB:TLB_shootdowns
+499.7% 553.25 ± 98% interrupts.CPU134.TLB:TLB_shootdowns
+9.9% 97128 ± 5% interrupts.CPU138.RES:Rescheduling_interrupts
+18.5% 106341 ± 4% interrupts.CPU139.RES:Rescheduling_interrupts
+66.6% 4084 ± 8% interrupts.CPU140.NMI:Non-maskable_interrupts
+66.6% 4084 ± 8% interrupts.CPU140.PMI:Performance_monitoring_interrupts
+210.8% 274.25 ± 88% interrupts.CPU140.TLB:TLB_shootdowns
+12.3% 120362 ± 4% interrupts.CPU144.RES:Rescheduling_interrupts
+11.1% 107553 ± 5% interrupts.CPU145.RES:Rescheduling_interrupts
+10.5% 112975 ± 2% interrupts.CPU146.RES:Rescheduling_interrupts
+298.8% 83.75 ± 29% interrupts.CPU146.TLB:TLB_shootdowns
+39.6% 3294 ± 18% interrupts.CPU152.NMI:Non-maskable_interrupts
+39.6% 3294 ± 18% interrupts.CPU152.PMI:Performance_monitoring_interrupts
+1562.0% 382.25 ± 86% interrupts.CPU159.TLB:TLB_shootdowns
+635.5% 139.75 ± 80% interrupts.CPU160.TLB:TLB_shootdowns
+506.1% 124.25 ± 92% interrupts.CPU161.TLB:TLB_shootdowns
+16.3% 99481 interrupts.CPU162.RES:Rescheduling_interrupts
+415.6% 123.75 ±107% interrupts.CPU165.TLB:TLB_shootdowns
+22.7% 102187 ± 9% interrupts.CPU166.RES:Rescheduling_interrupts
+173.5% 341.25 ± 66% interrupts.CPU166.TLB:TLB_shootdowns
+543.8% 352.50 ± 52% interrupts.CPU168.TLB:TLB_shootdowns
+510.3% 221.25 ± 66% interrupts.CPU17.TLB:TLB_shootdowns
+205.6% 204.00 ± 71% interrupts.CPU170.TLB:TLB_shootdowns
+446.4% 497.25 ± 29% interrupts.CPU171.TLB:TLB_shootdowns
+10.1% 119063 ± 5% interrupts.CPU173.RES:Rescheduling_interrupts
+19.7% 124283 ± 7% interrupts.CPU175.RES:Rescheduling_interrupts
+430.9% 459.25 ± 26% interrupts.CPU176.TLB:TLB_shootdowns
+13.6% 114467 ± 5% interrupts.CPU180.RES:Rescheduling_interrupts
+24.5% 108773 ± 8% interrupts.CPU182.RES:Rescheduling_interrupts
+22.9% 110062 ± 7% interrupts.CPU183.RES:Rescheduling_interrupts
+774.7% 164.00 ± 85% interrupts.CPU183.TLB:TLB_shootdowns
+25.1% 112078 ± 11% interrupts.CPU184.RES:Rescheduling_interrupts
+29.7% 115571 ± 6% interrupts.CPU185.RES:Rescheduling_interrupts
+19.9% 101596 ± 9% interrupts.CPU186.RES:Rescheduling_interrupts
+19.3% 101991 ± 5% interrupts.CPU189.RES:Rescheduling_interrupts
+306.5% 409.50 ± 53% interrupts.CPU19.TLB:TLB_shootdowns
+303.1% 161.25 ± 51% interrupts.CPU190.TLB:TLB_shootdowns
+186.6% 263.00 ± 45% interrupts.CPU20.TLB:TLB_shootdowns
-18.4% 2920 ± 5% interrupts.CPU22.NMI:Non-maskable_interrupts
-18.4% 2920 ± 5% interrupts.CPU22.PMI:Performance_monitoring_interrupts
+236.5% 446.75 ± 24% interrupts.CPU23.TLB:TLB_shootdowns
+207.2% 371.00 ± 42% interrupts.CPU29.TLB:TLB_shootdowns
+1029.2% 319.00 ±116% interrupts.CPU30.TLB:TLB_shootdowns
-27.0% 2199 ± 31% interrupts.CPU31.NMI:Non-maskable_interrupts
-27.0% 2199 ± 31% interrupts.CPU31.PMI:Performance_monitoring_interrupts
+511.1% 302.50 ± 82% interrupts.CPU32.TLB:TLB_shootdowns
+825.0% 212.75 ±100% interrupts.CPU33.TLB:TLB_shootdowns
+24.8% 3840 ± 5% interrupts.CPU35.NMI:Non-maskable_interrupts
+24.8% 3840 ± 5% interrupts.CPU35.PMI:Performance_monitoring_interrupts
+285.7% 209.25 ± 68% interrupts.CPU36.TLB:TLB_shootdowns
+648.1% 342.25 ± 28% interrupts.CPU38.TLB:TLB_shootdowns
+529.6% 255.00 ± 44% interrupts.CPU41.TLB:TLB_shootdowns
+58.7% 3593 ± 10% interrupts.CPU46.NMI:Non-maskable_interrupts
+58.7% 3593 ± 10% interrupts.CPU46.PMI:Performance_monitoring_interrupts
+108.7% 572.75 ± 19% interrupts.CPU51.TLB:TLB_shootdowns
+7.8% 93252 ± 4% interrupts.CPU57.RES:Rescheduling_interrupts
+1033.8% 428.00 ± 53% interrupts.CPU60.TLB:TLB_shootdowns
+602.2% 565.25 ± 77% interrupts.CPU66.TLB:TLB_shootdowns
+221.4% 315.00 ± 76% interrupts.CPU69.TLB:TLB_shootdowns
-69.7% 85.25 ± 5% interrupts.CPU7.TLB:TLB_shootdowns
-22.6% 2535 ± 26% interrupts.CPU72.NMI:Non-maskable_interrupts
-22.6% 2535 ± 26% interrupts.CPU72.PMI:Performance_monitoring_interrupts
+383.3% 246.50 ± 44% interrupts.CPU88.TLB:TLB_shootdowns
+368.1% 110.00 ± 32% interrupts.CPU90.TLB:TLB_shootdowns
+231.5% 226.25 ± 58% interrupts.CPU91.TLB:TLB_shootdowns
+230.4% 195.75 ± 75% interrupts.CPU92.TLB:TLB_shootdowns
+530.2% 302.50 ± 44% interrupts.CPU95.TLB:TLB_shootdowns
+13.5% 122833 ± 4% interrupts.CPU97.RES:Rescheduling_interrupts
+42.5% 79045 ± 14% interrupts.TLB:TLB_shootdowns

unixbench.time.voluntary_context_switches

+-----------------------------------------------------------------+
|
O OO O O O O |
O O O OO O O O O |
O O |
|
|
+.+.+.+ |
+ : .+ |
+.+.+.+.++.+.+.+.+.++ : .+ +. |
: ++.+.+.+ +.|
: + |
++. .+ |
+.+.+ |
+-----------------------------------------------------------------+

unixbench.score

-----------------------------------------------------------+
|
|
O O |
O OO O O O O O O O OO O O |
O O |
|
+. |
.+.++.+ + + |
.+.+. .+ : +.+ + .|
+ + +.+ + : .+. : + |
: .+.+ + |
+.+. .+ |
+.+.+ |
-----------------------------------------------------------+

**********************************************************************************
104 threads Skylake with 192G memory
========================================================================
governor/kconfig/mode/nr_task/rootfs/tbox_group/test/testcase/ucode:
ce/x86_64-rhel-7.6/process/100%/debian-x86_64-20191114.cgz/lkp-skl-fpga01/mmap1/will-it-scale/0x2000065
("locking/qspinlock: Introduce starvation avoidance into CNA")
("locking/qspinlock: Introduce the shuffle reduction optimization into CNA")
7b6da7115786ee28ad82638a5dc
---------------------------
%reproduction fail:runs
| |
-50% :2 dmesg.WARNING:at#for_ip_interrupt_entry/0x
**********************************************************************************
104 threads Skylake with 192G memory
========================================================================
governor/kconfig/mode/nr_task/rootfs/tbox_group/test/testcase/ucode:
ce/x86_64-rhel-7.6/process/100%/debian-x86_64-20191114.cgz/lkp-skl-fpga01/mmap2/will-it-scale/0x2000065
("locking/qspinlock: Introduce starvation avoidance into CNA")
("locking/qspinlock: Introduce the shuffle reduction optimization into CNA")
7b6da7115786ee28ad82638a5dc
---------------------------
%reproduction fail:runs
| |
-50% :2 dmesg.WARNING:at#for_ip_interrupt_entry/0x
have been estimated based on internal Intel analysis and are provided
purposes only. Any difference in system hardware or software
configuration may affect actual performance.

Attachments:

(No filename) (27.41 kB)
config-5.4.0-04240-g7b6da7115786e (204.63 kB)
job-script (7.63 kB)
job.yaml (5.24 kB)
reproduce (304.00 B)
Download all attachments