LinuxLists.cc - [PATCH v10 6/9] Reimplement RLIMIT

2021-04-07 21:42:19

Subject: [PATCH v10 6/9] Reimplement RLIMIT_SIGPENDING on top of ucounts

The rlimit counter is tied to uid in the user_namespace. This allows
rlimit values to be specified in userns even if they are already
globally exceeded by the user. However, the value of the previous
user_namespaces cannot be exceeded.

v10:
* Fix memory leak on get_ucounts failure.

Signed-off-by: Alexey Gladkov <[email protected]>
---
fs/proc/array.c | 2 +-
include/linux/sched/user.h | 1 -
include/linux/signal_types.h | 4 ++-
include/linux/user_namespace.h | 1 +
kernel/fork.c | 1 +
kernel/signal.c | 58 ++++++++++++++++------------------
kernel/ucount.c | 1 +
kernel/user.c | 1 -
kernel/user_namespace.c | 1 +
9 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index bb87e4d89cd8..74b0ea4b7e38 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
- qsize = atomic_read(&__task_cred(p)->user->sigpending);
+ qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 8a34446681aa..8ba9cec4fb99 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -12,7 +12,6 @@
*/
struct user_struct {
refcount_t __count; /* reference count */
- atomic_t sigpending; /* How many pending signals does this user have? */
#ifdef CONFIG_FANOTIFY
atomic_t fanotify_listeners;
#endif
diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h
index 68e06c75c5b2..34cb28b8f16c 100644
--- a/include/linux/signal_types.h
+++ b/include/linux/signal_types.h
@@ -13,6 +13,8 @@ typedef struct kernel_siginfo {
__SIGINFO;
} kernel_siginfo_t;

+struct ucounts;
+
/*
* Real Time signals may be queued.
*/
@@ -21,7 +23,7 @@ struct sigqueue {
struct list_head list;
int flags;
kernel_siginfo_t info;
- struct user_struct *user;
+ struct ucounts *ucounts;
};

/* flags values. */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d0fea0306394..6e8736c7aa29 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -52,6 +52,7 @@ enum ucount_type {
#endif
UCOUNT_RLIMIT_NPROC,
UCOUNT_RLIMIT_MSGQUEUE,
+ UCOUNT_RLIMIT_SIGPENDING,
UCOUNT_COUNTS,
};

diff --git a/kernel/fork.c b/kernel/fork.c
index 85c6094f5a48..741f896c156e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -824,6 +824,7 @@ void __init fork_init(void)

init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC);
init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE);
+ init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING);

#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
diff --git a/kernel/signal.c b/kernel/signal.c
index f2a1b898da29..4e80386acec7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -413,49 +413,45 @@ void task_join_group_stop(struct task_struct *task)
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
{
- struct sigqueue *q = NULL;
- struct user_struct *user;
- int sigpending;
+ struct sigqueue *q = kmem_cache_alloc(sigqueue_cachep, flags);

- /*
- * Protect access to @t credentials. This can go away when all
- * callers hold rcu read lock.
- *
- * NOTE! A pending signal will hold on to the user refcount,
- * and we get/put the refcount only when the sigpending count
- * changes from/to zero.
- */
- rcu_read_lock();
- user = __task_cred(t)->user;
- sigpending = atomic_inc_return(&user->sigpending);
- if (sigpending == 1)
- get_uid(user);
- rcu_read_unlock();
+ if (likely(q != NULL)) {
+ bool overlimit;

- if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
- q = kmem_cache_alloc(sigqueue_cachep, flags);
- } else {
- print_dropped_signal(sig);
- }
-
- if (unlikely(q == NULL)) {
- if (atomic_dec_and_test(&user->sigpending))
- free_uid(user);
- } else {
INIT_LIST_HEAD(&q->list);
q->flags = 0;
- q->user = user;
+
+ /*
+ * Protect access to @t credentials. This can go away when all
+ * callers hold rcu read lock.
+ */
+ rcu_read_lock();
+ q->ucounts = get_ucounts(task_ucounts(t));
+ if (q->ucounts) {
+ overlimit = inc_rlimit_ucounts_and_test(q->ucounts, UCOUNT_RLIMIT_SIGPENDING,
+ 1, task_rlimit(t, RLIMIT_SIGPENDING));
+
+ if (override_rlimit || likely(!overlimit)) {
+ rcu_read_unlock();
+ return q;
+ }
+ }
+ rcu_read_unlock();
+ kmem_cache_free(sigqueue_cachep, q);
}

- return q;
+ print_dropped_signal(sig);
+ return NULL;
}

static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- if (atomic_dec_and_test(&q->user->sigpending))
- free_uid(q->user);
+ if (q->ucounts) {
+ dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
+ put_ucounts(q->ucounts);
+ }
kmem_cache_free(sigqueue_cachep, q);
}

diff --git a/kernel/ucount.c b/kernel/ucount.c
index 3f1682b690e6..5c1381ff388a 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -80,6 +80,7 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_inotify_instances"),
UCOUNT_ENTRY("max_inotify_watches"),
#endif
+ { },
{ },
{ },
{ }
diff --git a/kernel/user.c b/kernel/user.c
index 7f5ff498207a..6737327f83be 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */
struct user_struct root_user = {
.__count = REFCOUNT_INIT(1),
- .sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
.uid = GLOBAL_ROOT_UID,
.ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index cc90d5203acf..df1bed32dd48 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -123,6 +123,7 @@ int create_user_ns(struct cred *new)
}
ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC);
ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE);
+ ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING);
ns->ucounts = ucounts;

/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
--
2.29.3

2021-04-08 08:35:00

by kernel test robot

[permalink] [raw]

Subject: 08ed4efad6: stress-ng.sigsegv.ops_per_sec -41.9% regression

Greeting,

FYI, we noticed a

commit: 08ed4efad684e20eab url: in testcase: stress-ng
on test machine: with following parameters:

nr_threads: 100%
disk: 1HDD
testtime: 60s
class: interrupt
test: sigsegv
cpufreq_governor: performance
ucode: 0x42e

If you fix the issue, Reported-by: kernel

Details are as below:
--------------------------

To reproduce:

git clone bin/lkp install bin/lkp split-job bin/lkp run
========================== class/compiler/cpufreq_gov interrupt/gcc-9/performa
commit:
8c0a56ae79 ("Reimplement 08ed4efad6 ("Reimplement
8c0a56ae79357988 ---------------- %stddev %change \ | 4.708e+08 7846743 1558 +28.2% 1280 168046 ? 9% 85539 ? 7% +10.6% 9344 +1.9% 51.31 +27.2% 41.49 -34.2% 224.39 ? 7% 194.64 ? 7% 50.83 +27.5% 41.00 -34.6% 0.01 ? 34% -0.0 46.55 +12.7 38.56 -13.3 544.67 ? 95% 46207 5481 ? 34% +23.5% 5481 ? 34% +23.5% 5518 ? 35% +22.9% 5518 ? 35% +22.9% 544.67 ? 95% 8945 ? 11% -19.3% 7440 ? 18% -24.0% 9143 ? 11% -28.3% 8842 ? 15% -31.5% 8278 ? 17% -24.5% 10071 ? 16% -33.9% 8077 ? 12% -22.4% 8065 ? 14% -22.8% 8389 ? 11% -17.9% 8782 ? 15% -26.7% 8523 ? 13% -19.3% 8864 ? 15% -24.8% 9099 ? 13% -26.5% 8909 ? 22% -25.6% 8204 ? 14% -24.4% 8419 ? 14% -19.3% 8593 ? 12% -20.3% 8297 ? 11% -26.3% 394778 ? 13% 0.76 ? 5% +189.5% 8.024e+09 0.88 ? 2% +0.1 54200970 27.02 ? 2% +8.7 5961544 22107466 3.39 +66.6% 23809 -67.4% 1.604e+08 ? 5% 1.15e+10 1.384e+08 1.063e+10 3.926e+10 1519 -30.9% 0.31 -36.0% 631.97 44.56 +1.7 163292 ? 3% +4582.0% 227388 ? 2% +3708.8% 48.55 -1.3 5127160 5273474 7317392 0.56 +265.7% 0.68 +0.1 26.98 ? 2% +8.8 3.47 +66.8% 22849 -65.6% 1342 0.29 -40.0% 41.78 +5.1 49.29 -1.2 7.893e+09 53337110 5867618 21758871 1.578e+08 ? 5% 1.131e+10 1.361e+08 1.046e+10 3.862e+10 160758 ? 3% +4578.7% 224053 ? 2% +3702.9% 5043253 5187483 7197269 2.443e+12 34.97 -12.5 19.95 -6.3 13.13 -5.7 12.64 -5.3 8.09 -3.7 5.60 -2.9 6.25 -2.7 5.19 -2.5 3.00 -1.6 3.71 -1.5 2.22 ? 2% -1.0 2.28 -0.9 2.10 ? 2% -0.9 1.34 ? 2% -0.6 1.19 ? 2% -0.5 0.92 ? 2% -0.5 1.35 ? 5% -0.3 1.09 ? 2% -0.3 1.07 -0.2 0.94 ? 2% -0.2 0.97 ? 2% -0.1 0.86 -0.1 0.61 ? 2% +0.3 0.00 +0.6 0.59 +0.6 0.58 ? 2% +0.6 0.00 +0.6 0.00 +0.6 0.00 +0.6 0.00 +4.5 18.71 +6.1 14.80 +6.9 14.33 +7.1 0.00 +8.8 6.07 +10.7 0.00 +10.8 3.82 +11.5 0.00 +13.8 0.00 +18.0 10.89 +21.3 10.54 +21.4 7.98 +22.2 7.54 +22.3 7.45 +22.4 6.62 +22.5 4.50 +22.9 30.83 +26.8 35.13 -12.5 20.04 -6.3 14.40 -6.2 13.40 -5.4 10.28 -4.7 5.75 -2.9 6.43 -2.8 3.16 -1.7 3.25 ? 2% -1.5 3.75 -1.5 3.02 -1.5 2.18 -1.3 2.49 -1.1 2.16 ? 2% -0.9 2.29 -0.9 2.18 -0.9 1.48 -0.8 1.41 ? 2% -0.7 1.22 -0.6 0.93 ? 2% -0.5 1.64 -0.4 0.94 -0.4 0.81 ? 4% -0.3 0.56 -0.3 0.73 ? 2% -0.3 1.84 -0.3 1.12 ? 2% -0.3 1.10 -0.2 0.42 ? 4% -0.2 0.45 -0.2 0.40 ? 4% -0.2 0.37 ? 4% -0.2 0.72 ? 5% -0.2 1.20 -0.2 0.35 ? 2% -0.1 0.39 ? 4% -0.1 0.29 ? 3% -0.1 1.00 ? 2% -0.1 0.32 ? 4% -0.1 0.29 ? 2% -0.1 0.24 ? 6% -0.1 0.19 ? 4% -0.1 0.30 ? 2% -0.1 0.12 ? 4% -0.1 0.18 ? 3% -0.1 0.18 ? 2% -0.1 0.18 ? 6% -0.1 0.16 ? 5% -0.1 0.16 ? 9% -0.1 0.18 ? 3% -0.1 0.14 ? 7% -0.1 0.11 ? 6% -0.1 0.10 ? 5% -0.0 0.18 ? 3% -0.0 0.14 ? 12% +0.1 0.30 ? 2% +0.1 0.24 ? 3% +0.1 0.29 ? 3% +0.2 0.63 ? 2% +0.4 0.17 ? 4% +0.4 0.59 +0.6 0.58 +0.6 0.00 +4.5 19.45 +6.2 16.41 +6.5 14.66 +7.2 0.00 +9.0 6.25 +10.8 0.00 +11.0 3.89 +11.7 0.00 +14.0 0.00 +18.4 10.93 +21.3 10.61 +21.4 8.01 +22.2 7.55 +22.3 7.64 +22.8 6.83 +22.9 4.62 +23.3 30.88 +26.8 14.37 -6.2 18.54 -5.7 10.28 -4.7 4.24 -4.1 3.77 -3.7 2.11 -1.3 2.29 -0.9 2.17 -0.9 4.29 -0.8 2.20 ? 2% -0.8 1.92 -0.8 1.75 -0.8 1.63 ? 2% -0.8 1.19 -0.6 1.41 -0.6 1.20 ? 3% -0.6 1.08 ? 3% -0.5 0.84 ? 3% -0.5 1.05 ? 2% -0.5 0.84 ? 2% -0.4 0.81 -0.4 0.82 -0.4 1.59 -0.4 0.78 ? 3% -0.3 0.78 ? 2% -0.3 0.69 ? 4% -0.3 0.59 ? 2% -0.3 0.72 -0.3 0.70 ? 3% -0.3 0.51 -0.3 0.77 ? 3% -0.2 0.40 ? 4% -0.2 0.44 ? 2% -0.2 0.34 ? 3% -0.2 0.35 ? 4% -0.2 0.35 ? 4% -0.2 0.40 ? 2% -0.2 0.30 ? 3% -0.1 0.22 ? 4% -0.1 0.26 ? 3% -0.1 0.26 ? 3% -0.1 0.18 ? 6% -0.1 0.31 ? 8% -0.1 0.29 ? 3% -0.1 0.20 ? 3% -0.1 0.26 ? 3% -0.1 0.17 ? 5% -0.1 0.98 -0.1 0.16 ? 5% -0.1 0.16 ? 2% -0.1 0.17 ? 5% -0.1 0.14 ? 6% -0.1 0.15 ? 3% -0.1 0.13 ? 7% -0.1 0.14 ? 11% -0.1 0.16 ? 5% -0.1 0.12 ? 6% -0.1 0.11 ? 7% -0.1 0.10 ? 4% -0.0 0.10 ? 5% -0.0 0.11 ? 6% -0.0 0.11 ? 6% -0.0 0.09 ? 8% -0.0 0.23 ? 4% +0.1 0.28 ? 3% +0.2 0.41 ? 3% +0.4 0.00 +4.4 0.00 +4.5 0.00 +9.0 0.00 +11.0 0.00 +14.0

1350 +------------------ 1300 |.+ .+ .+ | ++.++.+.++.++.+ 1250 |-+ 1200 |-+ 1150 |-+ 1100 |-+ | 1050 |-+ 1000 |-+ 950 |-+ 900 |-+ | 850 |-OO OO O 800 +------------------ 2050 +------------------ 2000 |-OO OO O OO OO O O | 1950 |-+ 1900 |-+ 1850 |-+ 1800 |-+ | 1750 |-+ 1700 |-+ 1650 |-+ 1600 |-+ |.++.++.+. +. 1550 |-+ + ++.+.++.+ 1500 +------------------ 5e+08 +--------------- |.++.++.++.++.++.++.++. | 4.5e+08 |-+ | | 4e+08 |-+ | 3.5e+08 |-+ | | 3e+08 |-+ | OO OO OO O | 2.5e+08 +--------------- 8.5e+06 +--------------- | +. 8e+06 |.++.++.++.+ 7.5e+06 |-+ | 7e+06 |-+ 6.5e+06 |-+ | 6e+06 |-+ 5.5e+06 |-+ | 5e+06 |-+ 4.5e+06 |-OO OO OO O O O | 4e+06 +--------------- [*] bisect-good sample
[O] bisect-bad sample

Disclaimer:
Results have been for informational design or configuration

---
0DAY/LKP+ Test Infrastructure https://github.com/0day-ci/linux/commits/Alexey-Gladkov/Count-rlimits-in-each-user-namespace/20210408-011035
t.kernel.org/cgit/linux/kernel/git/shuah/linux-kselftest.git">https://git.kernel.org/cgit/linux/kernel/git/shuah/linux-kselftest.git next
48 threads Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz with 112G memory
kindly add following tag
test robot <[email protected]>
------------------------------------------------------------------------>
href="https://github.com/intel/lkp-tests.git">https://github.com/intel/lkp-tests.git
job.yaml # job file is attached in this email
--compatible job.yaml
compatible-job.yaml
===============================================================
ernor/disk/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime/ucode:
nce/1HDD/x86_64-rhel-8.3/100%/debian-10.4-x86_64-20200603.cgz/lkp-ivb-2ep1/sigsegv/stress-ng/60s/0x42e
RLIMIT_MSGQUEUE on top of ucounts")
RLIMIT_SIGPENDING on top of ucounts")
08ed4efad684e20eab719aae975
---------------------------
%stddev
\
-41.9% 2.734e+08 stress-ng.sigsegv.ops
-41.9% 4556545 stress-ng.sigsegv.ops_per_sec
1998 stress-ng.time.system_time
-34.3% 840.98 stress-ng.time.user_time
-33.0% 112621 ? 46% cpuidle.C1E.usage
94580 ? 5% numa-meminfo.node0.Slab
9518 proc-vmstat.nr_mapped
65.25 iostat.cpu.system
27.29 iostat.cpu.user
-17.2% 185.78 ? 8% sched_debug.cfs_rq:/.runnable_avg.stddev
-15.6% 164.25 ? 11% sched_debug.cfs_rq:/.util_avg.stddev
64.83 vmstat.cpu.sy
26.83 vmstat.cpu.us
0.00 ? 43% mpstat.cpu.all.iowait%
59.24 mpstat.cpu.all.sys%
25.27 mpstat.cpu.all.usr%
-73.0% 147.00 ?102% interrupts.36:PCI-MSI.2621442-edge.eth0-TxRx-1
-3.8% 44436 interrupts.CAL:Function_call_interrupts
6771 ? 27% interrupts.CPU12.NMI:Non-maskable_interrupts
6771 ? 27% interrupts.CPU12.PMI:Performance_monitoring_interrupts
6783 ? 27% interrupts.CPU23.NMI:Non-maskable_interrupts
6783 ? 27% interrupts.CPU23.PMI:Performance_monitoring_interrupts
-73.0% 147.00 ?102% interrupts.CPU27.36:PCI-MSI.2621442-edge.eth0-TxRx-1
7221 ? 15% softirqs.CPU1.RCU
5657 ? 8% softirqs.CPU15.RCU
6556 ? 7% softirqs.CPU17.RCU
6054 ? 9% softirqs.CPU18.RCU
6251 ? 7% softirqs.CPU19.RCU
6653 ? 7% softirqs.CPU2.RCU
6271 ? 9% softirqs.CPU21.RCU
6229 ? 10% softirqs.CPU29.RCU
6885 ? 12% softirqs.CPU36.RCU
6439 ? 16% softirqs.CPU39.RCU
6879 ? 9% softirqs.CPU4.RCU
6664 ? 9% softirqs.CPU40.RCU
6686 ? 10% softirqs.CPU41.RCU
6632 ? 10% softirqs.CPU43.RCU
6205 ? 9% softirqs.CPU44.RCU
6793 ? 8% softirqs.CPU5.RCU
6845 ? 8% softirqs.CPU6.RCU
6114 ? 12% softirqs.CPU7.RCU
-20.8% 312750 ? 7% softirqs.RCU
2.20 perf-stat.i.MPKI
-39.9% 4.822e+09 perf-stat.i.branch-instructions
0.95 ? 5% perf-stat.i.branch-miss-rate%
-31.8% 36988412 ? 6% perf-stat.i.branch-misses
35.70 perf-stat.i.cache-miss-rate%
+190.4% 17314361 perf-stat.i.cache-misses
+119.2% 48457656 perf-stat.i.cache-references
5.65 perf-stat.i.cpi
7762 perf-stat.i.cycles-between-cache-misses
-40.1% 96053796 ? 10% perf-stat.i.dTLB-load-misses
-40.4% 6.852e+09 perf-stat.i.dTLB-loads
-41.4% 81063645 perf-stat.i.dTLB-store-misses
-41.0% 6.27e+09 perf-stat.i.dTLB-stores
-40.1% 2.352e+10 perf-stat.i.instructions
1049 ? 14% perf-stat.i.instructions-per-iTLB-miss
0.20 perf-stat.i.ipc
-40.3% 377.29 perf-stat.i.metric.M/sec
46.21 perf-stat.i.node-load-miss-rate%
7645410 perf-stat.i.node-load-misses
8660824 perf-stat.i.node-loads
47.24 perf-stat.i.node-store-miss-rate%
+49.6% 7668899 perf-stat.i.node-store-misses
+57.0% 8281652 perf-stat.i.node-stores
-41.9% 4247879 perf-stat.i.page-faults
2.06 perf-stat.overall.MPKI
0.77 ? 7% perf-stat.overall.branch-miss-rate%
35.73 perf-stat.overall.cache-miss-rate%
5.79 perf-stat.overall.cpi
7866 perf-stat.overall.cycles-between-cache-misses
-37.7% 836.47 ? 19% perf-stat.overall.instructions-per-iTLB-miss
0.17 perf-stat.overall.ipc
46.89 perf-stat.overall.node-load-miss-rate%
48.08 perf-stat.overall.node-store-miss-rate%
-39.9% 4.744e+09 perf-stat.ps.branch-instructions
-31.7% 36421215 ? 6% perf-stat.ps.branch-misses
+190.4% 17037777 perf-stat.ps.cache-misses
+119.2% 47685960 perf-stat.ps.cache-references
-40.1% 94495593 ? 10% perf-stat.ps.dTLB-load-misses
-40.4% 6.741e+09 perf-stat.ps.dTLB-loads
-41.4% 79747745 perf-stat.ps.dTLB-store-misses
-41.0% 6.169e+09 perf-stat.ps.dTLB-stores
-40.1% 2.314e+10 perf-stat.ps.instructions
7521403 perf-stat.ps.node-load-misses
8520470 perf-stat.ps.node-loads
+49.6% 7544873 perf-stat.ps.node-store-misses
+57.1% 8148130 perf-stat.ps.node-stores
-41.9% 4178914 perf-stat.ps.page-faults
-40.0% 1.465e+12 perf-stat.total.instructions
22.49 perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe
13.66 perf-profile.calltrace.cycles-pp.syscall_exit_to_user_mode.entry_SYSCALL_64_after_hwframe
7.46 perf-profile.calltrace.cycles-pp.syscall_return_via_sysret
7.36 perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe
4.43 perf-profile.calltrace.cycles-pp.__entry_text_start
2.74 perf-profile.calltrace.cycles-pp.__setup_rt_frame.arch_do_signal_or_restart.exit_to_user_mode_prepare.irqentry_exit_to_user_mode.asm_exc_page_fault
3.52 ? 2% perf-profile.calltrace.cycles-pp.__x64_sys_rt_sigaction.do_syscall_64.entry_SYSCALL_64_after_hwframe
2.68 ? 2% perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_safe_stack
1.41 ? 2% perf-profile.calltrace.cycles-pp.copy_fpstate_to_sigframe.__setup_rt_frame.arch_do_signal_or_restart.exit_to_user_mode_prepare.irqentry_exit_to_user_mode
2.26 ? 2% perf-profile.calltrace.cycles-pp.__x64_sys_rt_sigprocmask.do_syscall_64.entry_SYSCALL_64_after_hwframe
1.20 ? 3% perf-profile.calltrace.cycles-pp._copy_from_user.__x64_sys_rt_sigaction.do_syscall_64.entry_SYSCALL_64_after_hwframe
1.35 perf-profile.calltrace.cycles-pp.__irqentry_text_end
1.19 perf-profile.calltrace.cycles-pp.do_sigaction.__x64_sys_rt_sigaction.do_syscall_64.entry_SYSCALL_64_after_hwframe
0.69 ? 2% perf-profile.calltrace.cycles-pp.copy_siginfo_to_user.__setup_rt_frame.arch_do_signal_or_restart.exit_to_user_mode_prepare.irqentry_exit_to_user_mode
0.71 ? 6% perf-profile.calltrace.cycles-pp.exit_to_user_mode_prepare.syscall_exit_to_user_mode.entry_SYSCALL_64_after_hwframe
0.45 ? 45% perf-profile.calltrace.cycles-pp.__might_fault._copy_from_user.__x64_sys_rt_sigaction.do_syscall_64.entry_SYSCALL_64_after_hwframe
1.05 ? 4% perf-profile.calltrace.cycles-pp.__x86_indirect_thunk_rax.do_syscall_64.entry_SYSCALL_64_after_hwframe
0.83 ? 4% perf-profile.calltrace.cycles-pp.signal_setup_done.arch_do_signal_or_restart.exit_to_user_mode_prepare.irqentry_exit_to_user_mode.asm_exc_page_fault
0.83 ? 2% perf-profile.calltrace.cycles-pp.fpu__clear.arch_do_signal_or_restart.exit_to_user_mode_prepare.irqentry_exit_to_user_mode.asm_exc_page_fault
0.75 ? 5% perf-profile.calltrace.cycles-pp.__set_current_blocked.signal_setup_done.arch_do_signal_or_restart.exit_to_user_mode_prepare.irqentry_exit_to_user_mode
0.86 ? 3% perf-profile.calltrace.cycles-pp.sigprocmask.__x64_sys_rt_sigprocmask.do_syscall_64.entry_SYSCALL_64_after_hwframe
0.79 ? 3% perf-profile.calltrace.cycles-pp.__set_current_blocked.sigprocmask.__x64_sys_rt_sigprocmask.do_syscall_64.entry_SYSCALL_64_after_hwframe
0.96 perf-profile.calltrace.cycles-pp.signal_wake_up_state.__send_signal.force_sig_info_to_task.force_sig_fault.__bad_area_nosemaphore
0.55 ? 3% perf-profile.calltrace.cycles-pp.__sigqueue_alloc.__send_signal.force_sig_info_to_task.force_sig.exc_general_protection
1.15 ? 2% perf-profile.calltrace.cycles-pp.asm_exc_general_protection
1.14 ? 2% perf-profile.calltrace.cycles-pp.exc_general_protection.asm_exc_general_protection
0.59 ? 3% perf-profile.calltrace.cycles-pp.__send_signal.force_sig_info_to_task.force_sig.exc_general_protection.asm_exc_general_protection
0.61 ? 3% perf-profile.calltrace.cycles-pp.force_sig_info_to_task.force_sig.exc_general_protection.asm_exc_general_protection
0.61 ? 3% perf-profile.calltrace.cycles-pp.force_sig.exc_general_protection.asm_exc_general_protection
4.46 perf-profile.calltrace.cycles-pp.put_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare
24.85 perf-profile.calltrace.cycles-pp.irqentry_exit_to_user_mode.asm_exc_page_fault
21.71 perf-profile.calltrace.cycles-pp.exit_to_user_mode_prepare.irqentry_exit_to_user_mode.asm_exc_page_fault
21.42 perf-profile.calltrace.cycles-pp.arch_do_signal_or_restart.exit_to_user_mode_prepare.irqentry_exit_to_user_mode.asm_exc_page_fault
8.81 perf-profile.calltrace.cycles-pp.get_ucounts.__sigqueue_alloc.__send_signal.force_sig_info_to_task.force_sig_fault
16.73 perf-profile.calltrace.cycles-pp.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare.irqentry_exit_to_user_mode.asm_exc_page_fault
10.82 perf-profile.calltrace.cycles-pp.dec_rlimit_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare
15.29 perf-profile.calltrace.cycles-pp.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare.irqentry_exit_to_user_mode
13.76 perf-profile.calltrace.cycles-pp.inc_rlimit_ucounts.inc_rlimit_ucounts_and_test.__sigqueue_alloc.__send_signal.force_sig_info_to_task
18.03 perf-profile.calltrace.cycles-pp.inc_rlimit_ucounts_and_test.__sigqueue_alloc.__send_signal.force_sig_info_to_task.force_sig_fault
32.16 perf-profile.calltrace.cycles-pp.exc_page_fault.asm_exc_page_fault
31.96 perf-profile.calltrace.cycles-pp.do_user_addr_fault.exc_page_fault.asm_exc_page_fault
30.18 perf-profile.calltrace.cycles-pp.__bad_area_nosemaphore.do_user_addr_fault.exc_page_fault.asm_exc_page_fault
29.88 perf-profile.calltrace.cycles-pp.force_sig_fault.__bad_area_nosemaphore.do_user_addr_fault.exc_page_fault.asm_exc_page_fault
29.83 perf-profile.calltrace.cycles-pp.force_sig_info_to_task.force_sig_fault.__bad_area_nosemaphore.do_user_addr_fault.exc_page_fault
29.09 perf-profile.calltrace.cycles-pp.__send_signal.force_sig_info_to_task.force_sig_fault.__bad_area_nosemaphore.do_user_addr_fault
27.38 perf-profile.calltrace.cycles-pp.__sigqueue_alloc.__send_signal.force_sig_info_to_task.force_sig_fault.__bad_area_nosemaphore
57.65 perf-profile.calltrace.cycles-pp.asm_exc_page_fault
22.61 perf-profile.children.cycles-pp.entry_SYSCALL_64_after_hwframe
13.72 perf-profile.children.cycles-pp.syscall_exit_to_user_mode
8.17 perf-profile.children.cycles-pp.syscall_return_via_sysret
7.96 perf-profile.children.cycles-pp.do_syscall_64
5.56 perf-profile.children.cycles-pp.__entry_text_start
2.82 perf-profile.children.cycles-pp.__setup_rt_frame
3.63 ? 2% perf-profile.children.cycles-pp.__x64_sys_rt_sigaction
1.49 ? 2% perf-profile.children.cycles-pp.copy_fpstate_to_sigframe
1.74 ? 3% perf-profile.children.cycles-pp._copy_from_user
2.28 ? 2% perf-profile.children.cycles-pp.__x64_sys_rt_sigprocmask
1.55 ? 2% perf-profile.children.cycles-pp.entry_SYSCALL_64_safe_stack
0.88 ? 2% perf-profile.children.cycles-pp.copy_user_generic_unrolled
1.41 ? 2% perf-profile.children.cycles-pp.__might_fault
1.22 perf-profile.children.cycles-pp.do_sigaction
1.35 perf-profile.children.cycles-pp.__irqentry_text_end
1.25 perf-profile.children.cycles-pp.native_irq_return_iret
0.70 ? 2% perf-profile.children.cycles-pp._copy_to_user
0.73 ? 2% perf-profile.children.cycles-pp.copy_siginfo_to_user
0.63 ? 2% perf-profile.children.cycles-pp.___might_sleep
0.47 ? 3% perf-profile.children.cycles-pp.__might_sleep
1.24 ? 2% perf-profile.children.cycles-pp._raw_spin_lock_irq
0.55 ? 3% perf-profile.children.cycles-pp.__clear_user
0.48 ? 4% perf-profile.children.cycles-pp.syscall_enter_from_user_mode
0.26 ? 6% perf-profile.children.cycles-pp.sync_regs
0.44 ? 5% perf-profile.children.cycles-pp.__perf_sw_event
1.56 ? 4% perf-profile.children.cycles-pp.__set_current_blocked
0.85 ? 4% perf-profile.children.cycles-pp.signal_setup_done
0.85 perf-profile.children.cycles-pp.fpu__clear
0.18 ? 6% perf-profile.children.cycles-pp.complete_signal
0.23 ? 3% perf-profile.children.cycles-pp.fixup_vdso_exception
0.19 ? 8% perf-profile.children.cycles-pp.kmem_cache_free
0.19 ? 9% perf-profile.children.cycles-pp.prepare_signal
0.55 ? 4% perf-profile.children.cycles-pp.__x86_indirect_thunk_rax
1.03 ? 4% perf-profile.children.cycles-pp.recalc_sigpending
0.20 ? 5% perf-profile.children.cycles-pp.rcu_nocb_flush_deferred_wakeup
0.25 ? 11% perf-profile.children.cycles-pp.___perf_sw_event
0.15 ? 3% perf-profile.children.cycles-pp.copy_user_enhanced_fast_string
0.87 ? 2% perf-profile.children.cycles-pp.sigprocmask
0.20 ? 5% perf-profile.children.cycles-pp.syscall_exit_to_user_mode_prepare
0.17 ? 4% perf-profile.children.cycles-pp.__cond_resched
0.13 ? 5% perf-profile.children.cycles-pp.find_vma
0.08 ? 8% perf-profile.children.cycles-pp.task_curr
0.20 ? 4% perf-profile.children.cycles-pp.__x86_retpoline_rax
0.03 ? 99% perf-profile.children.cycles-pp.kick_process
0.09 ? 10% perf-profile.children.cycles-pp.__get_user_nocheck_4
0.10 ? 7% perf-profile.children.cycles-pp.send_signal
0.10 ? 10% perf-profile.children.cycles-pp.perf_swevent_get_recursion_context
0.08 ? 7% perf-profile.children.cycles-pp.__local_bh_enable_ip
0.09 ? 4% perf-profile.children.cycles-pp.vmacache_find
0.11 ? 8% perf-profile.children.cycles-pp.rcu_all_qs
0.08 ? 8% perf-profile.children.cycles-pp.sigaction_compat_abi
0.06 ? 6% perf-profile.children.cycles-pp.fpu__alloc_mathframe
0.06 ? 11% perf-profile.children.cycles-pp.clear_user
0.16 ? 2% perf-profile.children.cycles-pp.down_read_trylock
0.21 ? 8% perf-profile.children.cycles-pp._raw_spin_unlock_irqrestore
0.41 ? 6% perf-profile.children.cycles-pp.__bad_area
0.38 ? 7% perf-profile.children.cycles-pp.up_read
0.45 ? 3% perf-profile.children.cycles-pp.fpregs_mark_activate
0.98 perf-profile.children.cycles-pp.signal_wake_up_state
0.61 ? 3% perf-profile.children.cycles-pp.force_sig
1.15 ? 2% perf-profile.children.cycles-pp.asm_exc_general_protection
1.14 ? 2% perf-profile.children.cycles-pp.exc_general_protection
4.55 perf-profile.children.cycles-pp.put_ucounts
25.61 perf-profile.children.cycles-pp.irqentry_exit_to_user_mode
22.92 perf-profile.children.cycles-pp.exit_to_user_mode_prepare
21.87 perf-profile.children.cycles-pp.arch_do_signal_or_restart
9.00 perf-profile.children.cycles-pp.get_ucounts
17.09 perf-profile.children.cycles-pp.get_signal
11.04 perf-profile.children.cycles-pp.dec_rlimit_ucounts
15.63 perf-profile.children.cycles-pp.__sigqueue_free
14.03 perf-profile.children.cycles-pp.inc_rlimit_ucounts
18.39 perf-profile.children.cycles-pp.inc_rlimit_ucounts_and_test
32.19 perf-profile.children.cycles-pp.exc_page_fault
32.00 perf-profile.children.cycles-pp.do_user_addr_fault
30.19 perf-profile.children.cycles-pp.__bad_area_nosemaphore
29.88 perf-profile.children.cycles-pp.force_sig_fault
30.46 perf-profile.children.cycles-pp.force_sig_info_to_task
29.71 perf-profile.children.cycles-pp.__send_signal
27.95 perf-profile.children.cycles-pp.__sigqueue_alloc
57.68 perf-profile.children.cycles-pp.asm_exc_page_fault
8.15 perf-profile.self.cycles-pp.syscall_return_via_sysret
12.81 perf-profile.self.cycles-pp.syscall_exit_to_user_mode
5.56 perf-profile.self.cycles-pp.__entry_text_start
0.19 ? 10% perf-profile.self.cycles-pp.__sigqueue_alloc
0.07 ? 14% perf-profile.self.cycles-pp.__sigqueue_free
0.84 ? 2% perf-profile.self.cycles-pp.copy_user_generic_unrolled
1.35 perf-profile.self.cycles-pp.__irqentry_text_end
1.24 perf-profile.self.cycles-pp.native_irq_return_iret
3.44 ? 2% perf-profile.self.cycles-pp.irqentry_exit_to_user_mode
1.40 ? 2% perf-profile.self.cycles-pp.do_syscall_64
1.14 ? 4% perf-profile.self.cycles-pp.__x64_sys_rt_sigaction
0.97 perf-profile.self.cycles-pp.entry_SYSCALL_64_after_hwframe
0.85 ? 3% perf-profile.self.cycles-pp.copy_fpstate_to_sigframe
0.62 ? 2% perf-profile.self.cycles-pp.___might_sleep
0.85 ? 3% perf-profile.self.cycles-pp.exit_to_user_mode_prepare
0.65 perf-profile.self.cycles-pp.do_sigaction
0.54 ? 4% perf-profile.self.cycles-pp.__setup_rt_frame
0.33 ? 5% perf-profile.self.cycles-pp.asm_exc_page_fault
0.56 ? 3% perf-profile.self.cycles-pp.__x64_sys_rt_sigprocmask
0.42 ? 3% perf-profile.self.cycles-pp.__might_sleep
0.40 ? 2% perf-profile.self.cycles-pp.fpu__clear
0.42 ? 3% perf-profile.self.cycles-pp.entry_SYSCALL_64_safe_stack
1.22 ? 3% perf-profile.self.cycles-pp._raw_spin_lock_irq
0.44 ? 5% perf-profile.self.cycles-pp.get_signal
0.45 ? 3% perf-profile.self.cycles-pp._copy_from_user
0.36 ? 6% perf-profile.self.cycles-pp.__send_signal
0.28 ? 3% perf-profile.self.cycles-pp.__clear_user
0.42 ? 3% perf-profile.self.cycles-pp.do_user_addr_fault
0.43 ? 4% perf-profile.self.cycles-pp.syscall_enter_from_user_mode
0.24 ? 7% perf-profile.self.cycles-pp.sync_regs
0.52 ? 2% perf-profile.self.cycles-pp.__might_fault
0.19 ? 9% perf-profile.self.cycles-pp.kmem_cache_free
0.23 ? 3% perf-profile.self.cycles-pp.fixup_vdso_exception
0.16 ? 5% perf-profile.self.cycles-pp._copy_to_user
0.18 ? 4% perf-profile.self.cycles-pp.__set_current_blocked
0.19 ? 9% perf-profile.self.cycles-pp.prepare_signal
0.23 ? 4% perf-profile.self.cycles-pp.arch_do_signal_or_restart
0.17 ? 5% perf-profile.self.cycles-pp.rcu_nocb_flush_deferred_wakeup
0.10 ? 7% perf-profile.self.cycles-pp.complete_signal
0.14 ? 2% perf-profile.self.cycles-pp.__bad_area_nosemaphore
0.15 ? 4% perf-profile.self.cycles-pp.exc_page_fault
0.07 ? 12% perf-profile.self.cycles-pp.task_curr
0.20 ? 14% perf-profile.self.cycles-pp.___perf_sw_event
0.18 ? 5% perf-profile.self.cycles-pp.syscall_exit_to_user_mode_prepare
0.10 ? 4% perf-profile.self.cycles-pp.force_sig_info_to_task
0.18 ? 4% perf-profile.self.cycles-pp.__x86_retpoline_rax
0.09 ? 9% perf-profile.self.cycles-pp.__perf_sw_event
0.90 perf-profile.self.cycles-pp.recalc_sigpending
0.09 ? 8% perf-profile.self.cycles-pp.sigprocmask
0.08 ? 5% perf-profile.self.cycles-pp.__get_user_nocheck_4
0.10 ? 8% perf-profile.self.cycles-pp.perf_swevent_get_recursion_context
0.07 ? 8% perf-profile.self.cycles-pp.__local_bh_enable_ip
0.09 ? 5% perf-profile.self.cycles-pp.signal_setup_done
0.07 ? 5% perf-profile.self.cycles-pp.copy_user_enhanced_fast_string
0.08 ? 4% perf-profile.self.cycles-pp.vmacache_find
0.10 ? 9% perf-profile.self.cycles-pp.rcu_all_qs
0.07 ? 8% perf-profile.self.cycles-pp.__set_task_blocked
0.06 ? 6% perf-profile.self.cycles-pp.send_signal
0.06 ? 9% perf-profile.self.cycles-pp.__cond_resched
0.05 perf-profile.self.cycles-pp.fpu__alloc_mathframe
0.07 ? 11% perf-profile.self.cycles-pp.sigaction_compat_abi
0.07 ? 11% perf-profile.self.cycles-pp.copy_siginfo_to_user
0.06 ? 13% perf-profile.self.cycles-pp._raw_spin_unlock_irqrestore
0.37 ? 7% perf-profile.self.cycles-pp.up_read
0.45 ? 3% perf-profile.self.cycles-pp.fpregs_mark_activate
0.77 ? 2% perf-profile.self.cycles-pp.signal_wake_up_state
4.36 perf-profile.self.cycles-pp.inc_rlimit_ucounts_and_test
4.54 perf-profile.self.cycles-pp.put_ucounts
8.99 perf-profile.self.cycles-pp.get_ucounts
11.01 perf-profile.self.cycles-pp.dec_rlimit_ucounts
14.03 perf-profile.self.cycles-pp.inc_rlimit_ucounts

stress-ng.time.user_time

--------------------------------------------------+
+.++. |
+ +.+.++.++.+.++.+.++.++.+.++. +.+.+ +.++.++.|
+ |
|
|
|
|
|
|
|
|
|
OO OO O OO OO O OO OO O OO O OO |
--------------------------------------------------+

stress-ng.time.system_time

--------------------------------------------------+
OO O OO O O OO O OO |
O O |
|
|
|
|
|
|
|
|
|
+.+.++.+ .+.++.+.++.++.+.++.++.+.+ +.+ .|
+ +.++.+.+ + |
--------------------------------------------------+

stress-ng.sigsegv.ops

--------------------------------------------------+
|
+.++.++.++.++.++.++.+.++.++.++.++.++.++.++.|
|
|
|
|
|
|
|
|
|
|
O OO OO OO O OO OO OO OO OO |
--------------------------------------------------+

stress-ng.sigsegv.ops_per_sec

--------------------------------------------------+
|
++.++.++.+.++.++.++.++.++.++.+. +. +.++.++.++.++.++.|
+ + |
|
|
|
|
|
|
|
|
O O OO OO OO |
O O O OO OO O |
--------------------------------------------------+

estimated based on internal Intel analysis and are provided
purposes only. Any difference in system hardware or software
may affect actual performance.
Open Source Technology Center
.org/hyperkitty/list/lkp@lists.01.org">https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation

Attachments:

(No filename) (36.65 kB)
config-5.12.0-rc2-00030-g08ed4efad684 (175.52 kB)
job-script (8.22 kB)
job.yaml (5.71 kB)
reproduce (421.00 B)
Download all attachments

2021-04-08 16:24:40

by Linus Torvalds

[permalink] [raw]

Subject: Re: 08ed4efad6: stress-ng.sigsegv.ops_per_sec -41.9% regression

On Thu, Apr 8, 2021 at 1:32 AM kernel test robot <[email protected]> wrote:
>
> FYI, we noticed a -41.9% regression of stress-ng.sigsegv.ops_per_sec due to commit
> 08ed4efad684 ("[PATCH v10 6/9] Reimplement RLIMIT_SIGPENDING on top of ucounts")

Ouch.

I *think* this test may be testing "send so many signals that it
triggers the signal queue overflow case".

And I *think* that the performance degradation may be due to lots of
unnecessary allocations, because ity looks like that commit changes
__sigqueue_alloc() to do

struct sigqueue *q = kmem_cache_alloc(sigqueue_cachep, flags);

*before* checking the signal limit, and then if the signal limit was
exceeded, it will just be free'd instead.

The old code would check the signal count against RLIMIT_SIGPENDING
*first*, and if there were m ore pending signals then it wouldn't do
anything at all (including not incrementing that expensive atomic
count).

Also, the old code was very careful to only do the "get_user()" for
the *first* signal it added to the queue, and do the "put_user()" for
when removing the last signal. Exactly because those atomics are very
expensive.

The new code just does a lot of these atomics unconditionally.

I dunno. The profile data in there is a bit hard to read, but there's
a lot more cachee misses, and a *lot* of node crossers:

> 5961544 +190.4% 17314361 perf-stat.i.cache-misses
> 22107466 +119.2% 48457656 perf-stat.i.cache-references
> 163292 ą 3% +4582.0% 7645410 perf-stat.i.node-load-misses
> 227388 ą 2% +3708.8% 8660824 perf-stat.i.node-loads

and (probably as a result) average instruction costs have gone up enormously:

> 3.47 +66.8% 5.79 perf-stat.overall.cpi
> 22849 -65.6% 7866 perf-stat.overall.cycles-between-cache-misses

and it does seem to be at least partly about "put_ucounts()":

> 0.00 +4.5 4.46 perf-profile.calltrace.cycles-pp.put_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare

and a lot of "get_ucounts()".

But it may also be that the new "get sigpending" is just *so* much
more expensive than it used to be.

Linus

2021-04-08 16:45:23

by Alexey Gladkov

[permalink] [raw]

Subject: Re: 08ed4efad6: stress-ng.sigsegv.ops_per_sec -41.9% regression

On Thu, Apr 08, 2021 at 09:22:40AM -0700, Linus Torvalds wrote:
> On Thu, Apr 8, 2021 at 1:32 AM kernel test robot <[email protected]> wrote:
> >
> > FYI, we noticed a -41.9% regression of stress-ng.sigsegv.ops_per_sec due to commit
> > 08ed4efad684 ("[PATCH v10 6/9] Reimplement RLIMIT_SIGPENDING on top of ucounts")
>
> Ouch.
>
> I *think* this test may be testing "send so many signals that it
> triggers the signal queue overflow case".
>
> And I *think* that the performance degradation may be due to lots of
> unnecessary allocations, because ity looks like that commit changes
> __sigqueue_alloc() to do
>
> struct sigqueue *q = kmem_cache_alloc(sigqueue_cachep, flags);
>
> *before* checking the signal limit, and then if the signal limit was
> exceeded, it will just be free'd instead.
>
> The old code would check the signal count against RLIMIT_SIGPENDING
> *first*, and if there were m ore pending signals then it wouldn't do
> anything at all (including not incrementing that expensive atomic
> count).
>
> Also, the old code was very careful to only do the "get_user()" for
> the *first* signal it added to the queue, and do the "put_user()" for
> when removing the last signal. Exactly because those atomics are very
> expensive.
>
> The new code just does a lot of these atomics unconditionally.

Yes and right now I'm trying to rewrite this patch.

> I dunno. The profile data in there is a bit hard to read, but there's
> a lot more cachee misses, and a *lot* of node crossers:
>
> > 5961544 +190.4% 17314361 perf-stat.i.cache-misses
> > 22107466 +119.2% 48457656 perf-stat.i.cache-references
> > 163292 ą 3% +4582.0% 7645410 perf-stat.i.node-load-misses
> > 227388 ą 2% +3708.8% 8660824 perf-stat.i.node-loads
>
> and (probably as a result) average instruction costs have gone up enormously:
>
> > 3.47 +66.8% 5.79 perf-stat.overall.cpi
> > 22849 -65.6% 7866 perf-stat.overall.cycles-between-cache-misses
>
> and it does seem to be at least partly about "put_ucounts()":
>
> > 0.00 +4.5 4.46 perf-profile.calltrace.cycles-pp.put_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare
>
> and a lot of "get_ucounts()".
>
> But it may also be that the new "get sigpending" is just *so* much
> more expensive than it used to be.

Thanks for decrypting this! I spent some time to understand this report
and still wasn't sure I understood it.

--
Rgrds, legion

2021-04-08 18:45:39

by Eric W. Biederman

[permalink] [raw]

Subject: Re: 08ed4efad6: stress-ng.sigsegv.ops_per_sec -41.9% regression

Linus Torvalds <[email protected]> writes:

> On Thu, Apr 8, 2021 at 1:32 AM kernel test robot <[email protected]> wrote:
>>
>> FYI, we noticed a -41.9% regression of stress-ng.sigsegv.ops_per_sec due to commit
>> 08ed4efad684 ("[PATCH v10 6/9] Reimplement RLIMIT_SIGPENDING on top of ucounts")
>
> Ouch.

We were cautiously optimistic when no test problems showed up from
the last posting that there was nothing to look at here.

Unfortunately it looks like the bots just missed the last posting.

So it seems we are finally pretty much at correct code in need
of performance tuning.

> I *think* this test may be testing "send so many signals that it
> triggers the signal queue overflow case".
>
> And I *think* that the performance degradation may be due to lots of
> unnecessary allocations, because ity looks like that commit changes
> __sigqueue_alloc() to do
>
> struct sigqueue *q = kmem_cache_alloc(sigqueue_cachep, flags);
>
> *before* checking the signal limit, and then if the signal limit was
> exceeded, it will just be free'd instead.
>
> The old code would check the signal count against RLIMIT_SIGPENDING
> *first*, and if there were m ore pending signals then it wouldn't do
> anything at all (including not incrementing that expensive atomic
> count).

This is an interesting test in a lot of ways as it is testing the
synchronous signal delivery path caused by an exception. The test
is either executing *ptr = 0 (where ptr points to a read-only page)
or it executes an x86 instruction that is excessively long.

I have found the code but I haven't figured out how it is being
called yet. The core loop is just:
for(;;) {
sigaction(SIGSEGV, &action, NULL);
sigaction(SIGILL, &action, NULL);
sigaction(SIGBUS, &action, NULL);

ret = sigsetjmp(jmp_env, 1);
if (done())
break;
if (ret) {
/* verify signal */
} else {
*ptr = 0;
}
}

Code like that fundamentally can not be multi-threaded. So the only way
the sigpending limit is being hit is if there are more processes running
that code simultaneously than the size of the limit.

Further it looks like stress-ng pushes RLIMIT_SIGPENDING as high as it
will go before the test starts.

> Also, the old code was very careful to only do the "get_user()" for
> the *first* signal it added to the queue, and do the "put_user()" for
> when removing the last signal. Exactly because those atomics are very
> expensive.
>
> The new code just does a lot of these atomics unconditionally.

Yes. That seems a likely culprit.

> I dunno. The profile data in there is a bit hard to read, but there's
> a lot more cachee misses, and a *lot* of node crossers:
>
>> 5961544 +190.4% 17314361 perf-stat.i.cache-misses
>> 22107466 +119.2% 48457656 perf-stat.i.cache-references
>> 163292 ą 3% +4582.0% 7645410 perf-stat.i.node-load-misses
>> 227388 ą 2% +3708.8% 8660824 perf-stat.i.node-loads
>
> and (probably as a result) average instruction costs have gone up enormously:
>
>> 3.47 +66.8% 5.79 perf-stat.overall.cpi
>> 22849 -65.6% 7866 perf-stat.overall.cycles-between-cache-misses
>
> and it does seem to be at least partly about "put_ucounts()":
>
>> 0.00 +4.5 4.46 perf-profile.calltrace.cycles-pp.put_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare
>
> and a lot of "get_ucounts()".
>
> But it may also be that the new "get sigpending" is just *so* much
> more expensive than it used to be.

That too is possible.

That node-load-misses number does look like something is bouncing back
and forth between the nodes a lot more. So I suspect stress-ng is
running multiple copies of the sigsegv test in different processes at
once.

That really suggests cache line ping pong from get_ucounts and
incrementing sigpending.

It surprises me that obtaining the cache lines exclusively is
the dominant cost on this code path but obtaining two cache lines
exclusively instead of one cache cache line exclusively is consistent
with a causing the exception delivery to take nearly twice as long.

For the optimization we only care about the leaf count so with a little
care we can restore the optimization. So that is probably the thing
to do here. The fewer changes to worry about the less likely to find
surprises.

That said for this specific case there is a lot of potential room for
improvement. As this is a per thread signal the code update sigpending
in commit_cred and never worry about needing to pin the struct
user_struct or struct ucounts. As this is a synchronous signal we could
skip the sigpending increment, skip the signal queue entirely, and
deliver the signal to user-space immediately. The removal of all cache
ping pongs might make it worth it.

There is also Thomas Gleixner's recent optimization to cache one
sigqueue entry per task to give more predictable behavior. That
would remove the cost of the allocation.

Eric

2021-04-16 11:45:08

by Alexey Gladkov

[permalink] [raw]

Subject: Re: 08ed4efad6: stress-ng.sigsegv.ops_per_sec -41.9% regression

On Thu, Apr 08, 2021 at 01:44:43PM -0500, Eric W. Biederman wrote:
> Linus Torvalds <[email protected]> writes:
>
> > On Thu, Apr 8, 2021 at 1:32 AM kernel test robot <[email protected]> wrote:
> >>
> >> FYI, we noticed a -41.9% regression of stress-ng.sigsegv.ops_per_sec due to commit
> >> 08ed4efad684 ("[PATCH v10 6/9] Reimplement RLIMIT_SIGPENDING on top of ucounts")
> >
> > Ouch.
>
> We were cautiously optimistic when no test problems showed up from
> the last posting that there was nothing to look at here.
>
> Unfortunately it looks like the bots just missed the last posting.
>
> So it seems we are finally pretty much at correct code in need
> of performance tuning.
>
> > I *think* this test may be testing "send so many signals that it
> > triggers the signal queue overflow case".
> >
> > And I *think* that the performance degradation may be due to lots of
> > unnecessary allocations, because ity looks like that commit changes
> > __sigqueue_alloc() to do
> >
> > struct sigqueue *q = kmem_cache_alloc(sigqueue_cachep, flags);
> >
> > *before* checking the signal limit, and then if the signal limit was
> > exceeded, it will just be free'd instead.
> >
> > The old code would check the signal count against RLIMIT_SIGPENDING
> > *first*, and if there were m ore pending signals then it wouldn't do
> > anything at all (including not incrementing that expensive atomic
> > count).
>
> This is an interesting test in a lot of ways as it is testing the
> synchronous signal delivery path caused by an exception. The test
> is either executing *ptr = 0 (where ptr points to a read-only page)
> or it executes an x86 instruction that is excessively long.
>
> I have found the code but I haven't figured out how it is being
> called yet. The core loop is just:
> for(;;) {
> sigaction(SIGSEGV, &action, NULL);
> sigaction(SIGILL, &action, NULL);
> sigaction(SIGBUS, &action, NULL);
>
> ret = sigsetjmp(jmp_env, 1);
> if (done())
> break;
> if (ret) {
> /* verify signal */
> } else {
> *ptr = 0;
> }
> }
>
> Code like that fundamentally can not be multi-threaded. So the only way
> the sigpending limit is being hit is if there are more processes running
> that code simultaneously than the size of the limit.
>
> Further it looks like stress-ng pushes RLIMIT_SIGPENDING as high as it
> will go before the test starts.
>
>
> > Also, the old code was very careful to only do the "get_user()" for
> > the *first* signal it added to the queue, and do the "put_user()" for
> > when removing the last signal. Exactly because those atomics are very
> > expensive.
> >
> > The new code just does a lot of these atomics unconditionally.
>
> Yes. That seems a likely culprit.
>
> > I dunno. The profile data in there is a bit hard to read, but there's
> > a lot more cachee misses, and a *lot* of node crossers:
> >
> >> 5961544 +190.4% 17314361 perf-stat.i.cache-misses
> >> 22107466 +119.2% 48457656 perf-stat.i.cache-references
> >> 163292 ą 3% +4582.0% 7645410 perf-stat.i.node-load-misses
> >> 227388 ą 2% +3708.8% 8660824 perf-stat.i.node-loads
> >
> > and (probably as a result) average instruction costs have gone up enormously:
> >
> >> 3.47 +66.8% 5.79 perf-stat.overall.cpi
> >> 22849 -65.6% 7866 perf-stat.overall.cycles-between-cache-misses
> >
> > and it does seem to be at least partly about "put_ucounts()":
> >
> >> 0.00 +4.5 4.46 perf-profile.calltrace.cycles-pp.put_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare
> >
> > and a lot of "get_ucounts()".
> >
> > But it may also be that the new "get sigpending" is just *so* much
> > more expensive than it used to be.
>
> That too is possible.
>
> That node-load-misses number does look like something is bouncing back
> and forth between the nodes a lot more. So I suspect stress-ng is
> running multiple copies of the sigsegv test in different processes at
> once.
>
>
>
> That really suggests cache line ping pong from get_ucounts and
> incrementing sigpending.
>
> It surprises me that obtaining the cache lines exclusively is
> the dominant cost on this code path but obtaining two cache lines
> exclusively instead of one cache cache line exclusively is consistent
> with a causing the exception delivery to take nearly twice as long.
>
> For the optimization we only care about the leaf count so with a little
> care we can restore the optimization. So that is probably the thing
> to do here. The fewer changes to worry about the less likely to find
> surprises.
>
>
>
> That said for this specific case there is a lot of potential room for
> improvement. As this is a per thread signal the code update sigpending
> in commit_cred and never worry about needing to pin the struct
> user_struct or struct ucounts. As this is a synchronous signal we could
> skip the sigpending increment, skip the signal queue entirely, and
> deliver the signal to user-space immediately. The removal of all cache
> ping pongs might make it worth it.
>
> There is also Thomas Gleixner's recent optimization to cache one
> sigqueue entry per task to give more predictable behavior. That
> would remove the cost of the allocation.

https://git.kernel.org/pub/scm/linux/kernel/git/legion/linux.git/commit/?h=patchset/per-userspace-rlimit/v11.1&id=08db0c814926c6f16e08de99b2de34c8b5ff68ce

You mean something like this ? I did it on top of Thomas Gleixner's
patches.

--
Rgrds, legion

2021-04-23 02:35:18

by kernel test robot

[permalink] [raw]

Subject: Re: 08ed4efad6: stress-ng.sigsegv.ops_per_sec -41.9% regression

hi, Eric,

On Thu, Apr 08, 2021 at 01:44:43PM -0500, Eric W. Biederman wrote:
> Linus Torvalds <[email protected]> writes:
>
> > On Thu, Apr 8, 2021 at 1:32 AM kernel test robot <[email protected]> wrote:
> >>
> >> FYI, we noticed a -41.9% regression of stress-ng.sigsegv.ops_per_sec due to commit
> >> 08ed4efad684 ("[PATCH v10 6/9] Reimplement RLIMIT_SIGPENDING on top of ucounts")
> >
> > Ouch.
>
> We were cautiously optimistic when no test problems showed up from
> the last posting that there was nothing to look at here.
>
> Unfortunately it looks like the bots just missed the last posting.

this report is upon v10. do you have newer version which hope bot test?

please be noted, sorry to say, due to various reasons, it will be a
big challenge for us to capture each version of a patch set.

e.g. we didn't make out a similar performance regression for
v8/v9 version of this one..

>
> So it seems we are finally pretty much at correct code in need
> of performance tuning.
>
> > I *think* this test may be testing "send so many signals that it
> > triggers the signal queue overflow case".
> >
> > And I *think* that the performance degradation may be due to lots of
> > unnecessary allocations, because ity looks like that commit changes
> > __sigqueue_alloc() to do
> >
> > struct sigqueue *q = kmem_cache_alloc(sigqueue_cachep, flags);
> >
> > *before* checking the signal limit, and then if the signal limit was
> > exceeded, it will just be free'd instead.
> >
> > The old code would check the signal count against RLIMIT_SIGPENDING
> > *first*, and if there were m ore pending signals then it wouldn't do
> > anything at all (including not incrementing that expensive atomic
> > count).
>
> This is an interesting test in a lot of ways as it is testing the
> synchronous signal delivery path caused by an exception. The test
> is either executing *ptr = 0 (where ptr points to a read-only page)
> or it executes an x86 instruction that is excessively long.
>
> I have found the code but I haven't figured out how it is being
> called yet. The core loop is just:
> for(;;) {
> sigaction(SIGSEGV, &action, NULL);
> sigaction(SIGILL, &action, NULL);
> sigaction(SIGBUS, &action, NULL);
>
> ret = sigsetjmp(jmp_env, 1);
> if (done())
> break;
> if (ret) {
> /* verify signal */
> } else {
> *ptr = 0;
> }
> }
>
> Code like that fundamentally can not be multi-threaded. So the only way
> the sigpending limit is being hit is if there are more processes running
> that code simultaneously than the size of the limit.
>
> Further it looks like stress-ng pushes RLIMIT_SIGPENDING as high as it
> will go before the test starts.
>
>
> > Also, the old code was very careful to only do the "get_user()" for
> > the *first* signal it added to the queue, and do the "put_user()" for
> > when removing the last signal. Exactly because those atomics are very
> > expensive.
> >
> > The new code just does a lot of these atomics unconditionally.
>
> Yes. That seems a likely culprit.
>
> > I dunno. The profile data in there is a bit hard to read, but there's
> > a lot more cachee misses, and a *lot* of node crossers:
> >
> >> 5961544 +190.4% 17314361 perf-stat.i.cache-misses
> >> 22107466 +119.2% 48457656 perf-stat.i.cache-references
> >> 163292 ą 3% +4582.0% 7645410 perf-stat.i.node-load-misses
> >> 227388 ą 2% +3708.8% 8660824 perf-stat.i.node-loads
> >
> > and (probably as a result) average instruction costs have gone up enormously:
> >
> >> 3.47 +66.8% 5.79 perf-stat.overall.cpi
> >> 22849 -65.6% 7866 perf-stat.overall.cycles-between-cache-misses
> >
> > and it does seem to be at least partly about "put_ucounts()":
> >
> >> 0.00 +4.5 4.46 perf-profile.calltrace.cycles-pp.put_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare
> >
> > and a lot of "get_ucounts()".
> >
> > But it may also be that the new "get sigpending" is just *so* much
> > more expensive than it used to be.
>
> That too is possible.
>
> That node-load-misses number does look like something is bouncing back
> and forth between the nodes a lot more. So I suspect stress-ng is
> running multiple copies of the sigsegv test in different processes at
> once.
>
>
>
> That really suggests cache line ping pong from get_ucounts and
> incrementing sigpending.
>
> It surprises me that obtaining the cache lines exclusively is
> the dominant cost on this code path but obtaining two cache lines
> exclusively instead of one cache cache line exclusively is consistent
> with a causing the exception delivery to take nearly twice as long.
>
> For the optimization we only care about the leaf count so with a little
> care we can restore the optimization. So that is probably the thing
> to do here. The fewer changes to worry about the less likely to find
> surprises.
>
>
>
> That said for this specific case there is a lot of potential room for
> improvement. As this is a per thread signal the code update sigpending
> in commit_cred and never worry about needing to pin the struct
> user_struct or struct ucounts. As this is a synchronous signal we could
> skip the sigpending increment, skip the signal queue entirely, and
> deliver the signal to user-space immediately. The removal of all cache
> ping pongs might make it worth it.
>
> There is also Thomas Gleixner's recent optimization to cache one
> sigqueue entry per task to give more predictable behavior. That
> would remove the cost of the allocation.
>
> Eric

2021-04-23 07:45:56

by Alexey Gladkov

[permalink] [raw]

Subject: Re: 08ed4efad6: stress-ng.sigsegv.ops_per_sec -41.9% regression

On Fri, Apr 23, 2021 at 10:47:22AM +0800, Oliver Sang wrote:
> hi, Eric,
>
> On Thu, Apr 08, 2021 at 01:44:43PM -0500, Eric W. Biederman wrote:
> > Linus Torvalds <[email protected]> writes:
> >
> > > On Thu, Apr 8, 2021 at 1:32 AM kernel test robot <[email protected]> wrote:
> > >>
> > >> FYI, we noticed a -41.9% regression of stress-ng.sigsegv.ops_per_sec due to commit
> > >> 08ed4efad684 ("[PATCH v10 6/9] Reimplement RLIMIT_SIGPENDING on top of ucounts")
> > >
> > > Ouch.
> >
> > We were cautiously optimistic when no test problems showed up from
> > the last posting that there was nothing to look at here.
> >
> > Unfortunately it looks like the bots just missed the last posting.
>
> this report is upon v10. do you have newer version which hope bot test?

Yes. I posted a new version of this patch set. I would be very grateful if
you could test it.

https://lore.kernel.org/lkml/[email protected]/

> please be noted, sorry to say, due to various reasons, it will be a
> big challenge for us to capture each version of a patch set.
>
> e.g. we didn't make out a similar performance regression for
> v8/v9 version of this one..
>
> >
> > So it seems we are finally pretty much at correct code in need
> > of performance tuning.
> >
> > > I *think* this test may be testing "send so many signals that it
> > > triggers the signal queue overflow case".
> > >
> > > And I *think* that the performance degradation may be due to lots of
> > > unnecessary allocations, because ity looks like that commit changes
> > > __sigqueue_alloc() to do
> > >
> > > struct sigqueue *q = kmem_cache_alloc(sigqueue_cachep, flags);
> > >
> > > *before* checking the signal limit, and then if the signal limit was
> > > exceeded, it will just be free'd instead.
> > >
> > > The old code would check the signal count against RLIMIT_SIGPENDING
> > > *first*, and if there were m ore pending signals then it wouldn't do
> > > anything at all (including not incrementing that expensive atomic
> > > count).
> >
> > This is an interesting test in a lot of ways as it is testing the
> > synchronous signal delivery path caused by an exception. The test
> > is either executing *ptr = 0 (where ptr points to a read-only page)
> > or it executes an x86 instruction that is excessively long.
> >
> > I have found the code but I haven't figured out how it is being
> > called yet. The core loop is just:
> > for(;;) {
> > sigaction(SIGSEGV, &action, NULL);
> > sigaction(SIGILL, &action, NULL);
> > sigaction(SIGBUS, &action, NULL);
> >
> > ret = sigsetjmp(jmp_env, 1);
> > if (done())
> > break;
> > if (ret) {
> > /* verify signal */
> > } else {
> > *ptr = 0;
> > }
> > }
> >
> > Code like that fundamentally can not be multi-threaded. So the only way
> > the sigpending limit is being hit is if there are more processes running
> > that code simultaneously than the size of the limit.
> >
> > Further it looks like stress-ng pushes RLIMIT_SIGPENDING as high as it
> > will go before the test starts.
> >
> >
> > > Also, the old code was very careful to only do the "get_user()" for
> > > the *first* signal it added to the queue, and do the "put_user()" for
> > > when removing the last signal. Exactly because those atomics are very
> > > expensive.
> > >
> > > The new code just does a lot of these atomics unconditionally.
> >
> > Yes. That seems a likely culprit.
> >
> > > I dunno. The profile data in there is a bit hard to read, but there's
> > > a lot more cachee misses, and a *lot* of node crossers:
> > >
> > >> 5961544 +190.4% 17314361 perf-stat.i.cache-misses
> > >> 22107466 +119.2% 48457656 perf-stat.i.cache-references
> > >> 163292 ą 3% +4582.0% 7645410 perf-stat.i.node-load-misses
> > >> 227388 ą 2% +3708.8% 8660824 perf-stat.i.node-loads
> > >
> > > and (probably as a result) average instruction costs have gone up enormously:
> > >
> > >> 3.47 +66.8% 5.79 perf-stat.overall.cpi
> > >> 22849 -65.6% 7866 perf-stat.overall.cycles-between-cache-misses
> > >
> > > and it does seem to be at least partly about "put_ucounts()":
> > >
> > >> 0.00 +4.5 4.46 perf-profile.calltrace.cycles-pp.put_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare
> > >
> > > and a lot of "get_ucounts()".
> > >
> > > But it may also be that the new "get sigpending" is just *so* much
> > > more expensive than it used to be.
> >
> > That too is possible.
> >
> > That node-load-misses number does look like something is bouncing back
> > and forth between the nodes a lot more. So I suspect stress-ng is
> > running multiple copies of the sigsegv test in different processes at
> > once.
> >
> >
> >
> > That really suggests cache line ping pong from get_ucounts and
> > incrementing sigpending.
> >
> > It surprises me that obtaining the cache lines exclusively is
> > the dominant cost on this code path but obtaining two cache lines
> > exclusively instead of one cache cache line exclusively is consistent
> > with a causing the exception delivery to take nearly twice as long.
> >
> > For the optimization we only care about the leaf count so with a little
> > care we can restore the optimization. So that is probably the thing
> > to do here. The fewer changes to worry about the less likely to find
> > surprises.
> >
> >
> >
> > That said for this specific case there is a lot of potential room for
> > improvement. As this is a per thread signal the code update sigpending
> > in commit_cred and never worry about needing to pin the struct
> > user_struct or struct ucounts. As this is a synchronous signal we could
> > skip the sigpending increment, skip the signal queue entirely, and
> > deliver the signal to user-space immediately. The removal of all cache
> > ping pongs might make it worth it.
> >
> > There is also Thomas Gleixner's recent optimization to cache one
> > sigqueue entry per task to give more predictable behavior. That
> > would remove the cost of the allocation.
> >
> > Eric
>

--
Rgrds, legion

2021-04-28 18:27:56

by kernel test robot

[permalink] [raw]

Subject: Re: 08ed4efad6: stress-ng.sigsegv.ops_per_sec -41.9% regression

hi, Alexey Gladkov,

On Fri, Apr 23, 2021 at 09:44:31AM +0200, Alexey Gladkov wrote:
> On Fri, Apr 23, 2021 at 10:47:22AM +0800, Oliver Sang wrote:
> > hi, Eric,
> >
> > On Thu, Apr 08, 2021 at 01:44:43PM -0500, Eric W. Biederman wrote:
> > > Linus Torvalds <[email protected]> writes:
> > >
> > > > On Thu, Apr 8, 2021 at 1:32 AM kernel test robot <[email protected]> wrote:
> > > >>
> > > >> FYI, we noticed a -41.9% regression of stress-ng.sigsegv.ops_per_sec due to commit
> > > >> 08ed4efad684 ("[PATCH v10 6/9] Reimplement RLIMIT_SIGPENDING on top of ucounts")
> > > >
> > > > Ouch.
> > >
> > > We were cautiously optimistic when no test problems showed up from
> > > the last posting that there was nothing to look at here.
> > >
> > > Unfortunately it looks like the bots just missed the last posting.
> >
> > this report is upon v10. do you have newer version which hope bot test?
>
> Yes. I posted a new version of this patch set. I would be very grateful if
> you could test it.
>
> https://lore.kernel.org/lkml/[email protected]/
>

we tested this v11 version, and found the regression reduced to about 1.6%.
please be noted, according to our previous experience, the stress-ng is
kind of sensitive testsuite, so we normally wouldn't report <3% regression.

=========================================================================================
class/compiler/cpufreq_governor/disk/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime/ucode:
interrupt/gcc-9/performance/1HDD/x86_64-rhel-8.3/100%/debian-10.4-x86_64-20200603.cgz/lkp-ivb-2ep1/sigsegv/stress-ng/60s/0x42e

commit:
00a58a6af1c4 ("Reimplement RLIMIT_MSGQUEUE on top of ucounts")
8932738fc10c ("Reimplement RLIMIT_SIGPENDING on top of ucounts")

00a58a6af1c473c5 8932738fc10c4398521892adfe6
---------------- ---------------------------
%stddev %change %stddev
\ | \
4.745e+08 -1.6% 4.669e+08 stress-ng.sigsegv.ops
7908964 -1.6% 7781343 stress-ng.sigsegv.ops_per_sec

Below is some data of results from your new branch and base.
b3ad8e1fa3fd8 ucounts: Set ucount_max to the largest positive value the type can hold 7783421.61 7794441.59 7775793.52 7773683.6 7760744.1 7757720.33
8932738fc10c4 Reimplement RLIMIT_SIGPENDING on top of ucounts 7755985.06 7780646.72 7783944.12 7809090.98 7798193.32 7760202.59
00a58a6af1c47 Reimplement RLIMIT_MSGQUEUE on top of ucounts 7940474.72 7912442.26 7879195.61 7869803.63 7912693.69 7939175.48
e75074781f173 selftests/resctrl: Change a few printed messages 7660254.5 7676124.45 7745330.79 7736754.88 7716834.93 7660143.13
87f1c20e2effd Documentation: kselftest: fix path to test module files 7729609.16 7726906.92 7760819.26
06bd03a57f8c2 selftests/resctrl: Fix MBA/MBM results reporting format 7692866.06 7730606.11 7681414.48
a38fd87484648 Linux 5.12-rc2 7724932.06

> > please be noted, sorry to say, due to various reasons, it will be a
> > big challenge for us to capture each version of a patch set.
> >
> > e.g. we didn't make out a similar performance regression for
> > v8/v9 version of this one..
> >
> > >
> > > So it seems we are finally pretty much at correct code in need
> > > of performance tuning.
> > >
> > > > I *think* this test may be testing "send so many signals that it
> > > > triggers the signal queue overflow case".
> > > >
> > > > And I *think* that the performance degradation may be due to lots of
> > > > unnecessary allocations, because ity looks like that commit changes
> > > > __sigqueue_alloc() to do
> > > >
> > > > struct sigqueue *q = kmem_cache_alloc(sigqueue_cachep, flags);
> > > >
> > > > *before* checking the signal limit, and then if the signal limit was
> > > > exceeded, it will just be free'd instead.
> > > >
> > > > The old code would check the signal count against RLIMIT_SIGPENDING
> > > > *first*, and if there were m ore pending signals then it wouldn't do
> > > > anything at all (including not incrementing that expensive atomic
> > > > count).
> > >
> > > This is an interesting test in a lot of ways as it is testing the
> > > synchronous signal delivery path caused by an exception. The test
> > > is either executing *ptr = 0 (where ptr points to a read-only page)
> > > or it executes an x86 instruction that is excessively long.
> > >
> > > I have found the code but I haven't figured out how it is being
> > > called yet. The core loop is just:
> > > for(;;) {
> > > sigaction(SIGSEGV, &action, NULL);
> > > sigaction(SIGILL, &action, NULL);
> > > sigaction(SIGBUS, &action, NULL);
> > >
> > > ret = sigsetjmp(jmp_env, 1);
> > > if (done())
> > > break;
> > > if (ret) {
> > > /* verify signal */
> > > } else {
> > > *ptr = 0;
> > > }
> > > }
> > >
> > > Code like that fundamentally can not be multi-threaded. So the only way
> > > the sigpending limit is being hit is if there are more processes running
> > > that code simultaneously than the size of the limit.
> > >
> > > Further it looks like stress-ng pushes RLIMIT_SIGPENDING as high as it
> > > will go before the test starts.
> > >
> > >
> > > > Also, the old code was very careful to only do the "get_user()" for
> > > > the *first* signal it added to the queue, and do the "put_user()" for
> > > > when removing the last signal. Exactly because those atomics are very
> > > > expensive.
> > > >
> > > > The new code just does a lot of these atomics unconditionally.
> > >
> > > Yes. That seems a likely culprit.
> > >
> > > > I dunno. The profile data in there is a bit hard to read, but there's
> > > > a lot more cachee misses, and a *lot* of node crossers:
> > > >
> > > >> 5961544 +190.4% 17314361 perf-stat.i.cache-misses
> > > >> 22107466 +119.2% 48457656 perf-stat.i.cache-references
> > > >> 163292 ą 3% +4582.0% 7645410 perf-stat.i.node-load-misses
> > > >> 227388 ą 2% +3708.8% 8660824 perf-stat.i.node-loads
> > > >
> > > > and (probably as a result) average instruction costs have gone up enormously:
> > > >
> > > >> 3.47 +66.8% 5.79 perf-stat.overall.cpi
> > > >> 22849 -65.6% 7866 perf-stat.overall.cycles-between-cache-misses
> > > >
> > > > and it does seem to be at least partly about "put_ucounts()":
> > > >
> > > >> 0.00 +4.5 4.46 perf-profile.calltrace.cycles-pp.put_ucounts.__sigqueue_free.get_signal.arch_do_signal_or_restart.exit_to_user_mode_prepare
> > > >
> > > > and a lot of "get_ucounts()".
> > > >
> > > > But it may also be that the new "get sigpending" is just *so* much
> > > > more expensive than it used to be.
> > >
> > > That too is possible.
> > >
> > > That node-load-misses number does look like something is bouncing back
> > > and forth between the nodes a lot more. So I suspect stress-ng is
> > > running multiple copies of the sigsegv test in different processes at
> > > once.
> > >
> > >
> > >
> > > That really suggests cache line ping pong from get_ucounts and
> > > incrementing sigpending.
> > >
> > > It surprises me that obtaining the cache lines exclusively is
> > > the dominant cost on this code path but obtaining two cache lines
> > > exclusively instead of one cache cache line exclusively is consistent
> > > with a causing the exception delivery to take nearly twice as long.
> > >
> > > For the optimization we only care about the leaf count so with a little
> > > care we can restore the optimization. So that is probably the thing
> > > to do here. The fewer changes to worry about the less likely to find
> > > surprises.
> > >
> > >
> > >
> > > That said for this specific case there is a lot of potential room for
> > > improvement. As this is a per thread signal the code update sigpending
> > > in commit_cred and never worry about needing to pin the struct
> > > user_struct or struct ucounts. As this is a synchronous signal we could
> > > skip the sigpending increment, skip the signal queue entirely, and
> > > deliver the signal to user-space immediately. The removal of all cache
> > > ping pongs might make it worth it.
> > >
> > > There is also Thomas Gleixner's recent optimization to cache one
> > > sigqueue entry per task to give more predictable behavior. That
> > > would remove the cost of the allocation.
> > >
> > > Eric
> >
>
> --
> Rgrds, legion
>