2022-09-02 07:18:28

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter

The msg_bytes and msg_hdrs atomic counters are frequently
updated when IPC msg queue is in heavy use, causing heavy
cache bounce and overhead. Change them to percpu_counters
greatly improve the performance. Since there is one unique
ipc namespace, additional memory cost is minimal. Reading
of the count done in msgctl call, which is infrequent. So
the need to sum up the counts in each CPU is infrequent.

Apply the patch and test the pts/stress-ng-1.4.0
-- system v message passing (160 threads).

Score gain: 3.38x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)

Signed-off-by: Jiebin Sun <[email protected]>
---
include/linux/ipc_namespace.h | 5 +++--
include/linux/percpu_counter.h | 9 +++++++++
ipc/msg.c | 30 +++++++++++++++++-------------
lib/percpu_counter.c | 6 ++++++
4 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e3e8c8662b49..e8240cf2611a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -11,6 +11,7 @@
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>

struct user_namespace;

@@ -36,8 +37,8 @@ struct ipc_namespace {
unsigned int msg_ctlmax;
unsigned int msg_ctlmnb;
unsigned int msg_ctlmni;
- atomic_t msg_bytes;
- atomic_t msg_hdrs;
+ struct percpu_counter percpu_msg_bytes;
+ struct percpu_counter percpu_msg_hdrs;

size_t shm_ctlmax;
size_t shm_ctlall;
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 01861eebed79..6eec30122cc3 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -40,6 +40,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,

void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
+void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
@@ -138,6 +139,14 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
preempt_enable();
}

+static inline void
+percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ preempt_disable();
+ fbc->count += amount;
+ preempt_enable();
+}
+
static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
diff --git a/ipc/msg.c b/ipc/msg.c
index a0d05775af2c..1b498537f05e 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -39,6 +39,7 @@
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>
+#include <linux/percpu_counter.h>

#include <asm/current.h>
#include <linux/uaccess.h>
@@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
rcu_read_unlock();

list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_add_local(&ns->percpu_msg_hdrs, -1);
free_msg(msg);
}
- atomic_sub(msq->q_cbytes, &ns->msg_bytes);
+ percpu_counter_add_local(&ns->percpu_msg_bytes, -(msq->q_cbytes));
ipc_update_pid(&msq->q_lspid, NULL);
ipc_update_pid(&msq->q_lrpid, NULL);
ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
@@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
msginfo->msgssz = MSGSSZ;
msginfo->msgseg = MSGSEG;
down_read(&msg_ids(ns).rwsem);
- if (cmd == MSG_INFO) {
+ if (cmd == MSG_INFO)
msginfo->msgpool = msg_ids(ns).in_use;
- msginfo->msgmap = atomic_read(&ns->msg_hdrs);
- msginfo->msgtql = atomic_read(&ns->msg_bytes);
+ max_idx = ipc_get_maxidx(&msg_ids(ns));
+ up_read(&msg_ids(ns).rwsem);
+ if (cmd == MSG_INFO) {
+ msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
+ msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
} else {
msginfo->msgmap = MSGMAP;
msginfo->msgpool = MSGPOOL;
msginfo->msgtql = MSGTQL;
}
- max_idx = ipc_get_maxidx(&msg_ids(ns));
- up_read(&msg_ids(ns).rwsem);
return (max_idx < 0) ? 0 : max_idx;
}

@@ -935,8 +937,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
list_add_tail(&msg->m_list, &msq->q_messages);
msq->q_cbytes += msgsz;
msq->q_qnum++;
- atomic_add(msgsz, &ns->msg_bytes);
- atomic_inc(&ns->msg_hdrs);
+ percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz);
+ percpu_counter_add_local(&ns->percpu_msg_hdrs, 1);
}

err = 0;
@@ -1159,8 +1161,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
msq->q_rtime = ktime_get_real_seconds();
ipc_update_pid(&msq->q_lrpid, task_tgid(current));
msq->q_cbytes -= msg->m_ts;
- atomic_sub(msg->m_ts, &ns->msg_bytes);
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_add_local(&ns->percpu_msg_bytes, -(msg->m_ts));
+ percpu_counter_add_local(&ns->percpu_msg_hdrs, -1);
ss_wakeup(msq, &wake_q, false);

goto out_unlock0;
@@ -1303,14 +1305,16 @@ void msg_init_ns(struct ipc_namespace *ns)
ns->msg_ctlmnb = MSGMNB;
ns->msg_ctlmni = MSGMNI;

- atomic_set(&ns->msg_bytes, 0);
- atomic_set(&ns->msg_hdrs, 0);
+ percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL);
+ percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL);
ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
}

#ifdef CONFIG_IPC_NS
void msg_exit_ns(struct ipc_namespace *ns)
{
+ percpu_counter_destroy(&ns->percpu_msg_bytes);
+ percpu_counter_destroy(&ns->percpu_msg_hdrs);
free_ipcs(ns, &msg_ids(ns), freeque);
idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index ed610b75dc32..d33cb750962a 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -72,6 +72,12 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
}
EXPORT_SYMBOL(percpu_counter_set);

+void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ this_cpu_add(*fbc->counters, amount);
+}
+EXPORT_SYMBOL(percpu_counter_add_local);
+
/*
* This function is both preempt and irq safe. The former is due to explicit
* preemption disable. The latter is guaranteed by the fact that the slow path
--
2.31.1


2022-09-02 16:14:59

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter

On Fri, 2 Sep 2022 23:22:43 +0800 Jiebin Sun <[email protected]> wrote:

> The msg_bytes and msg_hdrs atomic counters are frequently
> updated when IPC msg queue is in heavy use, causing heavy
> cache bounce and overhead. Change them to percpu_counters
> greatly improve the performance. Since there is one unique
> ipc namespace, additional memory cost is minimal. Reading
> of the count done in msgctl call, which is infrequent. So
> the need to sum up the counts in each CPU is infrequent.
>
> Apply the patch and test the pts/stress-ng-1.4.0
> -- system v message passing (160 threads).
>
> Score gain: 3.38x

So this test became 3x faster?

> CPU: ICX 8380 x 2 sockets
> Core number: 40 x 2 physical cores
> Benchmark: pts/stress-ng-1.4.0
> -- system v message passing (160 threads)
>
> ...
>
> @@ -138,6 +139,14 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
> preempt_enable();
> }
>
> +static inline void
> +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
> +{
> + preempt_disable();
> + fbc->count += amount;
> + preempt_enable();
> +}

What's this and why is it added?

It would be best to propose this as a separate preparatory patch.
Fully changelogged and perhaps even with a code comment explaining why
and when it should be used.

Thanks.

2022-09-02 16:45:21

by Shakeel Butt

[permalink] [raw]
Subject: Re: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter

On Fri, Sep 2, 2022 at 12:04 AM Jiebin Sun <[email protected]> wrote:
>
> The msg_bytes and msg_hdrs atomic counters are frequently
> updated when IPC msg queue is in heavy use, causing heavy
> cache bounce and overhead. Change them to percpu_counters
> greatly improve the performance. Since there is one unique
> ipc namespace, additional memory cost is minimal. Reading
> of the count done in msgctl call, which is infrequent. So
> the need to sum up the counts in each CPU is infrequent.
>
> Apply the patch and test the pts/stress-ng-1.4.0
> -- system v message passing (160 threads).
>
> Score gain: 3.38x
>
> CPU: ICX 8380 x 2 sockets
> Core number: 40 x 2 physical cores
> Benchmark: pts/stress-ng-1.4.0
> -- system v message passing (160 threads)
>
> Signed-off-by: Jiebin Sun <[email protected]>
[...]
>
> +void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
> +{
> + this_cpu_add(*fbc->counters, amount);
> +}
> +EXPORT_SYMBOL(percpu_counter_add_local);

Why not percpu_counter_add()? This may drift the fbc->count more than
batch*nr_cpus. I am assuming that is not the issue for you as you
always do an expensive sum in the slow path. As Andrew asked, this
should be a separate patch.

2022-09-03 20:11:35

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter

Hi Jiebin,

On 9/2/22 17:22, Jiebin Sun wrote:
> The msg_bytes and msg_hdrs atomic counters are frequently
> updated when IPC msg queue is in heavy use, causing heavy
> cache bounce and overhead. Change them to percpu_counters
> greatly improve the performance. Since there is one unique
> ipc namespace, additional memory cost is minimal.

With ipc namespaces, there is one struct per namespace, correct?

The cost is probably still ok, but the change log should be correct.


> @@ -1303,14 +1305,16 @@ void msg_init_ns(struct ipc_namespace *ns)
> ns->msg_ctlmnb = MSGMNB;
> ns->msg_ctlmni = MSGMNI;
>
> - atomic_set(&ns->msg_bytes, 0);
> - atomic_set(&ns->msg_hdrs, 0);
> + percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL);
> + percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL);
> ipc_init_ids(&ns->ids[IPC_MSG_IDS]);

These calls can fail. You must add error handling.

--

    Manfred

2022-09-05 12:00:30

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v2 0/2] ipc/msg: mitigate the lock contention in ipc/msg


Hi,

Here are two patches to mitigate the lock contention in ipc/msg.

The 1st patch is to add the new function percpu_counter_add_local if
only update the local counter without aggregating to global counter.

The 2nd patch is to use percpu_counter_add_local instead of atomic
updating in do_msgsnd and do_msgrcv every time. It will always do
sum when the syscall msgctl_info. So there is no need to do global
adding in percpu_counter_add_batch. We add percpu_counter_add_local
to resolve the above issue. The sum operation in msgctl_info is
infrequent and the additional cost is much less compared to the
performance gain in do_msgsnd and do_msgrcv.

Changes in v2:
1. Separate the original patch into two patches.
2. Add error handling for percpu_counter_init.

The performance gain increases as the threads of workload become larger.
Performance gain: 3.38x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)


Regards
Jiebin

2022-09-05 12:06:04

by Sun, Jiebin

[permalink] [raw]
Subject: Re: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter


On 9/3/2022 12:06 AM, Andrew Morton wrote:
> On Fri, 2 Sep 2022 23:22:43 +0800 Jiebin Sun <[email protected]> wrote:
>
>> The msg_bytes and msg_hdrs atomic counters are frequently
>> updated when IPC msg queue is in heavy use, causing heavy
>> cache bounce and overhead. Change them to percpu_counters
>> greatly improve the performance. Since there is one unique
>> ipc namespace, additional memory cost is minimal. Reading
>> of the count done in msgctl call, which is infrequent. So
>> the need to sum up the counts in each CPU is infrequent.
>>
>> Apply the patch and test the pts/stress-ng-1.4.0
>> -- system v message passing (160 threads).
>>
>> Score gain: 3.38x
> So this test became 3x faster?

Yes. It is from the phoronix test suite stress-ng-1.4.0 -- system v message
passing with dual sockets ICX servers. In this benchmark, there are 160
pairs of threads, which do msgsnd and msgrcv. The patch benefit more as the
threads of workload increase.

>
>> CPU: ICX 8380 x 2 sockets
>> Core number: 40 x 2 physical cores
>> Benchmark: pts/stress-ng-1.4.0
>> -- system v message passing (160 threads)
>>
>> ...
>>
>> @@ -138,6 +139,14 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
>> preempt_enable();
>> }
>>
>> +static inline void
>> +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
>> +{
>> + preempt_disable();
>> + fbc->count += amount;
>> + preempt_enable();
>> +}
> What's this and why is it added?
>
> It would be best to propose this as a separate preparatory patch.
> Fully changelogged and perhaps even with a code comment explaining why
> and when it should be used.
>
> Thanks.

As it will always do sum in msgctl_info, there is no need to use
percpu_counter_add_batch. It will do global updating when the counter reach
the batch size. So we add percpu_counter_add_local for smp and non_smp,
which will only do local adding to the percpu counter.
I have separate the original patch into two patches.

Thanks.

2022-09-05 12:15:34

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v2 1/2] percpu: Add percpu_counter_add_local

Add percpu_counter_add_local for only updating local counter
without aggregating to global counter.

Signed-off-by: Jiebin Sun <[email protected]>
---
include/linux/percpu_counter.h | 7 +++++++
lib/percpu_counter.c | 6 ++++++
2 files changed, 13 insertions(+)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 01861eebed79..344d69ae0fb1 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -40,6 +40,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,

void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
+void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
@@ -138,6 +139,12 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
preempt_enable();
}

+static inline void
+percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_add(fbc, amount);
+}
+
static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index ed610b75dc32..d33cb750962a 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -72,6 +72,12 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
}
EXPORT_SYMBOL(percpu_counter_set);

+void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ this_cpu_add(*fbc->counters, amount);
+}
+EXPORT_SYMBOL(percpu_counter_add_local);
+
/*
* This function is both preempt and irq safe. The former is due to explicit
* preemption disable. The latter is guaranteed by the fact that the slow path
--
2.31.1

2022-09-05 12:26:44

by Sun, Jiebin

[permalink] [raw]
Subject: Re: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter


On 9/3/2022 12:27 AM, Shakeel Butt wrote:
> On Fri, Sep 2, 2022 at 12:04 AM Jiebin Sun <[email protected]> wrote:
>> The msg_bytes and msg_hdrs atomic counters are frequently
>> updated when IPC msg queue is in heavy use, causing heavy
>> cache bounce and overhead. Change them to percpu_counters
>> greatly improve the performance. Since there is one unique
>> ipc namespace, additional memory cost is minimal. Reading
>> of the count done in msgctl call, which is infrequent. So
>> the need to sum up the counts in each CPU is infrequent.
>>
>> Apply the patch and test the pts/stress-ng-1.4.0
>> -- system v message passing (160 threads).
>>
>> Score gain: 3.38x
>>
>> CPU: ICX 8380 x 2 sockets
>> Core number: 40 x 2 physical cores
>> Benchmark: pts/stress-ng-1.4.0
>> -- system v message passing (160 threads)
>>
>> Signed-off-by: Jiebin Sun <[email protected]>
> [...]
>> +void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
>> +{
>> + this_cpu_add(*fbc->counters, amount);
>> +}
>> +EXPORT_SYMBOL(percpu_counter_add_local);
> Why not percpu_counter_add()? This may drift the fbc->count more than
> batch*nr_cpus. I am assuming that is not the issue for you as you
> always do an expensive sum in the slow path. As Andrew asked, this
> should be a separate patch.

Yes. It will always do sum in msgctl_info. So there is no need to
do global updating in percpu_counter_add when the percpu counter
reaches the batch size. We add percpu_counter_add_local in this
case. The sum in slow path is infrequent. So the additional cost
is much less compared to the atomic updating in do_msgsnd and
do_msgrcv every time. I have separate the original patch into two
patches.

Thanks.

2022-09-05 12:28:18

by Sun, Jiebin

[permalink] [raw]
Subject: Re: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter


On 9/4/2022 3:35 AM, Manfred Spraul wrote:
> Hi Jiebin,
>
> On 9/2/22 17:22, Jiebin Sun wrote:
>> The msg_bytes and msg_hdrs atomic counters are frequently
>> updated when IPC msg queue is in heavy use, causing heavy
>> cache bounce and overhead. Change them to percpu_counters
>> greatly improve the performance. Since there is one unique
>> ipc namespace, additional memory cost is minimal.
>
> With ipc namespaces, there is one struct per namespace, correct?
>
> The cost is probably still ok, but the change log should be correct.
>
Yes, that's what I want to summarize. The IPC msg namespace is unique

and there is only one percpu counter in IPC msg namespace.

Thanks.

>
>> @@ -1303,14 +1305,16 @@ void msg_init_ns(struct ipc_namespace *ns)
>>       ns->msg_ctlmnb = MSGMNB;
>>       ns->msg_ctlmni = MSGMNI;
>>   -    atomic_set(&ns->msg_bytes, 0);
>> -    atomic_set(&ns->msg_hdrs, 0);
>> +    percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL);
>> +    percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL);
>>       ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
>
> These calls can fail. You must add error handling.

I have add error handling for percpu_counter_init.

Thanks.

>
> --
>
>     Manfred
>
>

2022-09-05 19:40:54

by Shakeel Butt

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] percpu: Add percpu_counter_add_local

On Tue, Sep 06, 2022 at 03:35:16AM +0800, Jiebin Sun wrote:
> Add percpu_counter_add_local for only updating local counter
> without aggregating to global counter.

Please add why do we need this. Who should use this and who shouldn't.

>
> Signed-off-by: Jiebin Sun <[email protected]>

[...]

> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
> index ed610b75dc32..d33cb750962a 100644
> --- a/lib/percpu_counter.c
> +++ b/lib/percpu_counter.c
> @@ -72,6 +72,12 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
> }
> EXPORT_SYMBOL(percpu_counter_set);
>

Add a doc comment here on why someone want to use this?

> +void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
> +{
> + this_cpu_add(*fbc->counters, amount);
> +}
> +EXPORT_SYMBOL(percpu_counter_add_local);
> +
> /*
> * This function is both preempt and irq safe. The former is due to explicit
> * preemption disable. The latter is guaranteed by the fact that the slow path
> --
> 2.31.1
>

2022-09-06 08:46:49

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v3 0/2] ipc/msg: mitigate the lock contention in ipc/msg


Hi,

Here are two patches to mitigate the lock contention in ipc/msg.

The 1st patch is to add the new function percpu_counter_add_local if only update the local counter without aggregating to global counter. This function could be used with percpu_counter_sum together if you need high accurate counter. The combination could bring obvious performance improvement than percpu_counter_add_batch if percpu_counter_add is frequently called and percpu_counter_sum is not in the critical path.

The 2nd patch is to use percpu_counter instead of atomic update in ipc/msg.
The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent.

Changes in v3:
1. Add comment and change log for the new function percpu_counter_add_local.
Who should use it and who shouldn't.

Changes in v2:
1. Separate the original patch into two patches.
2. Add error handling for percpu_counter_init.

The performance gain increases as the threads of workload become larger.
Performance gain: 3.38x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)


Regards
Jiebin

2022-09-06 09:17:40

by Sun, Jiebin

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] percpu: Add percpu_counter_add_local


On 9/6/2022 3:31 AM, Shakeel Butt wrote:
> On Tue, Sep 06, 2022 at 03:35:16AM +0800, Jiebin Sun wrote:
>> Add percpu_counter_add_local for only updating local counter
>> without aggregating to global counter.
> Please add why do we need this. Who should use this and who shouldn't.

Thanks. I have added the code comment and change log in patch v3 and
provided

the info who should use it and who shouldn't.

>
>> Signed-off-by: Jiebin Sun <[email protected]>
> [...]
>
>> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
>> index ed610b75dc32..d33cb750962a 100644
>> --- a/lib/percpu_counter.c
>> +++ b/lib/percpu_counter.c
>> @@ -72,6 +72,12 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
>> }
>> EXPORT_SYMBOL(percpu_counter_set);
>>
> Add a doc comment here on why someone want to use this?
>
>> +void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
>> +{
>> + this_cpu_add(*fbc->counters, amount);
>> +}
>> +EXPORT_SYMBOL(percpu_counter_add_local);
>> +
>> /*
>> * This function is both preempt and irq safe. The former is due to explicit
>> * preemption disable. The latter is guaranteed by the fact that the slow path
>> --
>> 2.31.1
>>

2022-09-06 09:25:55

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v3 1/2] percpu: Add percpu_counter_add_local

Add percpu_counter_add_local for only updating local counter
without aggregating to global counter.

This function could be used with percpu_counter_sum together if
you need high accurate counter. It could bring obvious performance
improvement if percpu_counter_add is frequently called and
percpu_counter_sum is not in the critical path.

Please use percpu_counter_add_batch instead if you need the counter
timely but not accurate and the call of percpu_counter_add_batch is
not heavy.

Signed-off-by: Jiebin Sun <[email protected]>
---
include/linux/percpu_counter.h | 7 +++++++
lib/percpu_counter.c | 14 ++++++++++++++
2 files changed, 21 insertions(+)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 01861eebed79..344d69ae0fb1 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -40,6 +40,7 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,

void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
+void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
@@ -138,6 +139,12 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
preempt_enable();
}

+static inline void
+percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_add(fbc, amount);
+}
+
static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index ed610b75dc32..36907eb573a8 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -72,6 +72,20 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
}
EXPORT_SYMBOL(percpu_counter_set);

+/*
+ * Recommend to use the function combined with percpu_counter_sum if you need
+ * high accurate counter. As the percpu_counter_sum add up all the percpu
+ * counter, there is no need to check batch size and sum in percpu_counter_add.
+ * If the percpu_counter_sum is infrequent used and the percpu_counter_add
+ * is in critical path, this combination could have significant performance
+ * improvement than the function percpu_counter_add_batch.
+ */
+void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ this_cpu_add(*fbc->counters, amount);
+}
+EXPORT_SYMBOL(percpu_counter_add_local);
+
/*
* This function is both preempt and irq safe. The former is due to explicit
* preemption disable. The latter is guaranteed by the fact that the slow path
--
2.31.1

2022-09-06 09:29:09

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v3 2/2] ipc/msg: mitigate the lock contention with percpu counter

The msg_bytes and msg_hdrs atomic counters are frequently
updated when IPC msg queue is in heavy use, causing heavy
cache bounce and overhead. Change them to percpu_counter
greatly improve the performance. Since there is one percpu
struct per namespace, additional memory cost is minimal.
Reading of the count done in msgctl call, which is infrequent.
So the need to sum up the counts in each CPU is infrequent.

Apply the patch and test the pts/stress-ng-1.4.0
-- system v message passing (160 threads).

Score gain: 3.38x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)

Signed-off-by: Jiebin Sun <[email protected]>
---
include/linux/ipc_namespace.h | 5 ++--
ipc/msg.c | 44 ++++++++++++++++++++++++-----------
ipc/namespace.c | 5 +++-
ipc/util.h | 4 ++--
4 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e3e8c8662b49..e8240cf2611a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -11,6 +11,7 @@
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>

struct user_namespace;

@@ -36,8 +37,8 @@ struct ipc_namespace {
unsigned int msg_ctlmax;
unsigned int msg_ctlmnb;
unsigned int msg_ctlmni;
- atomic_t msg_bytes;
- atomic_t msg_hdrs;
+ struct percpu_counter percpu_msg_bytes;
+ struct percpu_counter percpu_msg_hdrs;

size_t shm_ctlmax;
size_t shm_ctlall;
diff --git a/ipc/msg.c b/ipc/msg.c
index a0d05775af2c..87c30decb23f 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -39,6 +39,7 @@
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>
+#include <linux/percpu_counter.h>

#include <asm/current.h>
#include <linux/uaccess.h>
@@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
rcu_read_unlock();

list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_add_local(&ns->percpu_msg_hdrs, -1);
free_msg(msg);
}
- atomic_sub(msq->q_cbytes, &ns->msg_bytes);
+ percpu_counter_add_local(&ns->percpu_msg_bytes, -(msq->q_cbytes));
ipc_update_pid(&msq->q_lspid, NULL);
ipc_update_pid(&msq->q_lrpid, NULL);
ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
@@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
msginfo->msgssz = MSGSSZ;
msginfo->msgseg = MSGSEG;
down_read(&msg_ids(ns).rwsem);
- if (cmd == MSG_INFO) {
+ if (cmd == MSG_INFO)
msginfo->msgpool = msg_ids(ns).in_use;
- msginfo->msgmap = atomic_read(&ns->msg_hdrs);
- msginfo->msgtql = atomic_read(&ns->msg_bytes);
+ max_idx = ipc_get_maxidx(&msg_ids(ns));
+ up_read(&msg_ids(ns).rwsem);
+ if (cmd == MSG_INFO) {
+ msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
+ msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
} else {
msginfo->msgmap = MSGMAP;
msginfo->msgpool = MSGPOOL;
msginfo->msgtql = MSGTQL;
}
- max_idx = ipc_get_maxidx(&msg_ids(ns));
- up_read(&msg_ids(ns).rwsem);
return (max_idx < 0) ? 0 : max_idx;
}

@@ -935,8 +937,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
list_add_tail(&msg->m_list, &msq->q_messages);
msq->q_cbytes += msgsz;
msq->q_qnum++;
- atomic_add(msgsz, &ns->msg_bytes);
- atomic_inc(&ns->msg_hdrs);
+ percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz);
+ percpu_counter_add_local(&ns->percpu_msg_hdrs, 1);
}

err = 0;
@@ -1159,8 +1161,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
msq->q_rtime = ktime_get_real_seconds();
ipc_update_pid(&msq->q_lrpid, task_tgid(current));
msq->q_cbytes -= msg->m_ts;
- atomic_sub(msg->m_ts, &ns->msg_bytes);
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_add_local(&ns->percpu_msg_bytes, -(msg->m_ts));
+ percpu_counter_add_local(&ns->percpu_msg_hdrs, -1);
ss_wakeup(msq, &wake_q, false);

goto out_unlock0;
@@ -1297,20 +1299,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
}
#endif

-void msg_init_ns(struct ipc_namespace *ns)
+int msg_init_ns(struct ipc_namespace *ns)
{
+ int ret;
+
ns->msg_ctlmax = MSGMAX;
ns->msg_ctlmnb = MSGMNB;
ns->msg_ctlmni = MSGMNI;

- atomic_set(&ns->msg_bytes, 0);
- atomic_set(&ns->msg_hdrs, 0);
+ ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL);
+ if (ret)
+ goto fail_msg_bytes;
+ ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL);
+ if (ret)
+ goto fail_msg_hdrs;
ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+ return 0;
+
+ fail_msg_hdrs:
+ percpu_counter_destroy(&ns->percpu_msg_bytes);
+ fail_msg_bytes:
+ return ret;
}

#ifdef CONFIG_IPC_NS
void msg_exit_ns(struct ipc_namespace *ns)
{
+ percpu_counter_destroy(&ns->percpu_msg_bytes);
+ percpu_counter_destroy(&ns->percpu_msg_hdrs);
free_ipcs(ns, &msg_ids(ns), freeque);
idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index e1fcaedba4fa..8316ea585733 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
if (!setup_ipc_sysctls(ns))
goto fail_mq;

+ err = msg_init_ns(ns);
+ if (err)
+ goto fail_put;
+
sem_init_ns(ns);
- msg_init_ns(ns);
shm_init_ns(ns);

return ns;
diff --git a/ipc/util.h b/ipc/util.h
index 2dd7ce0416d8..1b0086c6346f 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }

#ifdef CONFIG_SYSVIPC
void sem_init_ns(struct ipc_namespace *ns);
-void msg_init_ns(struct ipc_namespace *ns);
+int msg_init_ns(struct ipc_namespace *ns);
void shm_init_ns(struct ipc_namespace *ns);

void sem_exit_ns(struct ipc_namespace *ns);
@@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
static inline void sem_init_ns(struct ipc_namespace *ns) { }
-static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;}
static inline void shm_init_ns(struct ipc_namespace *ns) { }

static inline void sem_exit_ns(struct ipc_namespace *ns) { }
--
2.31.1

2022-09-06 18:58:40

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter

On Fri, 2022-09-02 at 09:27 -0700, Shakeel Butt wrote:
> On Fri, Sep 2, 2022 at 12:04 AM Jiebin Sun <[email protected]> wrote:
> > The msg_bytes and msg_hdrs atomic counters are frequently
> > updated when IPC msg queue is in heavy use, causing heavy
> > cache bounce and overhead. Change them to percpu_counters
> > greatly improve the performance. Since there is one unique
> > ipc namespace, additional memory cost is minimal. Reading
> > of the count done in msgctl call, which is infrequent. So
> > the need to sum up the counts in each CPU is infrequent.
> >
> > Apply the patch and test the pts/stress-ng-1.4.0
> > -- system v message passing (160 threads).
> >
> > Score gain: 3.38x
> >
> > CPU: ICX 8380 x 2 sockets
> > Core number: 40 x 2 physical cores
> > Benchmark: pts/stress-ng-1.4.0
> > -- system v message passing (160 threads)
> >
> > Signed-off-by: Jiebin Sun <[email protected]>
> [...]
> > +void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
> > +{
> > + this_cpu_add(*fbc->counters, amount);
> > +}
> > +EXPORT_SYMBOL(percpu_counter_add_local);
>
> Why not percpu_counter_add()? This may drift the fbc->count more than
> batch*nr_cpus. I am assuming that is not the issue for you as you
> always do an expensive sum in the slow path. As Andrew asked, this
> should be a separate patch.

In the IPC case, the read is always done with the accurate read using
percpu_counter_sum() gathering all the counts and
never with percpu_counter_read() that only read global count.
So Jiebin was not worry about accuracy.

However, the counter is s64 and the local per cpu counter is S32.
So the counter size has shrunk if we only keep the count in local per
cpu counter, which can overflow a lot sooner and is not okay.

Jiebin, can you try to use percpu_counter_add_batch, but using a large
batch size. That should achieve what you want without needing
to create a percpu_counter_add_local() function, and also the overflow
problem.

Tim


2022-09-07 09:23:13

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v4] ipc/msg: mitigate the lock contention with percpu counter

The msg_bytes and msg_hdrs atomic counters are frequently
updated when IPC msg queue is in heavy use, causing heavy
cache bounce and overhead. Change them to percpu_counter
greatly improve the performance. Since there is one percpu
struct per namespace, additional memory cost is minimal.
Reading of the count done in msgctl call, which is infrequent.
So the need to sum up the counts in each CPU is infrequent.


Apply the patch and test the pts/stress-ng-1.4.0
-- system v message passing (160 threads).

Score gain: 3.17x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)

Signed-off-by: Jiebin Sun <[email protected]>
---
include/linux/ipc_namespace.h | 5 ++--
ipc/msg.c | 47 ++++++++++++++++++++++++-----------
ipc/namespace.c | 5 +++-
ipc/util.h | 4 +--
4 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e3e8c8662b49..e8240cf2611a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -11,6 +11,7 @@
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>

struct user_namespace;

@@ -36,8 +37,8 @@ struct ipc_namespace {
unsigned int msg_ctlmax;
unsigned int msg_ctlmnb;
unsigned int msg_ctlmni;
- atomic_t msg_bytes;
- atomic_t msg_hdrs;
+ struct percpu_counter percpu_msg_bytes;
+ struct percpu_counter percpu_msg_hdrs;

size_t shm_ctlmax;
size_t shm_ctlall;
diff --git a/ipc/msg.c b/ipc/msg.c
index a0d05775af2c..040cfc93d7ef 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -39,11 +39,15 @@
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>
+#include <linux/percpu_counter.h>

#include <asm/current.h>
#include <linux/uaccess.h>
#include "util.h"

+/* large batch size could reduce the times to sum up percpu counter */
+#define MSG_PERCPU_COUNTER_BATCH 1024
+
/* one msq_queue structure for each present queue on the system */
struct msg_queue {
struct kern_ipc_perm q_perm;
@@ -285,10 +289,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
rcu_read_unlock();

list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_add_batch(&ns->percpu_msg_hdrs, -1, MSG_PERCPU_COUNTER_BATCH);
free_msg(msg);
}
- atomic_sub(msq->q_cbytes, &ns->msg_bytes);
+ percpu_counter_add_batch(&ns->percpu_msg_bytes, -(msq->q_cbytes), MSG_PERCPU_COUNTER_BATCH);
ipc_update_pid(&msq->q_lspid, NULL);
ipc_update_pid(&msq->q_lrpid, NULL);
ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
@@ -495,17 +499,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
msginfo->msgssz = MSGSSZ;
msginfo->msgseg = MSGSEG;
down_read(&msg_ids(ns).rwsem);
- if (cmd == MSG_INFO) {
+ if (cmd == MSG_INFO)
msginfo->msgpool = msg_ids(ns).in_use;
- msginfo->msgmap = atomic_read(&ns->msg_hdrs);
- msginfo->msgtql = atomic_read(&ns->msg_bytes);
+ max_idx = ipc_get_maxidx(&msg_ids(ns));
+ up_read(&msg_ids(ns).rwsem);
+ if (cmd == MSG_INFO) {
+ msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
+ msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
} else {
msginfo->msgmap = MSGMAP;
msginfo->msgpool = MSGPOOL;
msginfo->msgtql = MSGTQL;
}
- max_idx = ipc_get_maxidx(&msg_ids(ns));
- up_read(&msg_ids(ns).rwsem);
return (max_idx < 0) ? 0 : max_idx;
}

@@ -935,8 +940,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
list_add_tail(&msg->m_list, &msq->q_messages);
msq->q_cbytes += msgsz;
msq->q_qnum++;
- atomic_add(msgsz, &ns->msg_bytes);
- atomic_inc(&ns->msg_hdrs);
+ percpu_counter_add_batch(&ns->percpu_msg_bytes, msgsz, MSG_PERCPU_COUNTER_BATCH);
+ percpu_counter_add_batch(&ns->percpu_msg_hdrs, 1, MSG_PERCPU_COUNTER_BATCH);
}

err = 0;
@@ -1159,8 +1164,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
msq->q_rtime = ktime_get_real_seconds();
ipc_update_pid(&msq->q_lrpid, task_tgid(current));
msq->q_cbytes -= msg->m_ts;
- atomic_sub(msg->m_ts, &ns->msg_bytes);
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_add_batch(&ns->percpu_msg_bytes, -(msg->m_ts), MSG_PERCPU_COUNTER_BATCH);
+ percpu_counter_add_batch(&ns->percpu_msg_hdrs, -1, MSG_PERCPU_COUNTER_BATCH);
ss_wakeup(msq, &wake_q, false);

goto out_unlock0;
@@ -1297,20 +1302,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
}
#endif

-void msg_init_ns(struct ipc_namespace *ns)
+int msg_init_ns(struct ipc_namespace *ns)
{
+ int ret;
+
ns->msg_ctlmax = MSGMAX;
ns->msg_ctlmnb = MSGMNB;
ns->msg_ctlmni = MSGMNI;

- atomic_set(&ns->msg_bytes, 0);
- atomic_set(&ns->msg_hdrs, 0);
+ ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL);
+ if (ret)
+ goto fail_msg_bytes;
+ ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL);
+ if (ret)
+ goto fail_msg_hdrs;
ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+ return 0;
+
+ fail_msg_hdrs:
+ percpu_counter_destroy(&ns->percpu_msg_bytes);
+ fail_msg_bytes:
+ return ret;
}

#ifdef CONFIG_IPC_NS
void msg_exit_ns(struct ipc_namespace *ns)
{
+ percpu_counter_destroy(&ns->percpu_msg_bytes);
+ percpu_counter_destroy(&ns->percpu_msg_hdrs);
free_ipcs(ns, &msg_ids(ns), freeque);
idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index e1fcaedba4fa..8316ea585733 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
if (!setup_ipc_sysctls(ns))
goto fail_mq;

+ err = msg_init_ns(ns);
+ if (err)
+ goto fail_put;
+
sem_init_ns(ns);
- msg_init_ns(ns);
shm_init_ns(ns);

return ns;
diff --git a/ipc/util.h b/ipc/util.h
index 2dd7ce0416d8..1b0086c6346f 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }

#ifdef CONFIG_SYSVIPC
void sem_init_ns(struct ipc_namespace *ns);
-void msg_init_ns(struct ipc_namespace *ns);
+int msg_init_ns(struct ipc_namespace *ns);
void shm_init_ns(struct ipc_namespace *ns);

void sem_exit_ns(struct ipc_namespace *ns);
@@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
static inline void sem_init_ns(struct ipc_namespace *ns) { }
-static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;}
static inline void shm_init_ns(struct ipc_namespace *ns) { }

static inline void sem_exit_ns(struct ipc_namespace *ns) { }
--
2.31.1

2022-09-07 10:49:17

by Sun, Jiebin

[permalink] [raw]
Subject: Re: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter


On 9/7/2022 2:44 AM, Tim Chen wrote:
> On Fri, 2022-09-02 at 09:27 -0700, Shakeel Butt wrote:
>> On Fri, Sep 2, 2022 at 12:04 AM Jiebin Sun <[email protected]> wrote:
>>> The msg_bytes and msg_hdrs atomic counters are frequently
>>> updated when IPC msg queue is in heavy use, causing heavy
>>> cache bounce and overhead. Change them to percpu_counters
>>> greatly improve the performance. Since there is one unique
>>> ipc namespace, additional memory cost is minimal. Reading
>>> of the count done in msgctl call, which is infrequent. So
>>> the need to sum up the counts in each CPU is infrequent.
>>>
>>> Apply the patch and test the pts/stress-ng-1.4.0
>>> -- system v message passing (160 threads).
>>>
>>> Score gain: 3.38x
>>>
>>> CPU: ICX 8380 x 2 sockets
>>> Core number: 40 x 2 physical cores
>>> Benchmark: pts/stress-ng-1.4.0
>>> -- system v message passing (160 threads)
>>>
>>> Signed-off-by: Jiebin Sun <[email protected]>
>> [...]
>>> +void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
>>> +{
>>> + this_cpu_add(*fbc->counters, amount);
>>> +}
>>> +EXPORT_SYMBOL(percpu_counter_add_local);
>> Why not percpu_counter_add()? This may drift the fbc->count more than
>> batch*nr_cpus. I am assuming that is not the issue for you as you
>> always do an expensive sum in the slow path. As Andrew asked, this
>> should be a separate patch.
> In the IPC case, the read is always done with the accurate read using
> percpu_counter_sum() gathering all the counts and
> never with percpu_counter_read() that only read global count.
> So Jiebin was not worry about accuracy.
>
> However, the counter is s64 and the local per cpu counter is S32.
> So the counter size has shrunk if we only keep the count in local per
> cpu counter, which can overflow a lot sooner and is not okay.
>
> Jiebin, can you try to use percpu_counter_add_batch, but using a large
> batch size. That should achieve what you want without needing
> to create a percpu_counter_add_local() function, and also the overflow
> problem.
>
> Tim
>
I have sent out the patch v4 which use percpu_counter_add_batch. If we use
a tuned large batch size (1024), the performance gain is 3.17x (patch v4)
vs 3.38x (patch v3) previously in stress-ng -- message. It still has
significant performance improvement and also good balance between
performance gain and overflow issue.

Jiebin

>

2022-09-07 16:22:35

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v4] ipc/msg: mitigate the lock contention with percpu counter

On Thu, 2022-09-08 at 01:25 +0800, Jiebin Sun wrote:
> The msg_bytes and msg_hdrs atomic counters are frequently
> updated when IPC msg queue is in heavy use, causing heavy
> cache bounce and overhead. Change them to percpu_counter
> greatly improve the performance. Since there is one percpu
> struct per namespace, additional memory cost is minimal.
> Reading of the count done in msgctl call, which is infrequent.
> So the need to sum up the counts in each CPU is infrequent.
>
>
> Apply the patch and test the pts/stress-ng-1.4.0
> -- system v message passing (160 threads).
>
> Score gain: 3.17x
>
>
...
>
> +/* large batch size could reduce the times to sum up percpu counter */
> +#define MSG_PERCPU_COUNTER_BATCH 1024
> +

Jiebin,

1024 is a small size (1/4 page).
The local per cpu counter could overflow to the gloabal count quickly
if it is limited to this size, since our count tracks msg size.

I'll suggest something larger, say 8*1024*1024, about
8MB to accommodate about 2 large page worth of data. Maybe that
will further improve throughput on stress-ng by reducing contention
on adding to the global count.

Tim


2022-09-07 21:00:56

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] ipc/msg.c: mitigate the lock contention with percpu counter

On Wed, 7 Sep 2022 17:39:47 +0800 "Sun, Jiebin" <[email protected]> wrote:

> I have sent out the patch v4 which use percpu_counter_add_batch. If we use
> a tuned large batch size (1024),

Oh. Why not simply use a batch size of INT_MAX?

> the performance gain is 3.17x (patch v4)
> vs 3.38x (patch v3) previously in stress-ng -- message. It still has
> significant performance improvement and also good balance between
> performance gain and overflow issue.

2022-09-07 22:18:03

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v4] ipc/msg: mitigate the lock contention with percpu counter

On Wed, 07 Sep 2022 09:01:53 -0700 Tim Chen <[email protected]> wrote:

> On Thu, 2022-09-08 at 01:25 +0800, Jiebin Sun wrote:
> > The msg_bytes and msg_hdrs atomic counters are frequently
> > updated when IPC msg queue is in heavy use, causing heavy
> > cache bounce and overhead. Change them to percpu_counter
> > greatly improve the performance. Since there is one percpu
> > struct per namespace, additional memory cost is minimal.
> > Reading of the count done in msgctl call, which is infrequent.
> > So the need to sum up the counts in each CPU is infrequent.
> >
> >
> > Apply the patch and test the pts/stress-ng-1.4.0
> > -- system v message passing (160 threads).
> >
> > Score gain: 3.17x
> >
> >
> ...
> >
> > +/* large batch size could reduce the times to sum up percpu counter */
> > +#define MSG_PERCPU_COUNTER_BATCH 1024
> > +
>
> Jiebin,
>
> 1024 is a small size (1/4 page).
> The local per cpu counter could overflow to the gloabal count quickly
> if it is limited to this size, since our count tracks msg size.
>
> I'll suggest something larger, say 8*1024*1024, about
> 8MB to accommodate about 2 large page worth of data. Maybe that
> will further improve throughput on stress-ng by reducing contention
> on adding to the global count.
>

I think this concept of a percpu_counter_add() which is massively
biased to the write side and with very rare reading is a legitimate
use-case. Perhaps it should become an addition to the formal interface.
Something like

/*
* comment goes here
*/
static inline void percpu_counter_add_local(struct percpu_counter *fbc,
s64 amount)
{
percpu_counter_add_batch(fbc, amount, INT_MAX);
}

and percpu_counter_sub_local(), I guess.

The only instance I can see is
block/blk-cgroup-rwstat.h:blkg_rwstat_add() which is using INT_MAX/2
because it always uses percpu_counter_sum_positive() on the read side.

But that makes two!

2022-09-07 22:41:56

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v4] ipc/msg: mitigate the lock contention with percpu counter

On Wed, 2022-09-07 at 14:34 -0700, Andrew Morton wrote:
>
> I think this concept of a percpu_counter_add() which is massively
> biased to the write side and with very rare reading is a legitimate
> use-case. Perhaps it should become an addition to the formal interface.
> Something like
>
> /*
> * comment goes here
> */
> static inline void percpu_counter_add_local(struct percpu_counter *fbc,
> s64 amount)
> {
> percpu_counter_add_batch(fbc, amount, INT_MAX);
> }
>
> and percpu_counter_sub_local(), I guess.
>
> The only instance I can see is
> block/blk-cgroup-rwstat.h:blkg_rwstat_add() which is using INT_MAX/2
> because it always uses percpu_counter_sum_positive() on the read side.
>
> But that makes two!

Sure. We can create this function and use it for both cases. No objections.

Tim

2022-09-08 09:22:13

by Sun, Jiebin

[permalink] [raw]
Subject: Re: [PATCH v4] ipc/msg: mitigate the lock contention with percpu counter


On 9/8/2022 5:34 AM, Andrew Morton wrote:
> On Wed, 07 Sep 2022 09:01:53 -0700 Tim Chen <[email protected]> wrote:
>
>> On Thu, 2022-09-08 at 01:25 +0800, Jiebin Sun wrote:
>>> The msg_bytes and msg_hdrs atomic counters are frequently
>>> updated when IPC msg queue is in heavy use, causing heavy
>>> cache bounce and overhead. Change them to percpu_counter
>>> greatly improve the performance. Since there is one percpu
>>> struct per namespace, additional memory cost is minimal.
>>> Reading of the count done in msgctl call, which is infrequent.
>>> So the need to sum up the counts in each CPU is infrequent.
>>>
>>>
>>> Apply the patch and test the pts/stress-ng-1.4.0
>>> -- system v message passing (160 threads).
>>>
>>> Score gain: 3.17x
>>>
>>>
>> ...
>>>
>>> +/* large batch size could reduce the times to sum up percpu counter */
>>> +#define MSG_PERCPU_COUNTER_BATCH 1024
>>> +
>> Jiebin,
>>
>> 1024 is a small size (1/4 page).
>> The local per cpu counter could overflow to the gloabal count quickly
>> if it is limited to this size, since our count tracks msg size.
>>
>> I'll suggest something larger, say 8*1024*1024, about
>> 8MB to accommodate about 2 large page worth of data. Maybe that
>> will further improve throughput on stress-ng by reducing contention
>> on adding to the global count.
>>
> I think this concept of a percpu_counter_add() which is massively
> biased to the write side and with very rare reading is a legitimate
> use-case. Perhaps it should become an addition to the formal interface.
> Something like
>
> /*
> * comment goes here
> */
> static inline void percpu_counter_add_local(struct percpu_counter *fbc,
> s64 amount)
> {
> percpu_counter_add_batch(fbc, amount, INT_MAX);
> }
>
> and percpu_counter_sub_local(), I guess.
>
> The only instance I can see is
> block/blk-cgroup-rwstat.h:blkg_rwstat_add() which is using INT_MAX/2
> because it always uses percpu_counter_sum_positive() on the read side.
>
> But that makes two!


Yes. Using INT_MAX or INT_MAX/2 could have a big improvement on the
performance if heavy writing but rare reading. In our case, if the local
percpu counter is near to INT_MAX and there comes a big msgsz, the
overflow issue could happen. So I think INT_MAX/2, which is used in
blkg_rwstat_add(), might be a better choice. /$
percpu_counter_add_batch(&ns->percpu_msg_bytes, msgsz, batch); /I will
send the performance data and draft patch out for discussing.//Jiebin//

2022-09-08 16:28:23

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v4] ipc/msg: mitigate the lock contention with percpu counter

On Thu, 8 Sep 2022 16:25:47 +0800 "Sun, Jiebin" <[email protected]> wrote:

> In our case, if the local
> percpu counter is near to INT_MAX and there comes a big msgsz, the
> overflow issue could happen.

percpu_counter_add_batch() handles this - your big message
won't overflow an s64.


Lookng at percpu_counter_add_batch(), is this tweak right?

- don't need to update *fbc->counters inside the lock
- that __this_cpu_sub() is an obscure way of zeroing the thing

--- a/lib/percpu_counter.c~a
+++ a/lib/percpu_counter.c
@@ -89,8 +89,8 @@ void percpu_counter_add_batch(struct per
unsigned long flags;
raw_spin_lock_irqsave(&fbc->lock, flags);
fbc->count += count;
- __this_cpu_sub(*fbc->counters, count - amount);
raw_spin_unlock_irqrestore(&fbc->lock, flags);
+ __this_cpu_write(*fbc->counters, 0);
} else {
this_cpu_add(*fbc->counters, amount);
}
_

2022-09-08 16:45:05

by Dennis Zhou

[permalink] [raw]
Subject: Re: [PATCH v4] ipc/msg: mitigate the lock contention with percpu counter

Hello,

On Thu, Sep 08, 2022 at 08:38:59AM -0700, Andrew Morton wrote:
> On Thu, 8 Sep 2022 16:25:47 +0800 "Sun, Jiebin" <[email protected]> wrote:
>
> > In our case, if the local
> > percpu counter is near to INT_MAX and there comes a big msgsz, the
> > overflow issue could happen.
>
> percpu_counter_add_batch() handles this - your big message
> won't overflow an s64.
>
>
> Lookng at percpu_counter_add_batch(), is this tweak right?
>
> - don't need to update *fbc->counters inside the lock
> - that __this_cpu_sub() is an obscure way of zeroing the thing
>
> --- a/lib/percpu_counter.c~a
> +++ a/lib/percpu_counter.c
> @@ -89,8 +89,8 @@ void percpu_counter_add_batch(struct per
> unsigned long flags;
> raw_spin_lock_irqsave(&fbc->lock, flags);
> fbc->count += count;
> - __this_cpu_sub(*fbc->counters, count - amount);
> raw_spin_unlock_irqrestore(&fbc->lock, flags);
> + __this_cpu_write(*fbc->counters, 0);

I don't think this is irq safe. It'd be best to leave it inside the
spinlock as then we can use __this_cpu_write() to 0 in there.

> } else {
> this_cpu_add(*fbc->counters, amount);
> }
> _
>

Thanks,
Dennis

2022-09-09 12:50:31

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v5 0/2] ipc/msg: mitigate the lock contention in ipc/msg


Hi,

Here are two patches to mitigate the lock contention in ipc/msg.

The 1st patch is to add the new interface percpu_counter_add_local and
percpu_counter_sub_local. The batch size in percpu_counter_add_batch should
be very large in heavy writing and rare reading case. Add the "_local"
version, and mostly it will do local adding, reduce the global updating and
mitigate lock contention in writing.

The 2nd patch is to use percpu_counter instead of atomic update in ipc/msg.
The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC
msg queue is in heavy use, causing heavy cache bounce and overhead. Change
them to percpu_counter greatly improve the performance. Since there is one
percpu struct per namespace, additional memory cost is minimal. Reading of
the count done in msgctl call, which is infrequent. So the need to sum up
the counts in each CPU is infrequent.

Changes in v5:
1. Use INT_MAX as the large batch size in percpu_counter_local_add and
percpu_counter_sub_local.
2. Use the latest kernel 6.0-rc4 as the baseline for performance test.
3. Move the percpu_counter_local_add and percpu_counter_sub_local from
percpu_counter.c to percpu_counter.h.

Changes in v3:
1. Add comment and change log for the new function percpu_counter_add_local.
Who should use it and who shouldn't.

Changes in v2:
1. Separate the original patch into two patches.
2. Add error handling for percpu_counter_init.

The performance gain increases as the threads of workload become larger.
Performance gain: 3.99x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)


Regards
Jiebin

2022-09-09 12:50:53

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v5 1/2] percpu: Add percpu_counter_add_local and percpu_counter_sub_local

The batch size in percpu_counter_add_batch should be very large
in heavy writing and rare reading case. Add the "_local" version,
and mostly it will do local adding, reduce the global updating
and mitigate lock contention in writing.

Signed-off-by: Jiebin Sun <[email protected]>
---
include/linux/percpu_counter.h | 38 ++++++++++++++++++++++++++++++++++
1 file changed, 38 insertions(+)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 01861eebed79..6dd7eaba8527 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -15,6 +15,9 @@
#include <linux/types.h>
#include <linux/gfp.h>

+/* percpu_counter batch for local add or sub */
+#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX
+
#ifdef CONFIG_SMP

struct percpu_counter {
@@ -56,6 +59,27 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}

+/*
+ * Use this function in heavy writing but rare reading case. The large
+ * batch size will reduce the global updating.
+ */
+static inline void
+percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH);
+}
+
+/*
+ * Similar with percpu_counter_add_local, use it in heavy writing but
+ * rare reading case. The large batch size will reduce the global
+ * updating.
+ */
+static inline void
+percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_add_batch(fbc, -amount, PERCPU_COUNTER_LOCAL_BATCH);
+}
+
static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
s64 ret = __percpu_counter_sum(fbc);
@@ -138,6 +162,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
preempt_enable();
}

+/* no smp percpu_counter_add_local is the same with percpu_counter_add */
+static inline void
+percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_add(fbc, amount);
+}
+
+/* no smp percpu_counter_sub_local is the same with percpu_counter_sub */
+static inline void
+percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
+{
+ percpu_counter_sub(fbc, amount);
+}
+
static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
--
2.31.1

2022-09-09 13:04:37

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v5 2/2] ipc/msg: mitigate the lock contention with percpu counter

The msg_bytes and msg_hdrs atomic counters are frequently
updated when IPC msg queue is in heavy use, causing heavy
cache bounce and overhead. Change them to percpu_counter
greatly improve the performance. Since there is one percpu
struct per namespace, additional memory cost is minimal.
Reading of the count done in msgctl call, which is infrequent.
So the need to sum up the counts in each CPU is infrequent.

Apply the patch and test the pts/stress-ng-1.4.0
-- system v message passing (160 threads).

Score gain: 3.99x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)

Signed-off-by: Jiebin Sun <[email protected]>
---
include/linux/ipc_namespace.h | 5 ++--
ipc/msg.c | 44 ++++++++++++++++++++++++-----------
ipc/namespace.c | 5 +++-
ipc/util.h | 4 ++--
4 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e3e8c8662b49..e8240cf2611a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -11,6 +11,7 @@
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>

struct user_namespace;

@@ -36,8 +37,8 @@ struct ipc_namespace {
unsigned int msg_ctlmax;
unsigned int msg_ctlmnb;
unsigned int msg_ctlmni;
- atomic_t msg_bytes;
- atomic_t msg_hdrs;
+ struct percpu_counter percpu_msg_bytes;
+ struct percpu_counter percpu_msg_hdrs;

size_t shm_ctlmax;
size_t shm_ctlall;
diff --git a/ipc/msg.c b/ipc/msg.c
index a0d05775af2c..f2bb4c193ecf 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -39,6 +39,7 @@
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>
+#include <linux/percpu_counter.h>

#include <asm/current.h>
#include <linux/uaccess.h>
@@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
rcu_read_unlock();

list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1);
free_msg(msg);
}
- atomic_sub(msq->q_cbytes, &ns->msg_bytes);
+ percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes);
ipc_update_pid(&msq->q_lspid, NULL);
ipc_update_pid(&msq->q_lrpid, NULL);
ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
@@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
msginfo->msgssz = MSGSSZ;
msginfo->msgseg = MSGSEG;
down_read(&msg_ids(ns).rwsem);
- if (cmd == MSG_INFO) {
+ if (cmd == MSG_INFO)
msginfo->msgpool = msg_ids(ns).in_use;
- msginfo->msgmap = atomic_read(&ns->msg_hdrs);
- msginfo->msgtql = atomic_read(&ns->msg_bytes);
+ max_idx = ipc_get_maxidx(&msg_ids(ns));
+ up_read(&msg_ids(ns).rwsem);
+ if (cmd == MSG_INFO) {
+ msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
+ msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
} else {
msginfo->msgmap = MSGMAP;
msginfo->msgpool = MSGPOOL;
msginfo->msgtql = MSGTQL;
}
- max_idx = ipc_get_maxidx(&msg_ids(ns));
- up_read(&msg_ids(ns).rwsem);
return (max_idx < 0) ? 0 : max_idx;
}

@@ -935,8 +937,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
list_add_tail(&msg->m_list, &msq->q_messages);
msq->q_cbytes += msgsz;
msq->q_qnum++;
- atomic_add(msgsz, &ns->msg_bytes);
- atomic_inc(&ns->msg_hdrs);
+ percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz);
+ percpu_counter_add_local(&ns->percpu_msg_hdrs, 1);
}

err = 0;
@@ -1159,8 +1161,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
msq->q_rtime = ktime_get_real_seconds();
ipc_update_pid(&msq->q_lrpid, task_tgid(current));
msq->q_cbytes -= msg->m_ts;
- atomic_sub(msg->m_ts, &ns->msg_bytes);
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts);
+ percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1);
ss_wakeup(msq, &wake_q, false);

goto out_unlock0;
@@ -1297,20 +1299,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
}
#endif

-void msg_init_ns(struct ipc_namespace *ns)
+int msg_init_ns(struct ipc_namespace *ns)
{
+ int ret;
+
ns->msg_ctlmax = MSGMAX;
ns->msg_ctlmnb = MSGMNB;
ns->msg_ctlmni = MSGMNI;

- atomic_set(&ns->msg_bytes, 0);
- atomic_set(&ns->msg_hdrs, 0);
+ ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL);
+ if (ret)
+ goto fail_msg_bytes;
+ ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL);
+ if (ret)
+ goto fail_msg_hdrs;
ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+ return 0;
+
+ fail_msg_hdrs:
+ percpu_counter_destroy(&ns->percpu_msg_bytes);
+ fail_msg_bytes:
+ return ret;
}

#ifdef CONFIG_IPC_NS
void msg_exit_ns(struct ipc_namespace *ns)
{
+ percpu_counter_destroy(&ns->percpu_msg_bytes);
+ percpu_counter_destroy(&ns->percpu_msg_hdrs);
free_ipcs(ns, &msg_ids(ns), freeque);
idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index e1fcaedba4fa..8316ea585733 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
if (!setup_ipc_sysctls(ns))
goto fail_mq;

+ err = msg_init_ns(ns);
+ if (err)
+ goto fail_put;
+
sem_init_ns(ns);
- msg_init_ns(ns);
shm_init_ns(ns);

return ns;
diff --git a/ipc/util.h b/ipc/util.h
index 2dd7ce0416d8..1b0086c6346f 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }

#ifdef CONFIG_SYSVIPC
void sem_init_ns(struct ipc_namespace *ns);
-void msg_init_ns(struct ipc_namespace *ns);
+int msg_init_ns(struct ipc_namespace *ns);
void shm_init_ns(struct ipc_namespace *ns);

void sem_exit_ns(struct ipc_namespace *ns);
@@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
static inline void sem_init_ns(struct ipc_namespace *ns) { }
-static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;}
static inline void shm_init_ns(struct ipc_namespace *ns) { }

static inline void sem_exit_ns(struct ipc_namespace *ns) { }
--
2.31.1

2022-09-09 16:46:51

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v5 2/2] ipc/msg: mitigate the lock contention with percpu counter

On Sat, 2022-09-10 at 04:36 +0800, Jiebin Sun wrote:
> The msg_bytes and msg_hdrs atomic counters are frequently
> updated when IPC msg queue is in heavy use, causing heavy
> cache bounce and overhead. Change them to percpu_counter
> greatly improve the performance. Since there is one percpu
> struct per namespace, additional memory cost is minimal.
> Reading of the count done in msgctl call, which is infrequent.
> So the need to sum up the counts in each CPU is infrequent.
>
> Apply the patch and test the pts/stress-ng-1.4.0
> -- system v message passing (160 threads).
>
> Score gain: 3.99x
>
> CPU: ICX 8380 x 2 sockets
> Core number: 40 x 2 physical cores
> Benchmark: pts/stress-ng-1.4.0
> -- system v message passing (160 threads)

Reviewed-by: Tim Chen <[email protected]>

>
> Signed-off-by: Jiebin Sun <[email protected]>
> ---
>

2022-09-09 16:50:57

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] percpu: Add percpu_counter_add_local and percpu_counter_sub_local

On Sat, 2022-09-10 at 04:36 +0800, Jiebin Sun wrote:
> The batch size in percpu_counter_add_batch should be very large
> in heavy writing and rare reading case. Add the "_local" version,
> and mostly it will do local adding, reduce the global updating
> and mitigate lock contention in writing.
>
> Signed-off-by: Jiebin Sun <[email protected]>
> ---
> include/linux/percpu_counter.h | 38 ++++++++++++++++++++++++++++++++++
> 1 file changed, 38 insertions(+)
>
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 01861eebed79..6dd7eaba8527 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -15,6 +15,9 @@
> #include <linux/types.h>
> #include <linux/gfp.h>
>
> +/* percpu_counter batch for local add or sub */
> +#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX
> +
> #ifdef CONFIG_SMP
>
> struct percpu_counter {
> @@ -56,6 +59,27 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
> percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
> }
>
> +/*
> + * Use this function in heavy writing but rare reading case. The large
> + * batch size will reduce the global updating.

Suggest revising the comment, so it is clear we need to use
percpu_counter_sum() to access the counter:

With percpu_counter_add_local() and percpu_counter_sub_local(),
counts are accumulated in local per cpu counter and not in
fbc->count until local count overflows PERCPU_COUNTER_LOCAL_BATCH.
This makes counter write efficient.

But percpu_counter_sum(), instead of percpu_counter_read(),
needs to be used to add up the counts
from each CPU to account for all the local counts.
So percpu_counter_add_local() and percpu_counter_sub_local()
should be used when a counter is updated frequently and read
rarely.


> + */
> +static inline void
> +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
> +{
> + percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH);
> +}
> +
> +/*
> + * Similar with percpu_counter_add_local, use it in heavy writing but
> + * rare reading case. The large batch size will reduce the global
> + * updating.
> + */
> +static inline void
> +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
> +{
> + percpu_counter_add_batch(fbc, -amount, PERCPU_COUNTER_LOCAL_BATCH);
> +}
> +
> static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
> {
> s64 ret = __percpu_counter_sum(fbc);
> @@ -138,6 +162,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
> preempt_enable();
> }
>
> +/* no smp percpu_counter_add_local is the same with percpu_counter_add */
> +static inline void
> +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
> +{
> + percpu_counter_add(fbc, amount);
> +}
> +
> +/* no smp percpu_counter_sub_local is the same with percpu_counter_sub */
> +static inline void
> +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
> +{
> + percpu_counter_sub(fbc, amount);
> +}
> +
> static inline void
> percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
> {

2022-09-10 01:57:51

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] percpu: Add percpu_counter_add_local and percpu_counter_sub_local

Hi Jiebin,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.0-rc4 next-20220909]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Jiebin-Sun/percpu-Add-percpu_counter_add_local-and-percpu_counter_sub_local/20220910-053730
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: x86_64-randconfig-a013 (https://download.01.org/0day-ci/archive/20220910/[email protected]/config)
compiler: gcc-11 (Debian 11.3.0-5) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/44e7288c01b9b125c7a5f97591ca26ffd90e3385
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Jiebin-Sun/percpu-Add-percpu_counter_add_local-and-percpu_counter_sub_local/20220910-053730
git checkout 44e7288c01b9b125c7a5f97591ca26ffd90e3385
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=x86_64 prepare

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All error/warnings (new ones prefixed by >>):

In file included from include/linux/sched/user.h:7,
from include/linux/cred.h:17,
from include/linux/sched/signal.h:10,
from include/linux/rcuwait.h:6,
from include/linux/percpu-rwsem.h:7,
from include/linux/fs.h:33,
from include/linux/cgroup.h:17,
from include/linux/memcontrol.h:13,
from include/linux/swap.h:9,
from include/linux/suspend.h:5,
from arch/x86/kernel/asm-offsets.c:13:
include/linux/percpu_counter.h: In function 'percpu_counter_sub_local':
>> include/linux/percpu_counter.h:176:9: error: implicit declaration of function 'percpu_counter_sub'; did you mean 'percpu_counter_set'? [-Werror=implicit-function-declaration]
176 | percpu_counter_sub(fbc, amount);
| ^~~~~~~~~~~~~~~~~~
| percpu_counter_set
include/linux/percpu_counter.h: At top level:
>> include/linux/percpu_counter.h:229:20: warning: conflicting types for 'percpu_counter_sub'; have 'void(struct percpu_counter *, s64)' {aka 'void(struct percpu_counter *, long long int)'}
229 | static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
| ^~~~~~~~~~~~~~~~~~
>> include/linux/percpu_counter.h:229:20: error: static declaration of 'percpu_counter_sub' follows non-static declaration
include/linux/percpu_counter.h:176:9: note: previous implicit declaration of 'percpu_counter_sub' with type 'void(struct percpu_counter *, s64)' {aka 'void(struct percpu_counter *, long long int)'}
176 | percpu_counter_sub(fbc, amount);
| ^~~~~~~~~~~~~~~~~~
cc1: some warnings being treated as errors
make[2]: *** [scripts/Makefile.build:117: arch/x86/kernel/asm-offsets.s] Error 1
make[2]: Target '__build' not remade because of errors.
make[1]: *** [Makefile:1206: prepare0] Error 2
make[1]: Target 'prepare' not remade because of errors.
make: *** [Makefile:222: __sub-make] Error 2
make: Target 'prepare' not remade because of errors.


vim +176 include/linux/percpu_counter.h

171
172 /* no smp percpu_counter_sub_local is the same with percpu_counter_sub */
173 static inline void
174 percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
175 {
> 176 percpu_counter_sub(fbc, amount);
177 }
178
179 static inline void
180 percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
181 {
182 percpu_counter_add(fbc, amount);
183 }
184
185 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
186 {
187 return fbc->count;
188 }
189
190 /*
191 * percpu_counter is intended to track positive numbers. In the UP case the
192 * number should never be negative.
193 */
194 static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
195 {
196 return fbc->count;
197 }
198
199 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
200 {
201 return percpu_counter_read_positive(fbc);
202 }
203
204 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
205 {
206 return percpu_counter_read(fbc);
207 }
208
209 static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
210 {
211 return true;
212 }
213
214 static inline void percpu_counter_sync(struct percpu_counter *fbc)
215 {
216 }
217 #endif /* CONFIG_SMP */
218
219 static inline void percpu_counter_inc(struct percpu_counter *fbc)
220 {
221 percpu_counter_add(fbc, 1);
222 }
223
224 static inline void percpu_counter_dec(struct percpu_counter *fbc)
225 {
226 percpu_counter_add(fbc, -1);
227 }
228
> 229 static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
230 {
231 percpu_counter_add(fbc, -amount);
232 }
233

--
0-DAY CI Kernel Test Service
https://01.org/lkp

2022-09-10 08:39:56

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] percpu: Add percpu_counter_add_local and percpu_counter_sub_local

Hi Jiebin,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.0-rc4 next-20220909]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Jiebin-Sun/percpu-Add-percpu_counter_add_local-and-percpu_counter_sub_local/20220910-053730
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: x86_64-randconfig-a012 (https://download.01.org/0day-ci/archive/20220910/[email protected]/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/44e7288c01b9b125c7a5f97591ca26ffd90e3385
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Jiebin-Sun/percpu-Add-percpu_counter_add_local-and-percpu_counter_sub_local/20220910-053730
git checkout 44e7288c01b9b125c7a5f97591ca26ffd90e3385
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 prepare

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

In file included from arch/x86/kernel/asm-offsets.c:13:
In file included from include/linux/suspend.h:5:
In file included from include/linux/swap.h:9:
In file included from include/linux/memcontrol.h:13:
In file included from include/linux/cgroup.h:17:
In file included from include/linux/fs.h:33:
In file included from include/linux/percpu-rwsem.h:7:
In file included from include/linux/rcuwait.h:6:
In file included from include/linux/sched/signal.h:10:
In file included from include/linux/cred.h:17:
In file included from include/linux/sched/user.h:7:
>> include/linux/percpu_counter.h:176:2: error: implicit declaration of function 'percpu_counter_sub' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
percpu_counter_sub(fbc, amount);
^
include/linux/percpu_counter.h:176:2: note: did you mean 'percpu_counter_set'?
include/linux/percpu_counter.h:136:20: note: 'percpu_counter_set' declared here
static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
^
include/linux/percpu_counter.h:229:20: error: static declaration of 'percpu_counter_sub' follows non-static declaration
static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
^
include/linux/percpu_counter.h:176:2: note: previous implicit declaration is here
percpu_counter_sub(fbc, amount);
^
2 errors generated.
make[2]: *** [scripts/Makefile.build:117: arch/x86/kernel/asm-offsets.s] Error 1
make[2]: Target '__build' not remade because of errors.
make[1]: *** [Makefile:1206: prepare0] Error 2
make[1]: Target 'prepare' not remade because of errors.
make: *** [Makefile:222: __sub-make] Error 2
make: Target 'prepare' not remade because of errors.


vim +/percpu_counter_sub +176 include/linux/percpu_counter.h

171
172 /* no smp percpu_counter_sub_local is the same with percpu_counter_sub */
173 static inline void
174 percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
175 {
> 176 percpu_counter_sub(fbc, amount);
177 }
178

--
0-DAY CI Kernel Test Service
https://01.org/lkp

2022-09-10 08:54:01

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] percpu: Add percpu_counter_add_local and percpu_counter_sub_local

Hi Jiebin,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.0-rc4 next-20220909]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Jiebin-Sun/percpu-Add-percpu_counter_add_local-and-percpu_counter_sub_local/20220910-053730
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: hexagon-randconfig-r041-20220909 (https://download.01.org/0day-ci/archive/20220910/[email protected]/config)
compiler: clang version 16.0.0 (https://github.com/llvm/llvm-project 1546df49f5a6d09df78f569e4137ddb365a3e827)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/44e7288c01b9b125c7a5f97591ca26ffd90e3385
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Jiebin-Sun/percpu-Add-percpu_counter_add_local-and-percpu_counter_sub_local/20220910-053730
git checkout 44e7288c01b9b125c7a5f97591ca26ffd90e3385
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon prepare

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All errors (new ones prefixed by >>):

In file included from arch/hexagon/kernel/asm-offsets.c:12:
In file included from include/linux/compat.h:17:
In file included from include/linux/fs.h:33:
In file included from include/linux/percpu-rwsem.h:7:
In file included from include/linux/rcuwait.h:6:
In file included from include/linux/sched/signal.h:10:
In file included from include/linux/cred.h:17:
In file included from include/linux/sched/user.h:7:
>> include/linux/percpu_counter.h:176:2: error: call to undeclared function 'percpu_counter_sub'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
percpu_counter_sub(fbc, amount);
^
include/linux/percpu_counter.h:176:2: note: did you mean 'percpu_counter_set'?
include/linux/percpu_counter.h:136:20: note: 'percpu_counter_set' declared here
static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
^
include/linux/percpu_counter.h:229:20: error: static declaration of 'percpu_counter_sub' follows non-static declaration
static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
^
include/linux/percpu_counter.h:176:2: note: previous implicit declaration is here
percpu_counter_sub(fbc, amount);
^
2 errors generated.
make[2]: *** [scripts/Makefile.build:117: arch/hexagon/kernel/asm-offsets.s] Error 1
make[2]: Target '__build' not remade because of errors.
make[1]: *** [Makefile:1206: prepare0] Error 2
make[1]: Target 'prepare' not remade because of errors.
make: *** [Makefile:222: __sub-make] Error 2
make: Target 'prepare' not remade because of errors.


vim +/percpu_counter_sub +176 include/linux/percpu_counter.h

171
172 /* no smp percpu_counter_sub_local is the same with percpu_counter_sub */
173 static inline void
174 percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
175 {
> 176 percpu_counter_sub(fbc, amount);
177 }
178

--
0-DAY CI Kernel Test Service
https://01.org/lkp

2022-09-13 12:14:59

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v6 0/2] ipc/msg: mitigate the lock contention in ipc/msg

Hi,

Here are two patches to mitigate the lock contention in ipc/msg.

The 1st patch is to add the new interface percpu_counter_add_local and
percpu_counter_sub_local. The batch size in percpu_counter_add_batch
should be very large in heavy writing and rare reading case. Add the
"_local" version, and mostly it will do local adding, reduce the global
updating and mitigate lock contention in writing.

The 2nd patch is to use percpu_counter instead of atomic update in
ipc/msg. The msg_bytes and msg_hdrs atomic counters are frequently
updated when IPC msg queue is in heavy use, causing heavy cache bounce
and overhead. Change them to percpu_counter greatly improve the
performance. Since there is one percpu struct per namespace, additional
memory cost is minimal. Reading of the count done in msgctl call, which
is infrequent. So the need to sum up the counts in each CPU is
infrequent.

Changes in v6:
1. Revise the code comment of percpu_counter_add_local in patch 1/2.
2. Get percpu_counter_sub_local from percpu_counter_add_local rather
than that from percpu_counter_add_batch for SMP and percpu_counter_sub
for non-SMP to reduce code modification.

Changes in v5:
1. Use INT_MAX as the large batch size in percpu_counter_local_add and
percpu_counter_sub_local.
2. Use the latest kernel 6.0-rc4 as the baseline for performance test.
3. Move the percpu_counter_local_add and percpu_counter_sub_local from
percpu_counter.c to percpu_counter.h.

Changes in v3:
1. Add comment and change log for the new function percpu_counter_add_local.
Who should use it and who shouldn't.

Changes in v2:
1. Separate the original patch into two patches.
2. Add error handling for percpu_counter_init.

The performance gain increases as the threads of workload become larger.
Performance gain: 3.99x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)


Regards
Jiebin

2022-09-13 12:54:26

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter

The msg_bytes and msg_hdrs atomic counters are frequently
updated when IPC msg queue is in heavy use, causing heavy
cache bounce and overhead. Change them to percpu_counter
greatly improve the performance. Since there is one percpu
struct per namespace, additional memory cost is minimal.
Reading of the count done in msgctl call, which is infrequent.
So the need to sum up the counts in each CPU is infrequent.

Apply the patch and test the pts/stress-ng-1.4.0
-- system v message passing (160 threads).

Score gain: 3.99x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)

Signed-off-by: Jiebin Sun <[email protected]>
Reviewed-by: Tim Chen <[email protected]>
---
include/linux/ipc_namespace.h | 5 ++--
ipc/msg.c | 44 ++++++++++++++++++++++++-----------
ipc/namespace.c | 5 +++-
ipc/util.h | 4 ++--
4 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e3e8c8662b49..e8240cf2611a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -11,6 +11,7 @@
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>

struct user_namespace;

@@ -36,8 +37,8 @@ struct ipc_namespace {
unsigned int msg_ctlmax;
unsigned int msg_ctlmnb;
unsigned int msg_ctlmni;
- atomic_t msg_bytes;
- atomic_t msg_hdrs;
+ struct percpu_counter percpu_msg_bytes;
+ struct percpu_counter percpu_msg_hdrs;

size_t shm_ctlmax;
size_t shm_ctlall;
diff --git a/ipc/msg.c b/ipc/msg.c
index a0d05775af2c..f2bb4c193ecf 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -39,6 +39,7 @@
#include <linux/nsproxy.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>
+#include <linux/percpu_counter.h>

#include <asm/current.h>
#include <linux/uaccess.h>
@@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
rcu_read_unlock();

list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1);
free_msg(msg);
}
- atomic_sub(msq->q_cbytes, &ns->msg_bytes);
+ percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes);
ipc_update_pid(&msq->q_lspid, NULL);
ipc_update_pid(&msq->q_lrpid, NULL);
ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
@@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
msginfo->msgssz = MSGSSZ;
msginfo->msgseg = MSGSEG;
down_read(&msg_ids(ns).rwsem);
- if (cmd == MSG_INFO) {
+ if (cmd == MSG_INFO)
msginfo->msgpool = msg_ids(ns).in_use;
- msginfo->msgmap = atomic_read(&ns->msg_hdrs);
- msginfo->msgtql = atomic_read(&ns->msg_bytes);
+ max_idx = ipc_get_maxidx(&msg_ids(ns));
+ up_read(&msg_ids(ns).rwsem);
+ if (cmd == MSG_INFO) {
+ msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
+ msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
} else {
msginfo->msgmap = MSGMAP;
msginfo->msgpool = MSGPOOL;
msginfo->msgtql = MSGTQL;
}
- max_idx = ipc_get_maxidx(&msg_ids(ns));
- up_read(&msg_ids(ns).rwsem);
return (max_idx < 0) ? 0 : max_idx;
}

@@ -935,8 +937,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
list_add_tail(&msg->m_list, &msq->q_messages);
msq->q_cbytes += msgsz;
msq->q_qnum++;
- atomic_add(msgsz, &ns->msg_bytes);
- atomic_inc(&ns->msg_hdrs);
+ percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz);
+ percpu_counter_add_local(&ns->percpu_msg_hdrs, 1);
}

err = 0;
@@ -1159,8 +1161,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
msq->q_rtime = ktime_get_real_seconds();
ipc_update_pid(&msq->q_lrpid, task_tgid(current));
msq->q_cbytes -= msg->m_ts;
- atomic_sub(msg->m_ts, &ns->msg_bytes);
- atomic_dec(&ns->msg_hdrs);
+ percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts);
+ percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1);
ss_wakeup(msq, &wake_q, false);

goto out_unlock0;
@@ -1297,20 +1299,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
}
#endif

-void msg_init_ns(struct ipc_namespace *ns)
+int msg_init_ns(struct ipc_namespace *ns)
{
+ int ret;
+
ns->msg_ctlmax = MSGMAX;
ns->msg_ctlmnb = MSGMNB;
ns->msg_ctlmni = MSGMNI;

- atomic_set(&ns->msg_bytes, 0);
- atomic_set(&ns->msg_hdrs, 0);
+ ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL);
+ if (ret)
+ goto fail_msg_bytes;
+ ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL);
+ if (ret)
+ goto fail_msg_hdrs;
ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+ return 0;
+
+ fail_msg_hdrs:
+ percpu_counter_destroy(&ns->percpu_msg_bytes);
+ fail_msg_bytes:
+ return ret;
}

#ifdef CONFIG_IPC_NS
void msg_exit_ns(struct ipc_namespace *ns)
{
+ percpu_counter_destroy(&ns->percpu_msg_bytes);
+ percpu_counter_destroy(&ns->percpu_msg_hdrs);
free_ipcs(ns, &msg_ids(ns), freeque);
idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index e1fcaedba4fa..8316ea585733 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
if (!setup_ipc_sysctls(ns))
goto fail_mq;

+ err = msg_init_ns(ns);
+ if (err)
+ goto fail_put;
+
sem_init_ns(ns);
- msg_init_ns(ns);
shm_init_ns(ns);

return ns;
diff --git a/ipc/util.h b/ipc/util.h
index 2dd7ce0416d8..1b0086c6346f 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }

#ifdef CONFIG_SYSVIPC
void sem_init_ns(struct ipc_namespace *ns);
-void msg_init_ns(struct ipc_namespace *ns);
+int msg_init_ns(struct ipc_namespace *ns);
void shm_init_ns(struct ipc_namespace *ns);

void sem_exit_ns(struct ipc_namespace *ns);
@@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
static inline void sem_init_ns(struct ipc_namespace *ns) { }
-static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;}
static inline void shm_init_ns(struct ipc_namespace *ns) { }

static inline void sem_exit_ns(struct ipc_namespace *ns) { }
--
2.31.1

2022-09-18 13:43:31

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter

Hi Jiebin,

On 9/13/22 21:25, Jiebin Sun wrote:
> The msg_bytes and msg_hdrs atomic counters are frequently
> updated when IPC msg queue is in heavy use, causing heavy
> cache bounce and overhead. Change them to percpu_counter
> greatly improve the performance. Since there is one percpu
> struct per namespace, additional memory cost is minimal.
> Reading of the count done in msgctl call, which is infrequent.
> So the need to sum up the counts in each CPU is infrequent.
>
> Apply the patch and test the pts/stress-ng-1.4.0
> -- system v message passing (160 threads).
>
> Score gain: 3.99x
>
> CPU: ICX 8380 x 2 sockets
> Core number: 40 x 2 physical cores
> Benchmark: pts/stress-ng-1.4.0
> -- system v message passing (160 threads)
>
> Signed-off-by: Jiebin Sun <[email protected]>
> Reviewed-by: Tim Chen <[email protected]>
Reviewed-by: Manfred Spraul <[email protected]>
> @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
> msginfo->msgssz = MSGSSZ;
> msginfo->msgseg = MSGSEG;
> down_read(&msg_ids(ns).rwsem);
> - if (cmd == MSG_INFO) {
> + if (cmd == MSG_INFO)
> msginfo->msgpool = msg_ids(ns).in_use;
> - msginfo->msgmap = atomic_read(&ns->msg_hdrs);
> - msginfo->msgtql = atomic_read(&ns->msg_bytes);
> + max_idx = ipc_get_maxidx(&msg_ids(ns));
> + up_read(&msg_ids(ns).rwsem);
> + if (cmd == MSG_INFO) {
> + msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
> + msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);

Not caused by your change, it just now becomes obvious:

msginfo->msgmap and ->msgtql are type int, i.e. signed 32-bit, and the
actual counters are 64-bit.
This can overflow - and I think the code should handle this. Just clamp
the values to INT_MAX.

--

    Manfred


2022-09-20 02:53:21

by Sun, Jiebin

[permalink] [raw]
Subject: Re: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter


On 9/18/2022 8:53 PM, Manfred Spraul wrote:
> Hi Jiebin,
>
> On 9/13/22 21:25, Jiebin Sun wrote:
>> The msg_bytes and msg_hdrs atomic counters are frequently
>> updated when IPC msg queue is in heavy use, causing heavy
>> cache bounce and overhead. Change them to percpu_counter
>> greatly improve the performance. Since there is one percpu
>> struct per namespace, additional memory cost is minimal.
>> Reading of the count done in msgctl call, which is infrequent.
>> So the need to sum up the counts in each CPU is infrequent.
>>
>> Apply the patch and test the pts/stress-ng-1.4.0
>> -- system v message passing (160 threads).
>>
>> Score gain: 3.99x
>>
>> CPU: ICX 8380 x 2 sockets
>> Core number: 40 x 2 physical cores
>> Benchmark: pts/stress-ng-1.4.0
>> -- system v message passing (160 threads)
>>
>> Signed-off-by: Jiebin Sun <[email protected]>
>> Reviewed-by: Tim Chen <[email protected]>
> Reviewed-by: Manfred Spraul <[email protected]>
>> @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace
>> *ns, int msqid,
>>       msginfo->msgssz = MSGSSZ;
>>       msginfo->msgseg = MSGSEG;
>>       down_read(&msg_ids(ns).rwsem);
>> -    if (cmd == MSG_INFO) {
>> +    if (cmd == MSG_INFO)
>>           msginfo->msgpool = msg_ids(ns).in_use;
>> -        msginfo->msgmap = atomic_read(&ns->msg_hdrs);
>> -        msginfo->msgtql = atomic_read(&ns->msg_bytes);
>> +    max_idx = ipc_get_maxidx(&msg_ids(ns));
>> +    up_read(&msg_ids(ns).rwsem);
>> +    if (cmd == MSG_INFO) {
>> +        msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
>> +        msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
>
> Not caused by your change, it just now becomes obvious:
>
> msginfo->msgmap and ->msgtql are type int, i.e. signed 32-bit, and the
> actual counters are 64-bit.
> This can overflow - and I think the code should handle this. Just
> clamp the values to INT_MAX.
>
Hi Manfred,

Thanks for your advice. But I'm not sure if we could fix the overflow
issue in ipc/msg totally by

clamp(val, low, INT_MAX). If the value is over s32, we might avoid the
reversal sign, but still could

not get the accurate value.

2022-09-20 05:21:23

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter

On 9/20/22 04:36, Sun, Jiebin wrote:
>
> On 9/18/2022 8:53 PM, Manfred Spraul wrote:
>> Hi Jiebin,
>>
>> On 9/13/22 21:25, Jiebin Sun wrote:
>>> The msg_bytes and msg_hdrs atomic counters are frequently
>>> updated when IPC msg queue is in heavy use, causing heavy
>>> cache bounce and overhead. Change them to percpu_counter
>>> greatly improve the performance. Since there is one percpu
>>> struct per namespace, additional memory cost is minimal.
>>> Reading of the count done in msgctl call, which is infrequent.
>>> So the need to sum up the counts in each CPU is infrequent.
>>>
>>> Apply the patch and test the pts/stress-ng-1.4.0
>>> -- system v message passing (160 threads).
>>>
>>> Score gain: 3.99x
>>>
>>> CPU: ICX 8380 x 2 sockets
>>> Core number: 40 x 2 physical cores
>>> Benchmark: pts/stress-ng-1.4.0
>>> -- system v message passing (160 threads)
>>>
>>> Signed-off-by: Jiebin Sun <[email protected]>
>>> Reviewed-by: Tim Chen <[email protected]>
>> Reviewed-by: Manfred Spraul <[email protected]>
>>> @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace
>>> *ns, int msqid,
>>>       msginfo->msgssz = MSGSSZ;
>>>       msginfo->msgseg = MSGSEG;
>>>       down_read(&msg_ids(ns).rwsem);
>>> -    if (cmd == MSG_INFO) {
>>> +    if (cmd == MSG_INFO)
>>>           msginfo->msgpool = msg_ids(ns).in_use;
>>> -        msginfo->msgmap = atomic_read(&ns->msg_hdrs);
>>> -        msginfo->msgtql = atomic_read(&ns->msg_bytes);
>>> +    max_idx = ipc_get_maxidx(&msg_ids(ns));
>>> +    up_read(&msg_ids(ns).rwsem);
>>> +    if (cmd == MSG_INFO) {
>>> +        msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
>>> +        msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
>>
>> Not caused by your change, it just now becomes obvious:
>>
>> msginfo->msgmap and ->msgtql are type int, i.e. signed 32-bit, and
>> the actual counters are 64-bit.
>> This can overflow - and I think the code should handle this. Just
>> clamp the values to INT_MAX.
>>
> Hi Manfred,
>
> Thanks for your advice. But I'm not sure if we could fix the overflow
> issue in ipc/msg totally by
>
> clamp(val, low, INT_MAX). If the value is over s32, we might avoid the
> reversal sign, but still could
>
> not get the accurate value.

I think just clamping it to INT_MAX is the best approach.
Reporting negative values is worse than clamping. If (and only if) there
are real users that need to know the total amount of memory allocated
for messages queues in one namespace, then we could add a MSG_INFO64
with long values. But I would not add that right now, I do not see a
real use case where the value would be needed.

Any other opinions?

--

    Manfred

2022-09-20 06:16:10

by Sun, Jiebin

[permalink] [raw]
Subject: Re: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter


On 9/20/2022 12:53 PM, Manfred Spraul wrote:
> On 9/20/22 04:36, Sun, Jiebin wrote:
>>
>> On 9/18/2022 8:53 PM, Manfred Spraul wrote:
>>> Hi Jiebin,
>>>
>>> On 9/13/22 21:25, Jiebin Sun wrote:
>>>> The msg_bytes and msg_hdrs atomic counters are frequently
>>>> updated when IPC msg queue is in heavy use, causing heavy
>>>> cache bounce and overhead. Change them to percpu_counter
>>>> greatly improve the performance. Since there is one percpu
>>>> struct per namespace, additional memory cost is minimal.
>>>> Reading of the count done in msgctl call, which is infrequent.
>>>> So the need to sum up the counts in each CPU is infrequent.
>>>>
>>>> Apply the patch and test the pts/stress-ng-1.4.0
>>>> -- system v message passing (160 threads).
>>>>
>>>> Score gain: 3.99x
>>>>
>>>> CPU: ICX 8380 x 2 sockets
>>>> Core number: 40 x 2 physical cores
>>>> Benchmark: pts/stress-ng-1.4.0
>>>> -- system v message passing (160 threads)
>>>>
>>>> Signed-off-by: Jiebin Sun <[email protected]>
>>>> Reviewed-by: Tim Chen <[email protected]>
>>> Reviewed-by: Manfred Spraul <[email protected]>
>>>> @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace
>>>> *ns, int msqid,
>>>>       msginfo->msgssz = MSGSSZ;
>>>>       msginfo->msgseg = MSGSEG;
>>>>       down_read(&msg_ids(ns).rwsem);
>>>> -    if (cmd == MSG_INFO) {
>>>> +    if (cmd == MSG_INFO)
>>>>           msginfo->msgpool = msg_ids(ns).in_use;
>>>> -        msginfo->msgmap = atomic_read(&ns->msg_hdrs);
>>>> -        msginfo->msgtql = atomic_read(&ns->msg_bytes);
>>>> +    max_idx = ipc_get_maxidx(&msg_ids(ns));
>>>> +    up_read(&msg_ids(ns).rwsem);
>>>> +    if (cmd == MSG_INFO) {
>>>> +        msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
>>>> +        msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
>>>
>>> Not caused by your change, it just now becomes obvious:
>>>
>>> msginfo->msgmap and ->msgtql are type int, i.e. signed 32-bit, and
>>> the actual counters are 64-bit.
>>> This can overflow - and I think the code should handle this. Just
>>> clamp the values to INT_MAX.
>>>
>> Hi Manfred,
>>
>> Thanks for your advice. But I'm not sure if we could fix the overflow
>> issue in ipc/msg totally by
>>
>> clamp(val, low, INT_MAX). If the value is over s32, we might avoid
>> the reversal sign, but still could
>>
>> not get the accurate value.
>
> I think just clamping it to INT_MAX is the best approach.
> Reporting negative values is worse than clamping. If (and only if)
> there are real users that need to know the total amount of memory
> allocated for messages queues in one namespace, then we could add a
> MSG_INFO64 with long values. But I would not add that right now, I do
> not see a real use case where the value would be needed.
>
> Any other opinions?
>
> --
>
>     Manfred
>
>
OK. I will work on it and send it out for review.

2022-09-20 07:26:28

by Sun, Jiebin

[permalink] [raw]
Subject: [PATCH] ipc/msg: avoid negative value by overflow in msginfo

The 32-bit value in msginfo struct could be negative if we get it
from signed 64-bit. Clamping it to INT_MAX helps to avoid the
negative value by overflow.

Signed-off-by: Jiebin Sun <[email protected]>
Reviewed-by: Manfred Spraul <[email protected]>
Reviewed-by: Tim Chen <[email protected]>
---
ipc/msg.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index f2bb4c193ecf..65f437e28c9b 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -501,8 +501,8 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
max_idx = ipc_get_maxidx(&msg_ids(ns));
up_read(&msg_ids(ns).rwsem);
if (cmd == MSG_INFO) {
- msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
- msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
+ msginfo->msgmap = min(percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX);
+ msginfo->msgtql = min(percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX);
} else {
msginfo->msgmap = MSGMAP;
msginfo->msgpool = MSGPOOL;
--
2.31.1