2017-07-31 08:42:44

by Guillaume Knispel

[permalink] [raw]
Subject: [PATCH] ipc: optimize semget/shmget/msgget for lots of keys

ipc_findkey() scanned all objects to look for the wanted key. This is
slow when using a high number of keys, for example on an i5 laptop the
following loop took 17 s, with last semget calls taking ~1 ms each.

for (int i = 0, k=0x424242; i < 31900; ++i)
semget(k++, 1, IPC_CREAT | 0600);

This change adds an rhashtable of kern_ipc_perm objects in ipc_ids, so
that one lookup cease to be O(n). The above loop now takes ~10 ms. Each
lookup-only semget() call can take between ~120 ns and a few µs. Rarely,
some creations can take a few dozen of µs.

I also ran 'syscalls-ipc' and 'containers' of ltp 20170516, including
with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB,
CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK,
CONFIG_DEBUG_ATOMIC_SLEEP, and CONFIG_DEBUG_OBJECTS_RCU_HEAD, and did
not detect any regression (compared to 4.13-rc2, running Ubuntu 16.04.2)
[I did not manage to enable CONFIG_PROVE_RCU]

Signed-off-by: Guillaume Knispel <[email protected]>
Reviewed-by: Marc Pardo <[email protected]>
---
include/linux/ipc.h | 3 ++
include/linux/ipc_namespace.h | 3 ++
ipc/msg.c | 10 +++--
ipc/namespace.c | 20 +++++++--
ipc/sem.c | 11 +++--
ipc/shm.c | 14 +++---
ipc/util.c | 99 ++++++++++++++++++++++++++++++++-----------
ipc/util.h | 21 +++++----
8 files changed, 130 insertions(+), 51 deletions(-)

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index fadd579..f480f22 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -3,6 +3,7 @@

#include <linux/spinlock.h>
#include <linux/uidgid.h>
+#include <linux/rhashtable.h>
#include <uapi/linux/ipc.h>

#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
@@ -21,6 +22,8 @@ struct kern_ipc_perm {
unsigned long seq;
void *security;

+ struct rhash_head khtnode;
+
struct rcu_head rcu;
atomic_t refcount;
} ____cacheline_aligned_in_smp __randomize_layout;
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 65327ee..aea3b81 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -7,15 +7,18 @@
#include <linux/notifier.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
+#include <linux/rhashtable.h>

struct user_namespace;

struct ipc_ids {
int in_use;
unsigned short seq;
+ bool tables_initialized;
struct rw_semaphore rwsem;
struct idr ipcs_idr;
int next_id;
+ struct rhashtable key_ht;
};

struct ipc_namespace {
diff --git a/ipc/msg.c b/ipc/msg.c
index 5b25e07..f123042 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1011,7 +1011,7 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
}


-void msg_init_ns(struct ipc_namespace *ns)
+int msg_init_ns(struct ipc_namespace *ns)
{
ns->msg_ctlmax = MSGMAX;
ns->msg_ctlmnb = MSGMNB;
@@ -1019,7 +1019,7 @@ void msg_init_ns(struct ipc_namespace *ns)

atomic_set(&ns->msg_bytes, 0);
atomic_set(&ns->msg_hdrs, 0);
- ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+ return ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
}

#ifdef CONFIG_IPC_NS
@@ -1027,6 +1027,7 @@ void msg_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &msg_ids(ns), freeque);
idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
+ rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
}
#endif

@@ -1057,11 +1058,12 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
}
#endif

-void __init msg_init(void)
+int __init msg_init(void)
{
- msg_init_ns(&init_ipc_ns);
+ const int err = msg_init_ns(&init_ipc_ns);

ipc_init_proc_interface("sysvipc/msg",
" key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
IPC_MSG_IDS, sysvipc_msg_proc_show);
+ return err;
}
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b4d80f9..4130f48 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -54,16 +54,28 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;

- err = mq_init_ns(ns);
+ err = sem_init_ns(ns);
if (err)
goto fail_put;
+ err = msg_init_ns(ns);
+ if (err)
+ goto fail_destroy_sem;
+ err = shm_init_ns(ns);
+ if (err)
+ goto fail_destroy_msg;

- sem_init_ns(ns);
- msg_init_ns(ns);
- shm_init_ns(ns);
+ err = mq_init_ns(ns);
+ if (err)
+ goto fail_destroy_shm;

return ns;

+fail_destroy_shm:
+ shm_exit_ns(ns);
+fail_destroy_msg:
+ msg_exit_ns(ns);
+fail_destroy_sem:
+ sem_exit_ns(ns);
fail_put:
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
diff --git a/ipc/sem.c b/ipc/sem.c
index 9e70cd7..b13ecfd 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -185,14 +185,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
#define sc_semopm sem_ctls[2]
#define sc_semmni sem_ctls[3]

-void sem_init_ns(struct ipc_namespace *ns)
+int sem_init_ns(struct ipc_namespace *ns)
{
ns->sc_semmsl = SEMMSL;
ns->sc_semmns = SEMMNS;
ns->sc_semopm = SEMOPM;
ns->sc_semmni = SEMMNI;
ns->used_sems = 0;
- ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
+ return ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
}

#ifdef CONFIG_IPC_NS
@@ -200,15 +200,18 @@ void sem_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &sem_ids(ns), freeary);
idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
+ rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
}
#endif

-void __init sem_init(void)
+int __init sem_init(void)
{
- sem_init_ns(&init_ipc_ns);
+ const int err = sem_init_ns(&init_ipc_ns);
+
ipc_init_proc_interface("sysvipc/sem",
" key semid perms nsems uid gid cuid cgid otime ctime\n",
IPC_SEM_IDS, sysvipc_sem_proc_show);
+ return err;
}

/**
diff --git a/ipc/shm.c b/ipc/shm.c
index 28a4448..5fa7576 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -72,14 +72,14 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
#endif

-void shm_init_ns(struct ipc_namespace *ns)
+int shm_init_ns(struct ipc_namespace *ns)
{
ns->shm_ctlmax = SHMMAX;
ns->shm_ctlall = SHMALL;
ns->shm_ctlmni = SHMMNI;
ns->shm_rmid_forced = 0;
ns->shm_tot = 0;
- ipc_init_ids(&shm_ids(ns));
+ return ipc_init_ids(&shm_ids(ns));
}

/*
@@ -95,7 +95,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
if (shp->shm_nattch) {
shp->shm_perm.mode |= SHM_DEST;
/* Do not find it any more */
- shp->shm_perm.key = IPC_PRIVATE;
+ ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
shm_unlock(shp);
} else
shm_destroy(ns, shp);
@@ -106,16 +106,18 @@ void shm_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
+ rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
}
#endif

static int __init ipc_ns_init(void)
{
- shm_init_ns(&init_ipc_ns);
- return 0;
+ const int err = shm_init_ns(&init_ipc_ns);
+ WARN(err, "ipc: sysV shm_init_ns failed: %d\n", err);
+ return err;
}

-pure_initcall(ipc_ns_init);
+core_initcall(ipc_ns_init);

void __init shm_init(void)
{
diff --git a/ipc/util.c b/ipc/util.c
index 1a2cb02..621e9fc 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -83,27 +83,45 @@ struct ipc_proc_iface {
*/
static int __init ipc_init(void)
{
- sem_init();
- msg_init();
+ int err_sem, err_msg;
+
+ err_sem = sem_init();
+ WARN(err_sem, "ipc: sysV sem_init failed: %d\n", err_sem);
+ err_msg = msg_init();
+ WARN(err_msg, "ipc: sysV msg_init failed: %d\n", err_msg);
shm_init();
- return 0;
+
+ return err_msg ? err_msg : err_sem;
}
device_initcall(ipc_init);

+static const struct rhashtable_params ipc_kht_params = {
+ .head_offset = offsetof(struct kern_ipc_perm, khtnode),
+ .key_offset = offsetof(struct kern_ipc_perm, key),
+ .key_len = FIELD_SIZEOF(struct kern_ipc_perm, key),
+ .automatic_shrinking = true,
+};
+
/**
* ipc_init_ids - initialise ipc identifiers
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
- * below IPCMNI) then initialise the ids idr.
+ * below IPCMNI) then initialise the keys hashtable and ids idr.
*/
-void ipc_init_ids(struct ipc_ids *ids)
+int ipc_init_ids(struct ipc_ids *ids)
{
+ int err;
ids->in_use = 0;
ids->seq = 0;
ids->next_id = -1;
init_rwsem(&ids->rwsem);
+ err = rhashtable_init(&ids->key_ht, &ipc_kht_params);
+ if (err)
+ return err;
idr_init(&ids->ipcs_idr);
+ ids->tables_initialized = true;
+ return 0;
}

#ifdef CONFIG_PROC_FS
@@ -147,28 +165,20 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
* Returns the locked pointer to the ipc structure if found or NULL
* otherwise. If key is found ipc points to the owning ipc structure
*
- * Called with ipc_ids.rwsem held.
+ * Called with writer ipc_ids.rwsem held.
*/
static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
{
- struct kern_ipc_perm *ipc;
- int next_id;
- int total;
-
- for (total = 0, next_id = 0; total < ids->in_use; next_id++) {
- ipc = idr_find(&ids->ipcs_idr, next_id);
-
- if (ipc == NULL)
- continue;
+ struct kern_ipc_perm *ipcp = NULL;

- if (ipc->key != key) {
- total++;
- continue;
- }
+ if (likely(ids->tables_initialized))
+ ipcp = rhashtable_lookup_fast(&ids->key_ht, &key,
+ ipc_kht_params);

+ if (ipcp) {
rcu_read_lock();
- ipc_lock_object(ipc);
- return ipc;
+ ipc_lock_object(ipcp);
+ return ipcp;
}

return NULL;
@@ -221,13 +231,13 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
{
kuid_t euid;
kgid_t egid;
- int id;
+ int id, err;
int next_id = ids->next_id;

if (size > IPCMNI)
size = IPCMNI;

- if (ids->in_use >= size)
+ if (!ids->tables_initialized || ids->in_use >= size)
return -ENOSPC;

idr_preload(GFP_KERNEL);
@@ -246,6 +256,15 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
(next_id < 0) ? 0 : ipcid_to_idx(next_id), 0,
GFP_NOWAIT);
idr_preload_end();
+
+ if (id >= 0 && new->key != IPC_PRIVATE) {
+ err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
+ ipc_kht_params);
+ if (err < 0) {
+ idr_remove(&ids->ipcs_idr, id);
+ id = err;
+ }
+ }
if (id < 0) {
spin_unlock(&new->lock);
rcu_read_unlock();
@@ -377,6 +396,20 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
return err;
}

+/**
+ * ipc_kht_remove - remove an ipc from the key hashtable
+ * @ids: ipc identifier set
+ * @ipcp: ipc perm structure containing the key to remove
+ *
+ * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
+ * before this function is called, and remain locked on the exit.
+ */
+static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
+{
+ if (ipcp->key != IPC_PRIVATE)
+ rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode,
+ ipc_kht_params);
+}

/**
* ipc_rmid - remove an ipc identifier
@@ -391,10 +424,25 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
int lid = ipcid_to_idx(ipcp->id);

idr_remove(&ids->ipcs_idr, lid);
+ ipc_kht_remove(ids, ipcp);
ids->in_use--;
ipcp->deleted = true;
}

+/**
+ * ipc_set_key_private - switch the key of an existing ipc to IPC_PRIVATE
+ * @ids: ipc identifier set
+ * @ipcp: ipc perm structure containing the key to modify
+ *
+ * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
+ * before this function is called, and remain locked on the exit.
+ */
+void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
+{
+ ipc_kht_remove(ids, ipcp);
+ ipcp->key = IPC_PRIVATE;
+}
+
int ipc_rcu_getref(struct kern_ipc_perm *ptr)
{
return atomic_inc_not_zero(&ptr->refcount);
@@ -485,7 +533,7 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
}

/**
- * ipc_obtain_object
+ * ipc_obtain_object_idr
* @ids: ipc identifier set
* @id: ipc id to look for
*
@@ -499,6 +547,9 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
struct kern_ipc_perm *out;
int lid = ipcid_to_idx(id);

+ if (unlikely(!ids->tables_initialized))
+ return ERR_PTR(-EINVAL);
+
out = idr_find(&ids->ipcs_idr, lid);
if (!out)
return ERR_PTR(-EINVAL);
diff --git a/ipc/util.h b/ipc/util.h
index c692010..80c9f51 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -15,8 +15,8 @@

#define SEQ_MULTIPLIER (IPCMNI)

-void sem_init(void);
-void msg_init(void);
+int sem_init(void);
+int msg_init(void);
void shm_init(void);

struct ipc_namespace;
@@ -30,17 +30,17 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }
#endif

#ifdef CONFIG_SYSVIPC
-void sem_init_ns(struct ipc_namespace *ns);
-void msg_init_ns(struct ipc_namespace *ns);
-void shm_init_ns(struct ipc_namespace *ns);
+int sem_init_ns(struct ipc_namespace *ns);
+int msg_init_ns(struct ipc_namespace *ns);
+int shm_init_ns(struct ipc_namespace *ns);

void sem_exit_ns(struct ipc_namespace *ns);
void msg_exit_ns(struct ipc_namespace *ns);
void shm_exit_ns(struct ipc_namespace *ns);
#else
-static inline void sem_init_ns(struct ipc_namespace *ns) { }
-static inline void msg_init_ns(struct ipc_namespace *ns) { }
-static inline void shm_init_ns(struct ipc_namespace *ns) { }
+static inline int sem_init_ns(struct ipc_namespace *ns) { return 0; }
+static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; }
+static inline int shm_init_ns(struct ipc_namespace *ns) { return 0; }

static inline void sem_exit_ns(struct ipc_namespace *ns) { }
static inline void msg_exit_ns(struct ipc_namespace *ns) { }
@@ -79,7 +79,7 @@ struct ipc_ops {
struct seq_file;
struct ipc_ids;

-void ipc_init_ids(struct ipc_ids *);
+int ipc_init_ids(struct ipc_ids *);
#ifdef CONFIG_PROC_FS
void __init ipc_init_proc_interface(const char *path, const char *header,
int ids, int (*show)(struct seq_file *, void *));
@@ -104,6 +104,9 @@ int ipc_get_maxid(struct ipc_ids *);
/* must be called with both locks acquired. */
void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);

+/* must be called with both locks acquired. */
+void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);
+
/* must be called with ipcp locked */
int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);

--
2.7.4


2017-07-31 15:46:12

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH] ipc: optimize semget/shmget/msgget for lots of keys

On Mon, 31 Jul 2017, Guillaume Knispel wrote:

>ipc_findkey() scanned all objects to look for the wanted key. This is
>slow when using a high number of keys, for example on an i5 laptop the
>following loop took 17 s, with last semget calls taking ~1 ms each.

I would argue that this is not the common case.

>
> for (int i = 0, k=0x424242; i < 31900; ++i)
> semget(k++, 1, IPC_CREAT | 0600);
>
>This change adds an rhashtable of kern_ipc_perm objects in ipc_ids, so
>that one lookup cease to be O(n). The above loop now takes ~10 ms. Each
>lookup-only semget() call can take between ~120 ns and a few ?s. Rarely,
>some creations can take a few dozen of ?s.

Could you please provide numbers for smaller amounts of keys?

Thanks,
Davidlohr

2017-08-01 01:18:02

by Guillaume Knispel

[permalink] [raw]
Subject: Re: [PATCH] ipc: optimize semget/shmget/msgget for lots of keys

On Mon, Jul 31, 2017 at 08:45:58AM -0700, Davidlohr Bueso wrote:
> On Mon, 31 Jul 2017, Guillaume Knispel wrote:
> >ipc_findkey() scanned all objects to look for the wanted key. This is
> >slow when using a high number of keys, for example on an i5 laptop the
> >following loop took 17 s, with last semget calls taking ~1 ms each.
>
> I would argue that this is not the common case.

Well, Linux allows for 32000 objects, and if you want to allocate them
with keys, this initial (maybe diluted) duration is incompressible, and
is O(n²).

Besides, I maintain a program which, in some of its versions, uses tens
of thousands of semaphore sets with keys, and destroys and creates new
ones all the time.

> >This change adds an rhashtable of kern_ipc_perm objects in ipc_ids, so
> >that one lookup cease to be O(n). The above loop now takes ~10 ms. Each
> >lookup-only semget() call can take between ~120 ns and a few µs. Rarely,
> >some creations can take a few dozen of µs.
>
> Could you please provide numbers for smaller amounts of keys?

On 4.13-rc3 without and with the patch, the following loop takes on
my laptop, according to clock_gettime CLOCK_MONOTONIC calls not
shown here, for each value of KEYS starting right after a reboot
with initially 0 semaphore sets:

for (int i = 0, k=0x424242; i < KEYS ; ++i)
semget(k++, 1, IPC_CREAT | 0600);

total total max single max single
KEYS without with call without call with

1 3.5 4.9 µs 3.5 4.9
10 7.6 8.6 µs 3.7 4.7
32 16.2 15.9 µs 4.3 5.3
100 72.9 41.8 µs 3.7 4.7
1000 5,630.0 502.0 µs * *
10000 1,340,000.0 7,240.0 µs * *
31900 17,600,000,0 22,200.0 µs * *

Repeating the test immediately (prior to the reboot) for the same value
of KEYS gives the times without creation (lookup only):

total total max single max single
KEYS without with call without call with

1 2.1 2.5 µs 2.1 2.5
10 4.5 4.8 µs 2.2 2.3
32 13.0 10.8 µs 2.3 2.8
100 82.9 25.1 µs * 2.3
1000 5,780.0 217.0 µs * *
10000 1,470,000.0 2,520.0 µs * *
31900 17,400,000.0 7,810.0 µs * *

*: unreliable measure: high variance

This is both on a laptop and within a VM, so even where I have not noted
high variance the figures are not very precise (especially for long
runs) but we can still see the tendencies.

I did one last benchmark, this time running each semget() in a new
process (and still only measuring the time taken by this syscall) and
got those figures (in a single run on each kernel) in µs:

creation:
total total
KEYS without with

1 3.7 5.0 µs
10 32.9 36.7 µs
32 125.0 109.0 µs
100 523.0 353.0 µs
1000 20,300.0 3,280.0 µs
10000 2,470,000.0 46,700.0 µs
31900 27,800,000.0 219,000.0 µs

lookup-only:
total total
KEYS without with

1 2.5 2.7 µs
10 25.4 24.4 µs
32 106.0 72.6 µs
100 591.0 352.0 µs
1000 22,400.0 2,250.0 µs
10000 2,510,000.0 25,700.0 µs
31900 28,200,000.0 115,000.0 µs

My provisional conclusion is that on my system this patch improves the
performance consistently from about n ~= 30, and below 30 the slowdown,
if any, is more than reasonable; it should be inconsequential for
properly written programs and of a limited impact on programs doing lots
of <ipc>get() calls on small sets of ipc objects.

For a huge number of keyed ipc objects (within the limits already
accepted by Linux), this patch render the <ipc>get() performance
acceptable when it was previously quite poor.

Cheers!
Guillaume

2017-08-01 15:54:59

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH] ipc: optimize semget/shmget/msgget for lots of keys

On Tue, 01 Aug 2017, Guillaume Knispel wrote:

>On Mon, Jul 31, 2017 at 08:45:58AM -0700, Davidlohr Bueso wrote:
>> On Mon, 31 Jul 2017, Guillaume Knispel wrote:
>> >ipc_findkey() scanned all objects to look for the wanted key. This is
>> >slow when using a high number of keys, for example on an i5 laptop the
>> >following loop took 17 s, with last semget calls taking ~1 ms each.
>>
>> I would argue that this is not the common case.
>
>Well, Linux allows for 32000 objects, and if you want to allocate them
>with keys, this initial (maybe diluted) duration is incompressible, and
>is O(n?).
>
>Besides, I maintain a program which, in some of its versions, uses tens
>of thousands of semaphore sets with keys, and destroys and creates new
>ones all the time.

Not impossible, just not the common case.

>On 4.13-rc3 without and with the patch, the following loop takes on
>my laptop, according to clock_gettime CLOCK_MONOTONIC calls not
>shown here, for each value of KEYS starting right after a reboot
>with initially 0 semaphore sets:
>
> for (int i = 0, k=0x424242; i < KEYS ; ++i)
> semget(k++, 1, IPC_CREAT | 0600);
>
> total total max single max single
> KEYS without with call without call with
>
> 1 3.5 4.9 ?s 3.5 4.9
> 10 7.6 8.6 ?s 3.7 4.7
> 32 16.2 15.9 ?s 4.3 5.3
> 100 72.9 41.8 ?s 3.7 4.7
> 1000 5,630.0 502.0 ?s * *
> 10000 1,340,000.0 7,240.0 ?s * *
> 31900 17,600,000,0 22,200.0 ?s * *
>
>Repeating the test immediately (prior to the reboot) for the same value
>of KEYS gives the times without creation (lookup only):
>
> total total max single max single
> KEYS without with call without call with
>
> 1 2.1 2.5 ?s 2.1 2.5
> 10 4.5 4.8 ?s 2.2 2.3
> 32 13.0 10.8 ?s 2.3 2.8
> 100 82.9 25.1 ?s * 2.3
> 1000 5,780.0 217.0 ?s * *
> 10000 1,470,000.0 2,520.0 ?s * *
> 31900 17,400,000.0 7,810.0 ?s * *
>
>*: unreliable measure: high variance
>
>This is both on a laptop and within a VM, so even where I have not noted
>high variance the figures are not very precise (especially for long
>runs) but we can still see the tendencies.
>
>I did one last benchmark, this time running each semget() in a new
>process (and still only measuring the time taken by this syscall) and
>got those figures (in a single run on each kernel) in ?s:
>
>creation:
> total total
> KEYS without with
>
> 1 3.7 5.0 ?s
> 10 32.9 36.7 ?s
> 32 125.0 109.0 ?s
> 100 523.0 353.0 ?s
> 1000 20,300.0 3,280.0 ?s
> 10000 2,470,000.0 46,700.0 ?s
> 31900 27,800,000.0 219,000.0 ?s
>
>lookup-only:
> total total
> KEYS without with
>
> 1 2.5 2.7 ?s
> 10 25.4 24.4 ?s
> 32 106.0 72.6 ?s
> 100 591.0 352.0 ?s
> 1000 22,400.0 2,250.0 ?s
> 10000 2,510,000.0 25,700.0 ?s
> 31900 28,200,000.0 115,000.0 ?s
>
>My provisional conclusion is that on my system this patch improves the
>performance consistently from about n ~= 30, and below 30 the slowdown,
>if any, is more than reasonable; it should be inconsequential for
>properly written programs and of a limited impact on programs doing lots
>of <ipc>get() calls on small sets of ipc objects.

I agree, for smaller amounts of keys the overhead is negligible, and the
O(n) quickly kicks in. To the point that this patch also benefits in
more normal scenarios, where we don't have unrealisticly high amounts of
keys.

Thanks,
Davidlohr

2017-08-02 20:07:01

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH] ipc: optimize semget/shmget/msgget for lots of keys

On Mon, 31 Jul 2017, Guillaume Knispel wrote:
> static int __init ipc_init(void)
> {
>- sem_init();
>- msg_init();
>+ int err_sem, err_msg;
>+
>+ err_sem = sem_init();
>+ WARN(err_sem, "ipc: sysV sem_init failed: %d\n", err_sem);
>+ err_msg = msg_init();
>+ WARN(err_msg, "ipc: sysV msg_init failed: %d\n", err_msg);
> shm_init();

This shows the ugliness of the underlying ipc init asymmetry. Specifically,
140d0b2108f (Do 'shm_init_ns()' in an early pure_initcall) was the final
nail in the coffin to fix an exit_shm() race.

While normally we could just initialize the ipc_ids fields statically and
be over with initcall dependencies, your patch will require inits be done
dynamically for the rhashtable_init(). Oh well.

Also, why do you do this?

> -pure_initcall(ipc_ns_init);
> +core_initcall(ipc_ns_init);

Thanks,
Davidlohr

2017-08-03 17:14:42

by Guillaume Knispel

[permalink] [raw]
Subject: Re: [PATCH] ipc: optimize semget/shmget/msgget for lots of keys

On Wed, Aug 02, 2017 at 01:06:44PM -0700, Davidlohr Bueso wrote:
> On Mon, 31 Jul 2017, Guillaume Knispel wrote:
> >static int __init ipc_init(void)
> >{
> >- sem_init();
> >- msg_init();
> >+ int err_sem, err_msg;
> >+
> >+ err_sem = sem_init();
> >+ WARN(err_sem, "ipc: sysV sem_init failed: %d\n", err_sem);
> >+ err_msg = msg_init();
> >+ WARN(err_msg, "ipc: sysV msg_init failed: %d\n", err_msg);
> > shm_init();
>
> This shows the ugliness of the underlying ipc init asymmetry. Specifically,
> 140d0b2108f (Do 'shm_init_ns()' in an early pure_initcall) was the final
> nail in the coffin to fix an exit_shm() race.
>
> While normally we could just initialize the ipc_ids fields statically and
> be over with initcall dependencies, your patch will require inits be done
> dynamically for the rhashtable_init(). Oh well.
>
> Also, why do you do this?
>
> >-pure_initcall(ipc_ns_init);
> >+core_initcall(ipc_ns_init);

In linux/init.h I saw that a pure_initcall is reserved to only
initialize variables and must have no dependency on anything else;
I interpreted that, + "pure" in the name, thinking we should not e.g.
allocate in a pure_initcall, however I see that net_ns_init() calls
kmem_cache_create() and others, so maybe we can keep ipc_ns_init() as
a pure_initcall?

Guillaume

2017-08-07 17:33:24

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH] ipc: optimize semget/shmget/msgget for lots of keys

On Thu, 03 Aug 2017, Guillaume Knispel wrote:

>In linux/init.h I saw that a pure_initcall is reserved to only
>initialize variables and must have no dependency on anything else;
>I interpreted that, + "pure" in the name, thinking we should not e.g.
>allocate in a pure_initcall, however I see that net_ns_init() calls
>kmem_cache_create() and others, so maybe we can keep ipc_ns_init() as
>a pure_initcall?

Yeah, I don't see this as a limitation wrt link order. Among others,
filelocks also do this. Not to mention futexes with alloc_large_system_hash().
So lets just keep this as is.

Thanks,
Davidlohr

2017-08-07 18:21:23

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH] ipc: optimize semget/shmget/msgget for lots of keys

On Mon, 31 Jul 2017, Guillaume Knispel wrote:
> struct ipc_ids {
> int in_use;
> unsigned short seq;
>+ bool tables_initialized;

So this is really ugly to have, but I understand why you added it. I
wonder what folks would think if we just panic() in the rhashtable_init()
ENOMEM case, and convert the EINVALs to WARNs. This way the function
would always be called successfully. This is similar to what futex_init
does, with the underlying hash table allocator panicing. sems and msg
would probably have to be converted to pure_initcall, but hey, we could
at least get the symmetry back.

> static int __init ipc_ns_init(void)
> {
>- shm_init_ns(&init_ipc_ns);
>- return 0;
>+ const int err = shm_init_ns(&init_ipc_ns);
>+ WARN(err, "ipc: sysV shm_init_ns failed: %d\n", err);

nit: s/sysV/sysv

>+ return err;
> }

Thanks,
Davidlohr

2017-08-09 16:15:44

by Guillaume Knispel

[permalink] [raw]
Subject: Re: [PATCH] ipc: optimize semget/shmget/msgget for lots of keys

On Mon, Aug 07, 2017 at 11:21:03AM -0700, Davidlohr Bueso wrote:
> On Mon, 31 Jul 2017, Guillaume Knispel wrote:
> > struct ipc_ids {
> > int in_use;
> > unsigned short seq;
> > + bool tables_initialized;
>
> So this is really ugly to have, but I understand why you added it. I
> wonder what folks would think if we just panic() in the rhashtable_init()
> ENOMEM case, and convert the EINVALs to WARNs. This way the function
> would always be called successfully. This is similar to what futex_init
> does, with the underlying hash table allocator panicing. sems and msg
> would probably have to be converted to pure_initcall, but hey, we could
> at least get the symmetry back.

I think we could only afford to panic() on ENOMEM during boot, but
ipc_init_ids() is also called through create_ipc_ns() on namespace
creation. Besides, I would not be very comfortable with only warning on
EINVAL but continuing execution using potentially uninitialized data.
Granted, this will probably never happen in production, but the intent
was to leave the system usable (except that it would not be possible to
create sysv ipc objects) with no risk of additionnal crash for cases
like people hacking rhashtable and testing their modifications, if they
merely introduce a correctly reported error.

Cheers!
Guillaume

2017-08-14 06:06:22

by kernel test robot

[permalink] [raw]
Subject: [lkp-robot] [ipc] cb6268f05d: reaim.jobs_per_min 865% improvement


Greeting,

FYI, we noticed a 865% improvement of reaim.jobs_per_min due to commit:


commit: cb6268f05df684e00607762fd8ad95d515e2407f ("ipc: optimize semget/shmget/msgget for lots of keys")
url: https://github.com/0day-ci/linux/commits/Guillaume-Knispel/ipc-optimize-semget-shmget-msgget-for-lots-of-keys/20170731-170031


in testcase: reaim
on test machine: 56 threads Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz with 256G memory
with following parameters:

runtime: 300s
nr_task: 5000
test: shared_memory
cpufreq_governor: performance

test-description: REAIM is an updated and improved version of AIM 7 benchmark.
test-url: https://sourceforge.net/projects/re-aim-7/


Details are as below:
-------------------------------------------------------------------------------------------------->


To reproduce:

git clone https://github.com/01org/lkp-tests.git
cd lkp-tests
bin/lkp install job.yaml # job file is attached in this email
bin/lkp run job.yaml

testcase/path_params/tbox_group/run: reaim/300s-5000-shared_memory-performance/lkp-hsw-ep5

fd2b2c57ec2020ae cb6268f05df684e00607762fd8
---------------- --------------------------
634033 ? 4% 865% 6120365 reaim.jobs_per_min
126 ? 4% 865% 1224 reaim.jobs_per_min_child
672755 ? 5% 831% 6263184 reaim.max_jobs_per_min
34.14 ? 3% 11% 37.82 reaim.std_dev_percent
65.40 -6% 61.66 reaim.jti
13.53 -7% 12.60 reaim.child_utime
47.48 ? 4% -90% 4.90 reaim.parent_time
1981 ? 4% -90% 204 reaim.child_systime
12.19 -90% 1.24 reaim.std_dev_time
5160570 ? 7% 517% 31838188 reaim.time.minor_page_faults
88.17 ? 7% 474% 505.69 reaim.time.user_time
125994 ? 11% 244% 433632 reaim.time.involuntary_context_switches
4053795 ? 7% 79% 7256949 reaim.time.voluntary_context_switches
3974 -28% 2864 reaim.time.percent_of_cpu_this_job_got
12855 ? 4% -36% 8249 reaim.time.system_time
27741 ? 3% 86% 51490 vmstat.system.cs
214773 ? 5% 44% 308264 interrupts.CAL:Function_call_interrupts
fail:runs %reproduction fail:runs
| | |
:4 25% 1:4 stderr.create_shared_memory():can't_create_shared_memory,pausing
0 6e+05 551946 ?104% latency_stats.avg.call_rwsem_down_write_failed.shmctl_down.SyS_shmctl.do_syscall_64.return_from_SYSCALL_64
240707 ? 7% -2e+05 81933 latency_stats.avg.call_rwsem_down_write_failed.ipcget.SyS_shmget.entry_SYSCALL_64_fastpath
282714 ? 4% -2e+05 76327 latency_stats.avg.call_rwsem_down_write_failed.shm_close.remove_vma.do_munmap.SyS_shmdt.entry_SYSCALL_64_fastpath
313951 ? 5% -2e+05 78957 latency_stats.avg.call_rwsem_down_write_failed.do_shmat.SyS_shmat.entry_SYSCALL_64_fastpath
341015 ? 4% -3e+05 78091 latency_stats.avg.call_rwsem_down_write_failed.shmctl_down.SyS_shmctl.entry_SYSCALL_64_fastpath
0 6e+05 551946 ?104% latency_stats.max.call_rwsem_down_write_failed.shmctl_down.SyS_shmctl.do_syscall_64.return_from_SYSCALL_64
21599230 ? 3% -2e+07 3153822 ? 6% latency_stats.max.call_rwsem_down_write_failed.shmctl_down.SyS_shmctl.entry_SYSCALL_64_fastpath
21608679 ? 3% -2e+07 3152519 ? 6% latency_stats.max.call_rwsem_down_write_failed.ipcget.SyS_shmget.entry_SYSCALL_64_fastpath
21612440 ? 3% -2e+07 3153028 ? 6% latency_stats.max.call_rwsem_down_write_failed.do_shmat.SyS_shmat.entry_SYSCALL_64_fastpath
21613940 ? 3% -2e+07 3154107 ? 6% latency_stats.max.call_rwsem_down_write_failed.shm_close.remove_vma.do_munmap.SyS_shmdt.entry_SYSCALL_64_fastpath
21615866 ? 3% -2e+07 3154900 ? 6% latency_stats.max.max
3.835e+10 ? 4% 9e+10 1.254e+11 ? 7% latency_stats.sum.io_schedule.__lock_page.do_wp_page.__handle_mm_fault.handle_mm_fault.__do_page_fault.do_page_fault.page_fault
1.672e+09 ? 22% 5e+09 6.765e+09 ? 86% latency_stats.sum.call_rwsem_down_write_failed.ipcget.SyS_semget.entry_SYSCALL_64_fastpath
2757771 ? 40% 2e+08 2.425e+08 ? 26% latency_stats.sum.io_schedule.wait_on_page_bit.__migration_entry_wait.migration_entry_wait.do_swap_page.__handle_mm_fault.handle_mm_fault.__do_page_fault.do_page_fault.page_fault
0 6e+05 551946 ?104% latency_stats.sum.call_rwsem_down_write_failed.shmctl_down.SyS_shmctl.do_syscall_64.return_from_SYSCALL_64
24449 ? 67% 2e+05 200503 ? 8% latency_stats.sum.io_schedule.__lock_page_or_retry.filemap_fault.__do_fault.__handle_mm_fault.handle_mm_fault.__do_page_fault.do_page_fault.page_fault
40790 ? 10% 2e+05 191446 ? 11% latency_stats.sum.ep_poll.SyS_epoll_wait.do_syscall_64.return_from_SYSCALL_64
27684 ? 19% 1e+05 172543 ? 13% latency_stats.sum.devkmsg_read.__vfs_read.vfs_read.SyS_read.entry_SYSCALL_64_fastpath
52252 ? 4% 1e+05 189551 ? 5% latency_stats.sum.wait_woken.inotify_read.__vfs_read.vfs_read.SyS_read.entry_SYSCALL_64_fastpath
6530 ? 95% 9e+04 91966 ? 21% latency_stats.sum.io_schedule.__lock_page_killable.__lock_page_or_retry.filemap_fault.__do_fault.__handle_mm_fault.handle_mm_fault.__do_page_fault.do_page_fault.page_fault
5976 ? 20% 3e+04 34861 ? 4% latency_stats.sum.pipe_wait.pipe_write.__vfs_write.vfs_write.SyS_write.entry_SYSCALL_64_fastpath
2.659e+11 ? 3% -2e+11 8.371e+10 latency_stats.sum.call_rwsem_down_write_failed.do_shmat.SyS_shmat.entry_SYSCALL_64_fastpath
5864993 ? 7% 454% 32495687 perf-stat.page-faults
5864993 ? 7% 454% 32495687 perf-stat.minor-faults
0.11 ? 3% 422% 0.60 perf-stat.branch-miss-rate%
1.501e+08 ? 7% 356% 6.847e+08 perf-stat.node-store-misses
4.444e+11 ? 6% 316% 1.849e+12 perf-stat.dTLB-stores
1.077e+08 ? 5% 314% 4.46e+08 perf-stat.node-loads
1.067e+09 ? 7% 287% 4.133e+09 perf-stat.cache-misses
6.282e+08 ? 7% 275% 2.355e+09 perf-stat.node-load-misses
1.757e+08 ? 7% 261% 6.339e+08 perf-stat.node-stores
4.56e+09 ? 6% 232% 1.516e+10 perf-stat.branch-misses
3.20 212% 9.98 perf-stat.cache-miss-rate%
71703040 ? 6% 163% 1.884e+08 perf-stat.dTLB-store-misses
2.831e+08 ? 11% 111% 5.964e+08 perf-stat.iTLB-loads
9086070 ? 7% 74% 15819624 perf-stat.context-switches
1340441 ? 8% 72% 2308738 perf-stat.cpu-migrations
2.164e+09 ? 7% 32% 2.847e+09 perf-stat.iTLB-load-misses
3.337e+10 ? 5% 24% 4.14e+10 perf-stat.cache-references
46.07 13% 51.93 perf-stat.node-store-miss-rate%
0.66 9% 0.72 perf-stat.ipc
85.35 84.08 perf-stat.node-load-miss-rate%
88.46 -7% 82.67 perf-stat.iTLB-load-miss-rate%
1.51 -8% 1.39 perf-stat.cpi
5.134e+12 ? 4% -28% 3.698e+12 perf-stat.dTLB-loads
1.972e+13 ? 4% -32% 1.342e+13 perf-stat.instructions
3.976e+12 ? 4% -36% 2.533e+12 perf-stat.branch-instructions
0.02 ? 6% -37% 0.01 perf-stat.dTLB-store-miss-rate%
2.977e+13 ? 4% -37% 1.865e+13 perf-stat.cpu-cycles
9132 ? 3% -48% 4715 perf-stat.instructions-per-iTLB-miss
0.06 ? 27% -68% 0.02 perf-stat.dTLB-load-miss-rate%
3.271e+09 ? 27% -77% 7.473e+08 perf-stat.dTLB-load-misses



reaim.parent_time

55 ++---------------------------------------------------------------------+
50 ++.*. .*.. *..|
*. *..*. *.*..*..*..*.*..*..*..*.*..*..*.*..*.. .*.*..*..*.. + *
45 ++ *. * |
40 ++ |
35 ++ |
30 ++ |
| |
25 ++ |
20 ++ |
15 ++ |
10 ++ |
| |
5 O+ O O O O O O O O O O O O O O O O O O O O O O |
0 ++---------------------------------------------------------------------+


reaim.child_systime

2200 ++-*-*-----*---------------------------------------------------------+
2000 *+ *. : *.*..*..*. .*.. .*..|
| : .*.*.. .*.. .. *..*..*.*. * *
1800 ++ *..*. *..* *..*.* |
1600 ++ |
1400 ++ |
1200 ++ |
| |
1000 ++ |
800 ++ |
600 ++ |
400 ++ |
| |
200 O+ O O O O O O O O O O O O O O O O O O O O O O |
0 ++-------------------------------------------------------------------+


reaim.jobs_per_min

7e+06 ++------------------------------------------------------------------+
O O O O O O O O O O O O O O O |
6e+06 ++ O O O O O O O O |
| |
5e+06 ++ |
| |
4e+06 ++ |
| |
3e+06 ++ |
| |
2e+06 ++ |
| |
1e+06 ++ |
*..*.*..*.*..*..*.*..*..*.*..*.*..*..*.*..*.*..*..*.*..*..*.*..*.*..*
0 ++------------------------------------------------------------------+


reaim.jobs_per_min_child

1400 ++-------------------------------------------------------------------+
O O O O O O O O O O O O O O O |
1200 ++ O O O O O O O O |
| |
1000 ++ |
| |
800 ++ |
| |
600 ++ |
| |
400 ++ |
| |
200 ++ |
*..*.*..*..*.*..*..*.*..*..*.*..*..*.*..*.*..*..*.*..*..*.*..*..*.*..*
0 ++-------------------------------------------------------------------+


[*] bisect-good sample
[O] bisect-bad sample


Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


Thanks,
Xiaolong


Attachments:
(No filename) (12.93 kB)
config-4.13.0-rc2-00023-gcb6268f0 (157.19 kB)
job-script (6.61 kB)
job.yaml (4.33 kB)
reproduce (4.79 kB)
Download all attachments

2017-08-14 17:59:05

by Kees Cook

[permalink] [raw]
Subject: Re: [lkp-robot] [ipc] cb6268f05d: reaim.jobs_per_min 865% improvement

On Sun, Aug 13, 2017 at 11:05 PM, kernel test robot
<[email protected]> wrote:
>
> Greeting,
>
> FYI, we noticed a 865% improvement of reaim.jobs_per_min due to commit:
>
>
> commit: cb6268f05df684e00607762fd8ad95d515e2407f ("ipc: optimize semget/shmget/msgget for lots of keys")
> url: https://github.com/0day-ci/linux/commits/Guillaume-Knispel/ipc-optimize-semget-shmget-msgget-for-lots-of-keys/20170731-170031

Impressive!

-Kees

--
Kees Cook
Pixel Security