2020-03-26 15:16:53

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH] ipc: Convert ipcs_idr to XArray

From: "Matthew Wilcox (Oracle)" <[email protected]>

The XArray has better loops than the IDR has, removing the need to
open-code them. We also don't need to call idr_destroy() any more.
Allocating the ID is a little tricky due to needing to get 'seq'
correct. Open-code a variant of __xa_alloc() which lets us set the
ID and the seq before depositing the pointer in the array.

Signed-off-by: Matthew Wilcox <[email protected]>
---
include/linux/ipc_namespace.h | 10 +-
include/linux/xarray.h | 1 +
ipc/ipc_sysctl.c | 14 +--
ipc/msg.c | 1 -
ipc/namespace.c | 13 +--
ipc/sem.c | 1 -
ipc/shm.c | 60 +++++------
ipc/util.c | 181 +++++++++++++++++-----------------
ipc/util.h | 4 +-
lib/xarray.c | 2 +-
10 files changed, 135 insertions(+), 152 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index c309f43bde45..bdc39cc4d1dc 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -3,13 +3,13 @@
#define __IPC_NAMESPACE_H__

#include <linux/err.h>
-#include <linux/idr.h>
-#include <linux/rwsem.h>
#include <linux/notifier.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
+#include <linux/rwsem.h>
+#include <linux/xarray.h>

struct user_namespace;

@@ -17,11 +17,11 @@ struct ipc_ids {
int in_use;
unsigned short seq;
struct rw_semaphore rwsem;
- struct idr ipcs_idr;
+ struct xarray ipcs;
int max_idx;
- int last_idx; /* For wrap around detection */
+ int next_idx;
#ifdef CONFIG_CHECKPOINT_RESTORE
- int next_id;
+ int restore_id;
#endif
struct rhashtable key_ht;
};
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 14c893433139..d79b8e3aa08d 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1501,6 +1501,7 @@ void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
void xas_init_marks(const struct xa_state *);

bool xas_nomem(struct xa_state *, gfp_t);
+bool __xas_nomem(struct xa_state *, gfp_t);
void xas_pause(struct xa_state *);

void xas_create_range(struct xa_state *);
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index affd66537e87..a2df5d0e27f6 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -115,7 +115,7 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,

int ipc_mni = IPCMNI;
int ipc_mni_shift = IPCMNI_SHIFT;
-int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
+int ipc_min_cycle = XA_CHUNK_SIZE;

static struct ctl_table ipc_kern_table[] = {
{
@@ -196,8 +196,8 @@ static struct ctl_table ipc_kern_table[] = {
#ifdef CONFIG_CHECKPOINT_RESTORE
{
.procname = "sem_next_id",
- .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
- .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
+ .data = &init_ipc_ns.ids[IPC_SEM_IDS].restore_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].restore_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
@@ -205,8 +205,8 @@ static struct ctl_table ipc_kern_table[] = {
},
{
.procname = "msg_next_id",
- .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
- .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
+ .data = &init_ipc_ns.ids[IPC_MSG_IDS].restore_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].restore_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
@@ -214,8 +214,8 @@ static struct ctl_table ipc_kern_table[] = {
},
{
.procname = "shm_next_id",
- .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
- .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
+ .data = &init_ipc_ns.ids[IPC_SHM_IDS].restore_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].restore_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
diff --git a/ipc/msg.c b/ipc/msg.c
index caca67368cb5..0c4fdc2f5c08 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1308,7 +1308,6 @@ void msg_init_ns(struct ipc_namespace *ns)
void msg_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &msg_ids(ns), freeque);
- idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
}
#endif
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b3ca1476ca51..e8567c3d32e9 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -96,22 +96,17 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
void (*free)(struct ipc_namespace *, struct kern_ipc_perm *))
{
struct kern_ipc_perm *perm;
- int next_id;
- int total, in_use;
+ unsigned long index;

down_write(&ids->rwsem);

- in_use = ids->in_use;
-
- for (total = 0, next_id = 0; total < in_use; next_id++) {
- perm = idr_find(&ids->ipcs_idr, next_id);
- if (perm == NULL)
- continue;
+ xa_for_each(&ids->ipcs, index, perm) {
rcu_read_lock();
ipc_lock_object(perm);
free(ns, perm);
- total++;
}
+ BUG_ON(!xa_empty(&ids->ipcs));
+
up_write(&ids->rwsem);
}

diff --git a/ipc/sem.c b/ipc/sem.c
index 3687b71151b3..8d6550ac035a 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -258,7 +258,6 @@ void sem_init_ns(struct ipc_namespace *ns)
void sem_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &sem_ids(ns), freeary);
- idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
}
#endif
diff --git a/ipc/shm.c b/ipc/shm.c
index ce1ca9f7c6e9..49a8e088e0de 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -129,7 +129,6 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
void shm_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
- idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
}
#endif
@@ -348,34 +347,30 @@ static void shm_close(struct vm_area_struct *vma)
up_write(&shm_ids(ns).rwsem);
}

-/* Called with ns->shm_ids(ns).rwsem locked */
-static int shm_try_destroy_orphaned(int id, void *p, void *data)
+void shm_destroy_orphaned(struct ipc_namespace *ns)
{
- struct ipc_namespace *ns = data;
- struct kern_ipc_perm *ipcp = p;
- struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ struct kern_ipc_perm *ipcp;
+ unsigned long index;

- /*
- * We want to destroy segments without users and with already
- * exit'ed originating process.
- *
- * As shp->* are changed under rwsem, it's safe to skip shp locking.
- */
- if (shp->shm_creator != NULL)
- return 0;
+ down_write(&shm_ids(ns).rwsem);
+ xa_for_each(&shm_ids(ns).ipcs, index, ipcp) {
+ struct shmid_kernel *shp;

- if (shm_may_destroy(ns, shp)) {
- shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
- }
- return 0;
-}
+ shp = container_of(ipcp, struct shmid_kernel, shm_perm);

-void shm_destroy_orphaned(struct ipc_namespace *ns)
-{
- down_write(&shm_ids(ns).rwsem);
- if (shm_ids(ns).in_use)
- idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
+ /*
+ * We want to destroy segments without users and with already
+ * exit'ed originating process. As shp->* are changed under
+ * rwsem, it's safe to skip shp locking.
+ */
+ if (shp->shm_creator != NULL)
+ continue;
+
+ if (shm_may_destroy(ns, shp)) {
+ shm_lock_by_ptr(shp);
+ shm_destroy(ns, shp);
+ }
+ }
up_write(&shm_ids(ns).rwsem);
}

@@ -860,26 +855,17 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
unsigned long *swp)
{
- int next_id;
- int total, in_use;
+ struct kern_ipc_perm *ipc;
+ unsigned long index;

*rss = 0;
*swp = 0;

- in_use = shm_ids(ns).in_use;
-
- for (total = 0, next_id = 0; total < in_use; next_id++) {
- struct kern_ipc_perm *ipc;
+ xa_for_each(&shm_ids(ns).ipcs, index, ipc) {
struct shmid_kernel *shp;

- ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
- if (ipc == NULL)
- continue;
shp = container_of(ipc, struct shmid_kernel, shm_perm);
-
shm_add_rss_swap(shp, rss, swp);
-
- total++;
}
}

diff --git a/ipc/util.c b/ipc/util.c
index 7acccfded7cb..723dc4b05208 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -104,12 +104,20 @@ static const struct rhashtable_params ipc_kht_params = {
.automatic_shrinking = true,
};

+#ifdef CONFIG_CHECKPOINT_RESTORE
+#define set_restore_id(ids, x) ids->restore_id = x
+#define get_restore_id(ids) ids->restore_id
+#else
+#define set_restore_id(ids, x) do { } while (0)
+#define get_restore_id(ids) (-1)
+#endif
+
/**
* ipc_init_ids - initialise ipc identifiers
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
- * below ipc_mni) then initialise the keys hashtable and ids idr.
+ * below ipc_mni) then initialise the keys hashtable and ids xarray.
*/
void ipc_init_ids(struct ipc_ids *ids)
{
@@ -117,12 +125,10 @@ void ipc_init_ids(struct ipc_ids *ids)
ids->seq = 0;
init_rwsem(&ids->rwsem);
rhashtable_init(&ids->key_ht, &ipc_kht_params);
- idr_init(&ids->ipcs_idr);
+ xa_init_flags(&ids->ipcs, XA_FLAGS_ALLOC);
ids->max_idx = -1;
- ids->last_idx = -1;
-#ifdef CONFIG_CHECKPOINT_RESTORE
- ids->next_id = -1;
-#endif
+ ids->next_idx = 0;
+ set_restore_id(ids, -1);
}

#ifdef CONFIG_PROC_FS
@@ -183,12 +189,12 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
}

/*
- * Insert new IPC object into idr tree, and set sequence number and id
+ * Insert new IPC object into xarray, and set sequence number and id
* in the correct order.
* Especially:
- * - the sequence number must be set before inserting the object into the idr,
- * because the sequence number is accessed without a lock.
- * - the id can/must be set after inserting the object into the idr.
+ * - the sequence number must be set before inserting the object into the
+ * xarray, because the sequence number is accessed without a lock.
+ * - the id can/must be set after inserting the object into the xarray.
* All accesses must be done after getting kern_ipc_perm.lock.
*
* The caller must own kern_ipc_perm.lock.of the new object.
@@ -198,64 +204,60 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
* the sequence number is incremented only when the returned ID is less than
* the last one.
*/
-static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
+static inline int ipc_id_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
{
- int idx, next_id = -1;
+ u32 idx;
+ int err;

-#ifdef CONFIG_CHECKPOINT_RESTORE
- next_id = ids->next_id;
- ids->next_id = -1;
-#endif
+ if (get_restore_id(ids) < 0) {
+ XA_STATE(xas, &ids->ipcs, 0);
+ int min_idx, max_idx;

- /*
- * As soon as a new object is inserted into the idr,
- * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it,
- * and the lockless preparations for ipc operations can start.
- * This means especially: permission checks, audit calls, allocation
- * of undo structures, ...
- *
- * Thus the object must be fully initialized, and if something fails,
- * then the full tear-down sequence must be followed.
- * (i.e.: set new->deleted, reduce refcount, call_rcu())
- */
+ max_idx = max(ids->in_use*3/2, ipc_min_cycle);
+ max_idx = min(max_idx, ipc_mni) - 1;

- if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
- int max_idx;
+ xas_lock(&xas);

- max_idx = max(ids->in_use*3/2, ipc_min_cycle);
- max_idx = min(max_idx, ipc_mni);
-
- /* allocate the idx, with a NULL struct kern_ipc_perm */
- idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx,
- GFP_NOWAIT);
-
- if (idx >= 0) {
- /*
- * idx got allocated successfully.
- * Now calculate the sequence number and set the
- * pointer for real.
- */
- if (idx <= ids->last_idx) {
+ min_idx = ids->next_idx;
+ new->seq = ids->seq;
+
+ /* Modified version of __xa_alloc */
+ do {
+ xas.xa_index = min_idx;
+ xas_find_marked(&xas, max_idx, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART && min_idx > 0) {
ids->seq++;
if (ids->seq >= ipcid_seq_max())
ids->seq = 0;
+ new->seq = ids->seq;
+ xas.xa_index = 0;
+ min_idx = 0;
+ xas_find_marked(&xas, max_idx, XA_FREE_MARK);
}
- ids->last_idx = idx;
-
- new->seq = ids->seq;
- /* no need for smp_wmb(), this is done
- * inside idr_replace, as part of
- * rcu_assign_pointer
- */
- idr_replace(&ids->ipcs_idr, new, idx);
- }
+ if (xas.xa_node == XAS_RESTART)
+ xas_set_err(&xas, -ENOSPC);
+ else
+ new->id = (new->seq << ipcmni_seq_shift()) +
+ xas.xa_index;
+ xas_store(&xas, new);
+ xas_clear_mark(&xas, XA_FREE_MARK);
+ } while (__xas_nomem(&xas, GFP_KERNEL));
+
+ xas_unlock(&xas);
+ err = xas_error(&xas);
+ idx = xas.xa_index;
} else {
- new->seq = ipcid_to_seqx(next_id);
- idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id),
- 0, GFP_NOWAIT);
+ new->id = get_restore_id(ids);
+ new->seq = ipcid_to_seqx(new->id);
+ idx = ipcid_to_idx(new->id);
+ err = xa_insert(&ids->ipcs, idx, new, GFP_KERNEL);
+ if (err == -EBUSY)
+ err = -ENOSPC;
+ set_restore_id(ids, -1);
}
- if (idx >= 0)
- new->id = (new->seq << ipcmni_seq_shift()) + idx;
+
+ if (err < 0)
+ return err;
return idx;
}

@@ -278,7 +280,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
{
kuid_t euid;
kgid_t egid;
- int idx, err;
+ int idx;

/* 1) Initialize the refcount so that ipc_rcu_putref works */
refcount_set(&new->refcount, 1);
@@ -289,29 +291,42 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
if (ids->in_use >= limit)
return -ENOSPC;

- idr_preload(GFP_KERNEL);
-
+ /*
+ * 2) Hold the spinlock so that nobody else can access the object
+ * once they can find it
+ */
spin_lock_init(&new->lock);
- rcu_read_lock();
spin_lock(&new->lock);
-
current_euid_egid(&euid, &egid);
new->cuid = new->uid = euid;
new->gid = new->cgid = egid;
-
new->deleted = false;

- idx = ipc_idr_alloc(ids, new);
- idr_preload_end();
+ idx = ipc_id_alloc(ids, new);
+
+ rcu_read_lock();
+
+ /*
+ * As soon as a new object is inserted into the XArray,
+ * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it,
+ * and the lockless preparations for ipc operations can start.
+ * This means especially: permission checks, audit calls, allocation
+ * of undo structures, ...
+ *
+ * Thus the object must be fully initialized, and if something fails,
+ * then the full tear-down sequence must be followed.
+ * (i.e.: set new->deleted, reduce refcount, call_rcu())
+ */

if (idx >= 0 && new->key != IPC_PRIVATE) {
- err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
+ int err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
ipc_kht_params);
if (err < 0) {
- idr_remove(&ids->ipcs_idr, idx);
+ xa_erase(&ids->ipcs, idx);
idx = err;
}
}
+
if (idx < 0) {
new->deleted = true;
spin_unlock(&new->lock);
@@ -462,7 +477,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
int idx = ipcid_to_idx(ipcp->id);

- idr_remove(&ids->ipcs_idr, idx);
+ xa_erase(&ids->ipcs, idx);
ipc_kht_remove(ids, ipcp);
ids->in_use--;
ipcp->deleted = true;
@@ -472,7 +487,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
idx--;
if (idx == -1)
break;
- } while (!idr_find(&ids->ipcs_idr, idx));
+ } while (!xa_load(&ids->ipcs, idx));
ids->max_idx = idx;
}
}
@@ -595,7 +610,7 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
struct kern_ipc_perm *out;
int idx = ipcid_to_idx(id);

- out = idr_find(&ids->ipcs_idr, idx);
+ out = xa_load(&ids->ipcs, idx);
if (!out)
return ERR_PTR(-EINVAL);

@@ -754,31 +769,19 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s)
static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
loff_t *new_pos)
{
+ unsigned long index = pos;
struct kern_ipc_perm *ipc;
- int total, id;
-
- total = 0;
- for (id = 0; id < pos && total < ids->in_use; id++) {
- ipc = idr_find(&ids->ipcs_idr, id);
- if (ipc != NULL)
- total++;
- }

*new_pos = pos + 1;
- if (total >= ids->in_use)
+ rcu_read_lock();
+ ipc = xa_find(&ids->ipcs, &index, ULONG_MAX, XA_PRESENT);
+ if (!ipc) {
+ rcu_read_unlock();
return NULL;
-
- for (; pos < ipc_mni; pos++) {
- ipc = idr_find(&ids->ipcs_idr, pos);
- if (ipc != NULL) {
- rcu_read_lock();
- ipc_lock_object(ipc);
- return ipc;
- }
}

- /* Out of range - return NULL to terminate iteration */
- return NULL;
+ ipc_lock_object(ipc);
+ return ipc;
}

static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos)
diff --git a/ipc/util.h b/ipc/util.h
index 5766c61aed0e..04d49db4cefa 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -27,7 +27,7 @@
*/
#define IPCMNI_SHIFT 15
#define IPCMNI_EXTEND_SHIFT 24
-#define IPCMNI_EXTEND_MIN_CYCLE (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE)
+#define IPCMNI_EXTEND_MIN_CYCLE (XA_CHUNK_SIZE * XA_CHUNK_SIZE)
#define IPCMNI (1 << IPCMNI_SHIFT)
#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT)

@@ -42,7 +42,7 @@ extern int ipc_min_cycle;
#else /* CONFIG_SYSVIPC_SYSCTL */

#define ipc_mni IPCMNI
-#define ipc_min_cycle ((int)RADIX_TREE_MAP_SIZE)
+#define ipc_min_cycle ((int)XA_CHUNK_SIZE)
#define ipcmni_seq_shift() IPCMNI_SHIFT
#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1)
#endif /* CONFIG_SYSVIPC_SYSCTL */
diff --git a/lib/xarray.c b/lib/xarray.c
index e9e641d3c0c3..8488fb4eff2e 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -319,7 +319,7 @@ EXPORT_SYMBOL_GPL(xas_nomem);
*
* Return: true if memory was needed, and was successfully allocated.
*/
-static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
+bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
__must_hold(xas->xa->xa_lock)
{
unsigned int lock_type = xa_lock_type(xas->xa);

base-commit: 5149100c3aebe5e640d6ff68e0b5e5a7eb8638e0
--
2.25.1


2020-04-18 20:17:07

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] ipc: Convert ipcs_idr to XArray

On Thu, 26 Mar 2020 08:14:18 -0700 Matthew Wilcox <[email protected]> wrote:

> From: "Matthew Wilcox (Oracle)" <[email protected]>
>
> The XArray has better loops than the IDR has, removing the need to
> open-code them. We also don't need to call idr_destroy() any more.
> Allocating the ID is a little tricky due to needing to get 'seq'
> correct. Open-code a variant of __xa_alloc() which lets us set the
> ID and the seq before depositing the pointer in the array.

hm, this goes rather deep. Manfred & Davidlohr, are you able to run
this through some testing?

>
> ...
>
> --- a/ipc/util.c
> +++ b/ipc/util.c
> @@ -104,12 +104,20 @@ static const struct rhashtable_params ipc_kht_params = {
> .automatic_shrinking = true,
> };
>
> +#ifdef CONFIG_CHECKPOINT_RESTORE

The code grew a few additional CONFIG_CHECKPOINT_RESTORE ifdefs.
What's going on here? Why is CRIU special in ipc/?

> +#define set_restore_id(ids, x) ids->restore_id = x
> +#define get_restore_id(ids) ids->restore_id
> +#else
> +#define set_restore_id(ids, x) do { } while (0)
> +#define get_restore_id(ids) (-1)
> +#endif

Well these are ugly. Can't all this be done in C?

>
> ...
>

2020-04-18 21:38:49

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH] ipc: Convert ipcs_idr to XArray

On Sat, Apr 18, 2020 at 01:15:09PM -0700, Andrew Morton wrote:
> > --- a/ipc/util.c
> > +++ b/ipc/util.c
> > @@ -104,12 +104,20 @@ static const struct rhashtable_params ipc_kht_params = {
> > .automatic_shrinking = true,
> > };
> >
> > +#ifdef CONFIG_CHECKPOINT_RESTORE
>
> The code grew a few additional CONFIG_CHECKPOINT_RESTORE ifdefs.
> What's going on here? Why is CRIU special in ipc/?

"grew a few"? I added (this) one and deleted two others. From in the
middle of functions, like we usually prefer.

I mean, this is why we need something like this:

@@ -17,11 +17,11 @@ struct ipc_ids {
...
#ifdef CONFIG_CHECKPOINT_RESTORE
- int next_id;
+ int restore_id;
#endif

> > +#define set_restore_id(ids, x) ids->restore_id = x
> > +#define get_restore_id(ids) ids->restore_id
> > +#else
> > +#define set_restore_id(ids, x) do { } while (0)
> > +#define get_restore_id(ids) (-1)
> > +#endif
>
> Well these are ugly. Can't all this be done in C?

Would you rather see it done as:

static inline void set_restore_id(struct ipc_ids *ids, int id)
{
#ifdef CONFIG_CHECKPOINT_RESTORE
ids->restore_id = id;
#endif
}

static inline int get_restore_id(struct ipc_ids *ids)
{
#ifdef CONFIG_CHECKPOINT_RESTORE
return ids->restore_id;
#else
return -1;
#endif
}

2020-04-18 22:10:01

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] ipc: Convert ipcs_idr to XArray

On Sat, 18 Apr 2020 14:34:51 -0700 Matthew Wilcox <[email protected]> wrote:

> On Sat, Apr 18, 2020 at 01:15:09PM -0700, Andrew Morton wrote:
> > > --- a/ipc/util.c
> > > +++ b/ipc/util.c
> > > @@ -104,12 +104,20 @@ static const struct rhashtable_params ipc_kht_params = {
> > > .automatic_shrinking = true,
> > > };
> > >
> > > +#ifdef CONFIG_CHECKPOINT_RESTORE
> >
> > The code grew a few additional CONFIG_CHECKPOINT_RESTORE ifdefs.
> > What's going on here? Why is CRIU special in ipc/?
>
> "grew a few"? I added (this) one and deleted two others. From in the
> middle of functions, like we usually prefer.
>

Oh.

>
> @@ -17,11 +17,11 @@ struct ipc_ids {
> ...
> #ifdef CONFIG_CHECKPOINT_RESTORE
> - int next_id;
> + int restore_id;
> #endif
>
> > > +#define set_restore_id(ids, x) ids->restore_id = x
> > > +#define get_restore_id(ids) ids->restore_id
> > > +#else
> > > +#define set_restore_id(ids, x) do { } while (0)
> > > +#define get_restore_id(ids) (-1)
> > > +#endif
> >
> > Well these are ugly. Can't all this be done in C?
>
> Would you rather see it done as:
>
> static inline void set_restore_id(struct ipc_ids *ids, int id)
> {
> #ifdef CONFIG_CHECKPOINT_RESTORE
> ids->restore_id = id;
> #endif
> }
>
> static inline int get_restore_id(struct ipc_ids *ids)
> {
> #ifdef CONFIG_CHECKPOINT_RESTORE
> return ids->restore_id;
> #else
> return -1;
> #endif
> }

Looks nicer. Has type checking regardless of Kconfig. Doesn't have
lval-and-rval in one case, neither in the other. Doesn't risk
unused-var warnings dependent on Kconfig. Etc.

Could also do

#ifdef CONFIG_CHECKPOINT_RESTORE
static inline void set_restore_id(struct ipc_ids *ids, int id)
{
ids->restore_id = id;
}

static inline int get_restore_id(struct ipc_ids *ids)
{
return ids->restore_id;
return -1;
}
#else
...

2020-04-20 15:37:17

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH] ipc: Convert ipcs_idr to XArray

Hi Matthew,

On 3/26/20 4:14 PM, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <[email protected]>
>
> The XArray has better loops than the IDR has, removing the need to
> open-code them. We also don't need to call idr_destroy() any more.
> Allocating the ID is a little tricky due to needing to get 'seq'
> correct. Open-code a variant of __xa_alloc() which lets us set the
> ID and the seq before depositing the pointer in the array.
>
> Signed-off-by: Matthew Wilcox <[email protected]>
> ---

> - max_idx = max(ids->in_use*3/2, ipc_min_cycle);
> - max_idx = min(max_idx, ipc_mni);
> -
> - /* allocate the idx, with a NULL struct kern_ipc_perm */
> - idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx,
> - GFP_NOWAIT);
> -
> - if (idx >= 0) {
> - /*
> - * idx got allocated successfully.
> - * Now calculate the sequence number and set the
> - * pointer for real.
> - */
> - if (idx <= ids->last_idx) {
> + min_idx = ids->next_idx;
> + new->seq = ids->seq;
> +
> + /* Modified version of __xa_alloc */
> + do {
> + xas.xa_index = min_idx;
> + xas_find_marked(&xas, max_idx, XA_FREE_MARK);
> + if (xas.xa_node == XAS_RESTART && min_idx > 0) {
> ids->seq++;
> if (ids->seq >= ipcid_seq_max())
> ids->seq = 0;
> + new->seq = ids->seq;
> + xas.xa_index = 0;
> + min_idx = 0;
> + xas_find_marked(&xas, max_idx, XA_FREE_MARK);
> }

Is is nessary to have that many details of xarray in ipc/util?

This function is not performance critical.

The core requirement is that ipc_obtain_object_check() must scale.

Would it be possible to use something like

    xa_alloc(,entry=NULL,)

    new->seq = ...

    xa_store(,entry=new,);

> - ids->last_idx = idx;
> -
> - new->seq = ids->seq;
> - /* no need for smp_wmb(), this is done
> - * inside idr_replace, as part of
> - * rcu_assign_pointer
> - */

Could you leave the memory barrier comments in the code?

The rcu_assign_pointer() is the first hands-off from semget() or msgget().

Before the rcu_assign_pointer, e.g. semop() calls would return -EINVAL;

After the rcu_assign_pointer, semwhatever() must work - and e.g. the
permission checks are lockless.

> - idr_replace(&ids->ipcs_idr, new, idx);
> - }
> + if (xas.xa_node == XAS_RESTART)
> + xas_set_err(&xas, -ENOSPC);
> + else
> + new->id = (new->seq << ipcmni_seq_shift()) +
> + xas.xa_index;

Setting new->id should remain at the end, outside any locking:

The variable has no special protection, access is only allowed after
proper locking, thus no need to have the initialization in the middle.

What is crucial is that the final value of new->seq is visible to all
cpus before a storing the pointer.
> + xas_store(&xas, new);
> + xas_clear_mark(&xas, XA_FREE_MARK);
> + } while (__xas_nomem(&xas, GFP_KERNEL));
> +

Just for my curiosity:

If the xas is in an invalid state, then xas_store() will not store anything.
Thus the loop will not store "new" multiple times, it will be stored
only once.

@@ -472,7 +487,7 @@ void ipc_rmid(struct ipc_ids *ids, struct
kern_ipc_perm *ipcp)
> idx--;
> if (idx == -1)
> break;
> - } while (!idr_find(&ids->ipcs_idr, idx));
> + } while (!xa_load(&ids->ipcs, idx));
> ids->max_idx = idx;
> }
> }

Is there an xa_find_last() function?

It is outside of any hot path, I have a patch that does a binary search
with idr_get_next().

If there is no xa_find_last(), then I would rebase that patch.


--

    Manfred

2020-04-20 17:07:46

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH] ipc: Convert ipcs_idr to XArray

On Mon, Apr 20, 2020 at 05:35:20PM +0200, Manfred Spraul wrote:
> > - max_idx = max(ids->in_use*3/2, ipc_min_cycle);
> > - max_idx = min(max_idx, ipc_mni);
> > -
> > - /* allocate the idx, with a NULL struct kern_ipc_perm */
> > - idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx,
> > - GFP_NOWAIT);
> > -
> > - if (idx >= 0) {
> > - /*
> > - * idx got allocated successfully.
> > - * Now calculate the sequence number and set the
> > - * pointer for real.
> > - */
> > - if (idx <= ids->last_idx) {
> > + min_idx = ids->next_idx;
> > + new->seq = ids->seq;
> > +
> > + /* Modified version of __xa_alloc */
> > + do {
> > + xas.xa_index = min_idx;
> > + xas_find_marked(&xas, max_idx, XA_FREE_MARK);
> > + if (xas.xa_node == XAS_RESTART && min_idx > 0) {
> > ids->seq++;
> > if (ids->seq >= ipcid_seq_max())
> > ids->seq = 0;
> > + new->seq = ids->seq;
> > + xas.xa_index = 0;
> > + min_idx = 0;
> > + xas_find_marked(&xas, max_idx, XA_FREE_MARK);
> > }
>
> Is is nessary to have that many details of xarray in ipc/util?
>
> This function is not performance critical.
>
> The core requirement is that ipc_obtain_object_check() must scale.
>
> Would it be possible to use something like
>
> ??? xa_alloc(,entry=NULL,)
>
> ??? new->seq = ...
>
> ??? xa_store(,entry=new,);

Yes, that would absolutely be possible, and some users do exactly that.
I thought that creating a new message queue / semaphore set / shared
memory segment would be performance sensitive.

> > - ids->last_idx = idx;
> > -
> > - new->seq = ids->seq;
> > - /* no need for smp_wmb(), this is done
> > - * inside idr_replace, as part of
> > - * rcu_assign_pointer
> > - */
>
> Could you leave the memory barrier comments in the code?
>
> The rcu_assign_pointer() is the first hands-off from semget() or msgget().
>
> Before the rcu_assign_pointer, e.g. semop() calls would return -EINVAL;
>
> After the rcu_assign_pointer, semwhatever() must work - and e.g. the
> permission checks are lockless.

How about simply:
/* xa_store contains a memory barrier */

> > - idr_replace(&ids->ipcs_idr, new, idx);
> > - }
> > + if (xas.xa_node == XAS_RESTART)
> > + xas_set_err(&xas, -ENOSPC);
> > + else
> > + new->id = (new->seq << ipcmni_seq_shift()) +
> > + xas.xa_index;
>
> Setting new->id should remain at the end, outside any locking:
>
> The variable has no special protection, access is only allowed after proper
> locking, thus no need to have the initialization in the middle.
>
> What is crucial is that the final value of new->seq is visible to all cpus
> before a storing the pointer.

The IPC locking is weird. Most users spin_lock the xarray/idr/radix
tree for modifications, and on the read-side use RCU to protect the
lookup and a refcount on the object looked up in it (after which,
RCU is unlocked). IPC seems to hold the RCU lock much longer, and it
has a per-object spinlock rather than refcount. And it has an rwsem.
It feels like it could be much simpler, but I haven't had time to dig
into it and figure out why it's so different from all the other users.
Maybe it's just older code.

> > + xas_store(&xas, new);
> > + xas_clear_mark(&xas, XA_FREE_MARK);
> > + } while (__xas_nomem(&xas, GFP_KERNEL));
> > +
>
> Just for my curiosity:
>
> If the xas is in an invalid state, then xas_store() will not store anything.
> Thus the loop will not store "new" multiple times, it will be stored only
> once.

Correct, although we're going to delete this loop entirely.

> @@ -472,7 +487,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm
> *ipcp)
> > idx--;
> > if (idx == -1)
> > break;
> > - } while (!idr_find(&ids->ipcs_idr, idx));
> > + } while (!xa_load(&ids->ipcs, idx));
> > ids->max_idx = idx;
> > }
> > }
>
> Is there an xa_find_last() function?
>
> It is outside of any hot path, I have a patch that does a binary search with
> idr_get_next().
>
> If there is no xa_find_last(), then I would rebase that patch.

There is not currently an xa_find_last(). It shouldn't be too hard
to add; start at the top of the tree and walk backwards in each node
until finding a non-NULL entry. Of course, it'll need documentation
and test cases.

2020-04-21 18:37:32

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH] ipc: Convert ipcs_idr to XArray

On Mon, Apr 20, 2020 at 10:06:03AM -0700, Matthew Wilcox wrote:
> On Mon, Apr 20, 2020 at 05:35:20PM +0200, Manfred Spraul wrote:
> > Would it be possible to use something like
> >
> > ??? xa_alloc(,entry=NULL,)
> >
> > ??? new->seq = ...
> >
> > ??? xa_store(,entry=new,);
>
> Yes, that would absolutely be possible, and some users do exactly that.
> I thought that creating a new message queue / semaphore set / shared
> memory segment would be performance sensitive.

Updated:

From 386556c2b1544463c2a05e0f48da2187960ec544 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <[email protected]>
Date: Sun, 4 Aug 2019 07:20:30 -0400
Subject: [PATCH] ipc: Convert ipcs_idr to XArray

The XArray has better loops than the IDR has, removing the need to
open-code them. We also don't need to call idr_destroy() any more.
Allocating the ID is a little tricky due to needing to get 'seq'
correct. Open-code a variant of __xa_alloc() which lets us set the
ID and the seq before depositing the pointer in the array.

Signed-off-by: Matthew Wilcox <[email protected]>
---
include/linux/ipc_namespace.h | 10 +-
ipc/ipc_sysctl.c | 14 +--
ipc/msg.c | 1 -
ipc/namespace.c | 13 +--
ipc/sem.c | 1 -
ipc/shm.c | 60 +++++-------
ipc/util.c | 171 ++++++++++++++++------------------
ipc/util.h | 4 +-
8 files changed, 122 insertions(+), 152 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index c309f43bde45..bdc39cc4d1dc 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -3,13 +3,13 @@
#define __IPC_NAMESPACE_H__

#include <linux/err.h>
-#include <linux/idr.h>
-#include <linux/rwsem.h>
#include <linux/notifier.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
+#include <linux/rwsem.h>
+#include <linux/xarray.h>

struct user_namespace;

@@ -17,11 +17,11 @@ struct ipc_ids {
int in_use;
unsigned short seq;
struct rw_semaphore rwsem;
- struct idr ipcs_idr;
+ struct xarray ipcs;
int max_idx;
- int last_idx; /* For wrap around detection */
+ int next_idx;
#ifdef CONFIG_CHECKPOINT_RESTORE
- int next_id;
+ int restore_id;
#endif
struct rhashtable key_ht;
};
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index affd66537e87..a2df5d0e27f6 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -115,7 +115,7 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,

int ipc_mni = IPCMNI;
int ipc_mni_shift = IPCMNI_SHIFT;
-int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
+int ipc_min_cycle = XA_CHUNK_SIZE;

static struct ctl_table ipc_kern_table[] = {
{
@@ -196,8 +196,8 @@ static struct ctl_table ipc_kern_table[] = {
#ifdef CONFIG_CHECKPOINT_RESTORE
{
.procname = "sem_next_id",
- .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
- .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
+ .data = &init_ipc_ns.ids[IPC_SEM_IDS].restore_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].restore_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
@@ -205,8 +205,8 @@ static struct ctl_table ipc_kern_table[] = {
},
{
.procname = "msg_next_id",
- .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
- .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
+ .data = &init_ipc_ns.ids[IPC_MSG_IDS].restore_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].restore_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
@@ -214,8 +214,8 @@ static struct ctl_table ipc_kern_table[] = {
},
{
.procname = "shm_next_id",
- .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
- .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
+ .data = &init_ipc_ns.ids[IPC_SHM_IDS].restore_id,
+ .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].restore_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
diff --git a/ipc/msg.c b/ipc/msg.c
index caca67368cb5..0c4fdc2f5c08 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1308,7 +1308,6 @@ void msg_init_ns(struct ipc_namespace *ns)
void msg_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &msg_ids(ns), freeque);
- idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
}
#endif
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b3ca1476ca51..e8567c3d32e9 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -96,22 +96,17 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
void (*free)(struct ipc_namespace *, struct kern_ipc_perm *))
{
struct kern_ipc_perm *perm;
- int next_id;
- int total, in_use;
+ unsigned long index;

down_write(&ids->rwsem);

- in_use = ids->in_use;
-
- for (total = 0, next_id = 0; total < in_use; next_id++) {
- perm = idr_find(&ids->ipcs_idr, next_id);
- if (perm == NULL)
- continue;
+ xa_for_each(&ids->ipcs, index, perm) {
rcu_read_lock();
ipc_lock_object(perm);
free(ns, perm);
- total++;
}
+ BUG_ON(!xa_empty(&ids->ipcs));
+
up_write(&ids->rwsem);
}

diff --git a/ipc/sem.c b/ipc/sem.c
index 3687b71151b3..8d6550ac035a 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -258,7 +258,6 @@ void sem_init_ns(struct ipc_namespace *ns)
void sem_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &sem_ids(ns), freeary);
- idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
}
#endif
diff --git a/ipc/shm.c b/ipc/shm.c
index 0ba6add05b35..7922c4f65234 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -129,7 +129,6 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
void shm_exit_ns(struct ipc_namespace *ns)
{
free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
- idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
}
#endif
@@ -348,34 +347,30 @@ static void shm_close(struct vm_area_struct *vma)
up_write(&shm_ids(ns).rwsem);
}

-/* Called with ns->shm_ids(ns).rwsem locked */
-static int shm_try_destroy_orphaned(int id, void *p, void *data)
+void shm_destroy_orphaned(struct ipc_namespace *ns)
{
- struct ipc_namespace *ns = data;
- struct kern_ipc_perm *ipcp = p;
- struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
+ struct kern_ipc_perm *ipcp;
+ unsigned long index;

- /*
- * We want to destroy segments without users and with already
- * exit'ed originating process.
- *
- * As shp->* are changed under rwsem, it's safe to skip shp locking.
- */
- if (shp->shm_creator != NULL)
- return 0;
+ down_write(&shm_ids(ns).rwsem);
+ xa_for_each(&shm_ids(ns).ipcs, index, ipcp) {
+ struct shmid_kernel *shp;

- if (shm_may_destroy(ns, shp)) {
- shm_lock_by_ptr(shp);
- shm_destroy(ns, shp);
- }
- return 0;
-}
+ shp = container_of(ipcp, struct shmid_kernel, shm_perm);

-void shm_destroy_orphaned(struct ipc_namespace *ns)
-{
- down_write(&shm_ids(ns).rwsem);
- if (shm_ids(ns).in_use)
- idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
+ /*
+ * We want to destroy segments without users and with already
+ * exit'ed originating process. As shp->* are changed under
+ * rwsem, it's safe to skip shp locking.
+ */
+ if (shp->shm_creator != NULL)
+ continue;
+
+ if (shm_may_destroy(ns, shp)) {
+ shm_lock_by_ptr(shp);
+ shm_destroy(ns, shp);
+ }
+ }
up_write(&shm_ids(ns).rwsem);
}

@@ -860,26 +855,17 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
unsigned long *swp)
{
- int next_id;
- int total, in_use;
+ struct kern_ipc_perm *ipc;
+ unsigned long index;

*rss = 0;
*swp = 0;

- in_use = shm_ids(ns).in_use;
-
- for (total = 0, next_id = 0; total < in_use; next_id++) {
- struct kern_ipc_perm *ipc;
+ xa_for_each(&shm_ids(ns).ipcs, index, ipc) {
struct shmid_kernel *shp;

- ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
- if (ipc == NULL)
- continue;
shp = container_of(ipc, struct shmid_kernel, shm_perm);
-
shm_add_rss_swap(shp, rss, swp);
-
- total++;
}
}

diff --git a/ipc/util.c b/ipc/util.c
index 7acccfded7cb..0f6b0e0aa17e 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -104,12 +104,20 @@ static const struct rhashtable_params ipc_kht_params = {
.automatic_shrinking = true,
};

+#ifdef CONFIG_CHECKPOINT_RESTORE
+#define set_restore_id(ids, x) ids->restore_id = x
+#define get_restore_id(ids) ids->restore_id
+#else
+#define set_restore_id(ids, x) do { } while (0)
+#define get_restore_id(ids) (-1)
+#endif
+
/**
* ipc_init_ids - initialise ipc identifiers
* @ids: ipc identifier set
*
* Set up the sequence range to use for the ipc identifier range (limited
- * below ipc_mni) then initialise the keys hashtable and ids idr.
+ * below ipc_mni) then initialise the keys hashtable and ids xarray.
*/
void ipc_init_ids(struct ipc_ids *ids)
{
@@ -117,12 +125,10 @@ void ipc_init_ids(struct ipc_ids *ids)
ids->seq = 0;
init_rwsem(&ids->rwsem);
rhashtable_init(&ids->key_ht, &ipc_kht_params);
- idr_init(&ids->ipcs_idr);
+ xa_init_flags(&ids->ipcs, XA_FLAGS_ALLOC);
ids->max_idx = -1;
- ids->last_idx = -1;
-#ifdef CONFIG_CHECKPOINT_RESTORE
- ids->next_id = -1;
-#endif
+ ids->next_idx = 0;
+ set_restore_id(ids, -1);
}

#ifdef CONFIG_PROC_FS
@@ -183,12 +189,12 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
}

/*
- * Insert new IPC object into idr tree, and set sequence number and id
+ * Insert new IPC object into xarray, and set sequence number and id
* in the correct order.
* Especially:
- * - the sequence number must be set before inserting the object into the idr,
- * because the sequence number is accessed without a lock.
- * - the id can/must be set after inserting the object into the idr.
+ * - the sequence number must be set before inserting the object into the
+ * xarray, because the sequence number is accessed without a lock.
+ * - the id can/must be set after inserting the object into the xarray.
* All accesses must be done after getting kern_ipc_perm.lock.
*
* The caller must own kern_ipc_perm.lock.of the new object.
@@ -198,64 +204,48 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
* the sequence number is incremented only when the returned ID is less than
* the last one.
*/
-static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
+static inline int ipc_id_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
{
- int idx, next_id = -1;
-
-#ifdef CONFIG_CHECKPOINT_RESTORE
- next_id = ids->next_id;
- ids->next_id = -1;
-#endif
-
- /*
- * As soon as a new object is inserted into the idr,
- * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it,
- * and the lockless preparations for ipc operations can start.
- * This means especially: permission checks, audit calls, allocation
- * of undo structures, ...
- *
- * Thus the object must be fully initialized, and if something fails,
- * then the full tear-down sequence must be followed.
- * (i.e.: set new->deleted, reduce refcount, call_rcu())
- */
+ u32 idx;
+ int err;

- if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
+ if (get_restore_id(ids) < 0) {
int max_idx;

max_idx = max(ids->in_use*3/2, ipc_min_cycle);
- max_idx = min(max_idx, ipc_mni);
-
- /* allocate the idx, with a NULL struct kern_ipc_perm */
- idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx,
- GFP_NOWAIT);
-
- if (idx >= 0) {
- /*
- * idx got allocated successfully.
- * Now calculate the sequence number and set the
- * pointer for real.
- */
- if (idx <= ids->last_idx) {
- ids->seq++;
- if (ids->seq >= ipcid_seq_max())
- ids->seq = 0;
- }
- ids->last_idx = idx;
+ max_idx = min(max_idx, ipc_mni) - 1;

+ xa_lock(&ids->ipcs);
+
+ err = __xa_alloc_cyclic(&ids->ipcs, &idx, NULL,
+ XA_LIMIT(0, max_idx), &ids->next_idx,
+ GFP_KERNEL);
+ if (err == 1) {
+ ids->seq++;
+ if (ids->seq >= ipcid_seq_max())
+ ids->seq = 0;
+ }
+
+ if (err >= 0) {
new->seq = ids->seq;
- /* no need for smp_wmb(), this is done
- * inside idr_replace, as part of
- * rcu_assign_pointer
- */
- idr_replace(&ids->ipcs_idr, new, idx);
+ new->id = (new->seq << ipcmni_seq_shift()) + idx;
+ /* xa_store contains a write barrier */
+ __xa_store(&ids->ipcs, idx, new, GFP_KERNEL);
}
+
+ xa_unlock(&ids->ipcs);
} else {
- new->seq = ipcid_to_seqx(next_id);
- idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id),
- 0, GFP_NOWAIT);
+ new->id = get_restore_id(ids);
+ new->seq = ipcid_to_seqx(new->id);
+ idx = ipcid_to_idx(new->id);
+ err = xa_insert(&ids->ipcs, idx, new, GFP_KERNEL);
+ set_restore_id(ids, -1);
}
- if (idx >= 0)
- new->id = (new->seq << ipcmni_seq_shift()) + idx;
+
+ if (err == -EBUSY)
+ return -ENOSPC;
+ if (err < 0)
+ return err;
return idx;
}

@@ -278,7 +268,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
{
kuid_t euid;
kgid_t egid;
- int idx, err;
+ int idx;

/* 1) Initialize the refcount so that ipc_rcu_putref works */
refcount_set(&new->refcount, 1);
@@ -289,29 +279,42 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
if (ids->in_use >= limit)
return -ENOSPC;

- idr_preload(GFP_KERNEL);
-
+ /*
+ * 2) Hold the spinlock so that nobody else can access the object
+ * once they can find it
+ */
spin_lock_init(&new->lock);
- rcu_read_lock();
spin_lock(&new->lock);
-
current_euid_egid(&euid, &egid);
new->cuid = new->uid = euid;
new->gid = new->cgid = egid;
-
new->deleted = false;

- idx = ipc_idr_alloc(ids, new);
- idr_preload_end();
+ idx = ipc_id_alloc(ids, new);
+
+ rcu_read_lock();
+
+ /*
+ * As soon as a new object is inserted into the XArray,
+ * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it,
+ * and the lockless preparations for ipc operations can start.
+ * This means especially: permission checks, audit calls, allocation
+ * of undo structures, ...
+ *
+ * Thus the object must be fully initialized, and if something fails,
+ * then the full tear-down sequence must be followed.
+ * (i.e.: set new->deleted, reduce refcount, call_rcu())
+ */

if (idx >= 0 && new->key != IPC_PRIVATE) {
- err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
+ int err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
ipc_kht_params);
if (err < 0) {
- idr_remove(&ids->ipcs_idr, idx);
+ xa_erase(&ids->ipcs, idx);
idx = err;
}
}
+
if (idx < 0) {
new->deleted = true;
spin_unlock(&new->lock);
@@ -462,7 +465,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
{
int idx = ipcid_to_idx(ipcp->id);

- idr_remove(&ids->ipcs_idr, idx);
+ xa_erase(&ids->ipcs, idx);
ipc_kht_remove(ids, ipcp);
ids->in_use--;
ipcp->deleted = true;
@@ -472,7 +475,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
idx--;
if (idx == -1)
break;
- } while (!idr_find(&ids->ipcs_idr, idx));
+ } while (!xa_load(&ids->ipcs, idx));
ids->max_idx = idx;
}
}
@@ -595,7 +598,7 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
struct kern_ipc_perm *out;
int idx = ipcid_to_idx(id);

- out = idr_find(&ids->ipcs_idr, idx);
+ out = xa_load(&ids->ipcs, idx);
if (!out)
return ERR_PTR(-EINVAL);

@@ -754,31 +757,19 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s)
static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
loff_t *new_pos)
{
+ unsigned long index = pos;
struct kern_ipc_perm *ipc;
- int total, id;
-
- total = 0;
- for (id = 0; id < pos && total < ids->in_use; id++) {
- ipc = idr_find(&ids->ipcs_idr, id);
- if (ipc != NULL)
- total++;
- }

*new_pos = pos + 1;
- if (total >= ids->in_use)
+ rcu_read_lock();
+ ipc = xa_find(&ids->ipcs, &index, ULONG_MAX, XA_PRESENT);
+ if (!ipc) {
+ rcu_read_unlock();
return NULL;
-
- for (; pos < ipc_mni; pos++) {
- ipc = idr_find(&ids->ipcs_idr, pos);
- if (ipc != NULL) {
- rcu_read_lock();
- ipc_lock_object(ipc);
- return ipc;
- }
}

- /* Out of range - return NULL to terminate iteration */
- return NULL;
+ ipc_lock_object(ipc);
+ return ipc;
}

static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos)
diff --git a/ipc/util.h b/ipc/util.h
index 5766c61aed0e..04d49db4cefa 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -27,7 +27,7 @@
*/
#define IPCMNI_SHIFT 15
#define IPCMNI_EXTEND_SHIFT 24
-#define IPCMNI_EXTEND_MIN_CYCLE (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE)
+#define IPCMNI_EXTEND_MIN_CYCLE (XA_CHUNK_SIZE * XA_CHUNK_SIZE)
#define IPCMNI (1 << IPCMNI_SHIFT)
#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT)

@@ -42,7 +42,7 @@ extern int ipc_min_cycle;
#else /* CONFIG_SYSVIPC_SYSCTL */

#define ipc_mni IPCMNI
-#define ipc_min_cycle ((int)RADIX_TREE_MAP_SIZE)
+#define ipc_min_cycle ((int)XA_CHUNK_SIZE)
#define ipcmni_seq_shift() IPCMNI_SHIFT
#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1)
#endif /* CONFIG_SYSVIPC_SYSCTL */
--
2.26.1