2022-07-15 12:02:38

by Brian Foster

[permalink] [raw]
Subject: [PATCH 0/3] pid: replace idr api with xarray

Hi all,

This series is a few patches to switch struct pid management over from
the idr api to the xarray api. The underlying data structures are
already the same between both apis, but the idr relies on the old and
slightly customized radix-tree implementation to accomplish things like
efficient free id tracking, which xarray already supports directly.

This is all based on a prototype patch[1] from Willy that fell out from
discussion on a separate series to try and improve /proc readdir
performance using radix-tree tags (to be replaced with xarray marks).
I've basically split it up into a few smaller patches, made some minor
tweaks, and ran some tests on the result.

Willy,

Re: the above, I've included your s-o-b on each of the patches. I'm not
sure what your preference or the proper etiquette is here. Let me know
if you want me to change authorship or tags or whatever in any way..

Brian

[1] https://lore.kernel.org/linux-fsdevel/[email protected]/

Brian Foster (3):
pid: replace pidmap_lock with xarray lock
pid: split cyclic id allocation cursor from idr
pid: switch pid_namespace from idr to xarray

arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
fs/proc/loadavg.c | 2 +-
include/linux/pid_namespace.h | 9 +-
include/linux/threads.h | 2 +-
init/main.c | 3 +-
kernel/pid.c | 133 +++++++++++-----------
kernel/pid_namespace.c | 23 ++--
7 files changed, 85 insertions(+), 89 deletions(-)

--
2.35.3


2022-07-15 12:27:53

by Brian Foster

[permalink] [raw]
Subject: [PATCH 2/3] pid: split cyclic id allocation cursor from idr

As a next step in separating pid allocation from the idr, split off
the cyclic pid allocation cursor from the idr. Lift the cursor value
into the sturct pid_namespace. Note that this involves temporarily
open-coding the cursor increment on allocation, but this is cleaned
up in the subsequent patch.

Signed-off-by: Matthew Wilcox <[email protected]>
Signed-off-by: Brian Foster <[email protected]>
---
arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
fs/proc/loadavg.c | 2 +-
include/linux/pid_namespace.h | 1 +
kernel/pid.c | 6 ++++--
kernel/pid_namespace.c | 4 ++--
5 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 99bd027a7f7c..a2ed928d7658 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1072,7 +1072,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
LOAD_INT(c), LOAD_FRAC(c),
count_active_contexts(),
atomic_read(&nr_spu_contexts),
- idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
+ READ_ONCE(task_active_pid_ns(current)->pid_next) - 1);
return 0;
}
#endif
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index f32878d9a39f..62f89d549582 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -21,7 +21,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
- idr_get_cursor(&task_active_pid_ns(current)->idr) - 1);
+ READ_ONCE(task_active_pid_ns(current)->pid_next) - 1);
return 0;
}

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 07481bb87d4e..82c72482019d 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -18,6 +18,7 @@ struct fs_pin;

struct pid_namespace {
struct idr idr;
+ unsigned int pid_next;
struct rcu_head rcu;
unsigned int pid_allocated;
struct task_struct *child_reaper;
diff --git a/kernel/pid.c b/kernel/pid.c
index 72a6e9d0db81..409303ada383 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -75,6 +75,7 @@ int pid_max_max = PID_MAX_LIMIT;
struct pid_namespace init_pid_ns = {
.ns.count = REFCOUNT_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
+ .pid_next = 0,
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
@@ -208,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* init really needs pid 1, but after reaching the
* maximum wrap back to RESERVED_PIDS
*/
- if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+ if (tmp->pid_next > RESERVED_PIDS)
pid_min = RESERVED_PIDS;

/*
@@ -217,6 +218,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
+ tmp->pid_next = nr + 1;
}
xa_unlock_irq(&tmp->idr.idr_rt);
idr_preload_end();
@@ -278,7 +280,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,

/* On failure to allocate the first pid, reset the state */
if (tmp == ns && tmp->pid_allocated == PIDNS_ADDING)
- idr_set_cursor(&ns->idr, 0);
+ ns->pid_next = 0;

idr_remove(&tmp->idr, upid->nr);
xa_unlock_irq(&tmp->idr.idr_rt);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index f4f8cb0435b4..a53d20c5c85e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -272,12 +272,12 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/

- next = idr_get_cursor(&pid_ns->idr) - 1;
+ next = READ_ONCE(pid_ns->pid_next) - 1;

tmp.data = &next;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (!ret && write)
- idr_set_cursor(&pid_ns->idr, next + 1);
+ WRITE_ONCE(pid_ns->pid_next, next + 1);

return ret;
}
--
2.35.3

2022-07-15 12:29:29

by Brian Foster

[permalink] [raw]
Subject: [PATCH 1/3] pid: replace pidmap_lock with xarray lock

As a first step to changing the struct pid tracking code from the
idr over to the xarray, replace the custom pidmap_lock spinlock with
the internal lock associated with the underlying xarray. This is
effectively equivalent to using idr_lock() and friends, but since
the goal is to disentangle from the idr, move directly to the
underlying xarray api.

Signed-off-by: Matthew Wilcox <[email protected]>
Signed-off-by: Brian Foster <[email protected]>
---
kernel/pid.c | 79 ++++++++++++++++++++++++++--------------------------
1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index 2fc0a16ec77b..72a6e9d0db81 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -86,22 +86,6 @@ struct pid_namespace init_pid_ns = {
};
EXPORT_SYMBOL_GPL(init_pid_ns);

-/*
- * Note: disable interrupts while the pidmap_lock is held as an
- * interrupt might come in and do read_lock(&tasklist_lock).
- *
- * If we don't disable interrupts there is a nasty deadlock between
- * detach_pid()->free_pid() and another cpu that does
- * spin_lock(&pidmap_lock) followed by an interrupt routine that does
- * read_lock(&tasklist_lock);
- *
- * After we clean up the tasklist_lock and know there are no
- * irq handlers that take it we can leave the interrupts enabled.
- * For now it is easier to be safe than to prove it can't happen.
- */
-
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -129,10 +113,11 @@ void free_pid(struct pid *pid)
int i;
unsigned long flags;

- spin_lock_irqsave(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
+
+ xa_lock_irqsave(&ns->idr.idr_rt, flags);
switch (--ns->pid_allocated) {
case 2:
case 1:
@@ -150,8 +135,8 @@ void free_pid(struct pid *pid)
}

idr_remove(&ns->idr, upid->nr);
+ xa_unlock_irqrestore(&ns->idr.idr_rt, flags);
}
- spin_unlock_irqrestore(&pidmap_lock, flags);

call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -206,7 +191,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
}

idr_preload(GFP_KERNEL);
- spin_lock_irq(&pidmap_lock);
+ xa_lock_irq(&tmp->idr.idr_rt);

if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -233,7 +218,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
- spin_unlock_irq(&pidmap_lock);
+ xa_unlock_irq(&tmp->idr.idr_rt);
idr_preload_end();

if (nr < 0) {
@@ -266,34 +251,38 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
INIT_HLIST_HEAD(&pid->inodes);

upid = pid->numbers + ns->level;
- spin_lock_irq(&pidmap_lock);
- if (!(ns->pid_allocated & PIDNS_ADDING))
- goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
+ tmp = upid->ns;
+
+ xa_lock_irq(&tmp->idr.idr_rt);
+ if (tmp == ns && !(tmp->pid_allocated & PIDNS_ADDING)) {
+ xa_unlock_irq(&tmp->idr.idr_rt);
+ put_pid_ns(ns);
+ goto out_free;
+ }
+
/* Make the PID visible to find_pid_ns. */
- idr_replace(&upid->ns->idr, pid, upid->nr);
- upid->ns->pid_allocated++;
+ idr_replace(&tmp->idr, pid, upid->nr);
+ tmp->pid_allocated++;
+ xa_unlock_irq(&tmp->idr.idr_rt);
}
- spin_unlock_irq(&pidmap_lock);

return pid;

-out_unlock:
- spin_unlock_irq(&pidmap_lock);
- put_pid_ns(ns);
-
out_free:
- spin_lock_irq(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
- idr_remove(&upid->ns->idr, upid->nr);
- }
+ tmp = upid->ns;

- /* On failure to allocate the first pid, reset the state */
- if (ns->pid_allocated == PIDNS_ADDING)
- idr_set_cursor(&ns->idr, 0);
+ xa_lock_irq(&tmp->idr.idr_rt);

- spin_unlock_irq(&pidmap_lock);
+ /* On failure to allocate the first pid, reset the state */
+ if (tmp == ns && tmp->pid_allocated == PIDNS_ADDING)
+ idr_set_cursor(&ns->idr, 0);
+
+ idr_remove(&tmp->idr, upid->nr);
+ xa_unlock_irq(&tmp->idr.idr_rt);
+ }

kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -301,9 +290,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,

void disable_pid_allocation(struct pid_namespace *ns)
{
- spin_lock_irq(&pidmap_lock);
+ xa_lock_irq(&ns->idr.idr_rt);
ns->pid_allocated &= ~PIDNS_ADDING;
- spin_unlock_irq(&pidmap_lock);
+ xa_unlock_irq(&ns->idr.idr_rt);
}

struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
@@ -646,6 +635,18 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return fd;
}

+/*
+ * Note: disable interrupts while the xarray lock is held as an interrupt might
+ * come in and do read_lock(&tasklist_lock).
+ *
+ * If we don't disable interrupts there is a nasty deadlock between
+ * detach_pid()->free_pid() and another cpu that does xa_lock() followed by an
+ * interrupt routine that does read_lock(&tasklist_lock);
+ *
+ * After we clean up the tasklist_lock and know there are no irq handlers that
+ * take it we can leave the interrupts enabled. For now it is easier to be safe
+ * than to prove it can't happen.
+ */
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
--
2.35.3