2017-09-09 15:21:05

by Gargi Sharma

[permalink] [raw]
Subject: [RFC 0/2] Replace PID implementation with IDR API

This patch series replaces kernel bitmap implementation of PID allocation
with IDR API.

The following are the stats for pid and pid_namespace object files
before and after the replacement. There is a noteworthy change between
the IDR and bitmap implementation.

Before
text data bss dec hex filename
8447 3894 64 12405 3075 kernel/pid.o
After
text data bss dec hex filename
3602 324 8 3934 f5e kernel/pid.o

Before
text data bss dec hex filename
5692 1842 192 7726 1e2e kernel/pid_namespace.o
After
text data bss dec hex filename
2858 216 16 3090 c12 kernel/pid_namespace.o

There wasn't a considerable difference between the time required for
allocation of PIDs to the processes. The IDR implementation is a little faster
than bitmap implementation.

The next change in the pipeline is replacing pidhash with IDR API implementation.

Gargi Sharma (2):
proc: Return if nothing to unmount
pid: Replace PID bitmap implementation with IDR API

fs/proc/base.c | 4 +
include/linux/pid.h | 1 +
include/linux/pid_namespace.h | 5 +-
init/main.c | 4 +-
kernel/pid.c | 204 ++++++++----------------------------------
kernel/pid_namespace.c | 39 ++++----
6 files changed, 63 insertions(+), 194 deletions(-)

--
2.7.4


2017-09-09 15:21:16

by Gargi Sharma

[permalink] [raw]
Subject: [RFC 1/2] proc: Return if nothing to unmount

If a task exits before procfs is mounted, proc_flush_task_mnt will
be called with a NULL mnt parameter. In that case, not only is there
nothing to unhash, but trying to do so will oops the kernel with a
null pointer dereference.

Signed-off-by: Gargi Sharma <[email protected]>
---
fs/proc/base.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e5d89a0..7b83c21 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3021,6 +3021,10 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
char buf[PROC_NUMBUF];
struct qstr name;

+ /* procfs is not mounted. There is nothing to unhash. */
+ if (!mnt)
+ return;
+
name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%d", pid);
/* no ->d_hash() rejects on procfs */
--
2.7.4

2017-09-09 15:21:27

by Gargi Sharma

[permalink] [raw]
Subject: [RFC 2/2] pid: Replace PID bitmap implementation with IDR API

This patch replaces the current bitmap implemetation for
Process ID allocation. Functions that are no longer required,
for example, free_pidmap(), alloc_pidmap(), etc. are removed.
The rest of the functions are modified to use the IDR API.
The change was made to make the PID allocation less complex by
replacing custom code with calls to generic API.

Signed-off-by: Gargi Sharma <[email protected]>
---
include/linux/pid.h | 1 +
include/linux/pid_namespace.h | 5 +-
init/main.c | 4 +-
kernel/pid.c | 204 ++++++++----------------------------------
kernel/pid_namespace.c | 39 ++++----
5 files changed, 59 insertions(+), 194 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 7195827..de85a7b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -99,6 +99,7 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new,

struct pid_namespace;
extern struct pid_namespace init_pid_ns;
+extern int pid_max;

/*
* look up a PID in the hash table. Must be called with the tasklist_lock
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index b09136f..1ac6e85 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/kref.h>
#include <linux/ns_common.h>
+#include <linux/idr.h>

struct pidmap {
atomic_t nr_free;
@@ -29,7 +30,7 @@ enum { /* definitions for pid_namespace's hide_pid field */

struct pid_namespace {
struct kref kref;
- struct pidmap pidmap[PIDMAP_ENTRIES];
+ struct idr idr;
struct rcu_head rcu;
int last_pid;
unsigned int nr_hashed;
@@ -105,6 +106,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)

extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
-void pidmap_init(void);
+void pid_idr_init(void);

#endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index a21a1a8..8821c1d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/binfmts.h>
+#include <linux/delay.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/stackprotector.h>
@@ -667,7 +668,7 @@ asmlinkage __visible void __init start_kernel(void)
if (late_time_init)
late_time_init();
calibrate_delay();
- pidmap_init();
+ pid_idr_init();
anon_vma_init();
acpi_early_init();
#ifdef CONFIG_X86
@@ -989,7 +990,6 @@ static inline void mark_readonly(void)
static int __ref kernel_init(void *unused)
{
int ret;
-
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedb..27e43fd 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,6 +39,7 @@
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#include <linux/sched/task.h>
+#include <linux/idr.h>

#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -53,12 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;

-static inline int mk_pid(struct pid_namespace *pid_ns,
- struct pidmap *map, int off)
-{
- return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
#define find_next_offset(map, off) \
find_next_zero_bit((map)->page, BITS_PER_PAGE, off)

@@ -70,10 +65,8 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .pidmap = {
- [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
- },
.last_pid = 0,
+ .idr = IDR_INIT,
.nr_hashed = PIDNS_HASH_ADDING,
.level = 0,
.child_reaper = &init_task,
@@ -101,138 +94,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);

-static void free_pidmap(struct upid *upid)
-{
- int nr = upid->nr;
- struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
- int offset = nr & BITS_PER_PAGE_MASK;
-
- clear_bit(offset, map->page);
- atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
- /*
- * This is the same as saying
- *
- * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
- * and that mapping orders 'a' and 'b' with respect to 'base'.
- */
- return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value. We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
- int prev;
- int last_write = base;
- do {
- prev = last_write;
- last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
- } while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
- struct pidmap *map;
-
- pid = last + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
- offset = pid & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- /*
- * If last_pid points into the middle of the map->page we
- * want to scan this bitmap block twice, the second time
- * we start with offset == 0 (or RESERVED_PIDS).
- */
- max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
- for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- return -ENOMEM;
- }
- if (likely(atomic_read(&map->nr_free))) {
- for ( ; ; ) {
- if (!test_and_set_bit(offset, map->page)) {
- atomic_dec(&map->nr_free);
- set_last_pid(pid_ns, last, pid);
- return pid;
- }
- offset = find_next_offset(map, offset);
- if (offset >= BITS_PER_PAGE)
- break;
- pid = mk_pid(pid_ns, map, offset);
- if (pid >= pid_max)
- break;
- }
- }
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
- ++map;
- offset = 0;
- } else {
- map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset))
- break;
- }
- pid = mk_pid(pid_ns, map, offset);
- }
- return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
- int offset;
- struct pidmap *map, *end;
-
- if (last >= PID_MAX_LIMIT)
- return -1;
-
- offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pid_ns->pidmap[PIDMAP_ENTRIES];
- for (; map < end; map++, offset = 0) {
- if (unlikely(!map->page))
- continue;
- offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
- if (offset < BITS_PER_PAGE)
- return mk_pid(pid_ns, map, offset);
- }
- return -1;
-}
-
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -285,10 +146,14 @@ void free_pid(struct pid *pid)
break;
}
}
- spin_unlock_irqrestore(&pidmap_lock, flags);

- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
+ for (i = 0; i <= pid->level; i++) {
+ struct upid *upid = pid->numbers + i;
+ struct pid_namespace *ns = upid->ns;
+
+ idr_remove(&ns->idr, upid->nr);
+ }
+ spin_unlock_irqrestore(&pidmap_lock, flags);

call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -309,7 +174,22 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ int pid_min = 1;
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&pidmap_lock);
+
+ /*
+ * init really needs pid 1, but after reaching the maximum
+ * wrap back to RESERVED_PIDS
+ */
+ if (tmp->idr.idr_next > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ nr = idr_alloc_cyclic(&tmp->idr, pid, pid_min,
+ pid_max, GFP_ATOMIC);
+ spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
+
if (nr < 0) {
retval = nr;
goto out_free;
@@ -346,12 +226,14 @@ struct pid *alloc_pid(struct pid_namespace *ns)
return pid;

out_unlock:
- spin_unlock_irq(&pidmap_lock);
put_pid_ns(ns);
+ spin_unlock_irq(&pidmap_lock);

out_free:
+ spin_lock_irq(&pidmap_lock);
while (++i <= ns->level)
- free_pidmap(pid->numbers + i);
+ idr_remove(&ns->idr, (pid->numbers + i)->nr);
+ spin_unlock_irq(&pidmap_lock);

kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -527,11 +409,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (!ns)
ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID) {
- if (type == __PIDTYPE_TGID)
- type = PIDTYPE_PID;
+ if (type != PIDTYPE_PID)
task = task->group_leader;
- }
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
@@ -553,16 +432,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
- struct pid *pid;
-
- do {
- pid = find_pid_ns(nr, ns);
- if (pid)
- break;
- nr = next_pidmap(ns, nr);
- } while (nr > 0);
-
- return pid;
+ return idr_get_next(&ns->idr, &nr);
}

/*
@@ -572,13 +442,16 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
+ unsigned int pidhash_size;
+
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
+ pidhash_size = 1U << pidhash_shift;
}

-void __init pidmap_init(void)
+void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
@@ -590,10 +463,9 @@ void __init pidmap_init(void)
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);

- init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pid_ns.pidmap[0].page);
- atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ idr_init(&init_pid_ns.idr);
+ /* Reserve PID 0. */
+ idr_alloc_cyclic(&init_pid_ns.idr, &init_pid_ns, 0, 0, GFP_KERNEL);

init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 74a5a72..d0dc231 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
+#include <linux/idr.h>

struct pid_cache {
int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
struct ucounts *ucounts;
- int i;
int err;

err = -ENOSPC;
@@ -113,17 +113,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns == NULL)
goto out_dec;

- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
+ idr_init(&ns->idr);

ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
- goto out_free_map;
+ goto out_free_idr;

err = ns_alloc_inum(&ns->ns);
if (err)
- goto out_free_map;
+ goto out_free_idr;
ns->ns.ops = &pidns_operations;

kref_init(&ns->kref);
@@ -134,17 +132,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->nr_hashed = PIDNS_HASH_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);

- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++)
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
return ns;

-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+ idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
out_dec:
dec_pid_namespaces(ucounts);
@@ -164,11 +155,9 @@ static void delayed_free_pidns(struct rcu_head *p)

static void destroy_pid_namespace(struct pid_namespace *ns)
{
- int i;
-
ns_free_inum(&ns->ns);
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
+
+ idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}

@@ -205,10 +194,11 @@ EXPORT_SYMBOL_GPL(put_pid_ns);

void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
- int nr;
+ int nr = 2;
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
+ struct pid *pid;

/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -236,8 +226,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*
*/
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
+ pid = idr_get_next(&pid_ns->idr, &nr);
+ while (pid) {
rcu_read_lock();

task = pid_task(find_vpid(nr), PIDTYPE_PID);
@@ -246,7 +236,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)

rcu_read_unlock();

- nr = next_pidmap(pid_ns, nr);
+ nr++;
+ pid = idr_get_next(&pid_ns->idr, &nr);
}
read_unlock(&tasklist_lock);

@@ -307,7 +298,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/

- tmp.data = &pid_ns->last_pid;
+ tmp.data = &pid_max;
return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}

--
2.7.4

2017-09-09 18:31:43

by Al Viro

[permalink] [raw]
Subject: Re: [RFC 1/2] proc: Return if nothing to unmount

On Sat, Sep 09, 2017 at 06:03:16PM +0530, Gargi Sharma wrote:
> If a task exits before procfs is mounted, proc_flush_task_mnt will
> be called with a NULL mnt parameter. In that case, not only is there
> nothing to unhash, but trying to do so will oops the kernel with a
> null pointer dereference.

You are misreading that sucker. It's about userland mounts, it's about
the internal ones in pidns, for each pidns the process belongs to.

IOW, what you are adding is dead code. The very first alloc_pid() in
that pidns should've called pid_ns_prepare_proc(), which creates that
vfsmount.

2017-09-10 20:08:03

by Rik van Riel

[permalink] [raw]
Subject: Re: [RFC 1/2] proc: Return if nothing to unmount



On September 9, 2017 2:31:35 PM EDT, Al Viro <[email protected]> wrote:
>On Sat, Sep 09, 2017 at 06:03:16PM +0530, Gargi Sharma wrote:
>> If a task exits before procfs is mounted, proc_flush_task_mnt will
>> be called with a NULL mnt parameter. In that case, not only is there
>> nothing to unhash, but trying to do so will oops the kernel with a
>> null pointer dereference.
>
>You are misreading that sucker. It's about userland mounts, it's about
>the internal ones in pidns, for each pidns the process belongs to.
>
>IOW, what you are adding is dead code. The very first alloc_pid() in
>that pidns should've called pid_ns_prepare_proc(), which creates that
>vfsmount.

Huh, my bad. I wonder why Gargi's code ran into a null pointer dereference on a null mnt pointer, then...
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

2017-09-11 00:58:25

by Rik van Riel

[permalink] [raw]
Subject: Re: [RFC 1/2] proc: Return if nothing to unmount

On Sat, 2017-09-09 at 19:31 +0100, Al Viro wrote:
> On Sat, Sep 09, 2017 at 06:03:16PM +0530, Gargi Sharma wrote:
> > If a task exits before procfs is mounted, proc_flush_task_mnt will
> > be called with a NULL mnt parameter. In that case, not only is
> > there
> > nothing to unhash, but trying to do so will oops the kernel with a
> > null pointer dereference.
>
> You are misreading that sucker.  It's about userland mounts, it's
> about
> the internal ones in pidns, for each pidns the process belongs to.
>
> IOW, what you are adding is dead code.  The very first alloc_pid() in
> that pidns should've called pid_ns_prepare_proc(), which creates that
> vfsmount.

Looking at the code (now that I am home, and no longer
reading this email on my phone), I see the cause of this
problem.

A previous version of Gargi's code had RESERVED_PIDS as
the lower bound for idr_alloc_cyclic, even on the very
first PID allocation cycle in a PID namespace.

With the code changed to have 1 as the lower bound during
the first allocation cycle, pid_ns_prepare_proc() should
be called correctly, and things should work as expected.

Gargi, can you drop this patch 1/2, and make sure the code
still works fine?

--
All Rights Reversed.


Attachments:
signature.asc (473.00 B)
This is a digitally signed message part