This patch series replaces kernel bitmap implementation of PID allocation
with IDR API.
The following are the stats for pid and pid_namespace object files
before and after the replacement. There is a noteworthy change between
the IDR and bitmap implementation.
Before
text data bss dec hex filename
8447 3894 64 12405 3075 kernel/pid.o
After
text data bss dec hex filename
3381 304 0 3717 e65 kernel/pid.o
Before
text data bss dec hex filename
5692 1842 192 7726 1e2e kernel/pid_namespace.o
After
text data bss dec hex filename
2870 216 16 3102 c1e kernel/pid_namespace.o
There wasn't a considerable difference between the time required for
allocation of PIDs to the processes. The IDR implementation is a little faster
than bitmap implementation.
Gargi Sharma (4):
pid: Replace pid bitmap implementation with IDR API
idr: Add a function idr_get()
pid.c: Replace pidhash lookup with idr_get()
pid: Remove pidhash
arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
fs/proc/loadavg.c | 2 +-
include/linux/idr.h | 1 +
include/linux/init_task.h | 1 -
include/linux/pid.h | 2 -
include/linux/pid_namespace.h | 16 +-
init/main.c | 3 +-
kernel/fork.c | 2 +-
kernel/pid.c | 241 +++++-------------------------
kernel/pid_namespace.c | 45 +++---
lib/idr.c | 11 ++
11 files changed, 78 insertions(+), 248 deletions(-)
--
2.7.4
This patch replaces the current bitmap implemetation for
Process ID allocation. Functions that are no longer required,
for example, free_pidmap(), alloc_pidmap(), etc. are removed.
The rest of the functions are modified to use the IDR API.
The change was made to make the PID allocation less complex by
replacing custom code with calls to generic API.
Signed-off-by: Gargi Sharma <[email protected]>
---
arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
fs/proc/loadavg.c | 2 +-
include/linux/pid_namespace.h | 14 +--
init/main.c | 2 +-
kernel/pid.c | 198 ++++++------------------------
kernel/pid_namespace.c | 39 +++---
6 files changed, 56 insertions(+), 201 deletions(-)
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 1fbb5da..d285aef 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
LOAD_INT(c), LOAD_FRAC(c),
count_active_contexts(),
atomic_read(&nr_spu_contexts),
- task_active_pid_ns(current)->last_pid);
+ task_active_pid_ns(current)->idr.idr_next-1);
return 0;
}
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 983fce5..9355f4d 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -23,7 +23,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
- task_active_pid_ns(current)->last_pid);
+ task_active_pid_ns(current)->idr.idr_next-1);
return 0;
}
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index b09136f..f4db4a7 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -9,15 +9,8 @@
#include <linux/nsproxy.h>
#include <linux/kref.h>
#include <linux/ns_common.h>
+#include <linux/idr.h>
-struct pidmap {
- atomic_t nr_free;
- void *page;
-};
-
-#define BITS_PER_PAGE (PAGE_SIZE * 8)
-#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
-#define PIDMAP_ENTRIES ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
struct fs_pin;
@@ -29,9 +22,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
struct pid_namespace {
struct kref kref;
- struct pidmap pidmap[PIDMAP_ENTRIES];
+ struct idr idr;
struct rcu_head rcu;
- int last_pid;
unsigned int nr_hashed;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
@@ -105,6 +97,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
-void pidmap_init(void);
+void pid_idr_init(void);
#endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index 0ee9c686..9f4db20 100644
--- a/init/main.c
+++ b/init/main.c
@@ -667,7 +667,7 @@ asmlinkage __visible void __init start_kernel(void)
if (late_time_init)
late_time_init();
calibrate_delay();
- pidmap_init();
+ pid_idr_init();
anon_vma_init();
acpi_early_init();
#ifdef CONFIG_X86
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedb..ea22e89 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,6 +39,7 @@
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#include <linux/sched/task.h>
+#include <linux/idr.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -53,12 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-static inline int mk_pid(struct pid_namespace *pid_ns,
- struct pidmap *map, int off)
-{
- return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
#define find_next_offset(map, off) \
find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
@@ -70,10 +65,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .pidmap = {
- [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
- },
- .last_pid = 0,
+ .idr = IDR_INIT,
.nr_hashed = PIDNS_HASH_ADDING,
.level = 0,
.child_reaper = &init_task,
@@ -101,138 +93,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static void free_pidmap(struct upid *upid)
-{
- int nr = upid->nr;
- struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
- int offset = nr & BITS_PER_PAGE_MASK;
-
- clear_bit(offset, map->page);
- atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
- /*
- * This is the same as saying
- *
- * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
- * and that mapping orders 'a' and 'b' with respect to 'base'.
- */
- return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value. We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
- int prev;
- int last_write = base;
- do {
- prev = last_write;
- last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
- } while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
- struct pidmap *map;
-
- pid = last + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
- offset = pid & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- /*
- * If last_pid points into the middle of the map->page we
- * want to scan this bitmap block twice, the second time
- * we start with offset == 0 (or RESERVED_PIDS).
- */
- max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
- for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- return -ENOMEM;
- }
- if (likely(atomic_read(&map->nr_free))) {
- for ( ; ; ) {
- if (!test_and_set_bit(offset, map->page)) {
- atomic_dec(&map->nr_free);
- set_last_pid(pid_ns, last, pid);
- return pid;
- }
- offset = find_next_offset(map, offset);
- if (offset >= BITS_PER_PAGE)
- break;
- pid = mk_pid(pid_ns, map, offset);
- if (pid >= pid_max)
- break;
- }
- }
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
- ++map;
- offset = 0;
- } else {
- map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset))
- break;
- }
- pid = mk_pid(pid_ns, map, offset);
- }
- return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
- int offset;
- struct pidmap *map, *end;
-
- if (last >= PID_MAX_LIMIT)
- return -1;
-
- offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pid_ns->pidmap[PIDMAP_ENTRIES];
- for (; map < end; map++, offset = 0) {
- if (unlikely(!map->page))
- continue;
- offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
- if (offset < BITS_PER_PAGE)
- return mk_pid(pid_ns, map, offset);
- }
- return -1;
-}
-
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -285,10 +145,14 @@ void free_pid(struct pid *pid)
break;
}
}
- spin_unlock_irqrestore(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
+ for (i = 0; i <= pid->level; i++) {
+ struct upid *upid = pid->numbers + i;
+ struct pid_namespace *ns = upid->ns;
+
+ idr_remove(&ns->idr, upid->nr);
+ }
+ spin_unlock_irqrestore(&pidmap_lock, flags);
call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -309,7 +173,22 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ int pid_min = 1;
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&pidmap_lock);
+
+ /*
+ * init really needs pid 1, but after reaching the maximum
+ * wrap back to RESERVED_PIDS
+ */
+ if (tmp->idr.idr_next > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ nr = idr_alloc_cyclic(&tmp->idr, pid, pid_min,
+ pid_max, GFP_ATOMIC);
+ spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
+
if (nr < 0) {
retval = nr;
goto out_free;
@@ -346,12 +225,14 @@ struct pid *alloc_pid(struct pid_namespace *ns)
return pid;
out_unlock:
- spin_unlock_irq(&pidmap_lock);
put_pid_ns(ns);
+ spin_unlock_irq(&pidmap_lock);
out_free:
+ spin_lock_irq(&pidmap_lock);
while (++i <= ns->level)
- free_pidmap(pid->numbers + i);
+ idr_remove(&ns->idr, (pid->numbers + i)->nr);
+ spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -553,16 +434,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
- struct pid *pid;
-
- do {
- pid = find_pid_ns(nr, ns);
- if (pid)
- break;
- nr = next_pidmap(ns, nr);
- } while (nr > 0);
-
- return pid;
+ return idr_get_next(&ns->idr, &nr);
}
/*
@@ -572,13 +444,16 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
+ unsigned int pidhash_size;
+
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
+ pidhash_size = 1U << pidhash_shift;
}
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
@@ -590,10 +465,7 @@ void __init pidmap_init(void)
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
- init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pid_ns.pidmap[0].page);
- atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314..0df4e76 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
+#include <linux/idr.h>
struct pid_cache {
int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
struct ucounts *ucounts;
- int i;
int err;
err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns == NULL)
goto out_dec;
- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
+ idr_init(&ns->idr);
ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
- goto out_free_map;
+ goto out_free_idr;
err = ns_alloc_inum(&ns->ns);
if (err)
- goto out_free_map;
+ goto out_free_idr;
ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
@@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->nr_hashed = PIDNS_HASH_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++)
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
return ns;
-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+ idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
out_dec:
dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
- int i;
-
ns_free_inum(&ns->ns);
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
+
+ idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
@@ -209,10 +198,11 @@ EXPORT_SYMBOL_GPL(put_pid_ns);
void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
- int nr;
+ int nr = 2;
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
+ struct pid *pid;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -240,8 +230,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*
*/
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
+ pid = idr_get_next(&pid_ns->idr, &nr);
+ while (pid) {
rcu_read_lock();
task = pid_task(find_vpid(nr), PIDTYPE_PID);
@@ -250,7 +240,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
rcu_read_unlock();
- nr = next_pidmap(pid_ns, nr);
+ nr++;
+ pid = idr_get_next(&pid_ns->idr, &nr);
}
read_unlock(&tasklist_lock);
@@ -311,7 +302,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/
- tmp.data = &pid_ns->last_pid;
+ tmp.data = &pid_ns->idr.idr_next-1;
return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}
--
2.7.4
idr_get(namespace, id) returns a NULL if id is not present
in the idr tree or returns the pointer to the struct if id is
present in the idr tree. With this function in the idr library,
code for pid allocation can be simplified by calling this function
instead of looking through the pidhash.
Signed-off-by: Gargi Sharma <[email protected]>
---
include/linux/idr.h | 1 +
lib/idr.c | 11 +++++++++++
2 files changed, 12 insertions(+)
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 7c3a365..e12b174 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -135,6 +135,7 @@ int idr_for_each(const struct idr *,
int (*fn)(int id, void *p, void *data), void *data);
void *idr_get_next(struct idr *, int *nextid);
void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
+void *idr_get(struct idr *idr, int *id);
void *idr_replace(struct idr *, void *, int id);
void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
void idr_destroy(struct idr *);
diff --git a/lib/idr.c b/lib/idr.c
index f9adf48..bb76400 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -135,6 +135,17 @@ void *idr_get_next_ext(struct idr *idr, unsigned long *nextid)
}
EXPORT_SYMBOL(idr_get_next_ext);
+void * idr_get(struct idr *idr, int *id)
+{
+ struct radix_tree_node *node;
+ void __rcu **slot = NULL;
+
+ __radix_tree_lookup(&idr->idr_rt, *id, &node, &slot);
+ if (!slot)
+ return NULL;
+ return node;
+}
+
/**
* idr_replace - replace pointer for given id
* @idr: idr handle
--
2.7.4
pidhash is no longer required as all the functionalities
are present in the idr tree associated with the namespace.
nr can be looked up in the namespace by idr_get().
Signed-off-by: Gargi Sharma <[email protected]>
---
kernel/pid.c | 10 +---------
1 file changed, 1 insertion(+), 9 deletions(-)
diff --git a/kernel/pid.c b/kernel/pid.c
index ea22e89..761a0c2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -247,15 +247,7 @@ void disable_pid_allocation(struct pid_namespace *ns)
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct upid *pnr;
-
- hlist_for_each_entry_rcu(pnr,
- &pid_hash[pid_hashfn(nr, ns)], pid_chain)
- if (pnr->nr == nr && pnr->ns == ns)
- return container_of(pnr, struct pid,
- numbers[ns->level]);
-
- return NULL;
+ return idr_get(&ns->idr, &nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);
--
2.7.4
pidhash is no longer required as all the information
can be looked up from idr tree. Also, nr_hashed represented
the number of pids that had been hashed. Since, nr_hashed is
no longer relevant, it has been renamed to pid_allocated.
Signed-off-by: Gargi Sharma <[email protected]>
---
include/linux/init_task.h | 1 -
include/linux/pid.h | 2 --
include/linux/pid_namespace.h | 2 +-
init/main.c | 1 -
kernel/fork.c | 2 +-
kernel/pid.c | 39 +++++++--------------------------------
kernel/pid_namespace.c | 6 +++---
7 files changed, 12 insertions(+), 41 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3c07ace..cc45798 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -104,7 +104,6 @@ extern struct group_info init_groups;
.numbers = { { \
.nr = 0, \
.ns = &init_pid_ns, \
- .pid_chain = { .next = NULL, .pprev = NULL }, \
}, } \
}
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 7195827..3915664 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -50,10 +50,8 @@ enum pid_type
*/
struct upid {
- /* Try to keep pid_chain in the same cacheline as nr for find_vpid */
int nr;
struct pid_namespace *ns;
- struct hlist_node pid_chain;
};
struct pid
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index f4db4a7..762e173 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -24,7 +24,7 @@ struct pid_namespace {
struct kref kref;
struct idr idr;
struct rcu_head rcu;
- unsigned int nr_hashed;
+ unsigned int pid_allocated;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
diff --git a/init/main.c b/init/main.c
index 9f4db20..c17a21b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -562,7 +562,6 @@ asmlinkage __visible void __init start_kernel(void)
* kmem_cache_init()
*/
setup_log_buf(0);
- pidhash_init();
vfs_caches_init_early();
sort_main_extable();
trap_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 1064618..662f80e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1853,7 +1853,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
- if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_HASH_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 761a0c2..b3b33a0 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,10 +41,6 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
-#define pid_hashfn(nr, ns) \
- hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
struct pid init_struct_pid = INIT_STRUCT_PID;
int pid_max = PID_MAX_DEFAULT;
@@ -54,9 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-#define find_next_offset(map, off) \
- find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
-
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
@@ -66,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT;
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
.idr = IDR_INIT,
- .nr_hashed = PIDNS_HASH_ADDING,
+ .pid_allocated = PIDNS_HASH_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
@@ -125,8 +118,7 @@ void free_pid(struct pid *pid)
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
- hlist_del_rcu(&upid->pid_chain);
- switch(--ns->nr_hashed) {
+ switch(--ns->pid_allocated) {
case 2:
case 1:
/* When all that is left in the pid namespace
@@ -138,7 +130,7 @@ void free_pid(struct pid *pid)
case PIDNS_HASH_ADDING:
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
- ns->nr_hashed = 0;
+ ns->pid_allocated = 0;
/* fall through */
case 0:
schedule_work(&ns->proc_work);
@@ -213,12 +205,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ if (!(ns->pid_allocated & PIDNS_HASH_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
- hlist_add_head_rcu(&upid->pid_chain,
- &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
- upid->ns->nr_hashed++;
+ upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
@@ -241,7 +231,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
- ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ ns->pid_allocated &= ~PIDNS_HASH_ADDING;
spin_unlock_irq(&pidmap_lock);
}
@@ -403,6 +393,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (type != PIDTYPE_PID) {
if (type == __PIDTYPE_TGID)
type = PIDTYPE_PID;
+
task = task->group_leader;
}
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -429,22 +420,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return idr_get_next(&ns->idr, &nr);
}
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
- unsigned int pidhash_size;
-
- pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL | HASH_ZERO,
- &pidhash_shift, NULL,
- 0, 4096);
- pidhash_size = 1U << pidhash_shift;
-}
-
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 0df4e76..c0873ac 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
- ns->nr_hashed = PIDNS_HASH_ADDING;
+ ns->pid_allocated = PIDNS_HASH_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
return ns;
@@ -259,7 +259,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* sys_wait4() above can't reap the EXIT_DEAD children but we do not
* really care, we could reparent them to the global init. We could
* exit and reap ->child_reaper even if it is not the last thread in
- * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
* pid_ns can not go away until proc_kill_sb() drops the reference.
*
* But this ns can also have other tasks injected by setns()+fork().
@@ -273,7 +273,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (pid_ns->nr_hashed == init_pids)
+ if (pid_ns->pid_allocated == init_pids)
break;
schedule();
}
--
2.7.4
On Mon, 2017-09-25 at 08:56 -0400, Gargi Sharma wrote:
> This patch replaces the current bitmap implemetation for
> Process ID allocation. Functions that are no longer required,
> for example, free_pidmap(), alloc_pidmap(), etc. are removed.
> The rest of the functions are modified to use the IDR API.
> The change was made to make the PID allocation less complex by
> replacing custom code with calls to generic API.
Nice work, Gargi!
> Signed-off-by: Gargi Sharma <[email protected]>
> --- a/arch/powerpc/platforms/cell/spufs/sched.c
> +++ b/arch/powerpc/platforms/cell/spufs/sched.c
> @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s,
> void *private)
> LOAD_INT(c), LOAD_FRAC(c),
> count_active_contexts(),
> atomic_read(&nr_spu_contexts),
> - task_active_pid_ns(current)->last_pid);
> + task_active_pid_ns(current)->idr.idr_next-1);
> return 0;
> }
The repeated use of "idr.idr_next - 1" suggests that maybe this could
be hidden behind a new IDR API, but that could be something for a
follow-up patch.
There already is the idr_get_cursor function, but you would still
have to subtract 1 from the value returned by idr_get_cursor...
Other than that nitpick, this patch looks good to me.
Reviewed-by: Rik van Riel <[email protected]>
kind regards,
Rik van Riel
--
All Rights Reversed.
On Mon, 2017-09-25 at 08:56 -0400, Gargi Sharma wrote:
> idr_get(namespace, id) returns a NULL if id is not present
> in the idr tree or returns the pointer to the struct if id is
> present in the idr tree. With this function in the idr library,
> code for pid allocation can be simplified by calling this function
> instead of looking through the pidhash.
> +++ b/lib/idr.c
> @@ -135,6 +135,17 @@ void *idr_get_next_ext(struct idr *idr, unsigned
> long *nextid)
> }
> EXPORT_SYMBOL(idr_get_next_ext);
>
> +void * idr_get(struct idr *idr, int *id)
> +{
> + struct radix_tree_node *node;
> + void __rcu **slot = NULL;
> +
> + __radix_tree_lookup(&idr->idr_rt, *id, &node, &slot);
> + if (!slot)
> + return NULL;
> + return node;
> +}
I should have noticed this (much) earlier, but doesn't idr_get do
essentially the same thing as idr_find?
Also, wouldn't you want to return the pid pointer from slot,
rather than a pointer to the entire radix tree node?
--
All Rights Reversed.
On Mon, 2017-09-25 at 08:56 -0400, Gargi Sharma wrote:
> pidhash is no longer required as all the functionalities
> are present in the idr tree associated with the namespace.
> nr can be looked up in the namespace by idr_get().
>
> Signed-off-by: Gargi Sharma <[email protected]>
> ---
> kernel/pid.c | 10 +---------
> 1 file changed, 1 insertion(+), 9 deletions(-)
>
> diff --git a/kernel/pid.c b/kernel/pid.c
> index ea22e89..761a0c2 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -247,15 +247,7 @@ void disable_pid_allocation(struct pid_namespace
> *ns)
>
> struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
> {
> - struct upid *pnr;
> -
> - hlist_for_each_entry_rcu(pnr,
> - &pid_hash[pid_hashfn(nr, ns)], pid_chain)
> - if (pnr->nr == nr && pnr->ns == ns)
> - return container_of(pnr, struct pid,
> - numbers[ns->level]);
> -
> - return NULL;
> + return idr_get(&ns->idr, &nr);
> }
> EXPORT_SYMBOL_GPL(find_pid_ns);
Does this work if you call idr_find instead of idr_get?
Then patch 2/4 would not be needed.
--
All Rights Reversed.
On Mon, Sep 25, 2017 at 09:20:07AM -0400, Rik van Riel wrote:
> > +++ b/lib/idr.c
> > @@ -135,6 +135,17 @@ void *idr_get_next_ext(struct idr *idr, unsigned
> > long *nextid)
> > ?}
> > ?EXPORT_SYMBOL(idr_get_next_ext);
> > ?
> > +void * idr_get(struct idr *idr, int *id)
> > +{
> > + struct radix_tree_node *node;
> > + void __rcu **slot = NULL;
> > +
> > + __radix_tree_lookup(&idr->idr_rt, *id, &node, &slot);
> > + if (!slot)
> > + return NULL;
> > + return node;
> > +}
>
> I should have noticed this (much) earlier, but doesn't idr_get do
> essentially the same thing as idr_find?
>
> Also, wouldn't you want to return the pid pointer from slot,
> rather than a pointer to the entire radix tree node?
It also seems rather odd to pass id by reference here just to
dereference it a little later.
On Mon, 2017-09-25 at 08:56 -0400, Gargi Sharma wrote:
> pidhash is no longer required as all the information
> can be looked up from idr tree. Also, nr_hashed represented
> the number of pids that had been hashed. Since, nr_hashed is
> no longer relevant, it has been renamed to pid_allocated.
>
> Signed-off-by: Gargi Sharma <[email protected]>
> @@ -66,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT;
> struct pid_namespace init_pid_ns = {
> .kref = KREF_INIT(2),
> .idr = IDR_INIT,
> - .nr_hashed = PIDNS_HASH_ADDING,
> + .pid_allocated = PIDNS_HASH_ADDING,
> .level = 0,
> .child_reaper = &init_task,
> .user_ns = &init_user_ns,
>
Should PIDNS_HASH_ADDING be renamed too, now that the pidhash is
gone?
Not that I have any ideas for a better name. This patch looks good
to me. Feel free to add this line the next time you submit it:
Reviewed-by: Rik van Riel <[email protected]>
--
All Rights Reversed.
On 09/25, Gargi Sharma wrote:
>
> idr_get(namespace, id) returns a NULL if id is not present
> in the idr tree or returns the pointer to the struct if id is
> present in the idr tree. With this function in the idr library,
> code for pid allocation can be simplified by calling this function
> instead of looking through the pidhash.
Could you explain why find_pid_ns() can't use idr_find() ?
> +void * idr_get(struct idr *idr, int *id)
> +{
> + struct radix_tree_node *node;
> + void __rcu **slot = NULL;
> +
> + __radix_tree_lookup(&idr->idr_rt, *id, &node, &slot);
so why it takes "int *", not just "int" ?
Oleg.
On 09/25, Gargi Sharma wrote:
>
> void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> {
> - int nr;
> + int nr = 2;
> int rc;
> struct task_struct *task, *me = current;
> int init_pids = thread_group_leader(me) ? 1 : 2;
> + struct pid *pid;
>
> /* Don't allow any more processes into the pid namespace */
> disable_pid_allocation(pid_ns);
> @@ -240,8 +230,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> *
> */
> read_lock(&tasklist_lock);
> - nr = next_pidmap(pid_ns, 1);
> - while (nr > 0) {
> + pid = idr_get_next(&pid_ns->idr, &nr);
> + while (pid) {
> rcu_read_lock();
>
> task = pid_task(find_vpid(nr), PIDTYPE_PID);
> @@ -250,7 +240,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>
> rcu_read_unlock();
>
> - nr = next_pidmap(pid_ns, nr);
> + nr++;
> + pid = idr_get_next(&pid_ns->idr, &nr);
> }
> read_unlock(&tasklist_lock);
Then you should probably rewrite this code using idr_for_each_entry_continue() ?
And why do you need find_vpid(nr) if you already have "pid" ?
Oleg.
On 09/25, Gargi Sharma wrote:
>
> @@ -285,10 +145,14 @@ void free_pid(struct pid *pid)
> break;
> }
> }
> - spin_unlock_irqrestore(&pidmap_lock, flags);
>
> - for (i = 0; i <= pid->level; i++)
> - free_pidmap(pid->numbers + i);
> + for (i = 0; i <= pid->level; i++) {
> + struct upid *upid = pid->numbers + i;
> + struct pid_namespace *ns = upid->ns;
> +
> + idr_remove(&ns->idr, upid->nr);
> + }
> + spin_unlock_irqrestore(&pidmap_lock, flags);
Now that you moved the "free pidmap" code under pidmap_lock, we do not
need 2 "for (i = 0; i <= pid->level; i++)" loops, you could simply add
a single
idr_remove(&ns->idr, upid->nr);
line into the 1st loop ?
Oleg.
On Mon, Sep 25, 2017 at 8:29 PM, Oleg Nesterov <[email protected]> wrote:
> On 09/25, Gargi Sharma wrote:
>>
>> void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>> {
>> - int nr;
>> + int nr = 2;
>> int rc;
>> struct task_struct *task, *me = current;
>> int init_pids = thread_group_leader(me) ? 1 : 2;
>> + struct pid *pid;
>>
>> /* Don't allow any more processes into the pid namespace */
>> disable_pid_allocation(pid_ns);
>> @@ -240,8 +230,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>> *
>> */
>> read_lock(&tasklist_lock);
>> - nr = next_pidmap(pid_ns, 1);
>> - while (nr > 0) {
>> + pid = idr_get_next(&pid_ns->idr, &nr);
>> + while (pid) {
>> rcu_read_lock();
>>
>> task = pid_task(find_vpid(nr), PIDTYPE_PID);
>> @@ -250,7 +240,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>>
>> rcu_read_unlock();
>>
>> - nr = next_pidmap(pid_ns, nr);
>> + nr++;
>> + pid = idr_get_next(&pid_ns->idr, &nr);
>> }
>> read_unlock(&tasklist_lock);
>
> Then you should probably rewrite this code using idr_for_each_entry_continue() ?
Yes, I missed this macro in the idr library.
>
> And why do you need find_vpid(nr) if you already have "pid" ?
Thanks for the feedback! Yes, it is not needed anymore and can be removed.
Best,
Gargi
>
> Oleg.
>
On Mon, Sep 25, 2017 at 8:50 PM, Oleg Nesterov <[email protected]> wrote:
> On 09/25, Gargi Sharma wrote:
>>
>> @@ -285,10 +145,14 @@ void free_pid(struct pid *pid)
>> break;
>> }
>> }
>> - spin_unlock_irqrestore(&pidmap_lock, flags);
>>
>> - for (i = 0; i <= pid->level; i++)
>> - free_pidmap(pid->numbers + i);
>> + for (i = 0; i <= pid->level; i++) {
>> + struct upid *upid = pid->numbers + i;
>> + struct pid_namespace *ns = upid->ns;
>> +
>> + idr_remove(&ns->idr, upid->nr);
>> + }
>> + spin_unlock_irqrestore(&pidmap_lock, flags);
>
> Now that you moved the "free pidmap" code under pidmap_lock, we do not
> need 2 "for (i = 0; i <= pid->level; i++)" loops, you could simply add
> a single
>
> idr_remove(&ns->idr, upid->nr);
>
> line into the 1st loop ?
Yes, I can do that. Will make this change.
Thanks!
Gargi
>
> Oleg.
>
On Mon, Sep 25, 2017 at 7:42 PM, Oleg Nesterov <[email protected]> wrote:
> On 09/25, Gargi Sharma wrote:
>>
>> idr_get(namespace, id) returns a NULL if id is not present
>> in the idr tree or returns the pointer to the struct if id is
>> present in the idr tree. With this function in the idr library,
>> code for pid allocation can be simplified by calling this function
>> instead of looking through the pidhash.
>
> Could you explain why find_pid_ns() can't use idr_find() ?
It can. I missed this macro from the IDR library. Will change this
and drop this patch in the next version.
Thanks!
Gargi
>
>> +void * idr_get(struct idr *idr, int *id)
>> +{
>> + struct radix_tree_node *node;
>> + void __rcu **slot = NULL;
>> +
>> + __radix_tree_lookup(&idr->idr_rt, *id, &node, &slot);
>
> so why it takes "int *", not just "int" ?
>
> Oleg.
>
On Mon, Sep 25, 2017 at 6:50 PM, Rik van Riel <[email protected]> wrote:
> On Mon, 2017-09-25 at 08:56 -0400, Gargi Sharma wrote:
>> pidhash is no longer required as all the functionalities
>> are present in the idr tree associated with the namespace.
>> nr can be looked up in the namespace by idr_get().
>>
>> Signed-off-by: Gargi Sharma <[email protected]>
>> ---
>> kernel/pid.c | 10 +---------
>> 1 file changed, 1 insertion(+), 9 deletions(-)
>>
>> diff --git a/kernel/pid.c b/kernel/pid.c
>> index ea22e89..761a0c2 100644
>> --- a/kernel/pid.c
>> +++ b/kernel/pid.c
>> @@ -247,15 +247,7 @@ void disable_pid_allocation(struct pid_namespace
>> *ns)
>>
>> struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
>> {
>> - struct upid *pnr;
>> -
>> - hlist_for_each_entry_rcu(pnr,
>> - &pid_hash[pid_hashfn(nr, ns)], pid_chain)
>> - if (pnr->nr == nr && pnr->ns == ns)
>> - return container_of(pnr, struct pid,
>> - numbers[ns->level]);
>> -
>> - return NULL;
>> + return idr_get(&ns->idr, &nr);
>> }
>> EXPORT_SYMBOL_GPL(find_pid_ns);
>
> Does this work if you call idr_find instead of idr_get?
>
> Then patch 2/4 would not be needed.
Yes, patch 2/4 can be dropped. I think this patch can be merged with
patch 4/4 where I remove pidhash, since this function is more or less
aimed at that too?
Thanks!
Gargi
>
> --
> All Rights Reversed.
On Mon, 2017-09-25 at 23:14 +0530, Gargi Sharma wrote:
> On Mon, Sep 25, 2017 at 6:50 PM, Rik van Riel <[email protected]>
> wrote:
> > On Mon, 2017-09-25 at 08:56 -0400, Gargi Sharma wrote:
> > > pidhash is no longer required as all the functionalities
> > > are present in the idr tree associated with the namespace.
> > > nr can be looked up in the namespace by idr_get().
> > >
> > > Signed-off-by: Gargi Sharma <[email protected]>
> > > ---
> > > kernel/pid.c | 10 +---------
> > > 1 file changed, 1 insertion(+), 9 deletions(-)
> > >
> > > diff --git a/kernel/pid.c b/kernel/pid.c
> > > index ea22e89..761a0c2 100644
> > > --- a/kernel/pid.c
> > > +++ b/kernel/pid.c
> > > @@ -247,15 +247,7 @@ void disable_pid_allocation(struct
> > > pid_namespace
> > > *ns)
> > >
> > > struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
> > > {
> > > - struct upid *pnr;
> > > -
> > > - hlist_for_each_entry_rcu(pnr,
> > > - &pid_hash[pid_hashfn(nr, ns)], pid_chain)
> > > - if (pnr->nr == nr && pnr->ns == ns)
> > > - return container_of(pnr, struct pid,
> > > - numbers[ns->level]);
> > > -
> > > - return NULL;
> > > + return idr_get(&ns->idr, &nr);
> > > }
> > > EXPORT_SYMBOL_GPL(find_pid_ns);
> >
> > Does this work if you call idr_find instead of idr_get?
> >
> > Then patch 2/4 would not be needed.
>
> Yes, patch 2/4 can be dropped. I think this patch can be merged with
> patch 4/4 where I remove pidhash, since this function is more or less
> aimed at that too?
Yes, that would simplify your patch series a little bit.
--
All Rights Reversed.