This patch series replaces kernel bitmap implementation of PID allocation
with IDR API. These patches are written to simplify the kernel by replacing custom code with calls to generic code.
The following are the stats for pid and pid_namespace object files
before and after the replacement. There is a noteworthy change between
the IDR and bitmap implementation.
Before
text data bss dec hex filename
8447 3894 64 12405 3075 kernel/pid.o
After
text data bss dec hex filename
3301 304 0 3605 e15 kernel/pid.o
Before
text data bss dec hex filename
5692 1842 192 7726 1e2e kernel/pid_namespace.o
After
text data bss dec hex filename
2870 216 16 3102 c1e kernel/pid_namespace.o
There wasn't a considerable difference between the time required for
allocation of PIDs to the processes.
---
Changes in v2:
- Removed redundant IDR function that was introduced
in the previous patchset.
- Renamed PIDNS_HASH_ADDING
- Used idr_for_each_entry_continue()
- Used idr_find() to lookup pids
Gargi Sharma (2):
pid: Replace pid bitmap implementation with IDR API
pid: Remove pidhash
arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
fs/proc/loadavg.c | 2 +-
include/linux/init_task.h | 1 -
include/linux/pid.h | 2 -
include/linux/pid_namespace.h | 18 +--
init/main.c | 3 +-
kernel/fork.c | 2 +-
kernel/pid.c | 239 +++++-------------------------
kernel/pid_namespace.c | 54 +++----
9 files changed, 68 insertions(+), 255 deletions(-)
--
2.7.4
This patch replaces the current bitmap implemetation for
Process ID allocation. Functions that are no longer required,
for example, free_pidmap(), alloc_pidmap(), etc. are removed.
The rest of the functions are modified to use the IDR API.
The change was made to make the PID allocation less complex by
replacing custom code with calls to generic API.
Signed-off-by: Gargi Sharma <[email protected]>
---
arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
fs/proc/loadavg.c | 2 +-
include/linux/pid_namespace.h | 14 +--
init/main.c | 2 +-
kernel/pid.c | 193 +++++-------------------------
kernel/pid_namespace.c | 47 +++-----
6 files changed, 55 insertions(+), 205 deletions(-)
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 1fbb5da..d285aef 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
LOAD_INT(c), LOAD_FRAC(c),
count_active_contexts(),
atomic_read(&nr_spu_contexts),
- task_active_pid_ns(current)->last_pid);
+ task_active_pid_ns(current)->idr.idr_next-1);
return 0;
}
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 983fce5..9355f4d 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -23,7 +23,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
- task_active_pid_ns(current)->last_pid);
+ task_active_pid_ns(current)->idr.idr_next-1);
return 0;
}
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index b09136f..f4db4a7 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -9,15 +9,8 @@
#include <linux/nsproxy.h>
#include <linux/kref.h>
#include <linux/ns_common.h>
+#include <linux/idr.h>
-struct pidmap {
- atomic_t nr_free;
- void *page;
-};
-
-#define BITS_PER_PAGE (PAGE_SIZE * 8)
-#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
-#define PIDMAP_ENTRIES ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
struct fs_pin;
@@ -29,9 +22,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
struct pid_namespace {
struct kref kref;
- struct pidmap pidmap[PIDMAP_ENTRIES];
+ struct idr idr;
struct rcu_head rcu;
- int last_pid;
unsigned int nr_hashed;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
@@ -105,6 +97,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
-void pidmap_init(void);
+void pid_idr_init(void);
#endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index 0ee9c686..9f4db20 100644
--- a/init/main.c
+++ b/init/main.c
@@ -667,7 +667,7 @@ asmlinkage __visible void __init start_kernel(void)
if (late_time_init)
late_time_init();
calibrate_delay();
- pidmap_init();
+ pid_idr_init();
anon_vma_init();
acpi_early_init();
#ifdef CONFIG_X86
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedb..207c49a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,6 +39,7 @@
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#include <linux/sched/task.h>
+#include <linux/idr.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -53,12 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-static inline int mk_pid(struct pid_namespace *pid_ns,
- struct pidmap *map, int off)
-{
- return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
#define find_next_offset(map, off) \
find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
@@ -70,10 +65,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
- .pidmap = {
- [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
- },
- .last_pid = 0,
+ .idr = IDR_INIT,
.nr_hashed = PIDNS_HASH_ADDING,
.level = 0,
.child_reaper = &init_task,
@@ -101,138 +93,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static void free_pidmap(struct upid *upid)
-{
- int nr = upid->nr;
- struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
- int offset = nr & BITS_PER_PAGE_MASK;
-
- clear_bit(offset, map->page);
- atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
- /*
- * This is the same as saying
- *
- * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
- * and that mapping orders 'a' and 'b' with respect to 'base'.
- */
- return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value. We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
- int prev;
- int last_write = base;
- do {
- prev = last_write;
- last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
- } while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
- int i, offset, max_scan, pid, last = pid_ns->last_pid;
- struct pidmap *map;
-
- pid = last + 1;
- if (pid >= pid_max)
- pid = RESERVED_PIDS;
- offset = pid & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- /*
- * If last_pid points into the middle of the map->page we
- * want to scan this bitmap block twice, the second time
- * we start with offset == 0 (or RESERVED_PIDS).
- */
- max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
- for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- return -ENOMEM;
- }
- if (likely(atomic_read(&map->nr_free))) {
- for ( ; ; ) {
- if (!test_and_set_bit(offset, map->page)) {
- atomic_dec(&map->nr_free);
- set_last_pid(pid_ns, last, pid);
- return pid;
- }
- offset = find_next_offset(map, offset);
- if (offset >= BITS_PER_PAGE)
- break;
- pid = mk_pid(pid_ns, map, offset);
- if (pid >= pid_max)
- break;
- }
- }
- if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
- ++map;
- offset = 0;
- } else {
- map = &pid_ns->pidmap[0];
- offset = RESERVED_PIDS;
- if (unlikely(last == offset))
- break;
- }
- pid = mk_pid(pid_ns, map, offset);
- }
- return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
- int offset;
- struct pidmap *map, *end;
-
- if (last >= PID_MAX_LIMIT)
- return -1;
-
- offset = (last + 1) & BITS_PER_PAGE_MASK;
- map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
- end = &pid_ns->pidmap[PIDMAP_ENTRIES];
- for (; map < end; map++, offset = 0) {
- if (unlikely(!map->page))
- continue;
- offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
- if (offset < BITS_PER_PAGE)
- return mk_pid(pid_ns, map, offset);
- }
- return -1;
-}
-
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -284,12 +144,11 @@ void free_pid(struct pid *pid)
schedule_work(&ns->proc_work);
break;
}
+
+ idr_remove(&ns->idr, upid->nr);
}
spin_unlock_irqrestore(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
-
call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -309,7 +168,22 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ int pid_min = 1;
+ idr_preload(GFP_KERNEL);
+ spin_lock_irq(&pidmap_lock);
+
+ /*
+ * init really needs pid 1, but after reaching the maximum
+ * wrap back to RESERVED_PIDS
+ */
+ if (tmp->idr.idr_next > RESERVED_PIDS)
+ pid_min = RESERVED_PIDS;
+
+ nr = idr_alloc_cyclic(&tmp->idr, pid, pid_min,
+ pid_max, GFP_ATOMIC);
+ spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
+
if (nr < 0) {
retval = nr;
goto out_free;
@@ -346,12 +220,14 @@ struct pid *alloc_pid(struct pid_namespace *ns)
return pid;
out_unlock:
- spin_unlock_irq(&pidmap_lock);
put_pid_ns(ns);
+ spin_unlock_irq(&pidmap_lock);
out_free:
+ spin_lock_irq(&pidmap_lock);
while (++i <= ns->level)
- free_pidmap(pid->numbers + i);
+ idr_remove(&ns->idr, (pid->numbers + i)->nr);
+ spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -553,16 +429,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
- struct pid *pid;
-
- do {
- pid = find_pid_ns(nr, ns);
- if (pid)
- break;
- nr = next_pidmap(ns, nr);
- } while (nr > 0);
-
- return pid;
+ return idr_get_next(&ns->idr, &nr);
}
/*
@@ -572,13 +439,16 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
+ unsigned int pidhash_size;
+
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
+ pidhash_size = 1U << pidhash_shift;
}
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
@@ -590,10 +460,7 @@ void __init pidmap_init(void)
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
- init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /* Reserve PID 0. We never call free_pidmap(0) */
- set_bit(0, init_pid_ns.pidmap[0].page);
- atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+ idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314..9af3625 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
#include <linux/export.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
+#include <linux/idr.h>
struct pid_cache {
int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
struct ucounts *ucounts;
- int i;
int err;
err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns == NULL)
goto out_dec;
- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
+ idr_init(&ns->idr);
ns->pid_cachep = create_pid_cachep(level + 1);
if (ns->pid_cachep == NULL)
- goto out_free_map;
+ goto out_free_idr;
err = ns_alloc_inum(&ns->ns);
if (err)
- goto out_free_map;
+ goto out_free_idr;
ns->ns.ops = &pidns_operations;
kref_init(&ns->kref);
@@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->nr_hashed = PIDNS_HASH_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++)
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
return ns;
-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+ idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
out_dec:
dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
- int i;
-
ns_free_inum(&ns->ns);
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
+
+ idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
@@ -209,10 +198,11 @@ EXPORT_SYMBOL_GPL(put_pid_ns);
void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
- int nr;
+ int nr = 2;
int rc;
struct task_struct *task, *me = current;
int init_pids = thread_group_leader(me) ? 1 : 2;
+ struct pid *pid;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -240,17 +230,18 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*
*/
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
- rcu_read_lock();
+ pid = idr_get_next(&pid_ns->idr, &nr);
+ while (pid) {
- task = pid_task(find_vpid(nr), PIDTYPE_PID);
- if (task && !__fatal_signal_pending(task))
- send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
+ rcu_read_lock();
+ idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+ task = pid_task(pid, PIDTYPE_PID);
+ if (task && !__fatal_signal_pending(task))
+ send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
+ }
rcu_read_unlock();
- nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);
@@ -311,7 +302,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
* it should synchronize its usage with external means.
*/
- tmp.data = &pid_ns->last_pid;
+ tmp.data = &pid_ns->idr.idr_next-1;
return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}
--
2.7.4
pidhash is no longer required as all the information
can be looked up from idr tree. nr_hashed represented
the number of pids that had been hashed. Since, nr_hashed and
PIDNS_HASH_ADDING are no longer relevant, it has been renamed
to pid_allocated and PIDNS_ADDING respectively.
Signed-off-by: Gargi Sharma <[email protected]>
Reviewed-by: Rik van Riel <[email protected]>
---
include/linux/init_task.h | 1 -
include/linux/pid.h | 2 --
include/linux/pid_namespace.h | 4 ++--
init/main.c | 1 -
kernel/fork.c | 2 +-
kernel/pid.c | 52 ++++++++-----------------------------------
kernel/pid_namespace.c | 7 +++---
7 files changed, 16 insertions(+), 53 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3c07ace..cc45798 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -104,7 +104,6 @@ extern struct group_info init_groups;
.numbers = { { \
.nr = 0, \
.ns = &init_pid_ns, \
- .pid_chain = { .next = NULL, .pprev = NULL }, \
}, } \
}
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 7195827..3915664 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -50,10 +50,8 @@ enum pid_type
*/
struct upid {
- /* Try to keep pid_chain in the same cacheline as nr for find_vpid */
int nr;
struct pid_namespace *ns;
- struct hlist_node pid_chain;
};
struct pid
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index f4db4a7..7911b58 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -24,7 +24,7 @@ struct pid_namespace {
struct kref kref;
struct idr idr;
struct rcu_head rcu;
- unsigned int nr_hashed;
+ unsigned int pid_allocated;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
@@ -48,7 +48,7 @@ struct pid_namespace {
extern struct pid_namespace init_pid_ns;
-#define PIDNS_HASH_ADDING (1U << 31)
+#define PIDNS_ADDING (1U << 31)
#ifdef CONFIG_PID_NS
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
diff --git a/init/main.c b/init/main.c
index 9f4db20..c17a21b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -562,7 +562,6 @@ asmlinkage __visible void __init start_kernel(void)
* kmem_cache_init()
*/
setup_log_buf(0);
- pidhash_init();
vfs_caches_init_early();
sort_main_extable();
trap_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 1064618..c3518b8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1853,7 +1853,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
- if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 207c49a..ae31f7d1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,10 +41,6 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
-#define pid_hashfn(nr, ns) \
- hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
struct pid init_struct_pid = INIT_STRUCT_PID;
int pid_max = PID_MAX_DEFAULT;
@@ -54,9 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-#define find_next_offset(map, off) \
- find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
-
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
@@ -66,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT;
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
.idr = IDR_INIT,
- .nr_hashed = PIDNS_HASH_ADDING,
+ .pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
@@ -125,8 +118,7 @@ void free_pid(struct pid *pid)
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
- hlist_del_rcu(&upid->pid_chain);
- switch(--ns->nr_hashed) {
+ switch(--ns->pid_allocated) {
case 2:
case 1:
/* When all that is left in the pid namespace
@@ -135,10 +127,10 @@ void free_pid(struct pid *pid)
*/
wake_up_process(ns->child_reaper);
break;
- case PIDNS_HASH_ADDING:
+ case PIDNS_ADDING:
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
- ns->nr_hashed = 0;
+ ns->pid_allocated = 0;
/* fall through */
case 0:
schedule_work(&ns->proc_work);
@@ -208,12 +200,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+ if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
- hlist_add_head_rcu(&upid->pid_chain,
- &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
- upid->ns->nr_hashed++;
+ upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
@@ -236,21 +226,13 @@ struct pid *alloc_pid(struct pid_namespace *ns)
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
- ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ ns->pid_allocated &= ~PIDNS_ADDING;
spin_unlock_irq(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct upid *pnr;
-
- hlist_for_each_entry_rcu(pnr,
- &pid_hash[pid_hashfn(nr, ns)], pid_chain)
- if (pnr->nr == nr && pnr->ns == ns)
- return container_of(pnr, struct pid,
- numbers[ns->level]);
-
- return NULL;
+ return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);
@@ -432,26 +414,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return idr_get_next(&ns->idr, &nr);
}
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
- unsigned int pidhash_size;
-
- pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL | HASH_ZERO,
- &pidhash_shift, NULL,
- 0, 4096);
- pidhash_size = 1U << pidhash_shift;
-}
-
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
- BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 9af3625..dcfcd99 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -133,7 +133,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
- ns->nr_hashed = PIDNS_HASH_ADDING;
+ ns->pid_allocated = PIDNS_ADDING;
+
INIT_WORK(&ns->proc_work, proc_cleanup_work);
return ns;
@@ -259,7 +260,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* sys_wait4() above can't reap the EXIT_DEAD children but we do not
* really care, we could reparent them to the global init. We could
* exit and reap ->child_reaper even if it is not the last thread in
- * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
* pid_ns can not go away until proc_kill_sb() drops the reference.
*
* But this ns can also have other tasks injected by setns()+fork().
@@ -273,7 +274,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (pid_ns->nr_hashed == init_pids)
+ if (pid_ns->pid_allocated == init_pids)
break;
schedule();
}
--
2.7.4
On Wed, 2017-09-27 at 01:06 -0400, Gargi Sharma wrote:
> This patch replaces the current bitmap implemetation for
> Process ID allocation. Functions that are no longer required,
> for example, free_pidmap(), alloc_pidmap(), etc. are removed.
> The rest of the functions are modified to use the IDR API.
> The change was made to make the PID allocation less complex by
> replacing custom code with calls to generic API.
I like where this is going. Just a few last comments from me :)
> --- a/arch/powerpc/platforms/cell/spufs/sched.c
> +++ b/arch/powerpc/platforms/cell/spufs/sched.c
> @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s,
> void *private)
> LOAD_INT(c), LOAD_FRAC(c),
> count_active_contexts(),
> atomic_read(&nr_spu_contexts),
> - task_active_pid_ns(current)->last_pid);
> + task_active_pid_ns(current)->idr.idr_next-1);
> return 0;
> }
Everywhere you use idr.idr_next or idr->idr_next directly,
you could use idr_get_cursor(idr) instead, exposing less
of the IDR internals to the rest of the code.
> @@ -240,17 +230,18 @@ void zap_pid_ns_processes(struct pid_namespace
> *pid_ns)
> *
> */
> read_lock(&tasklist_lock);
> - nr = next_pidmap(pid_ns, 1);
> - while (nr > 0) {
> - rcu_read_lock();
> + pid = idr_get_next(&pid_ns->idr, &nr);
> + while (pid) {
>
> - task = pid_task(find_vpid(nr), PIDTYPE_PID);
> - if (task && !__fatal_signal_pending(task))
> - send_sig_info(SIGKILL, SEND_SIG_FORCED,
> task);
> + rcu_read_lock();
>
> + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
> + task = pid_task(pid, PIDTYPE_PID);
> + if (task && !__fatal_signal_pending(task))
> + send_sig_info(SIGKILL,
> SEND_SIG_FORCED, task);
> + }
> rcu_read_unlock();
>
> - nr = next_pidmap(pid_ns, nr);
> }
> read_unlock(&tasklist_lock);
>
I believe we should be fine with just the idr_for_each_entry_continue()
surrounding the loop, and not need the while (pid) around that.
That should still iterate over all the pids in the namespace, and
simplify the code even more.
You have done a great job understanding some complicated code, and
simplifying it during your Outreachy internship.
--
All Rights Reversed.
On 09/27, Rik van Riel wrote:
>
> > @@ -240,17 +230,18 @@ void zap_pid_ns_processes(struct pid_namespace
> > *pid_ns)
> > ? ?*
> > ? ?*/
> > ? read_lock(&tasklist_lock);
> > - nr = next_pidmap(pid_ns, 1);
> > - while (nr > 0) {
> > - rcu_read_lock();
> > + pid = idr_get_next(&pid_ns->idr, &nr);
> > + while (pid) {
> > ?
> > - task = pid_task(find_vpid(nr), PIDTYPE_PID);
> > - if (task && !__fatal_signal_pending(task))
> > - send_sig_info(SIGKILL, SEND_SIG_FORCED,
> > task);
> > + rcu_read_lock();
> > ?
> > + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
> > + task = pid_task(pid, PIDTYPE_PID);
> > + if (task && !__fatal_signal_pending(task))
> > + send_sig_info(SIGKILL,
> > SEND_SIG_FORCED, task);
> > + }
> > ? rcu_read_unlock();
> > ?
> > - nr = next_pidmap(pid_ns, nr);
> > ? }
> > ? read_unlock(&tasklist_lock);
> > ?
>
> I believe we should be fine with just the idr_for_each_entry_continue()
> surrounding the loop,
Yes, and please move "nr = 2" close to idr_for_each_entry_continue() to
make it clear why do we use _continue (to skip nr == 1).
> and not need the while (pid) around that.
>
> That should still iterate over all the pids in the namespace, and
> simplify the code even more.
And make this patch correct ;)
because currently it is wrong, zap_pid_ns_processes() won't kill the pid
returned by the first idr_get_next().
Oleg.
On Wed, Sep 27, 2017 at 6:39 PM, Rik van Riel <[email protected]> wrote:
> On Wed, 2017-09-27 at 01:06 -0400, Gargi Sharma wrote:
>> This patch replaces the current bitmap implemetation for
>> Process ID allocation. Functions that are no longer required,
>> for example, free_pidmap(), alloc_pidmap(), etc. are removed.
>> The rest of the functions are modified to use the IDR API.
>> The change was made to make the PID allocation less complex by
>> replacing custom code with calls to generic API.
>
> I like where this is going. Just a few last comments from me :)
>
>> --- a/arch/powerpc/platforms/cell/spufs/sched.c
>> +++ b/arch/powerpc/platforms/cell/spufs/sched.c
>> @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s,
>> void *private)
>> LOAD_INT(c), LOAD_FRAC(c),
>> count_active_contexts(),
>> atomic_read(&nr_spu_contexts),
>> - task_active_pid_ns(current)->last_pid);
>> + task_active_pid_ns(current)->idr.idr_next-1);
>> return 0;
>> }
>
> Everywhere you use idr.idr_next or idr->idr_next directly,
> you could use idr_get_cursor(idr) instead, exposing less
> of the IDR internals to the rest of the code.
Yes, will change this in the next version.
>
>> @@ -240,17 +230,18 @@ void zap_pid_ns_processes(struct pid_namespace
>> *pid_ns)
>> *
>> */
>> read_lock(&tasklist_lock);
>> - nr = next_pidmap(pid_ns, 1);
>> - while (nr > 0) {
>> - rcu_read_lock();
>> + pid = idr_get_next(&pid_ns->idr, &nr);
>> + while (pid) {
>>
>> - task = pid_task(find_vpid(nr), PIDTYPE_PID);
>> - if (task && !__fatal_signal_pending(task))
>> - send_sig_info(SIGKILL, SEND_SIG_FORCED,
>> task);
>> + rcu_read_lock();
>>
>> + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
>> + task = pid_task(pid, PIDTYPE_PID);
>> + if (task && !__fatal_signal_pending(task))
>> + send_sig_info(SIGKILL,
>> SEND_SIG_FORCED, task);
>> + }
>> rcu_read_unlock();
>>
>> - nr = next_pidmap(pid_ns, nr);
>> }
>> read_unlock(&tasklist_lock);
>>
>
> I believe we should be fine with just the idr_for_each_entry_continue()
> surrounding the loop, and not need the while (pid) around that.
>
> That should still iterate over all the pids in the namespace, and
> simplify the code even more.
Yes, one loop suffices. I will fix this in the next version.
>
> You have done a great job understanding some complicated code, and
> simplifying it during your Outreachy internship.
Thanks!
Gargi
>
> --
> All Rights Reversed.
On 09/27, Gargi Sharma wrote:
>
> @@ -309,7 +168,22 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> tmp = ns;
> pid->level = ns->level;
> for (i = ns->level; i >= 0; i--) {
> - nr = alloc_pidmap(tmp);
> + int pid_min = 1;
> + idr_preload(GFP_KERNEL);
> + spin_lock_irq(&pidmap_lock);
> +
> + /*
> + * init really needs pid 1, but after reaching the maximum
> + * wrap back to RESERVED_PIDS
> + */
> + if (tmp->idr.idr_next > RESERVED_PIDS)
> + pid_min = RESERVED_PIDS;
> +
> + nr = idr_alloc_cyclic(&tmp->idr, pid, pid_min,
> + pid_max, GFP_ATOMIC);
> + spin_unlock_irq(&pidmap_lock);
> + idr_preload_end();
> +
> if (nr < 0) {
> retval = nr;
> goto out_free;
> @@ -346,12 +220,14 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> return pid;
>
> out_unlock:
> - spin_unlock_irq(&pidmap_lock);
> put_pid_ns(ns);
> + spin_unlock_irq(&pidmap_lock);
Why? No need to move put_pid_ns() under pidmap_lock, please remove this change...
> void __init pidhash_init(void)
> {
> + unsigned int pidhash_size;
> +
> pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
> HASH_EARLY | HASH_SMALL | HASH_ZERO,
> &pidhash_shift, NULL,
> 0, 4096);
> + pidhash_size = 1U << pidhash_shift;
> }
Hmm, this change makes no sense. And the next patch kills pidhash_init()
altogether.
Oleg.
On Wed, Sep 27, 2017 at 7:36 PM, Oleg Nesterov <[email protected]> wrote:
> On 09/27, Rik van Riel wrote:
>>
>> > @@ -240,17 +230,18 @@ void zap_pid_ns_processes(struct pid_namespace
>> > *pid_ns)
>> > *
>> > */
>> > read_lock(&tasklist_lock);
>> > - nr = next_pidmap(pid_ns, 1);
>> > - while (nr > 0) {
>> > - rcu_read_lock();
>> > + pid = idr_get_next(&pid_ns->idr, &nr);
>> > + while (pid) {
>> >
>> > - task = pid_task(find_vpid(nr), PIDTYPE_PID);
>> > - if (task && !__fatal_signal_pending(task))
>> > - send_sig_info(SIGKILL, SEND_SIG_FORCED,
>> > task);
>> > + rcu_read_lock();
>> >
>> > + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
>> > + task = pid_task(pid, PIDTYPE_PID);
>> > + if (task && !__fatal_signal_pending(task))
>> > + send_sig_info(SIGKILL,
>> > SEND_SIG_FORCED, task);
>> > + }
>> > rcu_read_unlock();
>> >
>> > - nr = next_pidmap(pid_ns, nr);
>> > }
>> > read_unlock(&tasklist_lock);
>> >
>>
>> I believe we should be fine with just the idr_for_each_entry_continue()
>> surrounding the loop,
>
> Yes, and please move "nr = 2" close to idr_for_each_entry_continue() to
> make it clear why do we use _continue (to skip nr == 1).
>
>> and not need the while (pid) around that.
>>
>> That should still iterate over all the pids in the namespace, and
>> simplify the code even more.
>
> And make this patch correct ;)
>
> because currently it is wrong, zap_pid_ns_processes() won't kill the pid
> returned by the first idr_get_next().
Yes, I missed this. I can simply remove the idr_get_next() before the
idr_for_each_continue and that will take care of it.
Thanks!
Gargi
>
> Oleg.
>
On 09/27, Gargi Sharma wrote:
>
> >
> > And make this patch correct ;)
> >
> > because currently it is wrong, zap_pid_ns_processes() won't kill the pid
> > returned by the first idr_get_next().
> Yes, I missed this. I can simply remove the idr_get_next() before the
> idr_for_each_continue and that will take care of it.
Yes, and also remove the while(pid) loop as Rik suggests, just do
read_lock(&tasklist_lock);
nr = 2;
idr_for_each_entry_continue(..., nr) {
kill pid_task()
}
read_unlock(&tasklist_lock);
Oleg.
On 09/27, Gargi Sharma wrote:
>
> -#define find_next_offset(map, off) \
> - find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
> -
this should go into the previous patch, but this is minor...
> @@ -208,12 +200,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>
> upid = pid->numbers + ns->level;
> spin_lock_irq(&pidmap_lock);
> - if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
> + if (!(ns->pid_allocated & PIDNS_ADDING))
> goto out_unlock;
> for ( ; upid >= pid->numbers; --upid) {
> - hlist_add_head_rcu(&upid->pid_chain,
> - &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
> - upid->ns->nr_hashed++;
> + upid->ns->pid_allocated++;
No, this is wrong.
It is too late to check PIDNS_HASH_ADDING/PIDNS_ADDING and increment pid_allocated,
once we call idr_alloc_cyclic() this pid is already "hashed" in that it can be found
by find_pid_ns() with this patch applied.
And of course, it is too late to do atomic_set(&pid->count, 1) and initialize
pid->tasks[type] lists by the same reason.
Oleg.
If I was not clear...
in short, after this patch the very first idr_alloc_cyclic() is already
wrong. Because, once again, the new not-fully-initialized pid can be found
by find_pid_ns().
perhaps you should chane the previous patch to do
idr_alloc_cyclic(ptr = NULL) and use idr_replace() in this patch after
the PIDNS_HASH_ADDING check.
And I just noticed you didn't cc Eric Biederman <[email protected]>,
please do next time.
On 09/27, Oleg Nesterov wrote:
>
> On 09/27, Gargi Sharma wrote:
> >
> > -#define find_next_offset(map, off) \
> > - find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
> > -
>
> this should go into the previous patch, but this is minor...
>
> > @@ -208,12 +200,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> >
> > upid = pid->numbers + ns->level;
> > spin_lock_irq(&pidmap_lock);
> > - if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
> > + if (!(ns->pid_allocated & PIDNS_ADDING))
> > goto out_unlock;
> > for ( ; upid >= pid->numbers; --upid) {
> > - hlist_add_head_rcu(&upid->pid_chain,
> > - &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
> > - upid->ns->nr_hashed++;
> > + upid->ns->pid_allocated++;
>
> No, this is wrong.
>
> It is too late to check PIDNS_HASH_ADDING/PIDNS_ADDING and increment pid_allocated,
> once we call idr_alloc_cyclic() this pid is already "hashed" in that it can be found
> by find_pid_ns() with this patch applied.
>
> And of course, it is too late to do atomic_set(&pid->count, 1) and initialize
> pid->tasks[type] lists by the same reason.
>
> Oleg.
Gargi Sharma <[email protected]> writes:
> This patch series replaces kernel bitmap implementation of PID allocation
> with IDR API. These patches are written to simplify the kernel by replacing custom code with calls to generic code.
>
> The following are the stats for pid and pid_namespace object files
> before and after the replacement. There is a noteworthy change between
> the IDR and bitmap implementation.
>
> Before
> text data bss dec hex filename
> 8447 3894 64 12405 3075 kernel/pid.o
> After
> text data bss dec hex filename
> 3301 304 0 3605 e15 kernel/pid.o
>
> Before
> text data bss dec hex filename
> 5692 1842 192 7726 1e2e kernel/pid_namespace.o
> After
> text data bss dec hex filename
> 2870 216 16 3102 c1e kernel/pid_namespace.o
>
> There wasn't a considerable difference between the time required for
> allocation of PIDs to the processes.
How about the time to call readdir on /proc?
How many processes were you testing with?
Why just the bitmap? Why not update the hash table as well.
An rbtree or an rhashtable might be better.
Hmm. Oh look there is patch 2/2 and you do replace the hashtable with
the idr as well. Now I am very interested in a comparison of data
structures.
How does the runtime memory footprint of your new pid hash
implementation compare to the old memory footprint?
There are a lot of options in this space and it does not sound like you
have looked at the options very thoroughly.
I am a little worried that in the quest to reuse code you may have made the
total amount of code executed larger and more susceptible to more cache line
misses.
>From what Oleg has pointed out and from your the holes in your
description I am generally leery of this patchset as the attention to
detail was appears lower than necessary for this to be more than a proof
of concept.
Eric
> ---
> Changes in v2:
> - Removed redundant IDR function that was introduced
> in the previous patchset.
> - Renamed PIDNS_HASH_ADDING
> - Used idr_for_each_entry_continue()
> - Used idr_find() to lookup pids
>
> Gargi Sharma (2):
> pid: Replace pid bitmap implementation with IDR API
> pid: Remove pidhash
>
> arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
> fs/proc/loadavg.c | 2 +-
> include/linux/init_task.h | 1 -
> include/linux/pid.h | 2 -
> include/linux/pid_namespace.h | 18 +--
> init/main.c | 3 +-
> kernel/fork.c | 2 +-
> kernel/pid.c | 239 +++++-------------------------
> kernel/pid_namespace.c | 54 +++----
> 9 files changed, 68 insertions(+), 255 deletions(-)
On Fri, 2017-09-29 at 01:09 +0530, Gargi Sharma wrote:
> 1000 processes that just sleep and sit around without doing
> anything(100 second sleep and then exit).
>
> pstree with 10,000 processes
> real 0m0.859s
> user 0m0.536s
> sys 0m0.172s
>
> ps with 10,000 processes
> real 0m0.918s
> user 0m0.100s
> sys 0m0.172s
>
> Stats for calling readdir on /proc with 10,000 processes
> real 0m0.092s
> user 0m0.000s
> sys 0m0.020s
Is that with or without your patches?
How does it compare to a kernel with(out) your patches?
--
All Rights Reversed.
On Thu, Sep 28, 2017 at 3:46 PM, Rik van Riel <[email protected]> wrote:
> On Fri, 2017-09-29 at 01:09 +0530, Gargi Sharma wrote:
>
>> 1000 processes that just sleep and sit around without doing
>> anything(100 second sleep and then exit).
>>
>> pstree with 10,000 processes
>> real 0m0.859s
>> user 0m0.536s
>> sys 0m0.172s
>>
>> ps with 10,000 processes
>> real 0m0.918s
>> user 0m0.100s
>> sys 0m0.172s
>>
>> Stats for calling readdir on /proc with 10,000 processes
>> real 0m0.092s
>> user 0m0.000s
>> sys 0m0.020s
>
> Is that with or without your patches?
>
> How does it compare to a kernel with(out) your patches?
Ah thanks for pointing this out. Those were without the patches.
Here are the stats for easier comparison.
With Patches Without patches
pstree
real 0m0.542s real 0m0.859s
user 0m0.335s user 0m0.536s
sys 0m0.150s sys 0m0.172s
ps
real 0m0.722s real 0m0.918s
user 0m0.064s user 0m0.100s
sys 0m0.162s sys 0m0.172s
readdir
real 0m0.080s real 0m0.092s
user 0m0.000s user 0m0.000s
sys 0m0.021s sys 0m0.020s
Thanks!
Gargi
>
> --
> All Rights Reversed.
On Fri, Sep 29, 2017 at 1:16 AM, Rik van Riel <[email protected]> wrote:
> On Fri, 2017-09-29 at 01:09 +0530, Gargi Sharma wrote:
>
>> 1000 processes that just sleep and sit around without doing
>> anything(100 second sleep and then exit).
>>
>> pstree with 10,000 processes
>> real 0m0.859s
>> user 0m0.536s
>> sys 0m0.172s
>>
>> ps with 10,000 processes
>> real 0m0.918s
>> user 0m0.100s
>> sys 0m0.172s
>>
>> Stats for calling readdir on /proc with 10,000 processes
>> real 0m0.092s
>> user 0m0.000s
>> sys 0m0.020s
>
> Is that with or without your patches?
>
> How does it compare to a kernel with(out) your patches?
>
> --
> All Rights Reversed.
On Fri, 2017-09-29 at 01:35 +0530, Gargi Sharma wrote:
> On Thu, Sep 28, 2017 at 3:46 PM, Rik van Riel <[email protected]>
> wrote:
> > On Fri, 2017-09-29 at 01:09 +0530, Gargi Sharma wrote:
> >
> > > 1000 processes that just sleep and sit around without doing
> > > anything(100 second sleep and then exit).
> >
> > Is that with or without your patches?
> >
> > How does it compare to a kernel with(out) your patches?
>
> Ah thanks for pointing this out. Those were without the patches.
> Here are the stats for easier comparison.
>
> With Patches Without patches
> pstree
> real 0m0.542s real 0m0.859s
> user 0m0.335s user 0m0.536s
> sys 0m0.150s sys 0m0.172s
>
> ps
> real 0m0.722s real 0m0.918s
> user 0m0.064s user 0m0.100s
> sys 0m0.162s sys 0m0.172s
>
> readdir
> real 0m0.080s real 0m0.092s
> user 0m0.000s user 0m0.000s
> sys 0m0.021s sys 0m0.020s
So your patches speed up the use of /proc?
I suspect pstree and ps benefit from the simplification
and speedup of find_pid_ns, which is called from
find_task_by_pid_ns.
That is great news.
--
All Rights Reversed.
On Wed, Sep 27, 2017 at 9:58 PM, Oleg Nesterov <[email protected]> wrote:
> If I was not clear...
>
> in short, after this patch the very first idr_alloc_cyclic() is already
> wrong. Because, once again, the new not-fully-initialized pid can be found
> by find_pid_ns().
If the PIDNS_ADDING check fails, I jump to the flag that performs
this
while (++i <= ns->level)
idr_remove(&ns->idr, (pid->numbers + i)->nr);
So when find_pid_ns() is called, it will not find this pid.
>
> perhaps you should chane the previous patch to do
> idr_alloc_cyclic(ptr = NULL) and use idr_replace() in this patch after
> the PIDNS_HASH_ADDING check.
I'm not sure if I understand this. Do we want to do this to make sure
the pid namespace is
initialized before the first process enters into
the namespace? If yes, how does idr_alloc_cyclic(ptr = NULL) help?
>
> And I just noticed you didn't cc Eric Biederman <[email protected]>,
> please do next time.
Sorry for missing out on this. Will do with the next version.
Thanks!
Gargi
>
>
> On 09/27, Oleg Nesterov wrote:
>>
>> On 09/27, Gargi Sharma wrote:
>> >
>> > -#define find_next_offset(map, off) \
>> > - find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
>> > -
>>
>> this should go into the previous patch, but this is minor...
>>
>> > @@ -208,12 +200,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>> >
>> > upid = pid->numbers + ns->level;
>> > spin_lock_irq(&pidmap_lock);
>> > - if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
>> > + if (!(ns->pid_allocated & PIDNS_ADDING))
>> > goto out_unlock;
>> > for ( ; upid >= pid->numbers; --upid) {
>> > - hlist_add_head_rcu(&upid->pid_chain,
>> > - &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
>> > - upid->ns->nr_hashed++;
>> > + upid->ns->pid_allocated++;
>>
>> No, this is wrong.
>>
>> It is too late to check PIDNS_HASH_ADDING/PIDNS_ADDING and increment pid_allocated,
>> once we call idr_alloc_cyclic() this pid is already "hashed" in that it can be found
>> by find_pid_ns() with this patch applied.
>>
>> And of course, it is too late to do atomic_set(&pid->count, 1) and initialize
>> pid->tasks[type] lists by the same reason.
>>
>> Oleg.
>