2002-08-07 22:05:49

by Paul Larson

[permalink] [raw]
Subject: [PATCH] Linux-2.5 fix/improve get_pid()


This patch provides an improved version of get_pid() while also taking
care of the bug that causes the machine to hang when you hit PID_MAX.
It is based on both solutions to the problem provided by Hubertus Franke
and Andrea Arcangeli. This uses a bitmap to find an available pid and
uses Hubertus's adaptive implementation to only use this when it is more
beneficial than the old mechanism. The getpid_mutex from AA's patch is
also carried over to avoid the race where another cpu could get the same
pid before SET_LINKS was called.

This should patch cleanly against 2.5.30 or bk current.
Please apply.

--- a/kernel/fork.c Wed Aug 7 16:37:38 2002
+++ b/kernel/fork.c Wed Aug 7 16:05:22 2002
@@ -50,6 +50,12 @@

rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */

+/*
+ * Protectes next_safe, last_pid and it avoids races
+ * between get_pid and SET_LINKS().
+ */
+static DECLARE_MUTEX(getpid_mutex);
+
void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
{
unsigned long flags;
@@ -129,27 +135,107 @@
kmem_cache_free(task_struct_cachep,tsk);
}

-/* Protects next_safe and last_pid. */
-spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
+/* this should be provided in every architecture */
+#ifndef SHIFT_PER_LONG
+#if BITS_PER_LONG == 64
+# define SHIFT_PER_LONG 6
+#elif BITS_PER_LONG == 32
+# define SHIFT_PER_LONG 5
+#else
+#error "SHIFT_PER_LONG"
+#endif
+#endif
+
+#define RESERVED_PIDS (300)
+#define GETPID_THRESHOLD (22000) /* when to switch to a mapped algo */
+#define PID_MAP_SIZE (PID_MAX >> SHIFT_PER_LONG)
+static unsigned long pid_map[PID_MAP_SIZE];
+static int next_safe = PID_MAX;
+
+static inline void mark_pid(int pid)
+{
+ __set_bit(pid,pid_map);
+}
+
+static int get_pid_by_map(int lastpid)
+{
+ static int mark_and_sweep = 0;
+
+ int round = 0;
+ struct task_struct *p;
+ int i;
+ unsigned long mask;
+ int fpos;
+
+
+ if (mark_and_sweep) {
+repeat:
+ mark_and_sweep = 0;
+ memset(pid_map, 0, PID_MAP_SIZE * sizeof(unsigned long));
+ lastpid = RESERVED_PIDS;
+ }
+ for_each_task(p) {
+ mark_pid(p->pid);
+ mark_pid(p->pgrp);
+ mark_pid(p->tgid);
+ mark_pid(p->session);
+ }
+
+ /* find next free pid */
+ i = (lastpid >> SHIFT_PER_LONG);
+ mask = pid_map[i] | ((1 << ((lastpid & (BITS_PER_LONG-1)))) - 1);
+
+ while ((mask == ~0) && (++i < PID_MAP_SIZE))
+ mask = pid_map[i];
+
+ if (i == PID_MAP_SIZE) {
+ if (round == 0) {
+ round = 1;
+ goto repeat;
+ }
+ next_safe = RESERVED_PIDS;
+ mark_and_sweep = 1; /* mark next time */
+ return 0;
+ }
+
+ fpos = ffz(mask);
+ i &= (PID_MAX-1);
+ lastpid = (i << SHIFT_PER_LONG) + fpos;
+
+ /* find next save pid */
+ mask &= ~((1 << fpos) - 1);
+
+ while ((mask == 0) && (++i < PID_MAP_SIZE))
+ mask = pid_map[i];
+
+ if (i==PID_MAP_SIZE)
+ next_safe = PID_MAX;
+ else
+ next_safe = (i << SHIFT_PER_LONG) + ffs(mask) - 1;
+ return lastpid;
+}

static int get_pid(unsigned long flags)
{
- static int next_safe = PID_MAX;
struct task_struct *p;
int pid;

- if (flags & CLONE_IDLETASK)
- return 0;
-
- spin_lock(&lastpid_lock);
if((++last_pid) & 0xffff8000) {
- last_pid = 300; /* Skip daemons etc. */
+ last_pid = RESERVED_PIDS; /* Skip daemons etc. */
goto inside;
}
if(last_pid >= next_safe) {
inside:
next_safe = PID_MAX;
read_lock(&tasklist_lock);
+ if (nr_threads > GETPID_THRESHOLD) {
+ last_pid = get_pid_by_map(last_pid);
+ if (last_pid == 0) {
+ last_pid = PID_MAX;
+ goto nomorepids;
+ }
+ } else {
+ int beginpid = last_pid;
repeat:
for_each_task(p) {
if(p->pid == last_pid ||
@@ -158,24 +244,33 @@
p->session == last_pid) {
if(++last_pid >= next_safe) {
if(last_pid & 0xffff8000)
- last_pid = 300;
+ last_pid = RESERVED_PIDS;
next_safe = PID_MAX;
}
+ if(last_pid == beginpid)
+ goto nomorepids;
goto repeat;
}
if(p->pid > last_pid && next_safe > p->pid)
next_safe = p->pid;
if(p->pgrp > last_pid && next_safe > p->pgrp)
next_safe = p->pgrp;
+ if(p->tgid > last_pid && next_safe > p->tgid)
+ next_safe = p->tgid;
if(p->session > last_pid && next_safe > p->session)
next_safe = p->session;
}
+ }
read_unlock(&tasklist_lock);
}
pid = last_pid;
- spin_unlock(&lastpid_lock);

return pid;
+
+nomorepids:
+ next_safe = last_pid = PID_MAX;
+ read_unlock(&tasklist_lock);
+ return 0;
}

static inline int dup_mmap(struct mm_struct * mm)
@@ -669,7 +764,14 @@
p->state = TASK_UNINTERRUPTIBLE;

copy_flags(clone_flags, p);
- p->pid = get_pid(clone_flags);
+ down(&getpid_mutex);
+ if (clone_flags & CLONE_IDLETASK)
+ p->pid = 0;
+ else {
+ p->pid = get_pid(clone_flags);
+ if (p->pid == 0)
+ goto bad_fork_cleanup;
+ }
p->proc_dentry = NULL;

INIT_LIST_HEAD(&p->run_list);
@@ -793,10 +895,20 @@
list_add(&p->thread_group, &current->thread_group);
}

+ /*
+ * We must do the SET_LINKS() under the getpid_mutex, to avoid
+ * another CPU to get our same PID between the release of of the
+ * getpid_mutex and the SET_LINKS().
+ *
+ * In short to avoid SMP races the new child->pid must be just visible
+ * in the tasklist by the time we drop the getpid_mutex.
+ */
SET_LINKS(p);
+
hash_pid(p);
nr_threads++;
write_unlock_irq(&tasklist_lock);
+ up(&getpid_mutex);
retval = 0;

fork_out:
@@ -819,6 +931,7 @@
bad_fork_cleanup_security:
security_ops->task_free_security(p);
bad_fork_cleanup:
+ up(&getpid_mutex);
put_exec_domain(p->thread_info->exec_domain);
if (p->binfmt && p->binfmt->module)
__MOD_DEC_USE_COUNT(p->binfmt->module);


2002-08-07 23:04:45

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Paul Larson wrote:
>
> This patch provides an improved version of get_pid() while also taking
> care of the bug that causes the machine to hang when you hit PID_MAX.

Has this been evaluated against Bill Irwin's constant-time
allocator? Bill says it has slightly worse normal-case and
vastly improved worst-case performance over the stock allocator.
Not sure how it stacks up against this one. Plus it's much nicer
looking code.

He's shy, so.....



#include <linux/compiler.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/init.h>

/*
* pid allocator
* (C) 2002 William Irwin, IBM
*
* The strategy is to maintain a tower of bitmaps where a bitmap above
* another in each bit accounts whether any pid's are available in the
* space tracked by BITS_PER_LONG bits of the level below. The bitmaps
* must be marked on allocation and also release, hence some
* infrastructure for detecting when the last user of a pid releases it
* must be in place.
*
* This general strategy is simple in concept and enforces highly
* deterministic bounds on the search time for the next pid.
*/

#define PID_MAX 0x8000
#define RESERVED_PIDS 300

#define MAP0_SIZE (PID_MAX >> BITS_PER_LONG_SHIFT)
#define MAP1_SIZE (MAP0_SIZE >> BITS_PER_LONG_SHIFT)
#define MAP2_SIZE (MAP1_SIZE >> BITS_PER_LONG_SHIFT)

#define MAP0_SHIFT BITS_PER_LONG_SHIFT
#define MAP1_SHIFT (2*BITS_PER_LONG_SHIFT)
#define MAP2_SHIFT (3*BITS_PER_LONG_SHIFT)

#define PID_MAP_MASK (BITS_PER_LONG - 1)

#define PID_MAP_DEPTH (ARRAY_SIZE(pid_map) - 1)

static unsigned long pid_map0[MAP0_SIZE];
static unsigned long pid_map1[MAP1_SIZE];
static unsigned long pid_map2[MAP2_SIZE];

static unsigned long * pid_map[] = { pid_map0, pid_map1, pid_map2, NULL, };

unsigned long last_pid = 0;
unsigned long npids = 0;

static const int map_shifts[] =
{ 0,
BITS_PER_LONG_SHIFT,
BITS_PER_LONG_SHIFT*2,
BITS_PER_LONG_SHIFT*3,
BITS_PER_LONG_SHIFT*4,
};

static inline int pid_map_shift(int depth)
{
return map_shifts[depth+1];
}

static spinlock_t pid_lock = SPIN_LOCK_UNLOCKED;

void free_pid(unsigned long pid)
{
unsigned long **map = pid_map;

spin_lock(&pid_lock);
while (*map) {
int bit = pid & PID_MAP_MASK;
pid >>= BITS_PER_LONG_SHIFT;
__clear_bit(bit, &(*map)[pid]);
++map;
}
--npids;
spin_unlock(&pid_lock);
}

static inline int whole_block_used(int level, unsigned long pid)
{
return pid_map[level][pid >> pid_map_shift(level)] == ~0UL;
}

static inline void mark_pid(unsigned long pid)
{
int level;
for (level = 0; level < PID_MAP_DEPTH; ++level) {
int shift, bit;
unsigned long entry;

shift = pid_map_shift(level);
entry = pid >> shift;
bit = (pid >> (shift - BITS_PER_LONG_SHIFT)) & PID_MAP_MASK;
if (level == 0 || whole_block_used(level - 1, pid))
__set_bit(bit, &pid_map[level][entry]);
else
break;
}
++npids;
}

static inline int pid_map_limit(int depth)
{
return PID_MAX >> pid_map_shift(depth);
}

#ifdef PID_ALLOC_EXAMPLE
/*
* the pid allocation traverses the bitmaps by recursively ffz'ing
* through down the tower of maps. Some additional logic is required
* to enforce lower limits, but the following example of how one
* would perform this search without the lower limit may well prove
* enlightening for those interested in the mechanics of the algorithm.
*/
static long alloc_pid_from_zero(void)
{
unsigned long pid = 0;
int level;

for (level = PID_MAP_DEPTH - 1; level >= 0; --level) {
unsigned long entry = pid_map[level][pid];

if (unlikely(entry == ~0UL))
return ~0UL;

pid = (pid << BITS_PER_LONG_SHIFT) + ffz(pid_map[level][pid]);
}
return pid;
}
#endif /* PID_ALLOC_EXAMPLE */


static const unsigned long pid_max_levels[] =
{ PID_MAX >> BITS_PER_LONG_SHIFT,
PID_MAX >> (BITS_PER_LONG_SHIFT*2),
PID_MAX >> (BITS_PER_LONG_SHIFT*3),
PID_MAX >> (BITS_PER_LONG_SHIFT*4),
};

static inline unsigned long pid_map_digit(int level, unsigned long limit)
{
return (limit >> pid_map_shift(level-1)) & PID_MAP_MASK;
}

/*
* Scratch space for storing the digits of the limit, all accesses
* protected by the pid_lock.
*/
static unsigned long limit_digits[4];

/*
* This is not a high-performance implementation. alloc_pid_after()
* can be made highly compact with some effort, but this is instead
* meant to be clear. As the cost of fork() is dominated by much
* more expensive operations and the cost of this is constant-bounded
* by a very low constant, the gains from manual optimization here
* are marginal.
*/
static long alloc_pid_after(unsigned long limit)
{
unsigned long pid = 0;
int level;

/*
* The limit passed to us is a strict lower limit. It is more
* convenient to work with <= constraints.
*/
++limit;
if (unlikely(limit == PID_MAX))
return ~0UL;

for (level = 0; level < PID_MAP_DEPTH; ++level) {
limit_digits[level] = limit & PID_MAP_MASK;
limit >>= BITS_PER_LONG_SHIFT;
}

/*
* Now the lowest available pid number above limit is
* reconstructed by ffz'ing down the bitmap and checking
* each digit against the digits of the limit for
* dictionary ordering. If the check should fail, it's
* fixed up by using the maximum of the two digits. The
* dictionary ordering on digits also means that a
* greater significant digit found in the bitmap
* invalidates all further comparisons, which requires
* fallback to the pure recursive ffz algorithm outlined
* above in order to be handled.
*/
for (level = PID_MAP_DEPTH - 1; level >= 0; --level) {
unsigned long bit, digit;

if (unlikely(pid >= pid_max_levels[level]))
return ~0UL;

bit = ffz(pid_map[level][pid]);
digit = limit_digits[level];

if (unlikely(bit < digit))
bit = digit;

pid = (pid << BITS_PER_LONG_SHIFT) + bit;

/*
* This is not an optimization; if this check
* should succeed the digit comparisons above
* are no longer valid and (pessimistically)
* incorrect first available pid's are found.
*
*/
if (likely(bit > digit)) {
--level;
goto finish_just_ffz;
}
}
out:
if (pid < PID_MAX)
return pid;
else
return ~0UL;

finish_just_ffz:
/*
* Now revert to the pure recursive ffz algorithm with
* the slight variation of not beginning at a fixed level,
* because it is no longer valid to perform comparisons
* of the digit obtained by ffz'ing the bitmap against the
* digits of the limit.
*/
while (level >= 0) {
unsigned long bit;

if (unlikely(pid >= pid_max_levels[level]))
return ~0UL;

bit = ffz(pid_map[level][pid]);
pid = (pid << BITS_PER_LONG_SHIFT) + bit;
--level;
}

goto out;
}

int alloc_pid(void)
{
unsigned long pid;

spin_lock(&pid_lock);
BUG_ON(last_pid >= PID_MAX);
pid = alloc_pid_after(last_pid);
if (unlikely(pid == ~0UL)) {
pid = alloc_pid_after(RESERVED_PIDS);
if (unlikely(pid == ~0UL))
goto out;
BUG_ON(pid < RESERVED_PIDS);
} else
BUG_ON(pid <= last_pid);
last_pid = pid;
mark_pid(pid);
out:
spin_unlock(&pid_lock);
return (int)pid;
}

void __init pid_init(void)
{
mark_pid(0);
}

2002-08-08 00:20:50

by Andries Brouwer

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Wed, Aug 07, 2002 at 04:06:05PM -0700, Andrew Morton wrote:

> Has this been evaluated against Bill Irwin's constant-time
> allocator? Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one. Plus it's much nicer
> looking code.

> #define PID_MAX 0x8000
> #define RESERVED_PIDS 300
>
> #define MAP0_SIZE (PID_MAX >> BITS_PER_LONG_SHIFT)
> #define MAP1_SIZE (MAP0_SIZE >> BITS_PER_LONG_SHIFT)
> #define MAP2_SIZE (MAP1_SIZE >> BITS_PER_LONG_SHIFT)
>
> static unsigned long pid_map0[MAP0_SIZE];
> static unsigned long pid_map1[MAP1_SIZE];
> static unsigned long pid_map2[MAP2_SIZE];

Here it is of interest how large a pid is.
With a 64-bit pid_t it is just

static pid_t last_pid;

pid_t get_next_pid() {
return ++last_pid;
}

since 2^64 is a really large number.
Unfortunately glibc does not support this (on x86).

With a 32-bit pid_t wraparounds will occur, but very infrequently.
Thus, finding the next pid will be very fast on average, much faster
than the above algorithm, and no arrays are required.
One only loses the guaranteed constant time property.
Unless hard real time is required, this sounds like the best version.

Andries

2002-08-08 19:02:22

by Hubertus Franke

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()






Which one sounds like the best one ?

Assuming that for now we have to stick to 16-bit pid_t ....
There are the following patches out there ....:

(a) Vanilla version which breaks down after 22K pids and really sucks above
32.5K
(b) Bill Irwin's, which keeps track of which PID is free and which one is
not ?
(c) Andrea's patch which searches in the bitmap when we are running out
(d) Paul's patch, which I believe was based on one of my earlier submission
(03/02) that
essentially switches between (a)+(c) at
the break even point.

Assuming that Paul's patch still resembles somewhat my earlier intentions:
My observation of (d) back in february was that the current approach with
using next_pid = ++last_pid in the general case is pretty good and that
only the determination of a guaranteed free range sucks due to O(N**2)
algorithm sucks and using a bitmap to determine the next safe range
through a O(N) algorithm is good. I determined the breakeven point with
random pid deletion to be ~22K where the current algorithm worked darn
well.
One difference to Andrea's patch (c) is (if I remember his code correctly)
that
I used was that I always look forward in the bitmap for the next safe range
in the
bitmap when I exhausted the previous one without having to traverse the
task list.
Only if non is found I traverse the task list and try the bit search one
more time.

I feel (c) or (d) is a better solution over (a) and (b)... Open for
discussion.
I have a test program that does the random pid deletion and pid allocation
in user space. All what's required is to copy the get_pid() code from the
kernel into there... Can make that available ...

I don't know what Paul has done to the patch since then ....

-- Hubertus Franke ([email protected])






Andries Brouwer
<[email protected]> To: Andrew Morton <[email protected]>
cc: Paul Larson <[email protected]>, Linus Torvalds <torvalds@transmeta.
08/07/2002 08:24 com>, lkml <[email protected]>, [email protected], Hubertus
PM Franke/Watson/IBM@IBMUS, [email protected]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()






On Wed, Aug 07, 2002 at 04:06:05PM -0700, Andrew Morton wrote:

> Has this been evaluated against Bill Irwin's constant-time
> allocator? Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one. Plus it's much nicer
> looking code.

> #define PID_MAX 0x8000
> #define RESERVED_PIDS 300
>
> #define MAP0_SIZE (PID_MAX >> BITS_PER_LONG_SHIFT)
> #define MAP1_SIZE (MAP0_SIZE >> BITS_PER_LONG_SHIFT)
> #define MAP2_SIZE (MAP1_SIZE >> BITS_PER_LONG_SHIFT)
>
> static unsigned long pid_map0[MAP0_SIZE];
> static unsigned long pid_map1[MAP1_SIZE];
> static unsigned long pid_map2[MAP2_SIZE];

Here it is of interest how large a pid is.
With a 64-bit pid_t it is just

static pid_t last_pid;

pid_t get_next_pid() {
return ++last_pid;
}

since 2^64 is a really large number.
Unfortunately glibc does not support this (on x86).

With a 32-bit pid_t wraparounds will occur, but very infrequently.
Thus, finding the next pid will be very fast on average, much faster
than the above algorithm, and no arrays are required.
One only loses the guaranteed constant time property.
Unless hard real time is required, this sounds like the best version.

Andries



2002-08-08 19:17:05

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Thu, 8 Aug 2002, Hubertus Franke wrote:

> Which one sounds like the best one ?
>
> Assuming that for now we have to stick to 16-bit pid_t ....

That assumption is pretty central to the debate.

I don't see the standard get_pid nor the bitmap based
get_pid scale to something larger than a 16-bit pid_t.

If we're not sure yet whether we want to keep pid_t 16
bits it might be worth putting in an algorithm that does
scale to larger numbers, if only so the switch to a larger
pid_t will be more straightforward.

kind regards,

Rik
--
http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid. Go buy yourself a real t-shirt"

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-08 19:20:53

by Paul Larson

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Thu, 2002-08-08 at 13:56, Hubertus Franke wrote:
> (a) Vanilla version which breaks down after 22K pids and
> really sucks above 32.5K
> (b) Bill Irwin's, which keeps track of which PID is free and
> which one is not ?
> (c) Andrea's patch which searches in the bitmap when we are
> running out
> (d) Paul's patch, which I believe was based on one of my earlier
> submission (03/02) that essentially switches between (a)+(c) at
> the break even point.

> I feel (c) or (d) is a better solution over (a) and (b)... Open for
> discussion.
> I have a test program that does the random pid deletion and pid allocation
> in user space. All what's required is to copy the get_pid() code from the
> kernel into there... Can make that available ...
>
> I don't know what Paul has done to the patch since then ....

It's pretty much the same, with a couple of small changes. After
forward porting your patch and the one from AA's patches (hch says that
one was actually done by Ihno Krumreich, thanks for the info), I added
the fix from (c) for the race, and moved the check for if(flags &
CLONE_IDLETASK) outside of get_pid into copy_process() since there's no
need to call get_pid and return just to find out that we need to use 0.

-Paul Larson

2002-08-08 19:39:40

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Thu, Aug 08, 2002 at 02:24:19AM +0200, Andries Brouwer wrote:
> Here it is of interest how large a pid is.
> With a 64-bit pid_t it is just
> static pid_t last_pid;
> pid_t get_next_pid() {
> return ++last_pid;
> }
> since 2^64 is a really large number.
> Unfortunately glibc does not support this (on x86).
> With a 32-bit pid_t wraparounds will occur, but very infrequently.
> Thus, finding the next pid will be very fast on average, much faster
> than the above algorithm, and no arrays are required.
> One only loses the guaranteed constant time property.
> Unless hard real time is required, this sounds like the best version.

The goal of the work that produced this was to remove the global
tasklist. Changing ABI's and/or breaking userspace was not "within the
rules" of that. My allocator relies on that other infrastructure for
notification of release of pid's, and is really only meant to remove
the reliance of fork() on the tasklist. That work is probably more
relevant to heavy tty usage than forkbombs, despite the O(1) get_pid().
I am glad people happen to like various tidbits of it, though. =)


Cheers,
Bill

2002-08-08 20:20:09

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()


On Wed, 7 Aug 2002, Andrew Morton wrote:
>
> Has this been evaluated against Bill Irwin's constant-time
> allocator? Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one. Plus it's much nicer
> looking code.

Guys, this discussion is getting ridiculous.

Doing a bit allocator should be trivial, but it's hard to know when a bit
is to be free'd. You can't just do it at "exit()" time, because even if
pid X exits, that doesn't mean that X can be re-used: it may still be used
as a pgid or a tid by some other process Y.

So if you really want to take this approach, you need to count the uses of
"pid X", and free the bitmap entry only when that count goes to zero. I
see no such logic in Bill Irwin's code, only a comment about last use
(which doesn't explain how to notice that last use).

Without that per-pid-count thing clarified, I don't think the (otherwise
fairly straightforward) approach of Bills really flies.

For that reason I think the mark-and-sweep thing is the right thing to do,
but I think the two-level algorithm is just over-engineering and not worth
it. And I do hate that getpid_mutex thing. Having a blocking lock for
something as simple as pid allocation just smells horribly wrong to me.

Moving the pid allocation to later (so that it doesn't need to handle
operations that block in between allocation and "we're done" and the pid
allocation can be a spinlock) sounds like a fairly obvious thing to do.

I don't see why we would need the "pid" until the very last moment, at
which point we already have the tasklist lock, in fact.

And I hate overengineering.

Linus

2002-08-08 20:43:27

by Andries Brouwer

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Thu, Aug 08, 2002 at 12:42:38PM -0700, William Lee Irwin III wrote:

> The goal of the work that produced this was to remove the global
> tasklist. Changing ABI's and/or breaking userspace was not "within the
> rules" of that.

It feels wrong to add such complexity and simultaneously keep
such a small pid_t.

Very soon 30000 processes will not be enough.

Using a 32-bit pid_t does not break userspace. Indeed, user space uses
a 32-bit pid_t today. The only complaint I have heard was from
Albert Cahalan who maintains ps and was afraid that the ps output
would become uglier if pids would get more digits.

It is a real pity that going to a 64-bit pid is impossible (on x64).

Many algorithms can be really efficient if you have a large space
to work in. For example, I do not know what your motivation was
for wanting to remove the global tasklist. It is certainly needed
for sending signals. But if you want to avoid access to global stuff
in a MP situation, then it is easy to partition the pid space
so that each processor only gives out pids in its own region.
(So that simultaneous forks do not interfere.)

Andries

2002-08-08 21:27:15

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Followup to: <[email protected]>
By author: Linus Torvalds <[email protected]>
In newsgroup: linux.dev.kernel
>
> Guys, this discussion is getting ridiculous.
>
> Doing a bit allocator should be trivial, but it's hard to know when a bit
> is to be free'd. You can't just do it at "exit()" time, because even if
> pid X exits, that doesn't mean that X can be re-used: it may still be used
> as a pgid or a tid by some other process Y.
>
> So if you really want to take this approach, you need to count the uses of
> "pid X", and free the bitmap entry only when that count goes to zero. I
> see no such logic in Bill Irwin's code, only a comment about last use
> (which doesn't explain how to notice that last use).
>

Even so, we need to maintain Not Recently Used semantic. A discussion
on #kernel seems to have ended up with recommending a design target of
"no pid reuse within 30 seconds", with 1 second being an absolute
requirement.

-hpa
--
<[email protected]> at work, <[email protected]> in private!
"Unix gives you enough rope to shoot yourself in the foot."
http://www.zytor.com/~hpa/puzzle.txt <[email protected]>

2002-08-08 21:41:48

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Thu, Aug 08, 2002 at 01:24:35PM -0700, Linus Torvalds wrote:
> Without that per-pid-count thing clarified, I don't think the (otherwise
> fairly straightforward) approach of Bills really flies.

I implemented the rest of it, based on maintaining hashtables for the
tgid, pgid, and sid as well as the pid itself. get_pid() was not the
focus of it.


Cheers,
Bill

2002-08-08 21:46:11

by Hubertus Franke

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()






Agreed ....
The mark-and-sweep is the correct method for 16-bits ! For 32-bits its not
possible !
Two level is over-engineered (as I told Paul).

If you want I can post the data that I collected comparing
(a) stock kernel
(b) my mark and sweep
(c) my double mark and sweep (try looking forward and then scan task list)
?

Let me know if you are interested.
That should clear up the issue quickly giving you some average allocation
times etc.....
as a function of random used pids.



Hubertus Franke
Enterprise Linux Group (Mgr), Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: [email protected]
(w) 914-945-2003 (fax) 914-945-4425 TL: 862-2003





Linus Torvalds
<torvalds@transme To: Andrew Morton <[email protected]>
ta.com> cc: Paul Larson <[email protected]>, lkml <[email protected]>,
<[email protected]>, Hubertus Franke/Watson/IBM@IBMUS, <[email protected]>
08/08/2002 04:24 Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()
PM






On Wed, 7 Aug 2002, Andrew Morton wrote:
>
> Has this been evaluated against Bill Irwin's constant-time
> allocator? Bill says it has slightly worse normal-case and
> vastly improved worst-case performance over the stock allocator.
> Not sure how it stacks up against this one. Plus it's much nicer
> looking code.

Guys, this discussion is getting ridiculous.

Doing a bit allocator should be trivial, but it's hard to know when a bit
is to be free'd. You can't just do it at "exit()" time, because even if
pid X exits, that doesn't mean that X can be re-used: it may still be used
as a pgid or a tid by some other process Y.

So if you really want to take this approach, you need to count the uses of
"pid X", and free the bitmap entry only when that count goes to zero. I
see no such logic in Bill Irwin's code, only a comment about last use
(which doesn't explain how to notice that last use).

Without that per-pid-count thing clarified, I don't think the (otherwise
fairly straightforward) approach of Bills really flies.

For that reason I think the mark-and-sweep thing is the right thing to do,
but I think the two-level algorithm is just over-engineering and not worth
it. And I do hate that getpid_mutex thing. Having a blocking lock for
something as simple as pid allocation just smells horribly wrong to me.

Moving the pid allocation to later (so that it doesn't need to handle
operations that block in between allocation and "we're done" and the pid
allocation can be a spinlock) sounds like a fairly obvious thing to do.

I don't see why we would need the "pid" until the very last moment, at
which point we already have the tasklist lock, in fact.

And I hate overengineering.

Linus




2002-08-08 21:47:26

by Hubertus Franke

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()






That is true. All was done under the 16-bit assumption
My hunch is that the current algorithm might actually work quite well
for a sparsely populated pid-space (32-bits).
A bitmap-ed based solution is not possible at that point due to space
requirements.

Should be easy to figure out.

Hubertus Franke
Enterprise Linux Group (Mgr), Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: [email protected]
(w) 914-945-2003 (fax) 914-945-4425 TL: 862-2003





Rik van Riel
<riel@conectiva. To: Hubertus Franke/Watson/IBM@IBMUS
com.br> cc: Andries Brouwer <[email protected]>, Andrew Morton <[email protected]>,
<[email protected]>, <[email protected]>, lkml <[email protected]>, Paul Larson
08/08/2002 04:15 <[email protected]>, Linus Torvalds <[email protected]>
PM Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()






On Thu, 8 Aug 2002, Hubertus Franke wrote:

> Which one sounds like the best one ?
>
> Assuming that for now we have to stick to 16-bit pid_t ....

That assumption is pretty central to the debate.

I don't see the standard get_pid nor the bitmap based
get_pid scale to something larger than a 16-bit pid_t.

If we're not sure yet whether we want to keep pid_t 16
bits it might be worth putting in an algorithm that does
scale to larger numbers, if only so the switch to a larger
pid_t will be more straightforward.

kind regards,

Rik
--
http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid. Go buy yourself a real t-shirt"

http://www.surriel.com/ http://distro.conectiva.com/




2002-08-08 21:59:12

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()


On Thu, 8 Aug 2002, Hubertus Franke wrote:
>
> That is true. All was done under the 16-bit assumption
> My hunch is that the current algorithm might actually work quite well
> for a sparsely populated pid-space (32-bits).

I agree.

So let's just try Andries approach, suggested patch as follows..

(yeah, silly mask and MAX_PID, but since even the kernel uses signed
integers for some of it, this way it never gets close to being an issue).

Linus

----
--- 1.2/include/linux/threads.h Tue Feb 5 07:23:04 2002
+++ edited/include/linux/threads.h Thu Aug 8 14:58:28 2002
@@ -19,6 +19,7 @@
/*
* This controls the maximum pid allocated to a process
*/
-#define PID_MAX 0x8000
+#define PID_MASK 0x3fffffff
+#define PID_MAX (PID_MASK+1)

#endif
===== kernel/fork.c 1.57 vs edited =====
--- 1.57/kernel/fork.c Tue Jul 30 15:49:20 2002
+++ edited/kernel/fork.c Thu Aug 8 15:00:16 2002
@@ -142,7 +142,7 @@
return 0;

spin_lock(&lastpid_lock);
- if((++last_pid) & 0xffff8000) {
+ if((++last_pid) & ~PID_MASK) {
last_pid = 300; /* Skip daemons etc. */
goto inside;
}
@@ -157,7 +157,7 @@
p->tgid == last_pid ||
p->session == last_pid) {
if(++last_pid >= next_safe) {
- if(last_pid & 0xffff8000)
+ if(last_pid & ~PID_MASK)
last_pid = 300;
next_safe = PID_MAX;
}

2002-08-08 22:21:27

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()


On Thu, 8 Aug 2002, Linus Torvalds wrote:
>
> So let's just try Andries approach, suggested patch as follows..

"ps" seems to do ok from a visual standpoint at least up to 99 million.
Maybe it won't look that good after that, I'm too lazy to test.

The following trivial program is useful for efficiently allocating pid
numbers without blowing chunks on the VM subsystem and spending all the
time on page table updates - for people who want to test (look out: I've
got dual 2.4GHz CPU's with HT, so getting up to 10+ million was easy, your
milage may wary and at some point you should just compile a kernel that
starts higher ;).

Linus

---
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

int main()
{
int i;
for (i = 1; i < 250000; i++) {
if (!vfork())
exit(1);
if (waitpid(-1, NULL, WNOHANG) < 0)
perror("waitpid");
}
return 0;
}


2002-08-08 22:33:39

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Thu, 8 Aug 2002, Linus Torvalds wrote:
> On Thu, 8 Aug 2002, Hubertus Franke wrote:
> >
> > That is true. All was done under the 16-bit assumption
> > My hunch is that the current algorithm might actually work quite well
> > for a sparsely populated pid-space (32-bits).
>
> I agree.
>
> So let's just try Andries approach, suggested patch as follows..

Hmmm, I wonder how badly the system will behave when
we need to reset last_pid and next_safe with 30000
pids in use ...

regards,

Rik
--
http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid. Go buy yourself a real t-shirt"

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-08 22:37:12

by Paul Larson

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Thu, 2002-08-08 at 17:02, Linus Torvalds wrote:
>
> On Thu, 8 Aug 2002, Hubertus Franke wrote:
> >
> > That is true. All was done under the 16-bit assumption
> > My hunch is that the current algorithm might actually work quite well
> > for a sparsely populated pid-space (32-bits).
>
> I agree.
The original issue that I had with all of this is the fact that if the
current algorithm can't find an available pid, it just sits there
churning forever and hangs the machine. My original patch was really
just a very basic fix for that (see the 2.4 tree). This makes it far
more unlikely for us to max out, but if we do aren't we just going to
have the same trouble all over?

Thanks,
Paul Larson

2002-08-08 22:40:47

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On 8 Aug 2002, Paul Larson wrote:

> The original issue that I had with all of this is the fact that if the
> current algorithm can't find an available pid, it just sits there
> churning forever and hangs the machine. My original patch was really
> just a very basic fix for that (see the 2.4 tree). This makes it far
> more unlikely for us to max out, but if we do aren't we just going to
> have the same trouble all over?

You'll need at least 330 million tasks to run out.

At a minimum kernel memory allocation of about 8 kB per
task, that's about 2600 GB of kernel data structures.

I'm not sure we'll hit that limit, ever. Not because
we won't have a TB of kernel data space at some point
in the future, but because 330 million tasks is a lot
more than we'd want to manage with just a few CPUs ;)

kind regards,

Rik
--
http://www.linuxsymposium.org/2002/
"You're one of those condescending OLS attendants"
"Here's a nickle kid. Go buy yourself a real t-shirt"

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-08 23:06:56

by Albert D. Cahalan

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Linus Torvalds writes:

> "ps" seems to do ok from a visual standpoint at least up to 99 million.
> Maybe it won't look that good after that, I'm too lazy to test.

Mind sharing what "ps -fj", "ps -lf", and "ps j" look like?
The standard tty is 80x24 BTW, and we already have serious
problems due to ever-expanding tty names.

How about a default limit of 9999, to be adjusted by
sysctl as needed?

2002-08-09 00:49:33

by Hubertus Franke

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()






I package my stuff up that allows to pull this into user space and run much
more efficient
pid allocation test for performance....
I'll also include the comparision numbers. Currently remote, so I don't
have access to
that info.
Based on that though it seems with random pid deletion (we surely could
argue about
that one) there still seems reasonable benefits to "consider" a twolevel
algo.
More tomorrow.

Hubertus Franke
Enterprise Linux Group (Mgr), Linux Technology Center (Member Scalability)
, OS-PIC (Chair)
email: [email protected]
(w) 914-945-2003 (fax) 914-945-4425 TL: 862-2003





Linus Torvalds
<torvalds@transme To: Hubertus Franke/Watson/IBM@IBMUS
ta.com> cc: Rik van Riel <[email protected]>, Andries Brouwer <[email protected]>,
Andrew Morton <[email protected]>, <[email protected]>, <[email protected]>, lkml <linux-
08/08/2002 06:26 [email protected]>, Paul Larson <[email protected]>
PM Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()







On Thu, 8 Aug 2002, Linus Torvalds wrote:
>
> So let's just try Andries approach, suggested patch as follows..

"ps" seems to do ok from a visual standpoint at least up to 99 million.
Maybe it won't look that good after that, I'm too lazy to test.

The following trivial program is useful for efficiently allocating pid
numbers without blowing chunks on the VM subsystem and spending all the
time on page table updates - for people who want to test (look out: I've
got dual 2.4GHz CPU's with HT, so getting up to 10+ million was easy, your
milage may wary and at some point you should just compile a kernel that
starts higher ;).

Linus

---
#include <sys/types.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>

int main()
{
int i;
for (i = 1; i < 250000; i++) {
if (!vfork())
exit(1);
if (waitpid(-1, NULL, WNOHANG) < 0)
perror("waitpid");
}
return 0;
}





2002-08-09 01:45:32

by Andries Brouwer

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Thu, Aug 08, 2002 at 07:37:08PM -0300, Rik van Riel wrote:

> Hmmm, I wonder how badly the system will behave when
> we need to reset last_pid and next_safe with 30000
> pids in use ...

On average very well, that is, the time spent in finding
the next pid is very small on average, but once in a while
there is a small hiccup.
On a hard real time system it may be reasonable to choose
for an average that is ten times higher and avoid the hiccup.

For systems that really have lots of very short-lived
processes, one might do what I suggested a moment ago
for a MP context: have the processes live in several lists,
that each only use part of the pid-space.

(One wants for_each_task to take a limited amount of time.
With N tasks, and sqrt(N) list heads one first picks the
shortest list, then loops over the list, all in time of order
sqrt(N). That works well, certainly up to a million processes.)

Andries

2002-08-09 03:23:00

by Chris Adams

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Once upon a time, Albert D. Cahalan <[email protected]> said:
>Mind sharing what "ps -fj", "ps -lf", and "ps j" look like?
>The standard tty is 80x24 BTW, and we already have serious
>problems due to ever-expanding tty names.
>
>How about a default limit of 9999, to be adjusted by
>sysctl as needed?

I hope you meant 99999 (since we already have 5 digit PIDs). It would
also seem to me that if it is adjustable, then "ps" would have to handle
it anyway, and making "ps" deal with adjustable size PIDs would be more
complex and error-prone.

Tru64 Unix 5.x uses 19 bit pids (up to 524288, so up to six digits - the
rest of the 32 bits go for cluster node, node sequence, and an unused
sign bit) without significant problems. Their "ps" args aren't an exact
match, but they're close (lots of processes snipped):

$ ps -fj | head -2
UID PID PPID C STIME TTY TIME CMD
cmadams 272363 301021 0.0 20:47:46 pts/1 0:00.09 -bash (bash)
$ ps -lf | head -2
F S UID PID PPID %CPU PRI NI RSS WCHAN STARTED TIME COMMAND
80c08001 I 200 272363 301021 0.0 44 0 520K wait 20:47:46 0:00.09 -bash (bash)
$ ps j | head -2
USER PID PPID PGID SESS JOBC S TTY TIME COMMAND
cmadams 272363 301021 272363 272363 0 I pts/1 0:00.09 -bash (bash)
$

I had exactly one problem moving to 5.x with larger PIDs: the free "top"
program (Tru64 doesn't include "top") assumed 5 digit PIDs so the
columns didn't line up until I changed the output format by one column.

I routinely have 1000+ processes running on this server (many of them
short-lived things like POP checks, short CGIs, sendmail background
delivery, etc.), so having a larger PID space is important (having the
same PID reused within 15-30 seconds would be annoying).

I would like to see Linux running on this server (or at least this class
of server), and limiting the number of PIDs because of "ps" formatting
is not the way to go.

--
Chris Adams <[email protected]>
Systems and Network Administrator - HiWAAY Internet Services
I don't speak for anybody but myself - that's enough trouble.

2002-08-09 04:39:09

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Thu, Aug 08, 2002 at 01:24:35PM -0700, Linus Torvalds wrote:
> So if you really want to take this approach, you need to count the uses of
> "pid X", and free the bitmap entry only when that count goes to zero. I
> see no such logic in Bill Irwin's code, only a comment about last use
> (which doesn't explain how to notice that last use).
> Without that per-pid-count thing clarified, I don't think the (otherwise
> fairly straightforward) approach of Bills really flies.

One big thing to bear in mind is that it is actually part of a much
larger work, one which is not centered around get_pid(), and which is
not yet ready for inclusion, or even widespread review. So please give
me time to finish it, and defer judgment until it is complete.

(1) akpm did not post the full patch, only the "after" picture of one file.
(2) The per-id accounting is properly implemented, with caveats
unrelated to the general accounting method. Yes, I am well
aware of the need to be notified on release at points other
than exit(), and I have implemented that notification.
(3) The patch as it is intended to be is largely a tty and job control
cleanup. get_pid() changes are required as the central feature
is the removal of the list of all tasks, upon which the current
get_pid() relies.
(4) pid hashing actually creates idtag objects for something guaranteed
to be unique. This is so stupid I consider it a bug.
(5) The patch is not yet finished.

Please defer judgment until I am ready to present as a finished work what
is now a work in progress and barely if even out of the "debug" phase.

The last fully-ported version of the patch, which was originally put
on-line only to facilitate communication with reviewers and
contributors, prior to the initial release (and by a very large margin)
is available from the following URL:

ftp://ftp.kernel.org/pub/linux/kernel/people/wli/task_mgmt/for_each_task-2.5.23-1

This patch does not contain a complete implementation of what I would
like to present when I feel ready to submit it.

While I thought I came up with something "nifty" in the way of a
get_pid() as a result of this work, its primary focus is really to
clean up tty and job control code. As it stands now, it does very
little in the way of cleaning it up, only converting it to use the new
infrastructure as a replacement for for_each_task() in the most
straightforward and braindead ways imaginable. Several bugs are known
to exist, but the full patch with all fixes has not yet been ported to
current mainline, and I won't have time to devote to it for some time.
This patch needs much further work, and that work is not yet finished.
Please defer judgment until I can actually finish it. This will
probably have to wait until 2.7 or even later.


Thanks,
Bill

2002-08-09 07:00:57

by Albert D. Cahalan

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Chris Adams writes:
> Once upon a time, Albert D. Cahalan <[email protected]> said:

>> Mind sharing what "ps -fj", "ps -lf", and "ps j" look like?
>> The standard tty is 80x24 BTW, and we already have serious
>> problems due to ever-expanding tty names.
>>
>> How about a default limit of 9999, to be adjusted by
>> sysctl as needed?
>
> I hope you meant 99999 (since we already have 5 digit PIDs). It would
> also seem to me that if it is adjustable, then "ps" would have to handle
> it anyway, and making "ps" deal with adjustable size PIDs would be more
> complex and error-prone.

BTW, let's start with: not only "ps" is affected.
Off the top of my head: netstat, fuser, top, pstree...

I almost put 99999, but then I realized that that's silly.
For years Linux had a hard limit of about 4000 processes,
and not many people complained. It sure would be nice to
gain back a few of the columns lost to other stuff, so
that people could once again see command arguments.

The two real-word usage examples close at hand:

a. My full GNOME desktop has 48 processes.

b. The main UNIX shell server for CS students
at this university has 79 processes.
(Tru64, several hundred CS students, 2:25 am)

For "ps", adjustable width is a trivial addition.
Notice the UID ("ps -l", "ps -lf) handling, and
notice what signals ("ps s") do on a wide screen.
I could also just let the output be ugly, since
no normal system will have so many processes.

The problem is screen space, pure and simple. If the
default limit goes to over 1 billion, then "ps" output
must wrap lines. There is no alternative, unless you
think "System going down to reset PID numbers!" is OK.

> Tru64 Unix 5.x uses 19 bit pids (up to 524288, so up to six digits - the
> rest of the 32 bits go for cluster node, node sequence, and an unused
> sign bit) without significant problems. Their "ps" args aren't an exact
> match, but they're close (lots of processes snipped):
>
> $ ps -fj | head -2
> UID PID PPID C STIME TTY TIME CMD
> cmadams 272363 301021 0.0 20:47:46 pts/1 0:00.09 -bash (bash)

Linux (and all SysV if I remember right) has 4 columns
of PID info that would need to expand:

UID PID PPID PGID SID C STIME TTY TIME CMD
albert 12975 12966 12975 12975 0 Aug02 pts/1 00:00:00 bash

> $ ps -lf | head -2
> F S UID PID PPID %CPU PRI NI RSS WCHAN STARTED TIME COMMAND
> 80c08001 I 200 272363 301021 0.0 44 0 520K wait 20:47:46 0:00.09 -bash (bash)

Eeew. That's broken; it won't display right on a normal
80x24 terminal. Linux "ps -lf" will just barely fit.

> $ ps j | head -2
> USER PID PPID PGID SESS JOBC S TTY TIME COMMAND
> cmadams 272363 301021 272363 272363 0 I pts/1 0:00.09 -bash (bash)

For historic reasons, Linux has a whopping 5 columns
of PID info for this format:

PPID PID PGID SID TTY TPGID STAT UID TIME COMMAND
1 770 770 770 tty1 12893 S 1000 0:00 -bash
770 12893 12893 770 tty1 12893 S 1000 0:00 /bin/sh /usr/bin/X11/st

> I routinely have 1000+ processes running on this server (many of them
> short-lived things like POP checks, short CGIs, sendmail background
> delivery, etc.), so having a larger PID space is important (having the
> same PID reused within 15-30 seconds would be annoying).

Increase the limit on this server if you wish. Problem?
I only suggest 9999 as the default. (which would actually
be just enough for you)

> I would like to see Linux running on this server (or at least this class
> of server), and limiting the number of PIDs because of "ps" formatting
> is not the way to go.

It's not just "ps", and I'm not saying you couldn't
adjust your system for a higher limit. I do think that
the out-of-box default shouldn't screw up formatting.
If you need to go past 9999, then you'll want to tweak
a few other settings as well. Low process counts are
the common case, and shouldn't be hurt by the fact that
a few people wish to run a billion processes.

2002-08-09 08:44:27

by Helge Hafting

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

"Albert D. Cahalan" wrote:

> The problem is screen space, pure and simple. If the
> default limit goes to over 1 billion, then "ps" output
> must wrap lines. There is no alternative, unless you
> think "System going down to reset PID numbers!" is OK.
>

There is an alternative.
Use 32-bit PID's, but with an additional rule for wraparound.
Simply wrap the PID when
(nextPID > 2*number_of_processes && nextPID > 30000)
The latter one just to avoid wrapping at 10 when there are
5 processes.

This simple approach supports 32-bit PIDs for those
that need them, while "ps" and friends always looks nice
except for those who actually run large amounts of processes.
You won't get a very large PID unless you need to.

Helge Hafting

2002-08-09 09:08:41

by Alan

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Fri, 2002-08-09 at 09:48, Helge Hafting wrote:
> Use 32-bit PID's, but with an additional rule for wraparound.
> Simply wrap the PID when
> (nextPID > 2*number_of_processes && nextPID > 30000)
> The latter one just to avoid wrapping at 10 when there are
> 5 processes.

Its much simpler to put max_pid into /proc/sys/kernel. That way we can
boot with 32000 process ids, which will ensure ancient stuff which has
16bit pid_t (old old libc) and also any old kernel interfaces which
expose it - does ipc ?

2002-08-09 09:22:16

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Alan Cox wrote:
>
> On Fri, 2002-08-09 at 09:48, Helge Hafting wrote:
> > Use 32-bit PID's, but with an additional rule for wraparound.
> > Simply wrap the PID when
> > (nextPID > 2*number_of_processes && nextPID > 30000)
> > The latter one just to avoid wrapping at 10 when there are
> > 5 processes.
>
> Its much simpler to put max_pid into /proc/sys/kernel. That way we can
> boot with 32000 process ids, which will ensure ancient stuff which has
> 16bit pid_t (old old libc) and also any old kernel interfaces which
> expose it - does ipc ?

procfs oopses with >65535 processes, btw. Related to the i_ino
encoding:

#define fake_ino(pid,ino) (((pid)<<16)|(ino))
#define PROC_INODE_PROPER(inode) ((inode)->i_ino & ~0xffff)

it ruined Anton's evening ;)

2002-08-09 10:33:43

by Andries Brouwer

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Fri, Aug 09, 2002 at 11:32:33AM +0100, Alan Cox wrote:

> Its much simpler to put max_pid into /proc/sys/kernel. That way we can
> boot with 32000 process ids, which will ensure ancient stuff which has
> 16bit pid_t (old old libc) and also any old kernel interfaces which
> expose it - does ipc ?

I don't think it is a good idea to add lots of almost meaningless knobs.
This is not something one would like to adapt dynamically.
Someone who needs 16bit pid_t can compile her own kernel with a
suitable value of PID_MAX.

There is no old old libc with 16bit pid_t, I think.
I have libc 4.4.1 here with
typedef int pid_t;
I have Linux 0.01 here with
typedef int pid_t;

Yes, ipc exposed pid in a 16-bit field, namely in the fields
msg_lspid, msg_lrpid, shm_cpid, shm_lpid.
Two years ago I found all occurrences in all Linux sources,
but I forgot the details of the result; I do not recall
significant occurrences. Maybe someone will repeat this big grep.

Andries

2002-08-09 12:56:49

by Chris Adams

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Once upon a time, Albert D. Cahalan <[email protected]> said:
> BTW, let's start with: not only "ps" is affected.
> Off the top of my head: netstat, fuser, top, pstree...

netstat: already wraps around and loses formatting when PIDs are shown
(so no changes)
fuser: already has enough columns (because it uses "kernel" some places)
(so no changes)
top: two line change
pstree: PIDs are displayed in "()" after process name (not formatted
columns) (so no changes)

How many people actually _use_ PIDs on a regular basis? When I did our
upgrade from Tru64 4.x to 5.x and got bigger PIDs, it took awhile before
anyone else even noticed. Almost always when I'm doing something to
processes, I'm scripting it, not typing numbers.

> I almost put 99999, but then I realized that that's silly.
> For years Linux had a hard limit of about 4000 processes,
> and not many people complained. It sure would be nice to
> gain back a few of the columns lost to other stuff, so
> that people could once again see command arguments.

We're talking one or two more columns. That's not going to magically
make it possible to see the command arguments. If someone wants to see
the arguments, they're going to have to use "w" anyway (also, "normal"
ps with no arguments still has lots of space).

> The two real-word usage examples close at hand:
>
> a. My full GNOME desktop has 48 processes.
>
> b. The main UNIX shell server for CS students
> at this university has 79 processes.
> (Tru64, several hundred CS students, 2:25 am)

My GNOME desktop has 72 processes, and I've only got Mozilla and a
couple of xterms running right now. My Tru64 server, with only 9 users
at 7:40 am, has 443 processes (during the business day we typically get
a lot more processes - we'll have 50-75 users logged in, lots of POP
checks running, and the usually at least once per day spam attack adding
a couple of hundred extra sendmail processes - that's when we hit 1000+
processes).

> The problem is screen space, pure and simple. If the
> default limit goes to over 1 billion, then "ps" output
> must wrap lines. There is no alternative, unless you
> think "System going down to reset PID numbers!" is OK.

Adding just one column (and going to 19 bits) allows 16 times as many
PIDs (or a sparse space 16 times as large).

> > $ ps -lf | head -2
> > F S UID PID PPID %CPU PRI NI RSS WCHAN STARTED TIME COMMAND
> > 80c08001 I 200 272363 301021 0.0 44 0 520K wait 20:47:46 0:00.09 -bash (bash)
>
> Eeew. That's broken; it won't display right on a normal
> 80x24 terminal. Linux "ps -lf" will just barely fit.

Why does every possible output format of ps have to fit each process on
one line? How often are these output formats used by normal people?

> > $ ps j | head -2
> > USER PID PPID PGID SESS JOBC S TTY TIME COMMAND
> > cmadams 272363 301021 272363 272363 0 I pts/1 0:00.09 -bash (bash)

Note: one reason this one wrapped (instead of chopping the command) is
that Tru64 ps automatically displays everything (like lots of "w"
options) when output is not a TTY.

> It's not just "ps", and I'm not saying you couldn't
> adjust your system for a higher limit. I do think that
> the out-of-box default shouldn't screw up formatting.
> If you need to go past 9999, then you'll want to tweak
> a few other settings as well. Low process counts are
> the common case, and shouldn't be hurt by the fact that
> a few people wish to run a billion processes.

I'm not talking about a billion processes. Also, are you going to make
all those other programs (well, "top" is the only one named above that
has to be changed) handle variable width PIDs now too? That argument
cuts both ways.

I think setting things based on formatting of some modes of a couple of
programs that not that many users actually use is not the right way to
go. We don't limit usernames to 8 characters anymore just because "ls"
truncates the name (not a kernel issue but the same concept). If a
larger PID space works better for PID allocation, then go with a larger
PID space.

--
Chris Adams <[email protected]>
Systems and Network Administrator - HiWAAY Internet Services
I don't speak for anybody but myself - that's enough trouble.

2002-08-09 13:55:32

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Fri, 9 Aug 2002, Albert D. Cahalan wrote:

> I almost put 99999, but then I realized that that's silly.
> For years Linux had a hard limit of about 4000 processes,

That limit was removed some 2 years ago.

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-09 14:26:04

by Hubertus Franke

[permalink] [raw]
Subject: Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches



Folks, below is the analysis that I promised yesterday.
Attached is also the harness program that brings this into userspace and
computes basic overhead for pid allocation in a random setting.
The tar file contains the following stuff and represents the
status last time I gave this consideration. I had posted it to
lkml but other than from Andrea I had not received any feedback
and dropped the issue.

total 52
4 -rw-rw-r-- 1 frankeh frankeh 141 Mar 28 11:25 Makefile
12 -rw-rw-r-- 1 frankeh frankeh 8306 Mar 22 16:59 res-2
16 -rw-rw-r-- 1 frankeh frankeh 13268 Mar 20 10:11 getpid1.c
16 -rw-rw-r-- 1 frankeh frankeh 15124 Mar 14 18:19 res-1
4 -rwxrw-r-- 1 frankeh frankeh 316 Mar 13 12:01 bm

getpid1 is the harness. bm is the batch driver.
res-1 and res-2 are two result files that each were mangled together from
the outputs of <bm> executed on different machines.
I attach res-1 here , which I posted earlier in March, so you can read through
it and draw your own conclusions with respect on where we should go with
this.
It might be worthwile to independenly redo the test and also include
Andrea's <getpid> there, allthough it resembles <algo-1>.

Volunteers ?

Note that based on the results I got, I still favor a version of the
mark and sweep that continues to go forward to find the next range
rather than always start from the beginning again.
I am not really sure whether Bill's version can be easily integrated.

The second part of the message (res-1) also experiments with a partial
maximum safe pid-range, i.e., once a range of 256 free pids has been
established we stop the mark and sweep. That looks very competitive
as well. I gives too simple improvements
(a) bitmask can be located on the stack
(b) we could potentially deal with 32-bit pid numbers as we can limit
the bitmask to partial space of the pid range.

-- Hubertus

----------------<previous post>-----------------------------------------------

I implemented an alternative version of getpid, that for large thread counts
( > 210000), provides "significantly" better performance as shown in attached
performance analysis. This is particulary viable for PID_MAX=32768.

-- Hubertus Franke <[email protected]>

---------------------------------------------------------------------------------

Currently the getpid algorithm works as follows:
At any given time an interval of [ last_pid .. next_safe ) is known to hold
unused pids. Initially the interval is set to [0 .. 32767]

Hence to allocate a new pid the following is sufficient:

if (++last_pid < next_safe) return last_pid;

However, if we move out of the safe interval, the next safe interval needs
to be established first.
This is currently done by a repetive search

repeat:
foralltasks(p) {
if (p uses lastpid) { last_pid++; goto repeat; }
/* narrow [ last_pid .. next_safe ) */
if (p->pids in [ last_pid .. next_safe ) ) next_safe = p->pid
}

Particulary for large number of tasks, this can lead to frequent exercise of
the repeat resulting in a O(N^2) algorithm. We call this : <algo-0>.

Instead I have provided an alternative mechanism that at the time
of determining the next interval marks a bitmask by walking the tasklist
once [ O(N) ] and then finding the proper bit offsets to mark the next free
interval starting from last_pid. The bitmap requires 4096 bytes.
This is <algo-1>.

An optimization to this to keep the last bitmap instead of clearing it
with every search. Only if we fail to obtain a free range, then we have
to go back and clear the bitmap and redo the search one more time.
This is <algo-2>.

I dragged the various algorithms into a userlevel test program to figure
out where the cut off points are with PID_MAX=32768. In this testprogram
I maintain A tasks, and for 10 rounds (delete D random tasks and
reallocate D tasks again) resulting in T=10*D total measured allocations.

Si states how many interval searches where needed for algo-i.
Gi states the average overhead per get_pid for algo-i in usecs.

Based on that one should use the current algorithm until ~ 22K tasks and
beyond that use algo-2. Only the last 15 tasks are a bit faster under algo-1.
We can safely ignore that case.

Based on that providing an adaptive implementation seems the right choice.
The patch for 2.5.7-pre1 is attached.


executed program example: getpid -c 2 -s 10 -e 100 -d 10 -t <0,1>
0 is old 1 is new algo 2.

A D T | S0 G0 | S1 G1 | S2 G2
----------------------------------------------------------------------------
10 10 80 | 1 0.34 | 1 0.59 | 1 0.81
20 10 100 | 1 0.30 | 1 0.49 | 1 0.64
30 10 100 | 1 0.29 | 1 0.55 | 1 0.65
40 10 100 | 1 0.35 | 1 0.51 | 1 0.65
50 10 100 | 1 0.35 | 1 0.54 | 1 0.67
60 10 100 | 2 0.38 | 21 1.95 | 2 0.79
70 10 100 | 1 0.39 | 1 0.59 | 1 0.76
80 10 100 | 1 0.41 | 1 0.62 | 1 0.76
100 50 500 | 2 0.22 | 63 1.26 | 2 0.30
150 50 500 | 3 0.24 | 12 0.56 | 4 0.36
200 50 500 | 3 0.27 | 56 2.26 | 5 0.46
250 50 500 | 2 0.26 | 119 5.63 | 6 0.54
300 50 500 | 3 0.32 | 148 8.73 | 9 0.76
350 50 500 | 5 0.45 | 168 11.51 | 6 0.76
400 50 500 | 4 0.44 | 90 7.28 | 10 1.10
450 50 500 | 6 0.61 | 143 13.08 | 7 0.97
500 50 500 | 6 0.65 | 100 10.47 | 7 1.06
550 50 500 | 5 0.63 | 71 8.10 | 9 1.34
600 50 500 | 7 0.86 | 115 14.32 | 14 2.04
650 50 500 | 8 1.00 | 112 15.08 | 13 2.07
700 50 500 | 8 1.06 | 127 18.12 | 10 1.79
750 50 500 | 8 1.26 | 62 9.73 | 15 2.73
800 50 500 | 11 1.68 | 92 15.14 | 12 2.42
850 50 500 | 14 2.03 | 78 13.73 | 13 2.67
900 50 500 | 21 3.17 | 102 18.74 | 27 5.18
1000 1000 9980 | 1 0.18 | 4 0.19 | 1 0.18
2000 1000 10000 | 76 1.22 | 3604 53.03 | 322 4.81
3000 1000 10000 | 161 3.84 | 4502 112.24 | 606 15.49
4000 1000 10000 | 359 11.17 | 4912 183.37 | 901 33.76
5000 1000 10000 | 539 23.33 | 4949 257.35 | 1165 59.91
6000 1000 10000 | 724 43.42 | 4918 349.59 | 1498 104.36
7000 1000 10000 | 1026 85.38 | 4886 447.58 | 1835 165.08
8000 1000 10000 | 1228 126.45 | 4870 565.29 | 2084 234.73
9000 1000 10000 | 1516 193.62 | 4826 699.85 | 2489 354.27
10000 1000 10000 | 1818 289.32 | 4910 843.32 | 2763 472.47
11000 1000 10000 | 2093 389.33 | 5005 1023.08 | 3095 629.70
12000 1000 10000 | 2305 506.23 | 5095 1194.71 | 3277 773.06
13000 1000 10000 | 2680 683.66 | 5289 1424.81 | 3711 1003.67
14000 1000 10000 | 2959 853.10 | 5358 1602.05 | 3878 1172.70
15000 1000 10000 | 3167 1037.79 | 5539 1835.64 | 4301 1436.40
16000 1000 10000 | 3466 1272.80 | 5669 2087.03 | 4485 1635.48
17000 1000 10000 | 3743 1539.06 | 5932 2338.50 | 4844 1924.27
18000 1000 10000 | 4069 1869.63 | 6097 2613.60 | 5218 2232.52
19000 1000 10000 | 4293 2183.98 | 6242 2866.34 | 5501 2519.60
20000 1000 10000 | 4616 2607.10 | 6508 3175.90 | 5770 2823.98
21000 1000 10000 | 4974 3119.34 | 6700 3460.95 | 6161 3183.73
22000 1000 10000 | 5177 3609.28 | 6944 3788.19 | 6389 3492.97 =
23000 1000 10000 | 5483 4214.03 | 7183 4136.25 | 6665 3823.38
24000 1000 10000 | 5838 4971.60 | 7404 4460.62 | 6982 4199.61
25000 1000 10000 | 6183 5880.92 | 7736 4891.80 | 7209 4546.18
26000 1000 10000 | 6413 6829.07 | 7890 5210.85 | 7533 4939.12
27000 1000 10000 | 6748 8132.96 | 8148 5598.19 | 7959 5442.25
28000 1000 10000 | 7139 10065.52 | 8445 6047.42 | 8140 5767.13
29000 1000 10000 | 7638 12967.20 | 8736 6475.23 | 8501 6250.86
30000 1000 10000 | 8178 16991.05 | 8994 6907.40 | 8911 6791.97
32000 50 500 | 482 26446.69 | 488 7405.63 | 487 7494.39
32050 50 500 | 488 34769.89 | 488 7463.11 | 486 7541.61
32100 50 500 | 489 44564.86 | 493 7593.99 | 486 7589.02
32150 50 500 | 486 58150.58 | 487 7549.96 | 492 7731.18
32200 50 500 | 490 64875.38 | 495 7721.82 | 497 7854.59
32250 50 500 | 491 81790.21 | 491 7697.57 | 490 7795.12
32300 50 500 | 489 88975.62 | 493 7763.04 | 495 7909.77
32350 50 500 | 489 115797.38 | 492 7782.34 | 495 7967.86
32400 50 500 | 490 120958.50 | 497 7898.45 | 496 8018.98
32450 50 500 | 492 147541.84 | 493 7874.27 | 492 7982.34
32500 50 500 | 493 175498.39 | 495 7940.18 | 495 8060.97
32550 50 500 | 492 207229.29 | 496 7973.88 | 498 8134.02
32600 50 500 | 495 267057.05 | 498 8028.86 | 498 8171.97
32650 50 500 | 492 375722.28 | 500 8088.30 | 498 8213.85
32700 50 500 | 497 528321.07 | 500 8110.51 | 499 8267.67
32751 1 10 | 10 259785.80 | 10 7851.50 | 10 8549.30
32752 1 10 | 10 1121285.60 | 10 7846.30 | 10 8556.10
32753 1 10 | 10 383729.50 | 10 7848.60 | 10 8562.20
32754 1 10 | 10 1061467.50 | 10 7849.80 | 10 8564.40
32755 1 10 | 10 612726.50 | 10 7853.00 | 10 8553.90
32756 1 10 | 10 1725559.90 | 10 7851.90 | 10 8553.00
32757 1 10 | 10 1679818.50 | 10 7847.80 | 10 8552.10
32758 1 10 | 10 2991838.60 | 10 7865.70 | 10 8557.20
32759 1 10 | 10 883388.90 | 10 7859.40 | 10 8562.00
32760 1 10 | 10 4830405.90 | 10 7850.50 | 10 9336.60
32761 1 10 | 10 7105809.20 | 10 7863.90 | 10 9337.20
32762 1 10 | 10 7919703.40 | 10 7867.10 | 10 9340.70
32763 1 10 | 10 1537522.50 | 10 7869.40 | 10 9340.70
32764 1 10 | 10 6173019.20 | 10 7866.60 | 10 9340.00
32765 1 10 | 10 8104105.00 | 10 7876.20 | 10 10112.80
32766 1 10 | 10 16145415.40 | 10 7880.80 | 10 10893.50
32767 1 10 | 10 16135267.10 | 10 7878.60 | 10 11674.40


Other variants are possible, for instance if 4096 bytes is too much
(hell I don't know how that could be), one can break it up into smaller
search chunks (e.g. 256 bytes).

Another alternative is to allocate the page on the first occasion of
getting into get_pid_my_map....

In the following I give a comparative result between algo-2 and
algo-2 with a max interval size of 256. The times are very comparative.
Also the search count values are identical, but in 2 cases suggesting
that a interval size particular for large thread counts of 256 is certainly
sufficient, but it brings some small overhead. Question to answer is
wether setting aside an extra page is such a crime.....

A D T | S2 G2 | S2-256 G2-256
-------------------------------------------------------
10 10 80 | 1 0.81 | 1 0.84
20 10 100 | 1 0.64 | 1 0.67
30 10 100 | 1 0.65 | 1 0.68
40 10 100 | 1 0.65 | 1 0.69
50 10 100 | 1 0.67 | 1 0.71
60 10 100 | 2 0.79 | 2 0.82
70 10 100 | 1 0.76 | 1 0.76
80 10 100 | 1 0.76 | 1 0.79
100 50 500 | 2 0.30 | 2 0.31
150 50 500 | 4 0.36 | 5 0.39 <=
200 50 500 | 5 0.46 | 5 0.46
250 50 500 | 6 0.54 | 6 0.55
300 50 500 | 9 0.76 | 9 0.76
350 50 500 | 6 0.76 | 6 0.75
400 50 500 | 10 1.10 | 10 1.10
450 50 500 | 7 0.97 | 7 0.97
500 50 500 | 7 1.06 | 7 1.06
550 50 500 | 9 1.34 | 9 1.35
600 50 500 | 14 2.04 | 14 2.06
650 50 500 | 13 2.07 | 13 2.09
700 50 500 | 10 1.79 | 10 1.82
750 50 500 | 15 2.73 | 15 2.69
800 50 500 | 12 2.42 | 12 2.38
850 50 500 | 13 2.67 | 13 2.66
900 50 500 | 27 5.18 | 27 5.25
1000 1000 9980 | 1 0.18 | 3 0.19 <=
2000 1000 10000 | 322 4.81 | 322 4.84
3000 1000 10000 | 606 15.49 | 606 15.55
4000 1000 10000 | 901 33.76 | 901 34.42
5000 1000 10000 | 1165 59.91 | 1165 62.35
6000 1000 10000 | 1498 104.36 | 1498 105.55
7000 1000 10000 | 1835 165.08 | 1835 174.82
8000 1000 10000 | 2084 234.73 | 2084 244.18
9000 1000 10000 | 2489 354.27 | 2489 372.11
10000 1000 10000 | 2763 472.47 | 2763 486.73
11000 1000 10000 | 3095 629.70 | 3095 648.31
12000 1000 10000 | 3277 773.06 | 3277 784.75
13000 1000 10000 | 3711 1003.67 | 3711 1006.94
14000 1000 10000 | 3878 1172.70 | 3878 1168.71
15000 1000 10000 | 4301 1436.40 | 4301 1429.89
16000 1000 10000 | 4485 1635.48 | 4485 1620.90
17000 1000 10000 | 4844 1924.27 | 4844 1904.92
18000 1000 10000 | 5218 2232.52 | 5218 2218.80
19000 1000 10000 | 5501 2519.60 | 5501 2508.83
20000 1000 10000 | 5770 2823.98 | 5770 2895.66
21000 1000 10000 | 6161 3183.73 | 6161 3307.54
22000 1000 10000 | 6389 3492.97 | 6389 3620.53
23000 1000 10000 | 6665 3823.38 | 6665 3995.63
24000 1000 10000 | 6982 4199.61 | 6982 4347.95
25000 1000 10000 | 7209 4546.18 | 7209 4701.95
26000 1000 10000 | 7533 4939.12 | 7533 5088.13
27000 1000 10000 | 7959 5442.25 | 7959 5599.85
28000 1000 10000 | 8140 5767.13 | 8140 5817.86
29000 1000 10000 | 8501 6250.86 | 8501 6250.30
30000 1000 10000 | 8911 6791.97 | 8911 6788.51
32000 50 500 | 487 7494.39 | 487 7493.47
32050 50 500 | 486 7541.61 | 486 7541.05
32100 50 500 | 486 7589.02 | 486 7586.12
32150 50 500 | 492 7731.18 | 492 7728.76
32200 50 500 | 497 7854.59 | 497 7854.94
32250 50 500 | 490 7795.12 | 490 7783.10
32300 50 500 | 495 7909.77 | 495 7902.70
32350 50 500 | 495 7967.86 | 495 7946.20
32400 50 500 | 496 8018.98 | 496 7999.34
32450 50 500 | 492 7982.34 | 492 7962.93
32500 50 500 | 495 8060.97 | 495 8048.18
32550 50 500 | 498 8134.02 | 498 8122.08
32600 50 500 | 498 8171.97 | 498 8169.34
32650 50 500 | 498 8213.85 | 498 8209.95
32700 50 500 | 499 8267.67 | 499 8266.13
32751 1 10 | 10 8549.30 | 10 8629.00
32752 1 10 | 10 8556.10 | 10 8636.30
32753 1 10 | 10 8562.20 | 10 8632.00
32754 1 10 | 10 8564.40 | 10 8633.40
32755 1 10 | 10 8553.90 | 10 8635.40
32756 1 10 | 10 8553.00 | 10 8637.60
32757 1 10 | 10 8552.10 | 10 8640.90
32758 1 10 | 10 8557.20 | 10 8644.90
32759 1 10 | 10 8562.00 | 10 8644.10
32760 1 10 | 10 9336.60 | 10 9436.10
32761 1 10 | 10 9337.20 | 10 9435.60
32762 1 10 | 10 9340.70 | 10 9439.10
32763 1 10 | 10 9340.70 | 10 9433.60
32764 1 10 | 10 9340.00 | 10 9440.60
32765 1 10 | 10 10112.80 | 10 10228.40
32766 1 10 | 10 10893.50 | 10 11023.50
32767 1 10 | 10 11674.40 | 10 11813.70


Attachments:
gp.tar.gz (10.37 kB)

2002-08-09 14:36:15

by Albert D. Cahalan

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Chris Adams writes:
> Once upon a time, Albert D. Cahalan <[email protected]> said:

>> BTW, let's start with: not only "ps" is affected.
>> Off the top of my head: netstat, fuser, top, pstree...
>
> netstat: already wraps around and loses formatting when PIDs are shown
> (so no changes)

Sure, but no need to make it worse.

> fuser: already has enough columns (because it uses "kernel" some places)
> (so no changes)
> top: two line change

Still this is screwing the common case for the truly unusual.

> pstree: PIDs are displayed in "()" after process name (not formatted
> columns) (so no changes)
>
> How many people actually _use_ PIDs on a regular basis? When I did our
> upgrade from Tru64 4.x to 5.x and got bigger PIDs, it took awhile before
> anyone else even noticed. Almost always when I'm doing something to
> processes, I'm scripting it, not typing numbers.

I'm not counting Aunt Tillie.

Anybody on a terminal without cut-and-paste will suffer.

>> I almost put 99999, but then I realized that that's silly.
>> For years Linux had a hard limit of about 4000 processes,
>> and not many people complained. It sure would be nice to
>> gain back a few of the columns lost to other stuff, so
>> that people could once again see command arguments.
>
> We're talking one or two more columns.

It's as many as 25 character-columns. Linus was using
0x3fffffff as a mask. There are 5 field-columns with
PID data in the "ps j" format.

> That's not going to magically
> make it possible to see the command arguments. If someone wants to see
> the arguments, they're going to have to use "w" anyway (also, "normal"
> ps with no arguments still has lots of space).

Huh? Try "ps -jf", which BTW has 4 PID columns. Also try "ps j".
I'm seeing arguments, and won't if PIDs get too wide.

>> The two real-word usage examples close at hand:
>>
>> a. My full GNOME desktop has 48 processes.
>>
>> b. The main UNIX shell server for CS students
>> at this university has 79 processes.
>> (Tru64, several hundred CS students, 2:25 am)
>
> My GNOME desktop has 72 processes, and I've only got Mozilla and a
> couple of xterms running right now.

Same as me, but I run Debian. Still, 72 is well below 9999.

> My Tru64 server, with only 9 users
> at 7:40 am, has 443 processes (during the business day we typically get
> a lot more processes - we'll have 50-75 users logged in, lots of POP
> checks running, and the usually at least once per day spam attack adding
> a couple of hundred extra sendmail processes - that's when we hit 1000+
> processes).

While I think this would fit under 9999, you're welcome
to adjust your system for any limit you like. I just don't
think the default should cater to systems that are both
rare and likely to have a real paid admin. It's your job to
increase the limit if you feel a need for more PID space.

>> The problem is screen space, pure and simple. If the
>> default limit goes to over 1 billion, then "ps" output
>> must wrap lines. There is no alternative, unless you
>> think "System going down to reset PID numbers!" is OK.
>
> Adding just one column (and going to 19 bits) allows 16
> times as many PIDs (or a sparse space 16 times as large).

Yes, and it trashes many common formats.

>>> $ ps -lf | head -2
>>> F S UID PID PPID %CPU PRI NI RSS WCHAN STARTED TIME COMMAND
>>> 80c08001 I 200 272363 301021 0.0 44 0 520K wait 20:47:46 0:00.09 -bash (bash)
>>
>> Eeew. That's broken; it won't display right on a normal
>> 80x24 terminal. Linux "ps -lf" will just barely fit.
>
> Why does every possible output format of ps have to fit each process on
> one line? How often are these output formats used by normal people?

How often are thousands of PIDs used by normal people?

>>> $ ps j | head -2
>>> USER PID PPID PGID SESS JOBC S TTY TIME COMMAND
>>> cmadams 272363 301021 272363 272363 0 I pts/1 0:00.09 -bash (bash)
>
> Note: one reason this one wrapped (instead of chopping the command) is
> that Tru64 ps automatically displays everything (like lots of "w"
> options) when output is not a TTY.

It's either that or assume 80, unless you have a way to
get info from the controlling tty. I haven't looked into
that yet.

>> It's not just "ps", and I'm not saying you couldn't
>> adjust your system for a higher limit. I do think that
>> the out-of-box default shouldn't screw up formatting.
>> If you need to go past 9999, then you'll want to tweak
>> a few other settings as well. Low process counts are
>> the common case, and shouldn't be hurt by the fact that
>> a few people wish to run a billion processes.
>
> I'm not talking about a billion processes.

That's what Linus was testing with.

> Also, are you going to make
> all those other programs (well, "top" is the only one named above that
> has to be changed) handle variable width PIDs now too? That argument
> cuts both ways.

Perhaps. You, with the odd case, can do without those programs.

> I think setting things based on formatting of some modes of a couple of
> programs that not that many users actually use is not the right way to
> go. We don't limit usernames to 8 characters anymore just because "ls"
> truncates the name (not a kernel issue but the same concept). If a
> larger PID space works better for PID allocation, then go with a larger
> PID space.

Arrrgh! That's a bug in ls, and yes you should limit usernames.
Think about:

Albert_Roberts
Albert_Robertson
Albert_Robbens

I think this is a much more popular problem than PID issues.
Lots of people with an NT background don't see why a long
username might cause problems. (names may be assigned by
corporate policy, and Linux admins must adapt) Maybe I could
do something about this if the default PID limit was 9999.

2002-08-09 14:54:00

by Albert D. Cahalan

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Rik van Riel writes:
> On Fri, 9 Aug 2002, Albert D. Cahalan wrote:
>
>> I almost put 99999, but then I realized that that's silly.
>> For years Linux had a hard limit of about 4000 processes,
>
> That limit was removed some 2 years ago.

Sure. Linux is 11 years old now. Also remember that that
was the _hard_ limit. We had a soft limit near 1000,
with a kernel recompile needed to increase it. Almost
everybody was satisfied with this.

Personally I'd change my adjustable PID limit to 999,
but I wouldn't suggest that as a default. 9999 should
cover almost all systems.

Self-adjusting could be nice. Then everybody starts
with 9, and gains a digit whenever space is 50% full.

2002-08-09 15:32:49

by Andries Brouwer

[permalink] [raw]
Subject: Re: Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches

On Fri, Aug 09, 2002 at 07:22:08AM -0400, Hubertus Franke wrote:

> Particulary for large number of tasks, this can lead to frequent exercise of
> the repeat resulting in a O(N^2) algorithm. We call this : <algo-0>.

Your math is flawed. The O(N^2) happens only when the name space for pid's
has the same order of magnitude as the number N of processes.
Now consider N=100000 with 31-bit name space. In a series of
2.10^9 forks you have to do the loop fewer than N times and
N^2 / 2.10^9 = 5. You see that on average for each fork there
are 5 comparisons.
For N=1000000 you rearrange the task list as I described yesterday
so that each loop takes time sqrt(N), and altogether N.sqrt(N)
comparisons are needed in a series of 2.10^9 forks.
That is 0.5 comparisons per fork.

You see that thanks to the large pid space things get really
efficient. Ugly constructions are only needed when a large fraction
of all possible pids is actually in use, or when you need hard
real time guarantees.

Andries

2002-08-09 16:00:24

by Linus Torvalds

[permalink] [raw]
Subject: Re: Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches


On Fri, 9 Aug 2002, Hubertus Franke wrote:
>
> I dragged the various algorithms into a userlevel test program to figure
> out where the cut off points are with PID_MAX=32768. In this testprogram
> I maintain A tasks, and for 10 rounds (delete D random tasks and
> reallocate D tasks again) resulting in T=10*D total measured allocations.

Mind re-doing that with PID_MAX=999999 or similar? The whole point of the
current simple algorithm is that the common case (nay, done right, the
_only_ case) is where the number of threads << PID_MAX.

That certainly used to be true with PID_MAX=32768 (not many people may
realize it, but in 1991 the maximum number of tasks in the system was
limited to 63, simply because of how the VM carved out the 4GB address
space).

Things have changed, but considering that some people think that 32k
threads are a limitation already, and that the current code should work
fine (and be pretty much optimal) with a larger PID_MAX, I really think
it's unfair to not even benchmark that case..

Linus

2002-08-09 18:12:48

by Hubertus Franke

[permalink] [raw]
Subject: Re: Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches

On Friday 09 August 2002 11:36 am, Andries Brouwer wrote:

!!!!!!!!!!! You are in a different space !!!!!!!!
All work was done under the assumption of 16-bit pid_t.
I stated yesterday already that for NumTasks substantially smaller
than the pid_t supported size, this won't be a problem
as your analysis states and my data also states.

You have two choices
(a) move Linux up to 32-bit pid_t
(b) stick within the current 16-bit discussion.

> On Fri, Aug 09, 2002 at 07:22:08AM -0400, Hubertus Franke wrote:
> > Particulary for large number of tasks, this can lead to frequent exercise
> > of the repeat resulting in a O(N^2) algorithm. We call this : <algo-0>.
>
> Your math is flawed. The O(N^2) happens only when the name space for pid's
> has the same order of magnitude as the number N of processes.
> Now consider N=100000 with 31-bit name space. In a series of
> 2.10^9 forks you have to do the loop fewer than N times and
> N^2 / 2.10^9 = 5. You see that on average for each fork there
> are 5 comparisons.
> For N=1000000 you rearrange the task list as I described yesterday
> so that each loop takes time sqrt(N), and altogether N.sqrt(N)
> comparisons are needed in a series of 2.10^9 forks.
> That is 0.5 comparisons per fork.
>
> You see that thanks to the large pid space things get really
> efficient. Ugly constructions are only needed when a large fraction
> of all possible pids is actually in use, or when you need hard
> real time guarantees.
>
> Andries


--
-- Hubertus Franke ([email protected])

2002-08-09 18:15:27

by Hubertus Franke

[permalink] [raw]
Subject: Re: Analysis for Linux-2.5 fix/improve get_pid(), comparing various approaches

On Friday 09 August 2002 12:05 pm, Linus Torvalds wrote:
> On Fri, 9 Aug 2002, Hubertus Franke wrote:
> > I dragged the various algorithms into a userlevel test program to figure
> > out where the cut off points are with PID_MAX=32768. In this testprogram
> > I maintain A tasks, and for 10 rounds (delete D random tasks and
> > reallocate D tasks again) resulting in T=10*D total measured allocations.
>
> Mind re-doing that with PID_MAX=999999 or similar? The whole point of the
> current simple algorithm is that the common case (nay, done right, the
> _only_ case) is where the number of threads << PID_MAX.
>

Don't have time right now...
Simply look at the numbers for the ratio you are expected.
I would be very surprise if the relative curves would change
when moving to 132K tasks and also populate the pid space only by
let's say 25%.

Otherwise, Paul can you run this....

> That certainly used to be true with PID_MAX=32768 (not many people may
> realize it, but in 1991 the maximum number of tasks in the system was
> limited to 63, simply because of how the VM carved out the 4GB address
> space).
>
> Things have changed, but considering that some people think that 32k
> threads are a limitation already, and that the current code should work
> fine (and be pretty much optimal) with a larger PID_MAX, I really think
> it's unfair to not even benchmark that case..
>
> Linus

--
-- Hubertus Franke ([email protected])

2002-08-09 19:37:10

by Paul Larson

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

I suspect that it would actually require more than just this. I tried
this with the same test I've been using and had several failed attepmts
at low numbers by getting wierd unexpected signals (like 28), and then
one that ran for a much longer time and produced an oops with random
garbage to the console (trying to extract this now).

-Paul Larson

On Thu, 2002-08-08 at 17:02, Linus Torvalds wrote:
> ----
> --- 1.2/include/linux/threads.h Tue Feb 5 07:23:04 2002
> +++ edited/include/linux/threads.h Thu Aug 8 14:58:28 2002
> @@ -19,6 +19,7 @@
> /*
> * This controls the maximum pid allocated to a process
> */
> -#define PID_MAX 0x8000
> +#define PID_MASK 0x3fffffff
> +#define PID_MAX (PID_MASK+1)
>
> #endif
> ===== kernel/fork.c 1.57 vs edited =====
> --- 1.57/kernel/fork.c Tue Jul 30 15:49:20 2002
> +++ edited/kernel/fork.c Thu Aug 8 15:00:16 2002
> @@ -142,7 +142,7 @@
> return 0;
>
> spin_lock(&lastpid_lock);
> - if((++last_pid) & 0xffff8000) {
> + if((++last_pid) & ~PID_MASK) {
> last_pid = 300; /* Skip daemons etc. */
> goto inside;
> }
> @@ -157,7 +157,7 @@
> p->tgid == last_pid ||
> p->session == last_pid) {
> if(++last_pid >= next_safe) {
> - if(last_pid & 0xffff8000)
> + if(last_pid & ~PID_MASK)
> last_pid = 300;
> next_safe = PID_MAX;
> }
>
>


2002-08-09 20:15:40

by Paul Larson

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Fri, 2002-08-09 at 14:34, Paul Larson wrote:
> I suspect that it would actually require more than just this. I tried
> this with the same test I've been using and had several failed attepmts
> at low numbers by getting wierd unexpected signals (like 28), and then
> one that ran for a much longer time and produced an oops with random
> garbage to the console (trying to extract this now).
>
> -Paul Larson
Here's the ksymoops output minus the unprintable chars that got sent
after it:

Unable to handle kernel paging request at virtual address 32313256
c015cb38
*pde = 37184001
Oops: 0000
CPU: 4
0010:[<c015cb38>] Not tainted
Using defaults from ksymoops -t elf32-i386 -a i386
EFLAGS: 00010202
eax: d9768090 ebx: d9768090 ecx: 00000020 edx: 32313232
esi: c015cb10 edi: d9768090 ebp: 00000008 esp: f6cb9e34
ds: 0018 es: 0018 ss: 0018
Stack: c0151403 d9768090 d9768090 f6dabaa0 c01515ad d9768090 f6dabab8
c014f49b
d9768090 f6dabaa0 c0139fa3 00000020 00000001 000001d0 c013a21a c013a170
000001f8 f6cb9e7c 00000020 00000001 000001d0 000041e6 c014f8f0 00000010
Call Trace: [<c0151403>] [<c01515ad>] [<c014f49b>] [<c0139fa3>]
[<c013a21a>]
[<c013a170>] [<c014f8f0>] [<c0131f9e>] [<c0131fec>] [<c0132c64>]
[<c0132fa2>]
[<c0133010>] [<c0116717>] [<c0117186>] [<c011793e>] [<c0105bc5>]
[<c0107173>]
Code: 8b 42 24 85 c0 74 0b f0 ff 48 10 8b 42 24 83 48 14 08 52 e8

>>EIP; c015cb38 <proc_delete_inode+28/50> <=====
Trace; c0151403 <generic_delete_inode+83/b0>
Trace; c01515ad <iput+5d/60>
Trace; c014f49b <prune_dcache+eb/180>
Trace; c0139fa3 <pdflush_operation+83/90>
Trace; c013a21a <wakeup_bdflush+1a/20>
Trace; c013a170 <background_writeout+0/90>
Trace; c014f8f0 <shrink_dcache_memory+20/40>
Trace; c0131f9e <shrink_caches+7e/a0>
Trace; c0131fec <try_to_free_pages+2c/50>
Trace; c0132c64 <balance_classzone+44/1f0>
Trace; c0132fa2 <__alloc_pages+192/1f0>
Trace; c0133010 <__get_free_pages+10/20>
Trace; c0116717 <dup_task_struct+17/70>
Trace; c0117186 <copy_process+56/7f0>
Trace; c011793e <do_fork+1e/a0>
Trace; c0105bc5 <sys_fork+15/30>
Trace; c0107173 <syscall_call+7/b>
Code; c015cb38 <proc_delete_inode+28/50>
00000000 <_EIP>:
Code; c015cb38 <proc_delete_inode+28/50> <=====
0: 8b 42 24 mov 0x24(%edx),%eax <=====
Code; c015cb3b <proc_delete_inode+2b/50>
3: 85 c0 test %eax,%eax
Code; c015cb3d <proc_delete_inode+2d/50>
5: 74 0b je 12 <_EIP+0x12> c015cb4a
<proc_delete_inode+3a/50>
Code; c015cb3f <proc_delete_inode+2f/50>
7: f0 ff 48 10 lock decl 0x10(%eax)
Code; c015cb43 <proc_delete_inode+33/50>
b: 8b 42 24 mov 0x24(%edx),%eax
Code; c015cb46 <proc_delete_inode+36/50>
e: 83 48 14 08 orl $0x8,0x14(%eax)
Code; c015cb4a <proc_delete_inode+3a/50>
12: 52 push %edx
Code; c015cb4b <proc_delete_inode+3b/50>
13: e8 00 00 00 00 call 18 <_EIP+0x18> c015cb50
<proc_delete_inode+40/50>

2002-08-09 20:36:33

by Andries Brouwer

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Fri, Aug 09, 2002 at 02:34:17PM -0500, Paul Larson wrote:

> I suspect that it would actually require more than just this. I tried
> this with the same test I've been using and had several failed attepmts
> at low numbers by getting wierd unexpected signals (like 28), and then
> one that ran for a much longer time and produced an oops with random
> garbage to the console (trying to extract this now).

Not much more. Around 2.3.40 I have run with a large PID_MAX for a long time.
The patch that I submitted is still visible on the net various places
(I just tried Andries pid_max and found a few).

At that time the only other change (other than <linux/threads.h> and
kernel/fork.c) was in proc/base.c, namely

-#define fake_ino(pid,ino) (((pid)<<16)|(ino))
+#define fake_ino(pid,ino) (((1)<<16)|(ino))

and

- if (!pid)
- goto out;
- if (pid & 0xffff0000)
+ if (pid <= 0 || pid >= PID_MAX)
goto out;

(plucked from google output).

I have not checked precisely what change is appropriate in procfs today.


Andries

2002-08-09 21:19:04

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()


On 9 Aug 2002, Paul Larson wrote:
>
> I suspect that it would actually require more than just this.

There are various /proc paths, Andries had a full patch at some point a
long time ago.

My point was not so much that this is sufficient, but that the approach is
valid. I'd rather go with the simple approach (that solves two problems)
than with a much more complex approach (that solves only one).

And yes, I doubt I actually want to give all 30 bits to pid space in the
long run (the per-node pid space argument is still valid), but it's better
to start with 30 bits and actively trying to break things and then phasing
it down later than to be timid and never even see the stuff that breaks.

Linus

2002-08-09 21:39:34

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()



On Fri, 9 Aug 2002, Linus Torvalds wrote:
>
> There are various /proc paths, Andries had a full patch at some point a
> long time ago.

Hmm.. Giving them a quick glance-over, the /proc issues look like they
shouldn't be there in 2.5.x anyway, since the inode number really is
largely just a random number in 2.5 and all the real information is
squirelled away at path open time.

There's certainly a possibility for some cleanups, though.

Linus

2002-08-09 21:48:42

by Paul Larson

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Fri, 2002-08-09 at 16:42, Linus Torvalds wrote:

> Hmm.. Giving them a quick glance-over, the /proc issues look like they
> shouldn't be there in 2.5.x anyway, since the inode number really is
> largely just a random number in 2.5 and all the real information is
> squirelled away at path open time.
>
> There's certainly a possibility for some cleanups, though.
So for now then, should I dig out my original (minimal) patch that
*just* fixed the "loop forever even if we're out of pids" problem? Even
if we increase PID_MAX to something obscenely high, I think we should
still be checking for this.

-Paul Larson

2002-08-09 21:59:39

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()


On 9 Aug 2002, Paul Larson wrote:
> On Fri, 2002-08-09 at 16:42, Linus Torvalds wrote:
> > Hmm.. Giving them a quick glance-over, the /proc issues look like they
> > shouldn't be there in 2.5.x anyway, since the inode number really is
> > largely just a random number in 2.5 and all the real information is
> > squirelled away at path open time.

It looks like the biggest impact on /proc would be that the /proc/<pid>
inodes wouldn't be 10%% unique any more, which in turn means that an
old-style /bin/pwd that actually walks the tree backwards and checks the
inode number would occasionally fail.

That in turn makes me suspect that we'd better off just biting the bullet
and makign the inode numbers almost completely static, and forcing that
particular failure mode early rather than hit it randomly due to unlucky
timing.

Doing a simple strace shows that all the systems I have regular access to
use the "getcwd()" system call anyway, which gets this right on /proc (and
other filesystems that do not guarantee unique inode numbers)

> So for now then, should I dig out my original (minimal) patch that
> *just* fixed the "loop forever even if we're out of pids" problem? Even
> if we increase PID_MAX to something obscenely high, I think we should
> still be checking for this.

Ehh, considering that especially with a 30-bit pid, there's _no_ way we'd
run out without some other serious problems hitting us long before (out of
memory being the obvious one), I don't think even that is an issue.

With a minimum of 8kB / pid for process overhead, you need to have at
least 43 bits of physical address space completely populated just to cover
the memory uses of that many pid's.

Linus

2002-08-10 17:21:11

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Linus Torvalds wrote:
> Doing a simple strace shows that all the systems I have regular access to
> use the "getcwd()" system call anyway, which gets this right on /proc (and
> other filesystems that do not guarantee unique inode numbers)

Oh dear -- what of programs that assume duplicate inode numbers are hard
links, and therefore assume the same contents will be found in each
duplicate?

-- Jamie

2002-08-10 18:28:39

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()


On Sat, 10 Aug 2002, Jamie Lokier wrote:
>
> Linus Torvalds wrote:
> > Doing a simple strace shows that all the systems I have regular access to
> > use the "getcwd()" system call anyway, which gets this right on /proc (and
> > other filesystems that do not guarantee unique inode numbers)
>
> Oh dear -- what of programs that assume duplicate inode numbers are hard
> links, and therefore assume the same contents will be found in each
> duplicate?

Well, anybody who tries to back up /proc with "tar" is in for some
surprises anyway ;)

Linus

2002-08-10 18:45:27

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Linus Torvalds wrote:
> > Oh dear -- what of programs that assume duplicate inode numbers are hard
> > links, and therefore assume the same contents will be found in each
> > duplicate?
>
> Well, anybody who tries to back up /proc with "tar" is in for some
> surprises anyway ;)

I was thinking of an over-intelligent `find'. But hey, as long as this
is only for the weird and wonderful /proc :-)

-- Jamie

2002-08-11 19:16:40

by Alan

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Sat, 2002-08-10 at 19:48, Jamie Lokier wrote:
> Linus Torvalds wrote:
> > > Oh dear -- what of programs that assume duplicate inode numbers are hard
> > > links, and therefore assume the same contents will be found in each
> > > duplicate?
> >
> > Well, anybody who tries to back up /proc with "tar" is in for some
> > surprises anyway ;)
>
> I was thinking of an over-intelligent `find'. But hey, as long as this
> is only for the weird and wonderful /proc :-)

If they hold both handles open and stat them and find the same inode
number then yes its a bug. We have lots of room for inode numbers to
handle 2^30 processes and allow for 2^30 other files

2002-08-11 19:57:09

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Alan Cox wrote:
> If they hold both handles open and stat them and find the same inode
> number then yes its a bug. We have lots of room for inode numbers to
> handle 2^30 processes and allow for 2^30 other files

So, in general, the way to detect hard links requires both objects to be
open at the same time? I was sure it was enough to stat(), and check
(st1.st_ino == st2.st_ino && st1.st_dev == st2.st_dev).

Admittedly, one of the object could be renamed or deleted in that time
so it's not 100% reliable on changing filesystems.

Ah well.

-- Jamie

2002-08-11 19:59:12

by Alan

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Sun, 2002-08-11 at 20:58, Jamie Lokier wrote:
> Alan Cox wrote:
> > If they hold both handles open and stat them and find the same inode
> > number then yes its a bug. We have lots of room for inode numbers to
> > handle 2^30 processes and allow for 2^30 other files
>
> So, in general, the way to detect hard links requires both objects to be
> open at the same time? I was sure it was enough to stat(), and check
> (st1.st_ino == st2.st_ino && st1.st_dev == st2.st_dev).
>
> Admittedly, one of the object could be renamed or deleted in that time
> so it's not 100% reliable on changing filesystems.

Hence you need both open at the same time.

2002-08-11 20:08:32

by Jamie Lokier

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Alan Cox wrote:
> > So, in general, the way to detect hard links requires both objects to be
> > open at the same time? I was sure it was enough to stat(), and check
> > (st1.st_ino == st2.st_ino && st1.st_dev == st2.st_dev).
> >
> > Admittedly, one of the object could be renamed or deleted in that time
> > so it's not 100% reliable on changing filesystems.
>
> Hence you need both open at the same time.

... unless you know that what you're looking at isn't changing, or you
only guarantee correct results for parts of the filesystem which don't
change (`tar', `find' etc.)

Again, /proc is exempt! :)

-- Jamie

2002-08-12 08:52:39

by Padraig Brady

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Linus Torvalds wrote:
> On 9 Aug 2002, Paul Larson wrote:
>
>>On Fri, 2002-08-09 at 16:42, Linus Torvalds wrote:
>>
>>>Hmm.. Giving them a quick glance-over, the /proc issues look like they
>>>shouldn't be there in 2.5.x anyway, since the inode number really is
>>>largely just a random number in 2.5 and all the real information is
>>>squirelled away at path open time.
>>
>
> It looks like the biggest impact on /proc would be that the /proc/<pid>
> inodes wouldn't be 10%% unique any more, which in turn means that an
> old-style /bin/pwd that actually walks the tree backwards and checks the
> inode number would occasionally fail.
>
> That in turn makes me suspect that we'd better off just biting the bullet
> and makign the inode numbers almost completely static, and forcing that
> particular failure mode early rather than hit it randomly due to unlucky
> timing.
>
> Doing a simple strace shows that all the systems I have regular access to
> use the "getcwd()" system call anyway, which gets this right on /proc (and
> other filesystems that do not guarantee unique inode numbers)

Anyone care to clarify which filesystems don't
have unique inode numbers. I always thought you
could uniquely identify any file using a device,inode
tuple? Fair enough proc is "special" but can/should
you not assume unique inodes within all other filesystems?

Also why can't you allocate a unique number in /proc?

thanks,
P?draig.

2002-08-12 09:12:33

by Alan

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

On Mon, 2002-08-12 at 09:56, Padraig Brady wrote:
> Anyone care to clarify which filesystems don't
> have unique inode numbers. I always thought you
> could uniquely identify any file using a device,inode
> tuple? Fair enough proc is "special" but can/should
> you not assume unique inodes within all other filesystems?

2.4 functions correctly here in all the stuff I've looked at. I can't
speak for 2.5. In the 2.4 case you must be holding the files open during
the comparison. Some file systems like MSDOS invent inodes as they go,
never issuing the same one to two objects at the same time but happily
reissuing different inode numbers the next time around.

That breaks NFS but not much else


2002-08-12 09:17:27

by Padraig Brady

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Alan Cox wrote:
> On Mon, 2002-08-12 at 09:56, Padraig Brady wrote:
>
>>Anyone care to clarify which filesystems don't
>>have unique inode numbers. I always thought you
>>could uniquely identify any file using a device,inode
>>tuple? Fair enough proc is "special" but can/should
>>you not assume unique inodes within all other filesystems?
>
>
> 2.4 functions correctly here in all the stuff I've looked at.

cool.

> I can't speak for 2.5. In the 2.4 case you must be holding the files open
> during the comparison. Some file systems like MSDOS invent inodes as
> they go, never issuing the same one to two objects at the same time but
> happily reissuing different inode numbers the next time around.

Sure, no hardlinks so who cares what the number is, as long
as it's unique.

> That breaks NFS but not much else

hmm..

thanks,
P?draig.

2002-08-12 14:44:27

by Paul Larson

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

Dave McCracken had a patch a while back that should probably be
considered with all this too.
http://marc.theaimsgroup.com/?l=linux-kernel&m=100506843702466&w=2

This fixes the calculation of the max number of tasks so that it doesn't
consider highmem since it can't use use that anyway.

-Paul Larson

2002-08-12 16:02:13

by Jim Houston

[permalink] [raw]
Subject: Re: [PATCH] Linux-2.5 fix/improve get_pid()

/*
* Test program for id.c
*/

#include <stdlib.h>
#include "id.h"

struct id my_id;

main()
{
int i, n;
void *v;

id_init(&my_id, 10000);

random_test();
}

#define LIST_SZ 50000

struct id_list {
int id;
int ptr;
} id_list[LIST_SZ];

int list_cnt;
int new_cnt;
int max = 5000;
int count = 10000000;

long long avr_depth;

#define rdtsc(low,high) \
__asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))

static inline unsigned long get_tsc(void)
{
register unsigned long eax, edx;

rdtsc(eax,edx);
return(eax);
}



random_test()
{
int n, i;
void *v;
unsigned long t1, t2, t3, t4;

for (n = 0; n < count; n++) {
/*
* favor insertion so we will tend to run will max
* id's active.
*/
if (list_cnt && (list_cnt > max || rand() < (RAND_MAX/4))) {
i = rand() % list_cnt;
v = id_lookup(&my_id, id_list[i].id);
if ((int)v != id_list[i].ptr) {
printf("list_cnt=%d, i=%d\n", list_cnt, i);
printf("failed id=%d, expected %d got %d\n",
id_list[i].id, id_list[i].ptr, (int)v);
} else {
#if 0
printf("rm id=%d, ptr=%d\n",
id_list[i].id, id_list[i].ptr);
#endif
id_remove(&my_id, id_list[i].id);
}
id_list[i] = id_list[--list_cnt];
} else {
new_cnt++;
id_list[list_cnt].id = id_new(&my_id, (void *)new_cnt);
id_list[list_cnt].ptr = new_cnt;
#if 0
printf("ins id=%d, ptr=%d\n",
id_list[list_cnt].id, id_list[list_cnt].ptr);
#endif
list_cnt++;
avr_depth += list_cnt;
}
}
t1 = get_tsc();
id_lookup(&my_id, id_list[0].id);
t2 = get_tsc();
n = id_new(&my_id, (void *)++new_cnt);
t3 = get_tsc();
id_remove(&my_id, n);
t4 = get_tsc();

for (i = 0; i < list_cnt; i++) {
v = id_lookup(&my_id, id_list[i].id);
if ((int)v != id_list[i].ptr) {
printf("failed id=%d, expected %d got %d\n",
id_list[i].id, id_list[i], (int)v);
}
id_remove(&my_id, id_list[i].id);
}
printf("my_id.count=%d\n", my_id.count);
printf("new_cnt=%d\n", new_cnt);
printf("avr_depth = %d\n", (int)(avr_depth/new_cnt));
printf("id_lookup took %d cycles\n", t2-t1);
printf("id_new took %d cycles\n", t3-t2);
printf("id_remove took %d cycles\n", t4-t3);
}


Attachments:
id.c (2.76 kB)
id.h (1.26 kB)
id_test.c (2.08 kB)
Download all attachments