Date: Sat, 27 Apr 2002 02:25:20 +0200
From: Andrea Arcangeli <andrea@suse.de>
To: Hubertus Franke <frankeh@watson.ibm.com>
Cc: Marcelo Tosatti <marcelo@conectiva.com.br>, linux-kernel@vger.kernel.org,
        Ihno Krumreich <ihno@suse.de>, Linus Torvalds <torvalds@transmeta.com>
Subject: Re: get_pid fixes against 2.4.19pre7
Message-ID: <20020427022520.S19278@dualathlon.random>
In-Reply-To: <20020426134409.C19278@dualathlon.random> <20020426141541.096F13FE06@smtp.linux.ibm.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.3.22.1i
Sender: linux-kernel-owner@vger.kernel.org

On Fri, Apr 26, 2002 at 09:16:46AM -0400, Hubertus Franke wrote:
> On Friday 26 April 2002 07:44 am, Andrea Arcangeli wrote:
> > Hello,
> >
> > Could you have a look at these get_pid fixes? Besides the deadlocking
> > while running out of pids and reducing from quadratic to linear the
> > complexity of get_pids, it also addresses a longstanding non trivial
> > race present in 2.2 too that can lead to pid collisions even on UP
> > (noticed today while merging two more fixes from Ihno on the other
> > part). the fix reduces a bit scalability of simultaneous forks from
> > different cpus, but it's obviously right at least. Putting non-ready
> > tasks into the tasklists asks for troubles (signals...).  For more
> > details see the comment at the end of the patch. If you've suggestions
> > they're welcome, thanks.
> >
> >
> 
> Andrea, is this a consolidation of some of the work that I posted earlier 
> with the following patch for 2.4.17 and some of the discussions that followed
> from that. 
> 
> At least from my prospect I ran some test and recognized that the
> break even point between the two algorithms was about 22K pids.
> 
> Hence, I switched between the two algorithms at that break even point.

Nice to hear.

> 
> If you want I can send you the test program and you can measure
> for yourself how your stuff is doing. The test program uses the
> algo in user space and hence you can get some quick and decent answers.
> Let me know if you like it.

Thanks, it is welcome. If you've time it would also be nice if you could
review and/or test the patch with your proggy so that we can get those
fixes included in mainline, the more eyes on it the better :).

Latest version with the __set_bit optimization from Russell is appended:

diff -urN 2.4.19pre7/include/linux/threads.h getpid/include/linux/threads.h
--- 2.4.19pre7/include/linux/threads.h	Thu Apr 18 07:51:30 2002
+++ getpid/include/linux/threads.h	Sat Apr 27 00:47:59 2002
@@ -19,6 +19,6 @@
 /*
  * This controls the maximum pid allocated to a process
  */
-#define PID_MAX 0x8000
+#define PID_NR 0x8000
 
 #endif
diff -urN 2.4.19pre7/kernel/fork.c getpid/kernel/fork.c
--- 2.4.19pre7/kernel/fork.c	Tue Apr 16 08:12:09 2002
+++ getpid/kernel/fork.c	Sat Apr 27 00:48:15 2002
@@ -37,6 +37,12 @@
 
 struct task_struct *pidhash[PIDHASH_SZ];
 
+/*
+ * Protectes next_unsafe, last_pid and it avoids races
+ * between get_pid and SET_LINKS().
+ */
+static DECLARE_MUTEX(getpid_mutex);
+
 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;
@@ -79,51 +85,105 @@
 	init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 }
 
-/* Protects next_safe and last_pid. */
-spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
-
+/*
+ *	Get the next free pid for a new process/thread.
+ *
+ *	Strategy: last_pid and next_unsafe (excluded) are an interval where all pids
+ *		  are free, so next pid is just last_pid + 1 if it's also < next_unsafe.
+ *		  If last_pid + 1 >= next_unsafe the interval is completely used.
+ *		  In this case a bitmap with all used pids/tgids/pgrp/seesion is
+ *		  is created. This bitmap is looked for the next free pid and next_unsafe.
+ *		  If all pids are used, a kernel warning is issued.
+ */
 static int get_pid(unsigned long flags)
 {
-	static int next_safe = PID_MAX;
+	static int next_unsafe = PID_NR;
+#define PID_FIRST	2 /* pid 1 is init, first usable pid is 2 */
+#define PID_BITMAP_SIZE	((((PID_NR + 7) / 8) + sizeof(long) - 1 ) / (sizeof(long)))
+	/*
+	 * Even if this could be local per-thread, keep it static and protected by
+	 * the lock because we don't want to overflow the stack and we wouldn't
+	 * SMP scale better anyways. It doesn't waste disk space because it's in
+	 * the .bss.
+	 */
+	static unsigned long pid_bitmap[PID_BITMAP_SIZE];
+
+	/* from here the stuff on the stack */
 	struct task_struct *p;
-	int pid;
+	int pid, found_pid;
 
 	if (flags & CLONE_PID)
 		return current->pid;
 
-	spin_lock(&lastpid_lock);
-	if((++last_pid) & 0xffff8000) {
-		last_pid = 300;		/* Skip daemons etc. */
-		goto inside;
-	}
-	if(last_pid >= next_safe) {
-inside:
-		next_safe = PID_MAX;
+	pid = last_pid + 1;
+	if (pid >= next_unsafe) {
+		next_unsafe = PID_NR;
+		memset(pid_bitmap, 0, PID_BITMAP_SIZE*sizeof(long));
+
 		read_lock(&tasklist_lock);
-	repeat:
+		/*
+		 * Build the bitmap and calc next_unsafe.
+		 */
 		for_each_task(p) {
-			if(p->pid == last_pid	||
-			   p->pgrp == last_pid	||
-			   p->tgid == last_pid	||
-			   p->session == last_pid) {
-				if(++last_pid >= next_safe) {
-					if(last_pid & 0xffff8000)
-						last_pid = 300;
-					next_safe = PID_MAX;
+			__set_bit(p->pid, pid_bitmap);
+			__set_bit(p->pgrp, pid_bitmap);
+			__set_bit(p->tgid, pid_bitmap);
+			__set_bit(p->session, pid_bitmap);
+
+			if (next_unsafe > p->pid && p->pid > pid)
+				next_unsafe = p->pid;
+			if (next_unsafe > p->pgrp && p->pgrp > pid)
+				next_unsafe = p->pgrp;
+			if (next_unsafe > p->tgid && p->tgid > pid)
+				next_unsafe = p->tgid;
+			if (next_unsafe > p->session && p->session > pid)
+				next_unsafe = p->session;
+		}
+
+		/*
+		 * Release the tasklist_lock, after the unlock it may happen that
+		 * a pid is freed while it's still marked in use
+		 * in the pid_bitmap[].
+		 */
+		read_unlock(&tasklist_lock);
+
+		found_pid = find_next_zero_bit(pid_bitmap, PID_NR, pid);
+		if (found_pid >= PID_NR) {
+			next_unsafe = 0; /* depends on PID_FIRST > 0 */
+			found_pid = find_next_zero_bit(pid_bitmap, pid, PID_FIRST);
+			/* We scanned the whole bitmap without finding a free pid. */
+			if (found_pid >= pid) {
+				static long last_get_pid_warning;
+				if ((unsigned long) (jiffies - last_get_pid_warning) >= HZ) {
+					printk(KERN_NOTICE "No more PIDs (PID_NR = %d)\n", PID_NR);
+					last_get_pid_warning = jiffies;
 				}
-				goto repeat;
+				return -1;
+			}
+		}
+
+		pid = found_pid;
+
+		if (pid > next_unsafe) {
+			/* recalc next_unsafe by looking for the next bit set in the bitmap */
+			unsigned long * start = pid_bitmap;
+			unsigned long * p = start + (pid / (sizeof(long) * 8));
+			unsigned long * end = pid_bitmap + PID_BITMAP_SIZE;
+			unsigned long mask = ~((1UL << (pid & ((sizeof(long) * 8 - 1)))) - 1);
+
+			*p &= (mask << 1);
+
+			while (p < end) {
+				if (*p) {
+					next_unsafe = ffz(~*p) + (p - start) * sizeof(long) * 8;
+					break;
+				}
+				p++;
 			}
-			if(p->pid > last_pid && next_safe > p->pid)
-				next_safe = p->pid;
-			if(p->pgrp > last_pid && next_safe > p->pgrp)
-				next_safe = p->pgrp;
-			if(p->session > last_pid && next_safe > p->session)
-				next_safe = p->session;
 		}
-		read_unlock(&tasklist_lock);
 	}
-	pid = last_pid;
-	spin_unlock(&lastpid_lock);
+
+	last_pid = pid;
 
 	return pid;
 }
@@ -623,7 +683,10 @@
 	p->state = TASK_UNINTERRUPTIBLE;
 
 	copy_flags(clone_flags, p);
+	down(&getpid_mutex);
 	p->pid = get_pid(clone_flags);
+	if (p->pid < 0) /* valid pids are >= 0 */
+		goto bad_fork_cleanup;
 
 	p->run_list.next = NULL;
 	p->run_list.prev = NULL;
@@ -730,7 +793,17 @@
 		list_add(&p->thread_group, &current->thread_group);
 	}
 
+	/*
+	 * We must do the SET_LINKS() under the getpid_mutex, to avoid
+	 * another CPU to get our same PID between the release of of the
+	 * getpid_mutex and the SET_LINKS().
+	 *
+	 * In short to avoid SMP races the new child->pid must be just visible
+	 * in the tasklist by the time we drop the getpid_mutex.
+	 */
 	SET_LINKS(p);
+	up(&getpid_mutex);
+
 	hash_pid(p);
 	nr_threads++;
 	write_unlock_irq(&tasklist_lock);
@@ -757,6 +830,7 @@
 bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup:
+	up(&getpid_mutex);
 	put_exec_domain(p->exec_domain);
 	if (p->binfmt && p->binfmt->module)
 		__MOD_DEC_USE_COUNT(p->binfmt->module);


Andrea
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/