2002-10-03 13:22:37

by Manfred Spraul

[permalink] [raw]
Subject: [PATCH] pipe bugfix /cleanup

Hi Linus,

pipe_write contains a wakeup storm, 2 writers that write into the same
fifo can wake each other up, and spend 100% cpu time with wakeup/schedule,
without making any progress.

The patch below was part of -dj, and IIRC even of -ac, for a few months,
but it's more a rewrite of pipe_read and _write.

Could you merge it, or do you want a minimal fix?

The only regression I'm aware of is that

$ dd if=/dev/zero | grep not_there

will fail due to OOM, because grep does something like

for(;;) {
rlen = read(fd, buf, len);
if (rlen == len) {
len *= 2;
buf = realloc(buf, len);
}
}

if it operates on pipes, and due to the improved syscall merging, read
will always return the maximum possible amount of data. But that's a grep
bug, not a kernel problem.

The diff is older, but applies to 2.5.40 without offsets.
--
Manfred
<<<<<<<<<<<<<<

diff -urpN --exclude-from=/home/davej/.exclude bk-linus/fs/pipe.c linux-2.5/fs/pipe.c
--- bk-linus/fs/pipe.c 2002-08-20 13:15:16.000000000 +0100
+++ linux-2.5/fs/pipe.c 2002-08-20 13:25:24.000000000 +0100
@@ -25,6 +25,9 @@
*
* FIFOs and Pipes now generate SIGIO for both readers and writers.
* -- Jeremy Elson <[email protected]> 2001-08-16
+ *
+ * pipe_read & write cleanup
+ * -- Manfred Spraul <[email protected]> 2002-05-09
*/

/* Drop the inode semaphore and wait for a pipe event, atomically */
@@ -44,97 +47,81 @@ static ssize_t
pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
{
struct inode *inode = filp->f_dentry->d_inode;
- ssize_t size, read, ret;
+ int do_wakeup;
+ ssize_t ret;

- /* Seeks are not allowed on pipes. */
- ret = -ESPIPE;
- read = 0;
- if (ppos != &filp->f_pos)
- goto out_nolock;
+ /* pread is not allowed on pipes. */
+ if (unlikely(ppos != &filp->f_pos))
+ return -ESPIPE;
+
+ /* Null read succeeds. */
+ if (unlikely(count == 0))
+ return 0;

- /* Always return 0 on null read. */
+ do_wakeup = 0;
ret = 0;
- if (count == 0)
- goto out_nolock;
+ down(PIPE_SEM(*inode));
+ for (;;) {
+ int size = PIPE_LEN(*inode);
+ if (size) {
+ char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
+ ssize_t chars = PIPE_MAX_RCHUNK(*inode);

- /* Get the pipe semaphore */
- ret = -ERESTARTSYS;
- if (down_interruptible(PIPE_SEM(*inode)))
- goto out_nolock;
-
- if (PIPE_EMPTY(*inode)) {
-do_more_read:
- ret = 0;
- if (!PIPE_WRITERS(*inode))
- goto out;
+ if (chars > count)
+ chars = count;
+ if (chars > size)
+ chars = size;

- ret = -EAGAIN;
- if (filp->f_flags & O_NONBLOCK)
- goto out;
-
- for (;;) {
- PIPE_WAITING_READERS(*inode)++;
- pipe_wait(inode);
- PIPE_WAITING_READERS(*inode)--;
- ret = -ERESTARTSYS;
- if (signal_pending(current))
- goto out;
- ret = 0;
- if (!PIPE_EMPTY(*inode))
+ if (copy_to_user(buf, pipebuf, chars)) {
+ if (!ret) ret = -EFAULT;
break;
- if (!PIPE_WRITERS(*inode))
- goto out;
- }
- }
+ }
+ ret += chars;

- /* Read what data is available. */
- ret = -EFAULT;
- while (count > 0 && (size = PIPE_LEN(*inode))) {
- char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
- ssize_t chars = PIPE_MAX_RCHUNK(*inode);
-
- if (chars > count)
- chars = count;
- if (chars > size)
- chars = size;
-
- if (copy_to_user(buf, pipebuf, chars))
- goto out;
-
- read += chars;
- PIPE_START(*inode) += chars;
- PIPE_START(*inode) &= (PIPE_SIZE - 1);
- PIPE_LEN(*inode) -= chars;
- count -= chars;
- buf += chars;
+ PIPE_START(*inode) += chars;
+ PIPE_START(*inode) &= (PIPE_SIZE - 1);
+ PIPE_LEN(*inode) -= chars;
+ count -= chars;
+ buf += chars;
+ do_wakeup = 1;
+ }
+ if (!count)
+ break; /* common path: read succeeded */
+ if (PIPE_LEN(*inode)) /* test for cyclic buffers */
+ continue;
+ if (!PIPE_WRITERS(*inode))
+ break;
+ if (!PIPE_WAITING_WRITERS(*inode)) {
+ /* syscall merging: Usually we must not sleep
+ * if O_NONBLOCK is set, or if we got some data.
+ * But if a writer sleeps in kernel space, then
+ * we can wait for that data without violating POSIX.
+ */
+ if (ret)
+ break;
+ if (filp->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ if (signal_pending(current)) {
+ if (!ret) ret = -ERESTARTSYS;
+ break;
+ }
+ if (do_wakeup) {
+ wake_up_interruptible(PIPE_WAIT(*inode));
+ kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+ }
+ pipe_wait(inode);
}
-
- /* Cache behaviour optimization */
- if (!PIPE_LEN(*inode))
- PIPE_START(*inode) = 0;
-
- if (count && PIPE_WAITING_WRITERS(*inode) && !(filp->f_flags & O_NONBLOCK)) {
- /*
- * We know that we are going to sleep: signal
- * writers synchronously that there is more
- * room.
- */
+ up(PIPE_SEM(*inode));
+ /* Signal writers asynchronously that there is more room. */
+ if (do_wakeup) {
wake_up_interruptible_sync(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
- if (!PIPE_EMPTY(*inode))
- BUG();
- goto do_more_read;
+ kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
}
- /* Signal writers asynchronously that there is more room. */
- wake_up_interruptible(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
-
- ret = read;
-out:
- up(PIPE_SEM(*inode));
-out_nolock:
- if (read)
- ret = read;
+ if (ret > 0)
+ UPDATE_ATIME(inode);
return ret;
}

@@ -142,116 +129,90 @@ static ssize_t
pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
{
struct inode *inode = filp->f_dentry->d_inode;
- ssize_t free, written, ret;
+ ssize_t ret;
+ size_t min;
+ int do_wakeup;
+
+ /* pwrite is not allowed on pipes. */
+ if (unlikely(ppos != &filp->f_pos))
+ return -ESPIPE;
+
+ /* Null write succeeds. */
+ if (unlikely(count == 0))
+ return 0;

- /* Seeks are not allowed on pipes. */
- ret = -ESPIPE;
- written = 0;
- if (ppos != &filp->f_pos)
- goto out_nolock;
-
- /* Null write succeeds. */
+ do_wakeup = 0;
ret = 0;
- if (count == 0)
- goto out_nolock;
-
- ret = -ERESTARTSYS;
- if (down_interruptible(PIPE_SEM(*inode)))
- goto out_nolock;
-
- /* No readers yields SIGPIPE. */
- if (!PIPE_READERS(*inode))
- goto sigpipe;
-
- /* If count <= PIPE_BUF, we have to make it atomic. */
- free = (count <= PIPE_BUF ? count : 1);
-
- /* Wait, or check for, available space. */
- if (filp->f_flags & O_NONBLOCK) {
- ret = -EAGAIN;
- if (PIPE_FREE(*inode) < free)
- goto out;
- } else {
- while (PIPE_FREE(*inode) < free) {
- PIPE_WAITING_WRITERS(*inode)++;
- pipe_wait(inode);
- PIPE_WAITING_WRITERS(*inode)--;
- ret = -ERESTARTSYS;
- if (signal_pending(current))
- goto out;
-
- if (!PIPE_READERS(*inode))
- goto sigpipe;
+ min = count;
+ if (min > PIPE_BUF)
+ min = 1;
+ down(PIPE_SEM(*inode));
+ for (;;) {
+ int free;
+ if (!PIPE_READERS(*inode)) {
+ send_sig(SIGPIPE, current, 0);
+ if (!ret) ret = -EPIPE;
+ break;
}
- }
-
- /* Copy into available space. */
- ret = -EFAULT;
- while (count > 0) {
- int space;
- char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
- ssize_t chars = PIPE_MAX_WCHUNK(*inode);
-
- if ((space = PIPE_FREE(*inode)) != 0) {
+ free = PIPE_FREE(*inode);
+ if (free >= min) {
+ /* transfer data */
+ ssize_t chars = PIPE_MAX_WCHUNK(*inode);
+ char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
+ /* Always wakeup, even if the copy fails. Otherwise
+ * we lock up (O_NONBLOCK-)readers that sleep due to
+ * syscall merging.
+ */
+ do_wakeup = 1;
if (chars > count)
chars = count;
- if (chars > space)
- chars = space;
+ if (chars > free)
+ chars = free;

- if (copy_from_user(pipebuf, buf, chars))
- goto out;
+ if (copy_from_user(pipebuf, buf, chars)) {
+ if (!ret) ret = -EFAULT;
+ break;
+ }

- written += chars;
+ ret += chars;
PIPE_LEN(*inode) += chars;
count -= chars;
buf += chars;
- space = PIPE_FREE(*inode);
+ }
+ if (!count)
+ break;
+ if (PIPE_FREE(*inode) && ret) {
+ /* handle cyclic data buffers */
+ min = 1;
continue;
}
-
- ret = written;
- if (filp->f_flags & O_NONBLOCK)
+ if (filp->f_flags & O_NONBLOCK) {
+ if (!ret) ret = -EAGAIN;
break;
-
- do {
- /*
- * Synchronous wake-up: it knows that this process
- * is going to give up this CPU, so it doesn't have
- * to do idle reschedules.
- */
+ }
+ if (signal_pending(current)) {
+ if (!ret) ret = -ERESTARTSYS;
+ break;
+ }
+ if (do_wakeup) {
wake_up_interruptible_sync(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
- PIPE_WAITING_WRITERS(*inode)++;
- pipe_wait(inode);
- PIPE_WAITING_WRITERS(*inode)--;
- if (signal_pending(current))
- goto out;
- if (!PIPE_READERS(*inode))
- goto sigpipe;
- } while (!PIPE_FREE(*inode));
- ret = -EFAULT;
+ kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+ do_wakeup = 0;
+ }
+ PIPE_WAITING_WRITERS(*inode)++;
+ pipe_wait(inode);
+ PIPE_WAITING_WRITERS(*inode)--;
}
-
- /* Signal readers asynchronously that there is more data. */
- wake_up_interruptible(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
-
- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- mark_inode_dirty(inode);
-
-out:
up(PIPE_SEM(*inode));
-out_nolock:
- if (written)
- ret = written;
+ if (do_wakeup) {
+ wake_up_interruptible(PIPE_WAIT(*inode));
+ kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+ }
+ if (ret > 0) {
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+ }
return ret;
-
-sigpipe:
- if (written)
- goto out;
- up(PIPE_SEM(*inode));
- send_sig(SIGPIPE, current, 0);
- return -EPIPE;
}

static ssize_t
@@ -525,7 +486,7 @@ struct inode* pipe_new(struct inode* ino
PIPE_BASE(*inode) = (char*) page;
PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
- PIPE_WAITING_READERS(*inode) = PIPE_WAITING_WRITERS(*inode) = 0;
+ PIPE_WAITING_WRITERS(*inode) = 0;
PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
*PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;

diff -urpN --exclude-from=/home/davej/.exclude bk-linus/include/linux/pipe_fs_i.h linux-2.5/include/linux/pipe_fs_i.h
--- bk-linus/include/linux/pipe_fs_i.h 2002-08-20 13:15:58.000000000 +0100
+++ linux-2.5/include/linux/pipe_fs_i.h 2002-08-20 13:26:25.000000000 +0100
@@ -9,7 +9,6 @@ struct pipe_inode_info {
unsigned int start;
unsigned int readers;
unsigned int writers;
- unsigned int waiting_readers;
unsigned int waiting_writers;
unsigned int r_counter;
unsigned int w_counter;
@@ -28,7 +27,6 @@ struct pipe_inode_info {
#define PIPE_LEN(inode) ((inode).i_pipe->len)
#define PIPE_READERS(inode) ((inode).i_pipe->readers)
#define PIPE_WRITERS(inode) ((inode).i_pipe->writers)
-#define PIPE_WAITING_READERS(inode) ((inode).i_pipe->waiting_readers)
#define PIPE_WAITING_WRITERS(inode) ((inode).i_pipe->waiting_writers)
#define PIPE_RCOUNTER(inode) ((inode).i_pipe->r_counter)
#define PIPE_WCOUNTER(inode) ((inode).i_pipe->w_counter)
--
<<<<<<<<<<<<<<


2002-10-04 07:34:48

by Alex Riesen

[permalink] [raw]
Subject: Re: [PATCH] pipe bugfix /cleanup

On Thu, Oct 03, 2002 at 03:27:49PM +0200, Manfred Spraul wrote:
> Hi Linus,
>
> pipe_write contains a wakeup storm, 2 writers that write into the same
> fifo can wake each other up, and spend 100% cpu time with wakeup/schedule,
> without making any progress.
>
> The patch below was part of -dj, and IIRC even of -ac, for a few months,
> but it's more a rewrite of pipe_read and _write.
>
> Could you merge it, or do you want a minimal fix?
>
> The only regression I'm aware of is that
>
> $ dd if=/dev/zero | grep not_there

This makes 2.4.19 + Con Kolivas patch behave very bad.
The system goes slow, freeze, than wakes up, freeze again, etc.

I did "cat /dev/zero | grep nothing".

procs memory swap io system cpu
r b w swpd free buff cache si so bi bo in cs us sy id
0 0 0 124592 292056 60624 70488 0 1 27 19 7 2 5 1 2
0 0 0 124592 292048 60632 70488 0 0 0 12 132 141 0 0 100
0 0 0 124588 292048 60632 70488 0 0 0 0 126 109 0 0 100
0 0 0 124588 292048 60632 70488 0 0 0 0 126 108 0 0 100
0 0 0 124588 292048 60632 70488 0 0 0 0 127 102 0 0 100
0 0 0 124588 292048 60632 70488 0 0 0 0 130 122 1 0 99
0 0 0 124568 292004 60632 70488 28 0 28 0 151 1836 5 1 94
1 0 0 124552 292004 60632 70488 0 0 0 0 150 530 6 1 93
0 0 0 124552 292004 60632 70488 0 0 0 0 130 496 4 1 95
0 0 0 124552 292004 60632 70488 0 0 0 0 138 1935 4 3 93
0 0 0 124528 290956 60632 70488 0 0 0 0 250 3075 17 2 81
0 0 0 124528 290948 60640 70488 0 0 0 44 143 197 0 0 100
0 0 0 124516 290948 60640 70488 0 0 0 0 134 170 0 0 100
1 0 0 124516 290952 60644 70488 0 0 4 0 143 186 0 1 99
1 0 0 124516 290944 60644 70488 0 0 0 0 126 104 0 0 100
2 0 0 124516 208760 60644 70488 4 0 4 0 136 37577 39 45 17
2 0 0 124516 126544 60652 70488 0 0 0 12 123 16596 72 28 0
2 0 1 253164 4832 54036 70120 16 660 16 660 148 69994 0 100 0
1 1 2 439472 3264 53880 70120 0 20704 0 20696 398 978 0 28 72
0 1 1 438920 3656 53088 70120 0 19496 0 19496 450 722 0 5 95
0 1 1 438596 3716 53024 70120 0 19944 0 19944 442 134 0 7 93
0 1 2 438364 3712 53004 70132 0 19944 20 19944 507 252 0 5 95
1 1 3 438104 3784 52932 70132 0 19804 0 19804 442 171 0 1 99
0 2 1 437884 3804 52892 70132 8 17956 8 17956 452 219 0 5 95
0 2 1 437660 3736 52860 70132 100 18124 100 18124 435 205 0 1 99
0 2 1 437440 3168 52796 70132 4 20392 4 20392 493 542 0 5 95
0 2 2 437328 3144 52760 70168 0 19976 0 20180 602 341 1 1 98
10 0 6 143432 294816 52384 70168 8 128296 8 128436 2961 502 0 5 95
1 0 0 133448 307256 52320 70168 352 0 356 44 225 330 0 1 99
1 0 0 133372 307256 52320 70168 0 0 0 0 130 109 0 0 100
1 0 0 133296 307240 52336 70168 0 0 0 56 123 101 0 0 100
1 0 0 133244 307240 52336 70168 0 0 0 0 119 96 0 0 100
2 0 0 132952 307240 52336 70168 0 0 0 0 124 309 1 1 98


> will fail due to OOM, because grep does something like
>
> for(;;) {
> rlen = read(fd, buf, len);
> if (rlen == len) {
> len *= 2;
> buf = realloc(buf, len);
> }
> }
>
> if it operates on pipes, and due to the improved syscall merging, read
> will always return the maximum possible amount of data. But that's a grep
> bug, not a kernel problem.
>
> The diff is older, but applies to 2.5.40 without offsets.

doesn't apply to 2.4.19

-alex

2002-10-04 13:41:12

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH] pipe bugfix /cleanup

Alex Riesen wrote:
>
> This makes 2.4.19 + Con Kolivas patch behave very bad.
> The system goes slow, freeze, than wakes up, freeze again, etc.
>
> I did "cat /dev/zero | grep nothing".
>
> procs memory swap io system cpu
> r b w swpd free buff cache si so bi bo in cs us sy id
> 0 0 0 124592 292056 60624 70488 0 1 27 19 7 2 5 1 2
[snip]
> 0 2 1 437440 3168 52796 70132 4 20392 4 20392 493 542 0 5 95
> 0 2 2 437328 3144 52760 70168 0 19976 0 20180 602 341 1 1 98
> 10 0 6 143432 294816 52384 70168 8 128296 8 128436 2961 502 0 5 95
> 1 0 0 133448 307256 52320 70168 352 0 356 44 225 330 0 1 99

As I wrote, this is a grep problem. I assume that regexp are more
efficient over long chunks, but there must be a limit - regexp over
swapped out memory are slow ;-)

I'll send a bugreport to the grep maintainers.

>
> doesn't apply to 2.4.19
>

Yes. likely/unlikely changes, and one additional change log entry.


--
Manfred