2018-08-30 20:26:25

by Subhra Mazumdar

[permalink] [raw]
Subject: [RFC PATCH 0/2] Pipe busy wait

This patch introduces busy waiting for pipes similar to network sockets.
When pipe is full or empty a thread busy waits for some microseconds before
sleeping. This avoids the sleep and wakeup overhead and improves
performance in case wakeup happens very fast. It uses new fields in
pipe_inode_info to decide how much to spin and if data has been written or
read during spin. As different workloads on different systems can have
different optimum spin time, it is configurable via a tunable that can be
set via /proc. The default value is 0 which indicates no spin.

Following are the hackbench process run times using pipe for different
sized systems with baseline and suitable spin time.

Hackbench on 2 socket, 36 core and 72 threads Intel x86 machine
(lower is better):
groups baseline patch(spin=10us)
1 0.603 0.614 (-1.82%)
2 0.673 0.527 (21.7%)
4 0.765 0.638 (16.6%)
8 1.935 1.114 (42.43%)
16 7.314 2.007 (72.56%)
32 6.215 3.585 (42.32%)

Hackbench on 1 socket, 16 core and 32 threads Intel x86 VM
(lower is better):
groups baseline patch(spin=10us)
1 1.314 0.747 (43.15%)
2 1.454 0.754 (48.14%)
4 3.409 1.343 (60.6%)
8 6.879 2.559 (62.8%)
16 9.82 4.951 (49.58%)

Hackbench on 1 socket, 4 core and 8 threads Intel x86 VM
(lower is better):
groups baseline patch(spin=5us)
1 2.827 1.455 (48.53%)
2 6.201 2.805 (54.77%)
4 9.514 5.008 (47.36%)
8 14.571 8.422 (42.2%)

Hackbench on 1 socket, 1 core and 2 threads Intel x86 VM
(lower is better):
groups baseline patch(spin=5us)
1 3.365 2.948 (12.39%)
2 6.82 6.535 (4.18%)
4 13.18 13.025 (1.18%)

subhra mazumdar (2):
pipe: introduce busy wait for pipe
pipe: use pipe busy wait

fs/pipe.c | 58 +++++++++++++++++++++++++++++++++++++++++++++--
include/linux/pipe_fs_i.h | 20 ++++++++++++++++
kernel/sysctl.c | 7 ++++++
3 files changed, 83 insertions(+), 2 deletions(-)

--
2.9.3



2018-08-30 20:26:29

by Subhra Mazumdar

[permalink] [raw]
Subject: [RFC PATCH 2/2] pipe: use pipe busy wait

Enable busy waiting for pipes. pipe_busy_wait is called if pipe is empty or
full which spins for specified micro seconds. wake_up_busy_poll is called
when data is written or read to signal any busy waiting threads. A tunable
pipe_busy_poll is introduced to enable or disable busy waiting via /proc.
The value of it specifies the amount of spin in microseconds.

Signed-off-by: subhra mazumdar <[email protected]>
---
fs/pipe.c | 58 +++++++++++++++++++++++++++++++++++++++++++++--
include/linux/pipe_fs_i.h | 1 +
kernel/sysctl.c | 7 ++++++
3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 97e5be8..03ce76a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -44,6 +44,7 @@ unsigned int pipe_min_size = PAGE_SIZE;
*/
unsigned long pipe_user_pages_hard;
unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
+unsigned int pipe_busy_poll;

/*
* We use a start+len construction, which provides full use of the
@@ -122,6 +123,35 @@ void pipe_wait(struct pipe_inode_info *pipe)
pipe_lock(pipe);
}

+void pipe_busy_wait(struct pipe_inode_info *pipe)
+{
+ unsigned long wait_flag = pipe->pipe_wait_flag;
+ unsigned long start_time = pipe_busy_loop_current_time();
+
+ pipe_unlock(pipe);
+ preempt_disable();
+ for (;;) {
+ if (pipe->pipe_wait_flag > wait_flag) {
+ preempt_enable();
+ pipe_lock(pipe);
+ return;
+ }
+ if (pipe_busy_loop_timeout(pipe, start_time))
+ break;
+ cpu_relax();
+ }
+ preempt_enable();
+ pipe_lock(pipe);
+ if (pipe->pipe_wait_flag > wait_flag)
+ return;
+ pipe_wait(pipe);
+}
+
+void wake_up_busy_poll(struct pipe_inode_info *pipe)
+{
+ pipe->pipe_wait_flag++;
+}
+
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -254,6 +284,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
struct pipe_inode_info *pipe = filp->private_data;
int do_wakeup;
ssize_t ret;
+ unsigned int poll = pipe->pipe_ll_usec;

/* Null read succeeds. */
if (unlikely(total_len == 0))
@@ -331,11 +362,18 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
break;
}
if (do_wakeup) {
+ if (poll)
+ wake_up_busy_poll(pipe);
wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- pipe_wait(pipe);
+ if (poll)
+ pipe_busy_wait(pipe);
+ else
+ pipe_wait(pipe);
}
+ if (poll && do_wakeup)
+ wake_up_busy_poll(pipe);
__pipe_unlock(pipe);

/* Signal writers asynchronously that there is more room. */
@@ -362,6 +400,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
int do_wakeup = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
+ unsigned int poll = pipe->pipe_ll_usec;

/* Null write succeeds. */
if (unlikely(total_len == 0))
@@ -467,15 +506,22 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
break;
}
if (do_wakeup) {
+ if (poll)
+ wake_up_busy_poll(pipe);
wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
do_wakeup = 0;
}
pipe->waiting_writers++;
- pipe_wait(pipe);
+ if (poll)
+ pipe_busy_wait(pipe);
+ else
+ pipe_wait(pipe);
pipe->waiting_writers--;
}
out:
+ if (poll && do_wakeup)
+ wake_up_busy_poll(pipe);
__pipe_unlock(pipe);
if (do_wakeup) {
wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
@@ -564,6 +610,7 @@ static int
pipe_release(struct inode *inode, struct file *file)
{
struct pipe_inode_info *pipe = file->private_data;
+ unsigned int poll = pipe->pipe_ll_usec;

__pipe_lock(pipe);
if (file->f_mode & FMODE_READ)
@@ -572,6 +619,8 @@ pipe_release(struct inode *inode, struct file *file)
pipe->writers--;

if (pipe->readers || pipe->writers) {
+ if (poll)
+ wake_up_busy_poll(pipe);
wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
@@ -840,6 +889,7 @@ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
struct file *files[2];
int fd[2];
int error;
+ struct pipe_inode_info *pipe;

error = __do_pipe_flags(fd, files, flags);
if (!error) {
@@ -853,6 +903,10 @@ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
fd_install(fd[0], files[0]);
fd_install(fd[1], files[1]);
}
+ pipe = files[0]->private_data;
+ pipe->pipe_ll_usec = pipe_busy_poll;
+ pipe = files[1]->private_data;
+ pipe->pipe_ll_usec = pipe_busy_poll;
}
return error;
}
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index fdfd2a2..3b96b05 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -188,6 +188,7 @@ void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
extern unsigned int pipe_max_size, pipe_min_size;
extern unsigned long pipe_user_pages_hard;
extern unsigned long pipe_user_pages_soft;
+extern unsigned int pipe_busy_poll;
int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);

/* Drop the inode semaphore and wait for a pipe event, atomically */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d9c31bc..823bde1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1842,6 +1842,13 @@ static struct ctl_table fs_table[] = {
.proc_handler = proc_doulongvec_minmax,
},
{
+ .procname = "pipe-busy-poll",
+ .data = &pipe_busy_poll,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ },
+ {
.procname = "mount-max",
.data = &sysctl_mount_max,
.maxlen = sizeof(unsigned int),
--
2.9.3


2018-08-30 20:27:29

by Subhra Mazumdar

[permalink] [raw]
Subject: [RFC PATCH 1/2] pipe: introduce busy wait for pipe

Introduce pipe_ll_usec field for pipes that indicates the amount of micro
seconds a thread should spin if pipe is empty or full before sleeping. This
is similar to network sockets. Workloads like hackbench in pipe mode
benefits significantly from this by avoiding the sleep and wakeup overhead.
Other similar usecases can benefit. pipe_wait_flag is used to signal any
thread busy waiting. pipe_busy_loop_timeout checks if spin time is over.

Signed-off-by: subhra mazumdar <[email protected]>
---
include/linux/pipe_fs_i.h | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index e7497c9..fdfd2a2 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -1,6 +1,8 @@
#ifndef _LINUX_PIPE_FS_I_H
#define _LINUX_PIPE_FS_I_H

+#include <linux/sched/clock.h>
+
#define PIPE_DEF_BUFFERS 16

#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
@@ -54,6 +56,8 @@ struct pipe_inode_info {
unsigned int waiting_writers;
unsigned int r_counter;
unsigned int w_counter;
+ unsigned int pipe_ll_usec;
+ unsigned long pipe_wait_flag;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
@@ -157,6 +161,21 @@ static inline int pipe_buf_steal(struct pipe_inode_info *pipe,
return buf->ops->steal(pipe, buf);
}

+static inline unsigned long pipe_busy_loop_current_time(void)
+{
+ return (unsigned long)(local_clock() >> 10);
+}
+
+static inline bool pipe_busy_loop_timeout(struct pipe_inode_info *pipe,
+ unsigned long start_time)
+{
+ unsigned long bp_usec = READ_ONCE(pipe->pipe_ll_usec);
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = pipe_busy_loop_current_time();
+
+ return time_after(now, end_time);
+}
+
/* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
memory allocation, whereas PIPE_BUF makes atomicity guarantees. */
#define PIPE_SIZE PAGE_SIZE
--
2.9.3


2018-08-30 20:49:57

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC PATCH 0/2] Pipe busy wait



On 08/30/2018 01:24 PM, subhra mazumdar wrote:
> This patch introduces busy waiting for pipes similar to network sockets.
> When pipe is full or empty a thread busy waits for some microseconds before
> sleeping. This avoids the sleep and wakeup overhead and improves
> performance in case wakeup happens very fast. It uses new fields in
> pipe_inode_info to decide how much to spin and if data has been written or
> read during spin. As different workloads on different systems can have
> different optimum spin time, it is configurable via a tunable that can be
> set via /proc. The default value is 0 which indicates no spin.
Hi,

Looking for comments on how useful you think this is. Network sockets does
similar. Some workloads other than hackbench didn't show any improvement.
Also for some it may regress. I initially tried to find the optimum spin
time for all workloads and systems but realized that is extremely
challenging. Given all these it is best to have a tunable and can be set if
beneficial. Also looking for more workloads suggestions that can benefit.

Thanks,
Subhra
>
> Following are the hackbench process run times using pipe for different
> sized systems with baseline and suitable spin time.
>
> Hackbench on 2 socket, 36 core and 72 threads Intel x86 machine
> (lower is better):
> groups baseline patch(spin=10us)
> 1 0.603 0.614 (-1.82%)
> 2 0.673 0.527 (21.7%)
> 4 0.765 0.638 (16.6%)
> 8 1.935 1.114 (42.43%)
> 16 7.314 2.007 (72.56%)
> 32 6.215 3.585 (42.32%)
>
> Hackbench on 1 socket, 16 core and 32 threads Intel x86 VM
> (lower is better):
> groups baseline patch(spin=10us)
> 1 1.314 0.747 (43.15%)
> 2 1.454 0.754 (48.14%)
> 4 3.409 1.343 (60.6%)
> 8 6.879 2.559 (62.8%)
> 16 9.82 4.951 (49.58%)
>
> Hackbench on 1 socket, 4 core and 8 threads Intel x86 VM
> (lower is better):
> groups baseline patch(spin=5us)
> 1 2.827 1.455 (48.53%)
> 2 6.201 2.805 (54.77%)
> 4 9.514 5.008 (47.36%)
> 8 14.571 8.422 (42.2%)
>
> Hackbench on 1 socket, 1 core and 2 threads Intel x86 VM
> (lower is better):
> groups baseline patch(spin=5us)
> 1 3.365 2.948 (12.39%)
> 2 6.82 6.535 (4.18%)
> 4 13.18 13.025 (1.18%)
>
> subhra mazumdar (2):
> pipe: introduce busy wait for pipe
> pipe: use pipe busy wait
>
> fs/pipe.c | 58 +++++++++++++++++++++++++++++++++++++++++++++--
> include/linux/pipe_fs_i.h | 20 ++++++++++++++++
> kernel/sysctl.c | 7 ++++++
> 3 files changed, 83 insertions(+), 2 deletions(-)
>


2018-08-31 16:11:52

by Steven Sistare

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] pipe: introduce busy wait for pipe

On 8/30/2018 4:24 PM, subhra mazumdar wrote:
> Introduce pipe_ll_usec field for pipes that indicates the amount of micro
> seconds a thread should spin if pipe is empty or full before sleeping. This
> is similar to network sockets. Workloads like hackbench in pipe mode
> benefits significantly from this by avoiding the sleep and wakeup overhead.
> Other similar usecases can benefit. pipe_wait_flag is used to signal any
> thread busy waiting. pipe_busy_loop_timeout checks if spin time is over.
>
> Signed-off-by: subhra mazumdar <[email protected]>
> ---
> include/linux/pipe_fs_i.h | 19 +++++++++++++++++++
> 1 file changed, 19 insertions(+)
>
> diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
> index e7497c9..fdfd2a2 100644
> --- a/include/linux/pipe_fs_i.h
> +++ b/include/linux/pipe_fs_i.h
> @@ -1,6 +1,8 @@
> #ifndef _LINUX_PIPE_FS_I_H
> #define _LINUX_PIPE_FS_I_H
>
> +#include <linux/sched/clock.h>
> +
> #define PIPE_DEF_BUFFERS 16
>
> #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
> @@ -54,6 +56,8 @@ struct pipe_inode_info {
> unsigned int waiting_writers;
> unsigned int r_counter;
> unsigned int w_counter;
> + unsigned int pipe_ll_usec;
> + unsigned long pipe_wait_flag;
> struct page *tmp_page;
> struct fasync_struct *fasync_readers;
> struct fasync_struct *fasync_writers;
> @@ -157,6 +161,21 @@ static inline int pipe_buf_steal(struct pipe_inode_info *pipe,
> return buf->ops->steal(pipe, buf);
> }
>
> +static inline unsigned long pipe_busy_loop_current_time(void)
> +{
> + return (unsigned long)(local_clock() >> 10);

Why ">> 10" ? local_lock() has nanosec units, and you compare to the tunable
pipe_llc_sec which has microsec units. Should be ">> 3". Better yet, redefine
the tunable to have nanosec units. I suspect you will need very large values
of the tunable to show similar results.

Also, since this type of optimization consumes CPU extra cycles that could
be used by other tasks, show the overall CPU utilization before and after
the optimization, such as by using "time hackbench ...".

- Steve

> +}
> +
> +static inline bool pipe_busy_loop_timeout(struct pipe_inode_info *pipe,
> + unsigned long start_time)
> +{
> + unsigned long bp_usec = READ_ONCE(pipe->pipe_ll_usec);
> + unsigned long end_time = start_time + bp_usec;
> + unsigned long now = pipe_busy_loop_current_time();
> +
> + return time_after(now, end_time);
> +}
> +
> /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
> memory allocation, whereas PIPE_BUF makes atomicity guarantees. */
> #define PIPE_SIZE PAGE_SIZE
>

2018-09-04 21:56:30

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [RFC PATCH 2/2] pipe: use pipe busy wait

On Thu, 30 Aug 2018, subhra mazumdar wrote:
>
> +void pipe_busy_wait(struct pipe_inode_info *pipe)
> +{
> + unsigned long wait_flag = pipe->pipe_wait_flag;
> + unsigned long start_time = pipe_busy_loop_current_time();
> +
> + pipe_unlock(pipe);
> + preempt_disable();
> + for (;;) {
> + if (pipe->pipe_wait_flag > wait_flag) {
> + preempt_enable();
> + pipe_lock(pipe);
> + return;
> + }
> + if (pipe_busy_loop_timeout(pipe, start_time))
> + break;
> + cpu_relax();
> + }
> + preempt_enable();

You are not really serious about busy looping with preemption disabled?

That's just wrong. Why do you want to block others from getting on the CPU
if there is nothing in the pipe?

There is no point in doing so, really. If the wait loop is preempted
because there is more important work to do, then it will come back and
either see new data, or leave due to wait time reached.

Thanks,

tglx


2018-09-05 00:21:47

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC PATCH 2/2] pipe: use pipe busy wait



On 09/04/2018 02:54 PM, Thomas Gleixner wrote:
> On Thu, 30 Aug 2018, subhra mazumdar wrote:
>>
>> +void pipe_busy_wait(struct pipe_inode_info *pipe)
>> +{
>> + unsigned long wait_flag = pipe->pipe_wait_flag;
>> + unsigned long start_time = pipe_busy_loop_current_time();
>> +
>> + pipe_unlock(pipe);
>> + preempt_disable();
>> + for (;;) {
>> + if (pipe->pipe_wait_flag > wait_flag) {
>> + preempt_enable();
>> + pipe_lock(pipe);
>> + return;
>> + }
>> + if (pipe_busy_loop_timeout(pipe, start_time))
>> + break;
>> + cpu_relax();
>> + }
>> + preempt_enable();
> You are not really serious about busy looping with preemption disabled?
>
> That's just wrong. Why do you want to block others from getting on the CPU
> if there is nothing in the pipe?
>
> There is no point in doing so, really. If the wait loop is preempted
> because there is more important work to do, then it will come back and
> either see new data, or leave due to wait time reached.
Makes sense. I will remove it.

Thanks,
Subhra
>
> Thanks,
>
> tglx
>


2018-09-05 00:53:01

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] pipe: introduce busy wait for pipe



On 08/31/2018 09:09 AM, Steven Sistare wrote:
> On 8/30/2018 4:24 PM, subhra mazumdar wrote:
>> Introduce pipe_ll_usec field for pipes that indicates the amount of micro
>> seconds a thread should spin if pipe is empty or full before sleeping. This
>> is similar to network sockets. Workloads like hackbench in pipe mode
>> benefits significantly from this by avoiding the sleep and wakeup overhead.
>> Other similar usecases can benefit. pipe_wait_flag is used to signal any
>> thread busy waiting. pipe_busy_loop_timeout checks if spin time is over.
>>
>> Signed-off-by: subhra mazumdar <[email protected]>
>> ---
>> include/linux/pipe_fs_i.h | 19 +++++++++++++++++++
>> 1 file changed, 19 insertions(+)
>>
>> diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
>> index e7497c9..fdfd2a2 100644
>> --- a/include/linux/pipe_fs_i.h
>> +++ b/include/linux/pipe_fs_i.h
>> @@ -1,6 +1,8 @@
>> #ifndef _LINUX_PIPE_FS_I_H
>> #define _LINUX_PIPE_FS_I_H
>>
>> +#include <linux/sched/clock.h>
>> +
>> #define PIPE_DEF_BUFFERS 16
>>
>> #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
>> @@ -54,6 +56,8 @@ struct pipe_inode_info {
>> unsigned int waiting_writers;
>> unsigned int r_counter;
>> unsigned int w_counter;
>> + unsigned int pipe_ll_usec;
>> + unsigned long pipe_wait_flag;
>> struct page *tmp_page;
>> struct fasync_struct *fasync_readers;
>> struct fasync_struct *fasync_writers;
>> @@ -157,6 +161,21 @@ static inline int pipe_buf_steal(struct pipe_inode_info *pipe,
>> return buf->ops->steal(pipe, buf);
>> }
>>
>> +static inline unsigned long pipe_busy_loop_current_time(void)
>> +{
>> + return (unsigned long)(local_clock() >> 10);
> Why ">> 10" ? local_lock() has nanosec units, and you compare to the tunable
> pipe_llc_sec which has microsec units. Should be ">> 3". Better yet, redefine
> the tunable to have nanosec units. I suspect you will need very large values
> of the tunable to show similar results.
It's 2^10. I don't think using nanosec units is necessary. It is unlikely
data will be read or written in nano seconds. sk_busy_loop_timeout for
sockets uses micro seconds too.
>
> Also, since this type of optimization consumes CPU extra cycles that could
> be used by other tasks, show the overall CPU utilization before and after
> the optimization, such as by using "time hackbench ...".
OK.

Thanks,
Subhra
>
> - Steve
>
>> +}
>> +
>> +static inline bool pipe_busy_loop_timeout(struct pipe_inode_info *pipe,
>> + unsigned long start_time)
>> +{
>> + unsigned long bp_usec = READ_ONCE(pipe->pipe_ll_usec);
>> + unsigned long end_time = start_time + bp_usec;
>> + unsigned long now = pipe_busy_loop_current_time();
>> +
>> + return time_after(now, end_time);
>> +}
>> +
>> /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
>> memory allocation, whereas PIPE_BUF makes atomicity guarantees. */
>> #define PIPE_SIZE PAGE_SIZE
>>


2018-09-05 13:47:56

by Steven Sistare

[permalink] [raw]
Subject: Re: [RFC PATCH 1/2] pipe: introduce busy wait for pipe

On 9/4/2018 8:50 PM, Subhra Mazumdar wrote:
> On 08/31/2018 09:09 AM, Steven Sistare wrote:
>> On 8/30/2018 4:24 PM, subhra mazumdar wrote:
>>> Introduce pipe_ll_usec field for pipes that indicates the amount of micro
>>> seconds a thread should spin if pipe is empty or full before sleeping. This
>>> is similar to network sockets. Workloads like hackbench in pipe mode
>>> benefits significantly from this by avoiding the sleep and wakeup overhead.
>>> Other similar usecases can benefit. pipe_wait_flag is used to signal any
>>> thread busy waiting. pipe_busy_loop_timeout checks if spin time is over.
>>>
>>> Signed-off-by: subhra mazumdar <[email protected]>
>>> ---
>>>   include/linux/pipe_fs_i.h | 19 +++++++++++++++++++
>>>   1 file changed, 19 insertions(+)
>>>
>>> diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
>>> index e7497c9..fdfd2a2 100644
>>> --- a/include/linux/pipe_fs_i.h
>>> +++ b/include/linux/pipe_fs_i.h
>>> @@ -1,6 +1,8 @@
>>>   #ifndef _LINUX_PIPE_FS_I_H
>>>   #define _LINUX_PIPE_FS_I_H
>>>   +#include <linux/sched/clock.h>
>>> +
>>>   #define PIPE_DEF_BUFFERS    16
>>>     #define PIPE_BUF_FLAG_LRU    0x01    /* page is on the LRU */
>>> @@ -54,6 +56,8 @@ struct pipe_inode_info {
>>>       unsigned int waiting_writers;
>>>       unsigned int r_counter;
>>>       unsigned int w_counter;
>>> +    unsigned int pipe_ll_usec;
>>> +    unsigned long pipe_wait_flag;
>>>       struct page *tmp_page;
>>>       struct fasync_struct *fasync_readers;
>>>       struct fasync_struct *fasync_writers;
>>> @@ -157,6 +161,21 @@ static inline int pipe_buf_steal(struct pipe_inode_info *pipe,
>>>       return buf->ops->steal(pipe, buf);
>>>   }
>>>   +static inline unsigned long pipe_busy_loop_current_time(void)
>>> +{
>>> +    return (unsigned long)(local_clock() >> 10);
>> Why ">> 10" ? local_lock() has nanosec units, and you compare to the tunable
>> pipe_llc_sec which has microsec units.  Should be ">> 3".  Better yet, redefine
>> the tunable to have nanosec units.  I suspect you will need very large values
>> of the tunable to show similar results.
> It's 2^10. I don't think using nanosec units is necessary. It is unlikely
> data will be read or written in nano seconds. sk_busy_loop_timeout for
> sockets uses micro seconds too.

Ah, you are using 2^10 as an approximation of 1000. OK.

- Steve

>>
>> Also, since this type of optimization consumes CPU extra cycles that could
>> be used by other tasks, show the overall CPU utilization before and after
>> the optimization, such as by using "time hackbench ...".
> OK.
>
> Thanks,
> Subhra
>>
>> - Steve
>>
>>> +}
>>> +
>>> +static inline bool pipe_busy_loop_timeout(struct pipe_inode_info *pipe,
>>> +                      unsigned long start_time)
>>> +{
>>> +    unsigned long bp_usec = READ_ONCE(pipe->pipe_ll_usec);
>>> +    unsigned long end_time = start_time + bp_usec;
>>> +    unsigned long now = pipe_busy_loop_current_time();
>>> +
>>> +    return time_after(now, end_time);
>>> +}
>>> +
>>>   /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
>>>      memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
>>>   #define PIPE_SIZE        PAGE_SIZE
>>>
>

2018-09-07 14:13:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/2] pipe: use pipe busy wait

On Thu, Aug 30, 2018 at 01:24:58PM -0700, subhra mazumdar wrote:
> +void pipe_busy_wait(struct pipe_inode_info *pipe)
> +{
> + unsigned long wait_flag = pipe->pipe_wait_flag;
> + unsigned long start_time = pipe_busy_loop_current_time();
> +
> + pipe_unlock(pipe);
> + preempt_disable();
> + for (;;) {
> + if (pipe->pipe_wait_flag > wait_flag) {
> + preempt_enable();
> + pipe_lock(pipe);
> + return;
> + }
> + if (pipe_busy_loop_timeout(pipe, start_time))
> + break;
> + cpu_relax();
> + }
> + preempt_enable();
> + pipe_lock(pipe);
> + if (pipe->pipe_wait_flag > wait_flag)
> + return;
> + pipe_wait(pipe);
> +}
> +
> +void wake_up_busy_poll(struct pipe_inode_info *pipe)
> +{
> + pipe->pipe_wait_flag++;
> +}

Why not just busy wait on current->state ? A little something like:

diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c0977d..8d9f1c95ff99 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -106,6 +106,7 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
void pipe_wait(struct pipe_inode_info *pipe)
{
DEFINE_WAIT(wait);
+ u64 start;

/*
* Pipes are system-local resources, so sleeping on them
@@ -113,7 +114,15 @@ void pipe_wait(struct pipe_inode_info *pipe)
*/
prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
pipe_unlock(pipe);
- schedule();
+
+ preempt_disable();
+ start = local_clock();
+ while (!need_resched() && current->state != TASK_RUNNING &&
+ (local_clock() - start) < pipe->poll_usec)
+ cpu_relax();
+ schedule_preempt_disabled();
+ preempt_enable();
+
finish_wait(&pipe->wait, &wait);
pipe_lock(pipe);
}

2018-09-17 21:06:47

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC PATCH 2/2] pipe: use pipe busy wait



On 09/07/2018 05:25 AM, Peter Zijlstra wrote:
> On Thu, Aug 30, 2018 at 01:24:58PM -0700, subhra mazumdar wrote:
>> +void pipe_busy_wait(struct pipe_inode_info *pipe)
>> +{
>> + unsigned long wait_flag = pipe->pipe_wait_flag;
>> + unsigned long start_time = pipe_busy_loop_current_time();
>> +
>> + pipe_unlock(pipe);
>> + preempt_disable();
>> + for (;;) {
>> + if (pipe->pipe_wait_flag > wait_flag) {
>> + preempt_enable();
>> + pipe_lock(pipe);
>> + return;
>> + }
>> + if (pipe_busy_loop_timeout(pipe, start_time))
>> + break;
>> + cpu_relax();
>> + }
>> + preempt_enable();
>> + pipe_lock(pipe);
>> + if (pipe->pipe_wait_flag > wait_flag)
>> + return;
>> + pipe_wait(pipe);
>> +}
>> +
>> +void wake_up_busy_poll(struct pipe_inode_info *pipe)
>> +{
>> + pipe->pipe_wait_flag++;
>> +}
> Why not just busy wait on current->state ? A little something like:
>
> diff --git a/fs/pipe.c b/fs/pipe.c
> index bdc5d3c0977d..8d9f1c95ff99 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -106,6 +106,7 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
> void pipe_wait(struct pipe_inode_info *pipe)
> {
> DEFINE_WAIT(wait);
> + u64 start;
>
> /*
> * Pipes are system-local resources, so sleeping on them
> @@ -113,7 +114,15 @@ void pipe_wait(struct pipe_inode_info *pipe)
> */
> prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
> pipe_unlock(pipe);
> - schedule();
> +
> + preempt_disable();
> + start = local_clock();
> + while (!need_resched() && current->state != TASK_RUNNING &&
> + (local_clock() - start) < pipe->poll_usec)
> + cpu_relax();
> + schedule_preempt_disabled();
> + preempt_enable();
> +
> finish_wait(&pipe->wait, &wait);
> pipe_lock(pipe);
> }
This will make the current thread always spin and block as it itself does
the state change to TASK_RUNNING in finish_wait.

2018-09-17 22:44:21

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH 2/2] pipe: use pipe busy wait

On Mon, Sep 17, 2018 at 02:05:40PM -0700, Subhra Mazumdar wrote:
> On 09/07/2018 05:25 AM, Peter Zijlstra wrote:

> >Why not just busy wait on current->state ? A little something like:
> >
> >diff --git a/fs/pipe.c b/fs/pipe.c
> >index bdc5d3c0977d..8d9f1c95ff99 100644
> >--- a/fs/pipe.c
> >+++ b/fs/pipe.c
> >@@ -106,6 +106,7 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
> > void pipe_wait(struct pipe_inode_info *pipe)
> > {
> > DEFINE_WAIT(wait);
> >+ u64 start;
> > /*
> > * Pipes are system-local resources, so sleeping on them
> >@@ -113,7 +114,15 @@ void pipe_wait(struct pipe_inode_info *pipe)
> > */
> > prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
> > pipe_unlock(pipe);
> >- schedule();
> >+
> >+ preempt_disable();
> >+ start = local_clock();
> >+ while (!need_resched() && current->state != TASK_RUNNING &&
> >+ (local_clock() - start) < pipe->poll_usec)
> >+ cpu_relax();
> >+ schedule_preempt_disabled();
> >+ preempt_enable();
> >+
> > finish_wait(&pipe->wait, &wait);
> > pipe_lock(pipe);
> > }

> This will make the current thread always spin and block as it itself does
> the state change to TASK_RUNNING in finish_wait.

Nah, the actual wakeup will also do that state change. The one in
finish_wait() is for the case where the wait condition became true
without wakeup, such that we don't 'leak' the INTERRUPTIBLE state.

2018-09-18 01:08:14

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC PATCH 2/2] pipe: use pipe busy wait



On 09/17/2018 03:43 PM, Peter Zijlstra wrote:
> On Mon, Sep 17, 2018 at 02:05:40PM -0700, Subhra Mazumdar wrote:
>> On 09/07/2018 05:25 AM, Peter Zijlstra wrote:
>>> Why not just busy wait on current->state ? A little something like:
>>>
>>> diff --git a/fs/pipe.c b/fs/pipe.c
>>> index bdc5d3c0977d..8d9f1c95ff99 100644
>>> --- a/fs/pipe.c
>>> +++ b/fs/pipe.c
>>> @@ -106,6 +106,7 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
>>> void pipe_wait(struct pipe_inode_info *pipe)
>>> {
>>> DEFINE_WAIT(wait);
>>> + u64 start;
>>> /*
>>> * Pipes are system-local resources, so sleeping on them
>>> @@ -113,7 +114,15 @@ void pipe_wait(struct pipe_inode_info *pipe)
>>> */
>>> prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
>>> pipe_unlock(pipe);
>>> - schedule();
>>> +
>>> + preempt_disable();
>>> + start = local_clock();
>>> + while (!need_resched() && current->state != TASK_RUNNING &&
>>> + (local_clock() - start) < pipe->poll_usec)
>>> + cpu_relax();
>>> + schedule_preempt_disabled();
>>> + preempt_enable();
>>> +
>>> finish_wait(&pipe->wait, &wait);
>>> pipe_lock(pipe);
>>> }
>> This will make the current thread always spin and block as it itself does
>> the state change to TASK_RUNNING in finish_wait.
> Nah, the actual wakeup will also do that state change. The one in
> finish_wait() is for the case where the wait condition became true
> without wakeup, such that we don't 'leak' the INTERRUPTIBLE state.
Ok, it works. I see similar improvements with hackbench as the original
patch.