2020-06-18 23:00:01

by Junxiao Bi

[permalink] [raw]
Subject: severe proc dentry lock contention

Hi,

When debugging some performance issue, i found that thousands of threads
exit around same time could cause a severe spin lock contention on proc
dentry "/proc/$parent_process_pid/task/", that's because threads needs
to clean up their pid file from that dir when exit. Check the following
standalone test case that simulated the case and perf top result on v5.7
kernel. Any idea on how to fix this?


   PerfTop:   48891 irqs/sec  kernel:95.6%  exact: 100.0% lost: 0/0
drop: 0/0 [4000Hz cycles],  (all, 72 CPUs)
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


    66.10%  [kernel]                               [k]
native_queued_spin_lock_slowpath
     1.13%  [kernel]                               [k] _raw_spin_lock
     0.84%  [kernel]                               [k] clear_page_erms
     0.82%  [kernel]                               [k]
queued_write_lock_slowpath
     0.64%  [kernel]                               [k] proc_task_readdir
     0.61%  [kernel]                               [k]
find_idlest_group.isra.95
     0.61%  [kernel]                               [k]
syscall_return_via_sysret
     0.55%  [kernel]                               [k] entry_SYSCALL_64
     0.49%  [kernel]                               [k] memcpy_erms
     0.46%  [kernel]                               [k] update_cfs_group
     0.41%  [kernel]                               [k] get_pid_task
     0.39%  [kernel]                               [k]
_raw_spin_lock_irqsave
     0.37%  [kernel]                               [k]
__list_del_entry_valid
     0.34%  [kernel]                               [k]
get_page_from_freelist
     0.34%  [kernel]                               [k] __d_lookup
     0.32%  [kernel]                               [k] update_load_avg
     0.31%  libc-2.17.so                           [.] get_next_seq
     0.27%  [kernel]                               [k] avc_has_perm_noaudit
     0.26%  [kernel]                               [k] __sched_text_start
     0.25%  [kernel]                               [k]
selinux_inode_permission
     0.25%  [kernel]                               [k] __slab_free
     0.24%  [kernel]                               [k] detach_entity_cfs_rq
     0.23%  [kernel]                               [k] zap_pte_range
     0.22%  [kernel]                               [k]
_find_next_bit.constprop.1
     0.22%  libc-2.17.so                           [.] vfprintf
     0.20%  libc-2.17.so                           [.] _int_malloc
     0.19%  [kernel]                               [k] _raw_spin_lock_irq
     0.18%  [kernel]                               [k] rb_erase
     0.18%  [kernel]                               [k] pid_revalidate
     0.18%  [kernel]                               [k] lockref_get_not_dead
     0.18%  [kernel]                               [k]
__alloc_pages_nodemask
     0.17%  [kernel]                               [k] set_task_cpu
     0.17%  libc-2.17.so                           [.] __strcoll_l
     0.17%  [kernel]                               [k] do_syscall_64
     0.17%  [kernel]                               [k] __vmalloc_node_range
     0.17%  libc-2.17.so                           [.] _IO_vfscanf
     0.17%  [kernel]                               [k] refcount_dec_not_one
     0.15%  [kernel]                               [k] __task_pid_nr_ns
     0.15%  [kernel]                               [k]
native_irq_return_iret
     0.15%  [kernel]                               [k] free_pcppages_bulk
     0.14%  [kernel]                               [k] kmem_cache_alloc
     0.14%  [kernel]                               [k] link_path_walk
     0.14%  libc-2.17.so                           [.] _int_free
     0.14%  [kernel]                               [k]
__update_load_avg_cfs_rq
     0.14%  perf.5.7.0-master.20200601.ol7.x86_64  [.] 0x00000000000eac29
     0.13%  [kernel]                               [k] kmem_cache_free
     0.13%  [kernel]                               [k] number
     0.13%  [kernel]                               [k] memset_erms
     0.12%  [kernel]                               [k] proc_pid_status
     0.12%  [kernel]                               [k] __d_lookup_rcu


=========== runme.sh ==========

#!/bin/bash

threads=${1:-10000}
prog=proc_race
while [ 1 ]; do ./$prog $threads; done &

while [ 1 ]; do
    pid=`ps aux | grep $prog | grep -v grep| awk '{print $2}'`
    if [ -z $pid ]; then continue; fi
    threadnum=`ls -l /proc/$pid/task | wc -l`
    if [ $threadnum -gt $threads ]; then
        echo kill $pid
        kill -9 $pid
    fi
done


===========proc_race.c=========


#include <pthread.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <ctype.h>

#define handle_error_en(en, msg) \
    do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)

#define handle_error(msg) \
    do { perror(msg); exit(EXIT_FAILURE); } while (0)

struct thread_info {
    pthread_t thread_id;
    int       thread_num;
};

static void *child_thread()
{
    int i;

    while (1) { if (!(i++ % 1000000)) sleep(1);}
    return NULL;
}

int main(int argc, char *argv[])
{
    int s, tnum, opt, num_threads;
    struct thread_info *tinfo;
    void *res;

    if (argc == 2)
        num_threads = atoi(argv[1]);
    else
        num_threads = 10000;

    tinfo = calloc(num_threads, sizeof(struct thread_info));
    if (tinfo == NULL)
        handle_error("calloc");


    for (tnum = 0; tnum < num_threads; tnum++) {
        tinfo[tnum].thread_num = tnum + 1;

        s = pthread_create(&tinfo[tnum].thread_id, NULL,
                &child_thread, NULL);
        if (s != 0)
            handle_error_en(s, "pthread_create");
    }

    for (tnum = 0; tnum < num_threads; tnum++) {
        s = pthread_join(tinfo[tnum].thread_id, &res);
        if (s != 0)
            handle_error_en(s, "pthread_join");

        free(res);
    }

    free(tinfo);
    exit(EXIT_SUCCESS);
}

==========

Thanks,

Junxiao.


2020-06-18 23:42:25

by Matthew Wilcox

[permalink] [raw]
Subject: Re: severe proc dentry lock contention

On Thu, Jun 18, 2020 at 03:17:33PM -0700, Junxiao Bi wrote:
> When debugging some performance issue, i found that thousands of threads
> exit around same time could cause a severe spin lock contention on proc
> dentry "/proc/$parent_process_pid/task/", that's because threads needs to
> clean up their pid file from that dir when exit. Check the following
> standalone test case that simulated the case and perf top result on v5.7
> kernel. Any idea on how to fix this?

Thanks, Junxiao.

We've looked at a few different ways of fixing this problem.

Even though the contention is within the dcache, it seems like a usecase
that the dcache shouldn't be optimised for -- generally we do not have
hundreds of CPUs removing dentries from a single directory in parallel.

We could fix this within procfs. We don't have a great patch yet, but
the current approach we're looking at allows only one thread at a time
to call dput() on any /proc/*/task directory.

We could also look at fixing this within the scheduler. Only allowing
one CPU to run the threads of an exiting process would fix this particular
problem, but might have other consequences.

I was hoping that 7bc3e6e55acf would fix this, but that patch is in 5.7,
so that hope is ruled out.

>
> ?? PerfTop:?? 48891 irqs/sec? kernel:95.6%? exact: 100.0% lost: 0/0 drop:
> 0/0 [4000Hz cycles],? (all, 72 CPUs)
> ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>
>
> ??? 66.10%? [kernel]?????????????????????????????? [k]
> native_queued_spin_lock_slowpath
> ???? 1.13%? [kernel]?????????????????????????????? [k] _raw_spin_lock
> ???? 0.84%? [kernel]?????????????????????????????? [k] clear_page_erms
> ???? 0.82%? [kernel]?????????????????????????????? [k]
> queued_write_lock_slowpath
> ???? 0.64%? [kernel]?????????????????????????????? [k] proc_task_readdir
> ???? 0.61%? [kernel]?????????????????????????????? [k]
> find_idlest_group.isra.95
> ???? 0.61%? [kernel]?????????????????????????????? [k]
> syscall_return_via_sysret
> ???? 0.55%? [kernel]?????????????????????????????? [k] entry_SYSCALL_64
> ???? 0.49%? [kernel]?????????????????????????????? [k] memcpy_erms
> ???? 0.46%? [kernel]?????????????????????????????? [k] update_cfs_group
> ???? 0.41%? [kernel]?????????????????????????????? [k] get_pid_task
> ???? 0.39%? [kernel]?????????????????????????????? [k]
> _raw_spin_lock_irqsave
> ???? 0.37%? [kernel]?????????????????????????????? [k]
> __list_del_entry_valid
> ???? 0.34%? [kernel]?????????????????????????????? [k]
> get_page_from_freelist
> ???? 0.34%? [kernel]?????????????????????????????? [k] __d_lookup
> ???? 0.32%? [kernel]?????????????????????????????? [k] update_load_avg
> ???? 0.31%? libc-2.17.so?????????????????????????? [.] get_next_seq
> ???? 0.27%? [kernel]?????????????????????????????? [k] avc_has_perm_noaudit
> ???? 0.26%? [kernel]?????????????????????????????? [k] __sched_text_start
> ???? 0.25%? [kernel]?????????????????????????????? [k]
> selinux_inode_permission
> ???? 0.25%? [kernel]?????????????????????????????? [k] __slab_free
> ???? 0.24%? [kernel]?????????????????????????????? [k] detach_entity_cfs_rq
> ???? 0.23%? [kernel]?????????????????????????????? [k] zap_pte_range
> ???? 0.22%? [kernel]?????????????????????????????? [k]
> _find_next_bit.constprop.1
> ???? 0.22%? libc-2.17.so?????????????????????????? [.] vfprintf
> ???? 0.20%? libc-2.17.so?????????????????????????? [.] _int_malloc
> ???? 0.19%? [kernel]?????????????????????????????? [k] _raw_spin_lock_irq
> ???? 0.18%? [kernel]?????????????????????????????? [k] rb_erase
> ???? 0.18%? [kernel]?????????????????????????????? [k] pid_revalidate
> ???? 0.18%? [kernel]?????????????????????????????? [k] lockref_get_not_dead
> ???? 0.18%? [kernel]?????????????????????????????? [k]
> __alloc_pages_nodemask
> ???? 0.17%? [kernel]?????????????????????????????? [k] set_task_cpu
> ???? 0.17%? libc-2.17.so?????????????????????????? [.] __strcoll_l
> ???? 0.17%? [kernel]?????????????????????????????? [k] do_syscall_64
> ???? 0.17%? [kernel]?????????????????????????????? [k] __vmalloc_node_range
> ???? 0.17%? libc-2.17.so?????????????????????????? [.] _IO_vfscanf
> ???? 0.17%? [kernel]?????????????????????????????? [k] refcount_dec_not_one
> ???? 0.15%? [kernel]?????????????????????????????? [k] __task_pid_nr_ns
> ???? 0.15%? [kernel]?????????????????????????????? [k]
> native_irq_return_iret
> ???? 0.15%? [kernel]?????????????????????????????? [k] free_pcppages_bulk
> ???? 0.14%? [kernel]?????????????????????????????? [k] kmem_cache_alloc
> ???? 0.14%? [kernel]?????????????????????????????? [k] link_path_walk
> ???? 0.14%? libc-2.17.so?????????????????????????? [.] _int_free
> ???? 0.14%? [kernel]?????????????????????????????? [k]
> __update_load_avg_cfs_rq
> ???? 0.14%? perf.5.7.0-master.20200601.ol7.x86_64? [.] 0x00000000000eac29
> ???? 0.13%? [kernel]?????????????????????????????? [k] kmem_cache_free
> ???? 0.13%? [kernel]?????????????????????????????? [k] number
> ???? 0.13%? [kernel]?????????????????????????????? [k] memset_erms
> ???? 0.12%? [kernel]?????????????????????????????? [k] proc_pid_status
> ???? 0.12%? [kernel]?????????????????????????????? [k] __d_lookup_rcu
>
>
> =========== runme.sh ==========
>
> #!/bin/bash
>
> threads=${1:-10000}
> prog=proc_race
> while [ 1 ]; do ./$prog $threads; done &
>
> while [ 1 ]; do
> ??? pid=`ps aux | grep $prog | grep -v grep| awk '{print $2}'`
> ??? if [ -z $pid ]; then continue; fi
> ??? threadnum=`ls -l /proc/$pid/task | wc -l`
> ??? if [ $threadnum -gt $threads ]; then
> ??? ??? echo kill $pid
> ??? ??? kill -9 $pid
> ??? fi
> done
>
>
> ===========proc_race.c=========
>
>
> #include <pthread.h>
> #include <string.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <errno.h>
> #include <ctype.h>
>
> #define handle_error_en(en, msg) \
> ??? do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)
>
> #define handle_error(msg) \
> ??? do { perror(msg); exit(EXIT_FAILURE); } while (0)
>
> struct thread_info {
> ??? pthread_t thread_id;
> ??? int?????? thread_num;
> };
>
> static void *child_thread()
> {
> ??? int i;
>
> ??? while (1) { if (!(i++ % 1000000)) sleep(1);}
> ??? return NULL;
> }
>
> int main(int argc, char *argv[])
> {
> ??? int s, tnum, opt, num_threads;
> ??? struct thread_info *tinfo;
> ??? void *res;
>
> ??? if (argc == 2)
> ??? ??? num_threads = atoi(argv[1]);
> ??? else
> ??? ??? num_threads = 10000;
>
> ??? tinfo = calloc(num_threads, sizeof(struct thread_info));
> ??? if (tinfo == NULL)
> ??? ??? handle_error("calloc");
>
>
> ??? for (tnum = 0; tnum < num_threads; tnum++) {
> ??? ??? tinfo[tnum].thread_num = tnum + 1;
>
> ??? ??? s = pthread_create(&tinfo[tnum].thread_id, NULL,
> ??? ??? ??? ??? &child_thread, NULL);
> ??? ??? if (s != 0)
> ??? ??? ??? handle_error_en(s, "pthread_create");
> ??? }
>
> ??? for (tnum = 0; tnum < num_threads; tnum++) {
> ??? ??? s = pthread_join(tinfo[tnum].thread_id, &res);
> ??? ??? if (s != 0)
> ??? ??? ??? handle_error_en(s, "pthread_join");
>
> ??? ??? free(res);
> ??? }
>
> ??? free(tinfo);
> ??? exit(EXIT_SUCCESS);
> }
>
> ==========
>
> Thanks,
>
> Junxiao.
>

2020-06-19 01:40:10

by Eric W. Biederman

[permalink] [raw]
Subject: Re: severe proc dentry lock contention

Matthew Wilcox <[email protected]> writes:

> On Thu, Jun 18, 2020 at 03:17:33PM -0700, Junxiao Bi wrote:
>> When debugging some performance issue, i found that thousands of threads
>> exit around same time could cause a severe spin lock contention on proc
>> dentry "/proc/$parent_process_pid/task/", that's because threads needs to
>> clean up their pid file from that dir when exit. Check the following
>> standalone test case that simulated the case and perf top result on v5.7
>> kernel. Any idea on how to fix this?

>
> Thanks, Junxiao.
>
> We've looked at a few different ways of fixing this problem.
>
> Even though the contention is within the dcache, it seems like a usecase
> that the dcache shouldn't be optimised for -- generally we do not have
> hundreds of CPUs removing dentries from a single directory in parallel.
>
> We could fix this within procfs. We don't have a great patch yet, but
> the current approach we're looking at allows only one thread at a time
> to call dput() on any /proc/*/task directory.
>
> We could also look at fixing this within the scheduler. Only allowing
> one CPU to run the threads of an exiting process would fix this particular
> problem, but might have other consequences.
>
> I was hoping that 7bc3e6e55acf would fix this, but that patch is in 5.7,
> so that hope is ruled out.

Does anyone know if problem new in v5.7? I am wondering if I introduced
this problem when I refactored the code or if I simply churned the code
but the issue remains effectively the same.

Can you try only flushing entries when the last thread of the process is
reaped? I think in practice we would want to be a little more
sophisticated but it is a good test case to see if it solves the issue.

diff --git a/kernel/exit.c b/kernel/exit.c
index cebae77a9664..d56e4eb60bdd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -152,7 +152,7 @@ void put_task_struct_rcu_user(struct task_struct *task)
void release_task(struct task_struct *p)
{
struct task_struct *leader;
- struct pid *thread_pid;
+ struct pid *thread_pid = NULL;
int zap_leader;
repeat:
/* don't need to get the RCU readlock here - the process is dead and
@@ -165,7 +165,8 @@ void release_task(struct task_struct *p)

write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- thread_pid = get_pid(p->thread_pid);
+ if (p == p->group_leader)
+ thread_pid = get_pid(p->thread_pid);
__exit_signal(p);

/*
@@ -188,8 +189,10 @@ void release_task(struct task_struct *p)
}

write_unlock_irq(&tasklist_lock);
- proc_flush_pid(thread_pid);
- put_pid(thread_pid);
+ if (thread_pid) {
+ proc_flush_pid(thread_pid);
+ put_pid(thread_pid);
+ }
release_thread(p);
put_task_struct_rcu_user(p);

2020-06-19 04:03:23

by Junxiao Bi

[permalink] [raw]
Subject: Re: severe proc dentry lock contention

On 6/18/20 5:02 PM, [email protected] wrote:

> Matthew Wilcox <[email protected]> writes:
>
>> On Thu, Jun 18, 2020 at 03:17:33PM -0700, Junxiao Bi wrote:
>>> When debugging some performance issue, i found that thousands of threads
>>> exit around same time could cause a severe spin lock contention on proc
>>> dentry "/proc/$parent_process_pid/task/", that's because threads needs to
>>> clean up their pid file from that dir when exit. Check the following
>>> standalone test case that simulated the case and perf top result on v5.7
>>> kernel. Any idea on how to fix this?
>> Thanks, Junxiao.
>>
>> We've looked at a few different ways of fixing this problem.
>>
>> Even though the contention is within the dcache, it seems like a usecase
>> that the dcache shouldn't be optimised for -- generally we do not have
>> hundreds of CPUs removing dentries from a single directory in parallel.
>>
>> We could fix this within procfs. We don't have a great patch yet, but
>> the current approach we're looking at allows only one thread at a time
>> to call dput() on any /proc/*/task directory.
>>
>> We could also look at fixing this within the scheduler. Only allowing
>> one CPU to run the threads of an exiting process would fix this particular
>> problem, but might have other consequences.
>>
>> I was hoping that 7bc3e6e55acf would fix this, but that patch is in 5.7,
>> so that hope is ruled out.
> Does anyone know if problem new in v5.7? I am wondering if I introduced
> this problem when I refactored the code or if I simply churned the code
> but the issue remains effectively the same.
It's not new issue, we see it in old kernel like v4.14
>
> Can you try only flushing entries when the last thread of the process is
> reaped? I think in practice we would want to be a little more
> sophisticated but it is a good test case to see if it solves the issue.

Thank you. i will try and let you know.

Thanks,

Junxiao.

>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index cebae77a9664..d56e4eb60bdd 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -152,7 +152,7 @@ void put_task_struct_rcu_user(struct task_struct *task)
> void release_task(struct task_struct *p)
> {
> struct task_struct *leader;
> - struct pid *thread_pid;
> + struct pid *thread_pid = NULL;
> int zap_leader;
> repeat:
> /* don't need to get the RCU readlock here - the process is dead and
> @@ -165,7 +165,8 @@ void release_task(struct task_struct *p)
>
> write_lock_irq(&tasklist_lock);
> ptrace_release_task(p);
> - thread_pid = get_pid(p->thread_pid);
> + if (p == p->group_leader)
> + thread_pid = get_pid(p->thread_pid);
> __exit_signal(p);
>
> /*
> @@ -188,8 +189,10 @@ void release_task(struct task_struct *p)
> }
>
> write_unlock_irq(&tasklist_lock);
> - proc_flush_pid(thread_pid);
> - put_pid(thread_pid);
> + if (thread_pid) {
> + proc_flush_pid(thread_pid);
> + put_pid(thread_pid);
> + }
> release_thread(p);
> put_task_struct_rcu_user(p);
>

2020-06-19 04:36:28

by Eric W. Biederman

[permalink] [raw]
Subject: Re: severe proc dentry lock contention

Junxiao Bi <[email protected]> writes:

> On 6/18/20 5:02 PM, [email protected] wrote:
>
>> Matthew Wilcox <[email protected]> writes:
>>
>>> On Thu, Jun 18, 2020 at 03:17:33PM -0700, Junxiao Bi wrote:
>>>> When debugging some performance issue, i found that thousands of threads
>>>> exit around same time could cause a severe spin lock contention on proc
>>>> dentry "/proc/$parent_process_pid/task/", that's because threads needs to
>>>> clean up their pid file from that dir when exit. Check the following
>>>> standalone test case that simulated the case and perf top result on v5.7
>>>> kernel. Any idea on how to fix this?
>>> Thanks, Junxiao.
>>>
>>> We've looked at a few different ways of fixing this problem.
>>>
>>> Even though the contention is within the dcache, it seems like a usecase
>>> that the dcache shouldn't be optimised for -- generally we do not have
>>> hundreds of CPUs removing dentries from a single directory in parallel.
>>>
>>> We could fix this within procfs. We don't have a great patch yet, but
>>> the current approach we're looking at allows only one thread at a time
>>> to call dput() on any /proc/*/task directory.
>>>
>>> We could also look at fixing this within the scheduler. Only allowing
>>> one CPU to run the threads of an exiting process would fix this particular
>>> problem, but might have other consequences.
>>>
>>> I was hoping that 7bc3e6e55acf would fix this, but that patch is in 5.7,
>>> so that hope is ruled out.
>> Does anyone know if problem new in v5.7? I am wondering if I introduced
>> this problem when I refactored the code or if I simply churned the code
>> but the issue remains effectively the same.
> It's not new issue, we see it in old kernel like v4.14
>>
>> Can you try only flushing entries when the last thread of the process is
>> reaped? I think in practice we would want to be a little more
>> sophisticated but it is a good test case to see if it solves the issue.
>
> Thank you. i will try and let you know.

Assuming this works we can remove the hacking part of always doing
this by only doing this if SIGNAL_GROUP_EXIT when we know this
thundering herd will appear.

We still need to do something with the list however.

If your testing works out performance wise I will figure out what
we need to make the hack generale and safe.

Eric



> Thanks,
>
> Junxiao.
>
>>
>> diff --git a/kernel/exit.c b/kernel/exit.c
>> index cebae77a9664..d56e4eb60bdd 100644
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -152,7 +152,7 @@ void put_task_struct_rcu_user(struct task_struct *task)
>> void release_task(struct task_struct *p)
>> {
>> struct task_struct *leader;
>> - struct pid *thread_pid;
>> + struct pid *thread_pid = NULL;
>> int zap_leader;
>> repeat:
>> /* don't need to get the RCU readlock here - the process is dead and
>> @@ -165,7 +165,8 @@ void release_task(struct task_struct *p)
>> write_lock_irq(&tasklist_lock);
>> ptrace_release_task(p);
>> - thread_pid = get_pid(p->thread_pid);
>> + if (p == p->group_leader)
>> + thread_pid = get_pid(p->thread_pid);
>> __exit_signal(p);
>> /*
>> @@ -188,8 +189,10 @@ void release_task(struct task_struct *p)
>> }
>> write_unlock_irq(&tasklist_lock);
>> - proc_flush_pid(thread_pid);
>> - put_pid(thread_pid);
>> + if (thread_pid) {
>> + proc_flush_pid(thread_pid);
>> + put_pid(thread_pid);
>> + }
>> release_thread(p);
>> put_task_struct_rcu_user(p);
>>

2020-06-19 14:17:04

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries


Junxiao Bi <[email protected]> reported:
> When debugging some performance issue, i found that thousands of threads exit
> around same time could cause a severe spin lock contention on proc dentry
> "/proc/$parent_process_pid/task/", that's because threads needs to clean up
> their pid file from that dir when exit.

Matthew Wilcox <[email protected]> reported:
> We've looked at a few different ways of fixing this problem.

The flushing of the proc dentries from the dcache is an optmization,
and is not necessary for correctness. Eventually cache pressure will
cause the dentries to be freed even if no flushing happens. Some
light testing when I refactored the proc flushg[1] indicated that at
least the memory footprint is easily measurable.

An optimization that causes a performance problem due to a thundering
herd of threads is no real optimization.

Modify the code to only flush the /proc/<tgid>/ directory when all
threads in a process are killed at once. This continues to flush
practically everything when the process is reaped as the threads live
under /proc/<tgid>/task/<tid>.

There is a rare possibility that a debugger will access /proc/<tid>/,
which this change will no longer flush, but I believe such accesses
are sufficiently rare to not be observed in practice.

[1] 7bc3e6e55acf ("proc: Use a list of inodes to flush from proc")
Link: https://lkml.kernel.org/r/[email protected]
Reported-by: Masahiro Yamada <[email protected]>
Reported-by: Matthew Wilcox <[email protected]>
Signed-off-by: "Eric W. Biederman" <[email protected]>
---

I am still waiting for word on how this affects performance, but this is
a clean version that should avoid the thundering herd problem in
general.


kernel/exit.c | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index cebae77a9664..567354550d62 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -151,8 +151,8 @@ void put_task_struct_rcu_user(struct task_struct *task)

void release_task(struct task_struct *p)
{
+ struct pid *flush_pid = NULL;
struct task_struct *leader;
- struct pid *thread_pid;
int zap_leader;
repeat:
/* don't need to get the RCU readlock here - the process is dead and
@@ -165,7 +165,16 @@ void release_task(struct task_struct *p)

write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- thread_pid = get_pid(p->thread_pid);
+
+ /*
+ * When all of the threads are exiting wait until the end
+ * and flush everything.
+ */
+ if (thread_group_leader(p))
+ flush_pid = get_pid(task_tgid(p));
+ else if (!(p->signal->flags & SIGNAL_GROUP_EXIT))
+ flush_pid = get_pid(task_pid(p));
+
__exit_signal(p);

/*
@@ -188,8 +197,10 @@ void release_task(struct task_struct *p)
}

write_unlock_irq(&tasklist_lock);
- proc_flush_pid(thread_pid);
- put_pid(thread_pid);
+ if (flush_pid) {
+ proc_flush_pid(flush_pid);
+ put_pid(flush_pid);
+ }
release_thread(p);
put_task_struct_rcu_user(p);

--
2.20.1

2020-06-19 15:59:26

by Junxiao Bi

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

Hi Eric,

The patch didn't improve lock contention.

   PerfTop:   48925 irqs/sec  kernel:95.6%  exact: 100.0% lost: 0/0
drop: 0/0 [4000Hz cycles],  (all, 104 CPUs)
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


    69.66%  [kernel]                                        [k]
native_queued_spin_lock_slowpath
     1.93%  [kernel]                                        [k]
_raw_spin_lock
     1.24%  [kernel]                                        [k]
page_counter_cancel
     0.70%  [kernel]                                        [k]
do_syscall_64
     0.62%  [kernel]                                        [k]
find_idlest_group.isra.96
     0.57%  [kernel]                                        [k]
queued_write_lock_slowpath
     0.56%  [kernel]                                        [k] d_walk
     0.45%  [kernel]                                        [k]
clear_page_erms
     0.44%  [kernel]                                        [k]
syscall_return_via_sysret
     0.40%  [kernel]                                        [k]
entry_SYSCALL_64
     0.38%  [kernel]                                        [k]
refcount_dec_not_one
     0.37%  [kernel]                                        [k]
propagate_protected_usage
     0.33%  [kernel]                                        [k]
unmap_page_range
     0.33%  [kernel]                                        [k]
select_collect
     0.32%  [kernel]                                        [k] memcpy_erms
     0.30%  [kernel]                                        [k]
proc_task_readdir
     0.27%  [kernel]                                        [k]
_raw_spin_lock_irqsave

Thanks,

Junxiao.

On 6/19/20 7:09 AM, [email protected] wrote:
> Junxiao Bi <[email protected]> reported:
>> When debugging some performance issue, i found that thousands of threads exit
>> around same time could cause a severe spin lock contention on proc dentry
>> "/proc/$parent_process_pid/task/", that's because threads needs to clean up
>> their pid file from that dir when exit.
> Matthew Wilcox <[email protected]> reported:
>> We've looked at a few different ways of fixing this problem.
> The flushing of the proc dentries from the dcache is an optmization,
> and is not necessary for correctness. Eventually cache pressure will
> cause the dentries to be freed even if no flushing happens. Some
> light testing when I refactored the proc flushg[1] indicated that at
> least the memory footprint is easily measurable.
>
> An optimization that causes a performance problem due to a thundering
> herd of threads is no real optimization.
>
> Modify the code to only flush the /proc/<tgid>/ directory when all
> threads in a process are killed at once. This continues to flush
> practically everything when the process is reaped as the threads live
> under /proc/<tgid>/task/<tid>.
>
> There is a rare possibility that a debugger will access /proc/<tid>/,
> which this change will no longer flush, but I believe such accesses
> are sufficiently rare to not be observed in practice.
>
> [1] 7bc3e6e55acf ("proc: Use a list of inodes to flush from proc")
> Link: https://lkml.kernel.org/r/[email protected]
> Reported-by: Masahiro Yamada <[email protected]>
> Reported-by: Matthew Wilcox <[email protected]>
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---
>
> I am still waiting for word on how this affects performance, but this is
> a clean version that should avoid the thundering herd problem in
> general.
>
>
> kernel/exit.c | 19 +++++++++++++++----
> 1 file changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index cebae77a9664..567354550d62 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -151,8 +151,8 @@ void put_task_struct_rcu_user(struct task_struct *task)
>
> void release_task(struct task_struct *p)
> {
> + struct pid *flush_pid = NULL;
> struct task_struct *leader;
> - struct pid *thread_pid;
> int zap_leader;
> repeat:
> /* don't need to get the RCU readlock here - the process is dead and
> @@ -165,7 +165,16 @@ void release_task(struct task_struct *p)
>
> write_lock_irq(&tasklist_lock);
> ptrace_release_task(p);
> - thread_pid = get_pid(p->thread_pid);
> +
> + /*
> + * When all of the threads are exiting wait until the end
> + * and flush everything.
> + */
> + if (thread_group_leader(p))
> + flush_pid = get_pid(task_tgid(p));
> + else if (!(p->signal->flags & SIGNAL_GROUP_EXIT))
> + flush_pid = get_pid(task_pid(p));
> +
> __exit_signal(p);
>
> /*
> @@ -188,8 +197,10 @@ void release_task(struct task_struct *p)
> }
>
> write_unlock_irq(&tasklist_lock);
> - proc_flush_pid(thread_pid);
> - put_pid(thread_pid);
> + if (flush_pid) {
> + proc_flush_pid(flush_pid);
> + put_pid(flush_pid);
> + }
> release_thread(p);
> put_task_struct_rcu_user(p);
>

2020-06-20 04:14:52

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

Junxiao Bi <[email protected]> writes:

> Hi Eric,
>
> The patch didn't improve lock contention.

Which raises the question where is the lock contention coming from.

Especially with my first variant. Only the last thread to be reaped
would free up anything in the cache.

Can you comment out the call to proc_flush_pid entirely?

That will rule out the proc_flush_pid in d_invalidate entirely.

The only candidate I can think of d_invalidate aka (proc_flush_pid) vs ps.

Eric

2020-06-20 04:51:12

by Junxiao Bi

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

On 6/19/20 10:24 AM, [email protected] wrote:

> Junxiao Bi <[email protected]> writes:
>
>> Hi Eric,
>>
>> The patch didn't improve lock contention.
> Which raises the question where is the lock contention coming from.
>
> Especially with my first variant. Only the last thread to be reaped
> would free up anything in the cache.
>
> Can you comment out the call to proc_flush_pid entirely?

Still high lock contention. Collect the following hot path.

    74.90%     0.01%  proc_race
[kernel.kallsyms]                               [k]
entry_SYSCALL_64_after_hwframe
            |
             --74.89%--entry_SYSCALL_64_after_hwframe
                       |
                        --74.88%--do_syscall_64
                                  |
                                  |--69.70%--exit_to_usermode_loop
                                  |          |
                                  |           --69.70%--do_signal
                                  |                     |
                                  | --69.69%--get_signal
                                  | |
                                  | |--56.30%--do_group_exit
                                  | |          |
                                  | |           --56.30%--do_exit
                                  | |                     |
                                  | |                    
|--27.50%--_raw_write_lock_irq
                                  | |                     |          |
                                  | |                     |
--27.47%--queued_write_lock_slowpath
                                  | |                    
|                     |
                                  | |                     |
--27.18%--native_queued_spin_lock_slowpath
                                  | |                     |
                                  | |                    
|--26.10%--release_task.part.20
                                  | |                     |          |
                                  | |                     |          
--25.60%--_raw_write_lock_irq
                                  | |                    
|                     |
                                  | |                     |
--25.56%--queued_write_lock_slowpath
                                  | |                    
|                                |
                                  | |                     |
--25.23%--native_queued_spin_lock_slowpath
                                  | |                     |
                                  | |                      --0.56%--mmput
                                  | |                                |
                                  | |                                
--0.55%--exit_mmap
                                  | |
|                                 --13.31%--_raw_spin_lock_irq
|                                           |
| --13.28%--native_queued_spin_lock_slowpath
                                  |

Thanks,

Junxiao.

>
> That will rule out the proc_flush_pid in d_invalidate entirely.
>
> The only candidate I can think of d_invalidate aka (proc_flush_pid) vs ps.
>
> Eric

2020-06-20 04:54:48

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

Junxiao Bi <[email protected]> writes:

> On 6/19/20 10:24 AM, [email protected] wrote:
>
>> Junxiao Bi <[email protected]> writes:
>>
>>> Hi Eric,
>>>
>>> The patch didn't improve lock contention.
>> Which raises the question where is the lock contention coming from.
>>
>> Especially with my first variant. Only the last thread to be reaped
>> would free up anything in the cache.
>>
>> Can you comment out the call to proc_flush_pid entirely?
>
> Still high lock contention. Collect the following hot path.

A different location this time.

I know of at least exit_signal and exit_notify that take thread wide
locks, and it looks like exit_mm is another. Those don't use the same
locks as flushing proc.


So I think you are simply seeing a result of the thundering herd of
threads shutting down at once. Given that thread shutdown is fundamentally
a slow path there is only so much that can be done.

If you are up for a project to working through this thundering herd I
expect I can help some. It will be a long process of cleaning up
the entire thread exit process with an eye to performance.

To make incremental progress we are going to need a way to understand
the impact of various changes. Such as my change to reduce the dentry
lock contention when a process is removed from proc. I have the feeling
that made a real world improvement on your test (as the lock no longer
shows up in your profile). Unfortunately whatever that improvement was
it was not relfected in now you read your numbers.

Eric

2020-06-20 16:30:17

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote:
> Junxiao Bi <[email protected]> writes:
> > Still high lock contention. Collect the following hot path.
>
> A different location this time.
>
> I know of at least exit_signal and exit_notify that take thread wide
> locks, and it looks like exit_mm is another. Those don't use the same
> locks as flushing proc.
>
>
> So I think you are simply seeing a result of the thundering herd of
> threads shutting down at once. Given that thread shutdown is fundamentally
> a slow path there is only so much that can be done.
>
> If you are up for a project to working through this thundering herd I
> expect I can help some. It will be a long process of cleaning up
> the entire thread exit process with an eye to performance.

Wengang had some tests which produced wall-clock values for this problem,
which I agree is more informative.

I'm not entirely sure what the customer workload is that requires a
highly threaded workload to also shut down quickly. To my mind, an
overall workload is normally composed of highly-threaded tasks that run
for a long time and only shut down rarely (thus performance of shutdown
is not important) and single-threaded tasks that run for a short time.

Understanding this workload is important to my next suggestion, which
is that rather than searching for all the places in the exit path which
contend on a single spinlock, we simply set the allowed CPUs for an
exiting task to include only the CPU that this thread is running on.
It will probably run faster to take the threads down in series on one
CPU rather than take them down in parallel across many CPUs (or am I
mistaken? Is there inherently a lot of parallelism in the thread
exiting process?)

2020-06-22 05:18:32

by Junxiao Bi

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

On 6/20/20 9:27 AM, Matthew Wilcox wrote:

> On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote:
>> Junxiao Bi <[email protected]> writes:
>>> Still high lock contention. Collect the following hot path.
>> A different location this time.
>>
>> I know of at least exit_signal and exit_notify that take thread wide
>> locks, and it looks like exit_mm is another. Those don't use the same
>> locks as flushing proc.
>>
>>
>> So I think you are simply seeing a result of the thundering herd of
>> threads shutting down at once. Given that thread shutdown is fundamentally
>> a slow path there is only so much that can be done.
>>
>> If you are up for a project to working through this thundering herd I
>> expect I can help some. It will be a long process of cleaning up
>> the entire thread exit process with an eye to performance.
> Wengang had some tests which produced wall-clock values for this problem,
> which I agree is more informative.
>
> I'm not entirely sure what the customer workload is that requires a
> highly threaded workload to also shut down quickly. To my mind, an
> overall workload is normally composed of highly-threaded tasks that run
> for a long time and only shut down rarely (thus performance of shutdown
> is not important) and single-threaded tasks that run for a short time.

The real workload is a Java application working in server-agent mode,
issue happened in agent side, all it do is waiting works dispatching
from server and execute. To execute one work, agent will start lots of
short live threads, there could be a lot of threads exit same time if
there were a lots of work to execute, the contention on the exit path
caused a high %sys time which impacted other workload.

Thanks,

Junxiao.

>
> Understanding this workload is important to my next suggestion, which
> is that rather than searching for all the places in the exit path which
> contend on a single spinlock, we simply set the allowed CPUs for an
> exiting task to include only the CPU that this thread is running on.
> It will probably run faster to take the threads down in series on one
> CPU rather than take them down in parallel across many CPUs (or am I
> mistaken? Is there inherently a lot of parallelism in the thread
> exiting process?)

2020-06-22 05:37:09

by Masahiro Yamada

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

On Fri, Jun 19, 2020 at 11:14 PM Eric W. Biederman
<[email protected]> wrote:
>
>
> Junxiao Bi <[email protected]> reported:
> > When debugging some performance issue, i found that thousands of threads exit
> > around same time could cause a severe spin lock contention on proc dentry
> > "/proc/$parent_process_pid/task/", that's because threads needs to clean up
> > their pid file from that dir when exit.
>
> Matthew Wilcox <[email protected]> reported:
> > We've looked at a few different ways of fixing this problem.
>
> The flushing of the proc dentries from the dcache is an optmization,
> and is not necessary for correctness. Eventually cache pressure will
> cause the dentries to be freed even if no flushing happens. Some
> light testing when I refactored the proc flushg[1] indicated that at
> least the memory footprint is easily measurable.
>
> An optimization that causes a performance problem due to a thundering
> herd of threads is no real optimization.
>
> Modify the code to only flush the /proc/<tgid>/ directory when all
> threads in a process are killed at once. This continues to flush
> practically everything when the process is reaped as the threads live
> under /proc/<tgid>/task/<tid>.
>
> There is a rare possibility that a debugger will access /proc/<tid>/,
> which this change will no longer flush, but I believe such accesses
> are sufficiently rare to not be observed in practice.
>
> [1] 7bc3e6e55acf ("proc: Use a list of inodes to flush from proc")
> Link: https://lkml.kernel.org/r/[email protected]


> Reported-by: Masahiro Yamada <[email protected]>

I did not report this.





> Reported-by: Matthew Wilcox <[email protected]>
> Signed-off-by: "Eric W. Biederman" <[email protected]>
> ---


--
Best Regards
Masahiro Yamada

2020-06-22 15:20:24

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

Masahiro Yamada <[email protected]> writes:

> On Fri, Jun 19, 2020 at 11:14 PM Eric W. Biederman
> <[email protected]> wrote:
>>
>>
>> Junxiao Bi <[email protected]> reported:
>> > When debugging some performance issue, i found that thousands of threads exit
>> > around same time could cause a severe spin lock contention on proc dentry
>> > "/proc/$parent_process_pid/task/", that's because threads needs to clean up
>> > their pid file from that dir when exit.
>>
>> Matthew Wilcox <[email protected]> reported:
>> > We've looked at a few different ways of fixing this problem.
>>
>> The flushing of the proc dentries from the dcache is an optmization,
>> and is not necessary for correctness. Eventually cache pressure will
>> cause the dentries to be freed even if no flushing happens. Some
>> light testing when I refactored the proc flushg[1] indicated that at
>> least the memory footprint is easily measurable.
>>
>> An optimization that causes a performance problem due to a thundering
>> herd of threads is no real optimization.
>>
>> Modify the code to only flush the /proc/<tgid>/ directory when all
>> threads in a process are killed at once. This continues to flush
>> practically everything when the process is reaped as the threads live
>> under /proc/<tgid>/task/<tid>.
>>
>> There is a rare possibility that a debugger will access /proc/<tid>/,
>> which this change will no longer flush, but I believe such accesses
>> are sufficiently rare to not be observed in practice.
>>
>> [1] 7bc3e6e55acf ("proc: Use a list of inodes to flush from proc")
>> Link: https://lkml.kernel.org/r/[email protected]
>
>
>> Reported-by: Masahiro Yamada <[email protected]>
>
> I did not report this.

Thank you for catching this.

I must have cut&pasted the wrong email address by mistake.

My apologies.

Eric

2020-06-22 15:27:22

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

Junxiao Bi <[email protected]> writes:

> On 6/20/20 9:27 AM, Matthew Wilcox wrote:
>
>> On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote:
>>> Junxiao Bi <[email protected]> writes:
>>>> Still high lock contention. Collect the following hot path.
>>> A different location this time.
>>>
>>> I know of at least exit_signal and exit_notify that take thread wide
>>> locks, and it looks like exit_mm is another. Those don't use the same
>>> locks as flushing proc.
>>>
>>>
>>> So I think you are simply seeing a result of the thundering herd of
>>> threads shutting down at once. Given that thread shutdown is fundamentally
>>> a slow path there is only so much that can be done.
>>>
>>> If you are up for a project to working through this thundering herd I
>>> expect I can help some. It will be a long process of cleaning up
>>> the entire thread exit process with an eye to performance.
>> Wengang had some tests which produced wall-clock values for this problem,
>> which I agree is more informative.
>>
>> I'm not entirely sure what the customer workload is that requires a
>> highly threaded workload to also shut down quickly. To my mind, an
>> overall workload is normally composed of highly-threaded tasks that run
>> for a long time and only shut down rarely (thus performance of shutdown
>> is not important) and single-threaded tasks that run for a short time.
>
> The real workload is a Java application working in server-agent mode, issue
> happened in agent side, all it do is waiting works dispatching from server and
> execute. To execute one work, agent will start lots of short live threads, there
> could be a lot of threads exit same time if there were a lots of work to
> execute, the contention on the exit path caused a high %sys time which impacted
> other workload.

If I understand correctly, the Java VM is not exiting. Just some of
it's threads.

That is a very different problem to deal with. That are many
optimizations that are possible when _all_ of the threads are exiting
that are not possible when _many_ threads are exiting.

Do you know if it is simply the cpu time or if it is the lock contention
that is the problem? If it is simply the cpu time we should consider if
some of the locks that can be highly contended should become mutexes.
Or perhaps something like Matthew's cpu pinning idea.

Eric


2020-06-22 16:13:16

by willy

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

On Mon, Jun 22, 2020 at 10:20:40AM -0500, Eric W. Biederman wrote:
> Junxiao Bi <[email protected]> writes:
> > On 6/20/20 9:27 AM, Matthew Wilcox wrote:
> >> On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote:
> >>> Junxiao Bi <[email protected]> writes:
> >>>> Still high lock contention. Collect the following hot path.
> >>> A different location this time.
> >>>
> >>> I know of at least exit_signal and exit_notify that take thread wide
> >>> locks, and it looks like exit_mm is another. Those don't use the same
> >>> locks as flushing proc.
> >>>
> >>>
> >>> So I think you are simply seeing a result of the thundering herd of
> >>> threads shutting down at once. Given that thread shutdown is fundamentally
> >>> a slow path there is only so much that can be done.
> >>>
> >>> If you are up for a project to working through this thundering herd I
> >>> expect I can help some. It will be a long process of cleaning up
> >>> the entire thread exit process with an eye to performance.
> >> Wengang had some tests which produced wall-clock values for this problem,
> >> which I agree is more informative.
> >>
> >> I'm not entirely sure what the customer workload is that requires a
> >> highly threaded workload to also shut down quickly. To my mind, an
> >> overall workload is normally composed of highly-threaded tasks that run
> >> for a long time and only shut down rarely (thus performance of shutdown
> >> is not important) and single-threaded tasks that run for a short time.
> >
> > The real workload is a Java application working in server-agent mode, issue
> > happened in agent side, all it do is waiting works dispatching from server and
> > execute. To execute one work, agent will start lots of short live threads, there
> > could be a lot of threads exit same time if there were a lots of work to
> > execute, the contention on the exit path caused a high %sys time which impacted
> > other workload.
>
> If I understand correctly, the Java VM is not exiting. Just some of
> it's threads.
>
> That is a very different problem to deal with. That are many
> optimizations that are possible when _all_ of the threads are exiting
> that are not possible when _many_ threads are exiting.

Ah! Now I get it. This explains why the dput() lock contention was
so important. A new thread starting would block on that lock as it
tried to create its new /proc/$pid/task/ directory.

Terminating thousands of threads but not the entire process isn't going
to hit many of the locks (eg exit_signal() and exit_mm() aren't going
to be called). So we need a more sophisticated micro benchmark that is
continually starting threads and asking dozens-to-thousands of them to
stop at the same time. Otherwise we'll try to fix lots of scalability
problems that our customer doesn't care about.

> Do you know if it is simply the cpu time or if it is the lock contention
> that is the problem? If it is simply the cpu time we should consider if
> some of the locks that can be highly contended should become mutexes.
> Or perhaps something like Matthew's cpu pinning idea.

If we're not trying to optimise for the entire process going down, then
we definitely don't want my CPU pinning idea.

2020-06-22 17:18:40

by Junxiao Bi

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

On 6/22/20 8:20 AM, [email protected] wrote:

> If I understand correctly, the Java VM is not exiting. Just some of
> it's threads.
>
> That is a very different problem to deal with. That are many
> optimizations that are possible when_all_ of the threads are exiting
> that are not possible when_many_ threads are exiting.
>
> Do you know if it is simply the cpu time or if it is the lock contention
> that is the problem? If it is simply the cpu time we should consider if
> some of the locks that can be highly contended should become mutexes.
> Or perhaps something like Matthew's cpu pinning idea.

The problem is high %sys time.

Thanks,

Junxiao.

2020-06-23 00:51:44

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

On Sun, Jun 21, 2020 at 10:15:39PM -0700, Junxiao Bi wrote:
> On 6/20/20 9:27 AM, Matthew Wilcox wrote:
> > On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote:
> > > Junxiao Bi <[email protected]> writes:
> > > > Still high lock contention. Collect the following hot path.
> > > A different location this time.
> > >
> > > I know of at least exit_signal and exit_notify that take thread wide
> > > locks, and it looks like exit_mm is another. Those don't use the same
> > > locks as flushing proc.
> > >
> > >
> > > So I think you are simply seeing a result of the thundering herd of
> > > threads shutting down at once. Given that thread shutdown is fundamentally
> > > a slow path there is only so much that can be done.
> > >
> > > If you are up for a project to working through this thundering herd I
> > > expect I can help some. It will be a long process of cleaning up
> > > the entire thread exit process with an eye to performance.
> > Wengang had some tests which produced wall-clock values for this problem,
> > which I agree is more informative.
> >
> > I'm not entirely sure what the customer workload is that requires a
> > highly threaded workload to also shut down quickly. To my mind, an
> > overall workload is normally composed of highly-threaded tasks that run
> > for a long time and only shut down rarely (thus performance of shutdown
> > is not important) and single-threaded tasks that run for a short time.
>
> The real workload is a Java application working in server-agent mode, issue
> happened in agent side, all it do is waiting works dispatching from server
> and execute. To execute one work, agent will start lots of short live
> threads, there could be a lot of threads exit same time if there were a lots
> of work to execute, the contention on the exit path caused a high %sys time
> which impacted other workload.

How about this for a micro? Executes in about ten seconds on my laptop.
You might need to tweak it a bit to get better timing on a server.

// gcc -pthread -O2 -g -W -Wall
#include <pthread.h>
#include <unistd.h>

void *worker(void *arg)
{
int i = 0;
int *p = arg;

for (;;) {
while (i < 1000 * 1000) {
i += *p;
}
sleep(1);
}
}

int main(int argc, char **argv)
{
pthread_t threads[20][100];
int i, j, one = 1;

for (i = 0; i < 1000; i++) {
for (j = 0; j < 100; j++)
pthread_create(&threads[i % 20][j], NULL, worker, &one);
if (i < 5)
continue;
for (j = 0; j < 100; j++)
pthread_cancel(threads[(i - 5) %20][j]);
}

return 0;
}

2020-06-25 23:49:52

by Junxiao Bi

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

On 6/22/20 5:47 PM, Matthew Wilcox wrote:

> On Sun, Jun 21, 2020 at 10:15:39PM -0700, Junxiao Bi wrote:
>> On 6/20/20 9:27 AM, Matthew Wilcox wrote:
>>> On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote:
>>>> Junxiao Bi <[email protected]> writes:
>>>>> Still high lock contention. Collect the following hot path.
>>>> A different location this time.
>>>>
>>>> I know of at least exit_signal and exit_notify that take thread wide
>>>> locks, and it looks like exit_mm is another. Those don't use the same
>>>> locks as flushing proc.
>>>>
>>>>
>>>> So I think you are simply seeing a result of the thundering herd of
>>>> threads shutting down at once. Given that thread shutdown is fundamentally
>>>> a slow path there is only so much that can be done.
>>>>
>>>> If you are up for a project to working through this thundering herd I
>>>> expect I can help some. It will be a long process of cleaning up
>>>> the entire thread exit process with an eye to performance.
>>> Wengang had some tests which produced wall-clock values for this problem,
>>> which I agree is more informative.
>>>
>>> I'm not entirely sure what the customer workload is that requires a
>>> highly threaded workload to also shut down quickly. To my mind, an
>>> overall workload is normally composed of highly-threaded tasks that run
>>> for a long time and only shut down rarely (thus performance of shutdown
>>> is not important) and single-threaded tasks that run for a short time.
>> The real workload is a Java application working in server-agent mode, issue
>> happened in agent side, all it do is waiting works dispatching from server
>> and execute. To execute one work, agent will start lots of short live
>> threads, there could be a lot of threads exit same time if there were a lots
>> of work to execute, the contention on the exit path caused a high %sys time
>> which impacted other workload.
> How about this for a micro? Executes in about ten seconds on my laptop.
> You might need to tweak it a bit to get better timing on a server.
>
> // gcc -pthread -O2 -g -W -Wall
> #include <pthread.h>
> #include <unistd.h>
>
> void *worker(void *arg)
> {
> int i = 0;
> int *p = arg;
>
> for (;;) {
> while (i < 1000 * 1000) {
> i += *p;
> }
> sleep(1);
> }
> }
>
> int main(int argc, char **argv)
> {
> pthread_t threads[20][100];

Tuning 100 to 1000 here and the following 2 loops.

Test it on 2-socket server with 104 cpu. Perf is similar on v5.7 and
v5.7 with Eric's fix. The spin lock was shifted to spin lock in futex,
so the fix didn't help.


    46.41%     0.11%  perf_test        [kernel.kallsyms] [k]
entry_SYSCALL_64_after_hwframe
            |
             --46.30%--entry_SYSCALL_64_after_hwframe
                       |
                        --46.12%--do_syscall_64
                                  |
                                  |--30.47%--__x64_sys_futex
                                  |          |
                                  |           --30.45%--do_futex
                                  |                     |
                                  | |--18.04%--futex_wait
                                  |                     | |
                                  |                     |
|--16.94%--futex_wait_setup
                                  |                     | |          |
                                  |                     | |          
--16.61%--_raw_spin_lock
                                  |                     |
|                     |
                                  |                     |
|                      --16.30%--native_queued_spin_lock_slowpath
                                  |                     |
|                                |
                                  |                     |
|                                 --0.81%--call_function_interrupt
                                  |                     |
|                                           |
                                  |                     | |
--0.79%--smp_call_function_interrupt
                                  |                     |
|                                                      |
                                  |                     | |
--0.62%--generic_smp_call_function_single_interrupt
                                  |                     | |
                                  | |          
--1.04%--futex_wait_queue_me
                                  | |                     |
                                  | |                     
--0.96%--schedule
                                  | |                                |
                                  | |                                
--0.94%--__schedule
                                  |
|                                           |
                                  | | --0.51%--pick_next_task_fair
                                  |                     |
                                  | --12.38%--futex_wake
                                  | |
                                  | |--11.00%--_raw_spin_lock
                                  | |          |
                                  | |          
--10.76%--native_queued_spin_lock_slowpath
                                  | |                     |
                                  | |                     
--0.55%--call_function_interrupt
                                  | |                                |
                                  | | --0.53%--smp_call_function_interrupt
                                  | |
|                                 --1.11%--wake_up_q
|                                           |
| --1.10%--try_to_wake_up
                                  |


Result of v5.7

=========

[root@jubi-bm-ol8 upstream]# time ./perf_test

real    0m4.850s
user    0m14.499s
sys    0m12.116s
[root@jubi-bm-ol8 upstream]# time ./perf_test

real    0m4.949s
user    0m14.285s
sys    0m18.408s
[root@jubi-bm-ol8 upstream]# time ./perf_test

real    0m4.885s
user    0m14.193s
sys    0m17.888s
[root@jubi-bm-ol8 upstream]# time ./perf_test

real    0m4.872s
user    0m14.451s
sys    0m18.717s
[root@jubi-bm-ol8 upstream]# uname -a
Linux jubi-bm-ol8 5.7.0-1700.20200601.el8uek.base.x86_64 #1 SMP Fri Jun
19 07:41:06 PDT 2020 x86_64 x86_64 x86_64 GNU/Linux


Result of v5.7 with Eric's fix

=================

[root@jubi-bm-ol8 upstream]# time ./perf_test

real    0m4.889s
user    0m14.215s
sys    0m16.203s
[root@jubi-bm-ol8 upstream]# time ./perf_test

real    0m4.872s
user    0m14.431s
sys    0m17.737s
[root@jubi-bm-ol8 upstream]# time ./perf_test

real    0m4.908s
user    0m14.274s
sys    0m15.377s
[root@jubi-bm-ol8 upstream]# time ./perf_test

real    0m4.937s
user    0m14.632s
sys    0m16.255s
[root@jubi-bm-ol8 upstream]# uname -a
Linux jubi-bm-ol8 5.7.0-1700.20200601.el8uek.procfix.x86_64 #1 SMP Fri
Jun 19 07:42:16 PDT 2020 x86_64 x86_64 x86_64 GNU/Linux

Thanks,

Junxiao.

> int i, j, one = 1;
>
> for (i = 0; i < 1000; i++) {
> for (j = 0; j < 100; j++)
> pthread_create(&threads[i % 20][j], NULL, worker, &one);
> if (i < 5)
> continue;
> for (j = 0; j < 100; j++)
> pthread_cancel(threads[(i - 5) %20][j]);
> }
>
> return 0;
> }

2020-08-17 12:26:06

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

[email protected] writes:

> On Mon, Jun 22, 2020 at 10:20:40AM -0500, Eric W. Biederman wrote:
>> Junxiao Bi <[email protected]> writes:
>> > On 6/20/20 9:27 AM, Matthew Wilcox wrote:
>> >> On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote:
>> >>> Junxiao Bi <[email protected]> writes:
>> >>>> Still high lock contention. Collect the following hot path.
>> >>> A different location this time.
>> >>>
>> >>> I know of at least exit_signal and exit_notify that take thread wide
>> >>> locks, and it looks like exit_mm is another. Those don't use the same
>> >>> locks as flushing proc.
>> >>>
>> >>>
>> >>> So I think you are simply seeing a result of the thundering herd of
>> >>> threads shutting down at once. Given that thread shutdown is fundamentally
>> >>> a slow path there is only so much that can be done.
>> >>>
>> >>> If you are up for a project to working through this thundering herd I
>> >>> expect I can help some. It will be a long process of cleaning up
>> >>> the entire thread exit process with an eye to performance.
>> >> Wengang had some tests which produced wall-clock values for this problem,
>> >> which I agree is more informative.
>> >>
>> >> I'm not entirely sure what the customer workload is that requires a
>> >> highly threaded workload to also shut down quickly. To my mind, an
>> >> overall workload is normally composed of highly-threaded tasks that run
>> >> for a long time and only shut down rarely (thus performance of shutdown
>> >> is not important) and single-threaded tasks that run for a short time.
>> >
>> > The real workload is a Java application working in server-agent mode, issue
>> > happened in agent side, all it do is waiting works dispatching from server and
>> > execute. To execute one work, agent will start lots of short live threads, there
>> > could be a lot of threads exit same time if there were a lots of work to
>> > execute, the contention on the exit path caused a high %sys time which impacted
>> > other workload.
>>
>> If I understand correctly, the Java VM is not exiting. Just some of
>> it's threads.
>>
>> That is a very different problem to deal with. That are many
>> optimizations that are possible when _all_ of the threads are exiting
>> that are not possible when _many_ threads are exiting.
>
> Ah! Now I get it. This explains why the dput() lock contention was
> so important. A new thread starting would block on that lock as it
> tried to create its new /proc/$pid/task/ directory.
>
> Terminating thousands of threads but not the entire process isn't going
> to hit many of the locks (eg exit_signal() and exit_mm() aren't going
> to be called). So we need a more sophisticated micro benchmark that is
> continually starting threads and asking dozens-to-thousands of them to
> stop at the same time. Otherwise we'll try to fix lots of scalability
> problems that our customer doesn't care about.

Has anyone come up with a more sophisticated microbenchmark or otherwise
made any progress in tracking this down farther?

Eric