2020-05-05 08:13:13

by SeongJae Park

[permalink] [raw]
Subject: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

From: SeongJae Park <[email protected]>

The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
deallocation of 'socket_alloc' to be done asynchronously using RCU, as
same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
socket_sq with socket itself") made those to have same life cycle.

The changes made the code much more simple, but also made 'socket_alloc'
live longer than before. For the reason, user programs intensively
repeating allocations and deallocations of sockets could cause memory
pressure on recent kernels.

To avoid the problem, this commit reverts the changes.

SeongJae Park (2):
Revert "coallocate socket_wq with socket itself"
Revert "sockfs: switch to ->free_inode()"

drivers/net/tap.c | 5 +++--
drivers/net/tun.c | 8 +++++---
include/linux/if_tap.h | 1 +
include/linux/net.h | 4 ++--
include/net/sock.h | 4 ++--
net/core/sock.c | 2 +-
net/socket.c | 23 ++++++++++++++++-------
7 files changed, 30 insertions(+), 17 deletions(-)

--
2.17.1


2020-05-05 08:13:20

by SeongJae Park

[permalink] [raw]
Subject: [PATCH net v2 1/2] Revert "coallocate socket_wq with socket itself"

From: SeongJae Park <[email protected]>

This reverts commit 333f7909a8573145811c4ab7d8c9092301707721.

The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
deallocation of 'socket_alloc' to be done asynchronously using RCU, as
same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
socket_sq with socket itself") made those to have same life cycle.

The changes made the code much more simple, but also made 'socket_alloc'
live longer than before. For the reason, user programs intensively
repeating allocations and deallocations of sockets could cause memory
pressure on recent kernels.

To avoid the problem, this commit separates the life cycle of
'socket_alloc' and 'sock.wq' again. The following commit will make the
deallocation of 'socket_alloc' to be done synchronously again.

Fixes: 6d7855c54e1e ("sockfs: switch to ->free_inode()")
Fixes: 333f7909a857 ("coallocate socket_sq with socket itself")
Signed-off-by: SeongJae Park <[email protected]>
---
drivers/net/tap.c | 5 +++--
drivers/net/tun.c | 8 +++++---
include/linux/if_tap.h | 1 +
include/linux/net.h | 4 ++--
include/net/sock.h | 4 ++--
net/core/sock.c | 2 +-
net/socket.c | 19 ++++++++++++++-----
7 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 1f4bdd94407a..7912039a4846 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -518,7 +518,8 @@ static int tap_open(struct inode *inode, struct file *file)
goto err;
}

- init_waitqueue_head(&q->sock.wq.wait);
+ RCU_INIT_POINTER(q->sock.wq, &q->wq);
+ init_waitqueue_head(&q->wq.wait);
q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED;
q->sock.file = file;
@@ -576,7 +577,7 @@ static __poll_t tap_poll(struct file *file, poll_table *wait)
goto out;

mask = 0;
- poll_wait(file, &q->sock.wq.wait, wait);
+ poll_wait(file, &q->wq.wait, wait);

if (!ptr_ring_empty(&q->ring))
mask |= EPOLLIN | EPOLLRDNORM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 650c937ed56b..16a5f3b80edf 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -160,6 +160,7 @@ struct tun_pcpu_stats {
struct tun_file {
struct sock sk;
struct socket socket;
+ struct socket_wq wq;
struct tun_struct __rcu *tun;
struct fasync_struct *fasync;
/* only used for fasnyc */
@@ -2173,7 +2174,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
goto out;
}

- add_wait_queue(&tfile->socket.wq.wait, &wait);
+ add_wait_queue(&tfile->wq.wait, &wait);

while (1) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -2193,7 +2194,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
}

__set_current_state(TASK_RUNNING);
- remove_wait_queue(&tfile->socket.wq.wait, &wait);
+ remove_wait_queue(&tfile->wq.wait, &wait);

out:
*err = error;
@@ -3434,7 +3435,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
tfile->flags = 0;
tfile->ifindex = 0;

- init_waitqueue_head(&tfile->socket.wq.wait);
+ init_waitqueue_head(&tfile->wq.wait);
+ RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);

tfile->socket.file = file;
tfile->socket.ops = &tun_socket_ops;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 915a187cfabd..8e66866c11be 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -62,6 +62,7 @@ struct tap_dev {
struct tap_queue {
struct sock sk;
struct socket sock;
+ struct socket_wq wq;
int vnet_hdr_sz;
struct tap_dev __rcu *tap;
struct file *file;
diff --git a/include/linux/net.h b/include/linux/net.h
index 6451425e828f..28c929bebb4a 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -116,11 +116,11 @@ struct socket {

unsigned long flags;

+ struct socket_wq *wq;
+
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
-
- struct socket_wq wq;
};

struct vm_area_struct;
diff --git a/include/net/sock.h b/include/net/sock.h
index 328564525526..20799a333570 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1841,7 +1841,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
{
WARN_ON(parent->sk);
write_lock_bh(&sk->sk_callback_lock);
- rcu_assign_pointer(sk->sk_wq, &parent->wq);
+ rcu_assign_pointer(sk->sk_wq, parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
sk->sk_uid = SOCK_INODE(parent)->i_uid;
@@ -2119,7 +2119,7 @@ static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
{
if (!poll_does_not_wait(p)) {
- poll_wait(filp, &sock->wq.wait, p);
+ poll_wait(filp, &sock->wq->wait, p);
/* We need to be sure we are in sync with the
* socket flags modification.
*
diff --git a/net/core/sock.c b/net/core/sock.c
index 8f71684305c3..7fa3241b5507 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2869,7 +2869,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)

if (sock) {
sk->sk_type = sock->type;
- RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
+ RCU_INIT_POINTER(sk->sk_wq, sock->wq);
sock->sk = sk;
sk->sk_uid = SOCK_INODE(sock)->i_uid;
} else {
diff --git a/net/socket.c b/net/socket.c
index 2eecf1517f76..e274ae4b45e4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -249,13 +249,20 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init;
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
+ struct socket_wq *wq;

ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
- init_waitqueue_head(&ei->socket.wq.wait);
- ei->socket.wq.fasync_list = NULL;
- ei->socket.wq.flags = 0;
+ wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq) {
+ kmem_cache_free(sock_inode_cachep, ei);
+ return NULL;
+ }
+ init_waitqueue_head(&wq->wait);
+ wq->fasync_list = NULL;
+ wq->flags = 0;
+ ei->socket.wq = wq;

ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
@@ -271,6 +278,7 @@ static void sock_free_inode(struct inode *inode)
struct socket_alloc *ei;

ei = container_of(inode, struct socket_alloc, vfs_inode);
+ kfree(ei->socket.wq);
kmem_cache_free(sock_inode_cachep, ei);
}

@@ -610,7 +618,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
module_put(owner);
}

- if (sock->wq.fasync_list)
+ if (sock->wq->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);

if (!sock->file) {
@@ -1299,12 +1307,13 @@ static int sock_fasync(int fd, struct file *filp, int on)
{
struct socket *sock = filp->private_data;
struct sock *sk = sock->sk;
- struct socket_wq *wq = &sock->wq;
+ struct socket_wq *wq;

if (sk == NULL)
return -EINVAL;

lock_sock(sk);
+ wq = sock->wq;
fasync_helper(fd, filp, on, &wq->fasync_list);

if (!wq->fasync_list)
--
2.17.1

2020-05-05 08:13:52

by SeongJae Park

[permalink] [raw]
Subject: [PATCH net v2 2/2] Revert "sockfs: switch to ->free_inode()"

From: SeongJae Park <[email protected]>

This reverts commit 6d7855c54e1e269275d7c504f8f62a0b7a5b3f18.

The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
deallocation of 'socket_alloc' to be done asynchronously using RCU, as
same to 'sock.wq'.

The change made 'socket_alloc' live longer than before. As a result,
user programs intensively repeating allocations and deallocations of
sockets could cause memory pressure on recent kernels.

To avoid the problem, this commit reverts the change.

Fixes: 6d7855c54e1e ("sockfs: switch to ->free_inode()")
Fixes: 333f7909a857 ("coallocate socket_sq with socket itself")
Signed-off-by: SeongJae Park <[email protected]>
---
net/socket.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index e274ae4b45e4..27174021f47f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -273,12 +273,12 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
return &ei->vfs_inode;
}

-static void sock_free_inode(struct inode *inode)
+static void sock_destroy_inode(struct inode *inode)
{
struct socket_alloc *ei;

ei = container_of(inode, struct socket_alloc, vfs_inode);
- kfree(ei->socket.wq);
+ kfree_rcu(ei->socket.wq, rcu);
kmem_cache_free(sock_inode_cachep, ei);
}

@@ -303,7 +303,7 @@ static void init_inodecache(void)

static const struct super_operations sockfs_ops = {
.alloc_inode = sock_alloc_inode,
- .free_inode = sock_free_inode,
+ .destroy_inode = sock_destroy_inode,
.statfs = simple_statfs,
};

--
2.17.1

2020-05-05 11:57:23

by SeongJae Park

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

CC-ing [email protected] and adding some more explanations.

On Tue, 5 May 2020 10:10:33 +0200 SeongJae Park <[email protected]> wrote:

> From: SeongJae Park <[email protected]>
>
> The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
> socket_sq with socket itself") made those to have same life cycle.
>
> The changes made the code much more simple, but also made 'socket_alloc'
> live longer than before. For the reason, user programs intensively
> repeating allocations and deallocations of sockets could cause memory
> pressure on recent kernels.

I found this problem on a production virtual machine utilizing 4GB memory while
running lebench[1]. The 'poll big' test of lebench opens 1000 sockets, polls
and closes those. This test is repeated 10,000 times. Therefore it should
consume only 1000 'socket_alloc' objects at once. As size of socket_alloc is
about 800 Bytes, it's only 800 KiB. However, on the recent kernels, it could
consume up to 10,000,000 objects (about 8 GiB). On the test machine, I
confirmed it consuming about 4GB of the system memory and results in OOM.

[1] https://github.com/LinuxPerfStudy/LEBench

>
> To avoid the problem, this commit reverts the changes.

I also tried to make fixup rather than reverts, but I couldn't easily find
simple fixup. As the commits 6d7855c54e1e and 333f7909a857 were for code
refactoring rather than performance optimization, I thought introducing complex
fixup for this problem would make no sense. Meanwhile, the memory pressure
regression could affect real machines. To this end, I decided to quickly
revert the commits first and consider better refactoring later.


Thanks,
SeongJae Park

>
> SeongJae Park (2):
> Revert "coallocate socket_wq with socket itself"
> Revert "sockfs: switch to ->free_inode()"
>
> drivers/net/tap.c | 5 +++--
> drivers/net/tun.c | 8 +++++---
> include/linux/if_tap.h | 1 +
> include/linux/net.h | 4 ++--
> include/net/sock.h | 4 ++--
> net/core/sock.c | 2 +-
> net/socket.c | 23 ++++++++++++++++-------
> 7 files changed, 30 insertions(+), 17 deletions(-)
>
> --
> 2.17.1

2020-05-05 12:33:50

by Nuernberger, Stefan

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 2020-05-05 at 13:54 +0200, SeongJae Park wrote:
> CC-ing [email protected] and adding some more explanations.
>
> On Tue, 5 May 2020 10:10:33 +0200 SeongJae Park <[email protected]>
> wrote:
>
> >
> > From: SeongJae Park <[email protected]>
> >
> > The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made
> > the
> > deallocation of 'socket_alloc' to be done asynchronously using RCU,
> > as
> > same to 'sock.wq'.  And the following commit 333f7909a857
> > ("coallocate
> > socket_sq with socket itself") made those to have same life cycle.
> >
> > The changes made the code much more simple, but also made
> > 'socket_alloc'
> > live longer than before.  For the reason, user programs intensively
> > repeating allocations and deallocations of sockets could cause
> > memory
> > pressure on recent kernels.
> I found this problem on a production virtual machine utilizing 4GB
> memory while
> running lebench[1].  The 'poll big' test of lebench opens 1000
> sockets, polls
> and closes those.  This test is repeated 10,000 times.  Therefore it
> should
> consume only 1000 'socket_alloc' objects at once.  As size of
> socket_alloc is
> about 800 Bytes, it's only 800 KiB.  However, on the recent kernels,
> it could
> consume up to 10,000,000 objects (about 8 GiB).  On the test machine,
> I
> confirmed it consuming about 4GB of the system memory and results in
> OOM.
>
> [1] https://github.com/LinuxPerfStudy/LEBench
>
> >
> >
> > To avoid the problem, this commit reverts the changes.
> I also tried to make fixup rather than reverts, but I couldn't easily
> find
> simple fixup.  As the commits 6d7855c54e1e and 333f7909a857 were for
> code
> refactoring rather than performance optimization, I thought
> introducing complex
> fixup for this problem would make no sense.  Meanwhile, the memory
> pressure
> regression could affect real machines.  To this end, I decided to
> quickly
> revert the commits first and consider better refactoring later.
>

While lebench might be exercising a rather pathological case, the
increase in memory pressure is real. I am concerned that the OOM killer
is actually engaging and killing off processes when there are lots of
resources already marked for release. This might be true for other
lazy/delayed resource deallocation, too. This has obviously just become
too lazy currently.

So for both reverts:

Reviewed-by: Stefan Nuernberger <[email protected]>

>
> Thanks,
> SeongJae Park
>
> >
> >
> > SeongJae Park (2):
> >   Revert "coallocate socket_wq with socket itself"
> >   Revert "sockfs: switch to ->free_inode()"
> >
> >  drivers/net/tap.c      |  5 +++--
> >  drivers/net/tun.c      |  8 +++++---
> >  include/linux/if_tap.h |  1 +
> >  include/linux/net.h    |  4 ++--
> >  include/net/sock.h     |  4 ++--
> >  net/core/sock.c        |  2 +-
> >  net/socket.c           | 23 ++++++++++++++++-------
> >  7 files changed, 30 insertions(+), 17 deletions(-)
> >



Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879


2020-05-05 14:55:44

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 5, 2020 at 4:54 AM SeongJae Park <[email protected]> wrote:
>
> CC-ing [email protected] and adding some more explanations.
>
> On Tue, 5 May 2020 10:10:33 +0200 SeongJae Park <[email protected]> wrote:
>
> > From: SeongJae Park <[email protected]>
> >
> > The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> > deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> > same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
> > socket_sq with socket itself") made those to have same life cycle.
> >
> > The changes made the code much more simple, but also made 'socket_alloc'
> > live longer than before. For the reason, user programs intensively
> > repeating allocations and deallocations of sockets could cause memory
> > pressure on recent kernels.
>
> I found this problem on a production virtual machine utilizing 4GB memory while
> running lebench[1]. The 'poll big' test of lebench opens 1000 sockets, polls
> and closes those. This test is repeated 10,000 times. Therefore it should
> consume only 1000 'socket_alloc' objects at once. As size of socket_alloc is
> about 800 Bytes, it's only 800 KiB. However, on the recent kernels, it could
> consume up to 10,000,000 objects (about 8 GiB). On the test machine, I
> confirmed it consuming about 4GB of the system memory and results in OOM.
>
> [1] https://github.com/LinuxPerfStudy/LEBench

To be fair, I have not backported Al patches to Google production
kernels, nor I have tried this benchmark.

Why do we have 10,000,000 objects around ? Could this be because of
some RCU problem ?

Once Al patches reverted, do you have 10,000,000 sock_alloc around ?

Thanks.

>
> >
> > To avoid the problem, this commit reverts the changes.
>
> I also tried to make fixup rather than reverts, but I couldn't easily find
> simple fixup. As the commits 6d7855c54e1e and 333f7909a857 were for code
> refactoring rather than performance optimization, I thought introducing complex
> fixup for this problem would make no sense. Meanwhile, the memory pressure
> regression could affect real machines. To this end, I decided to quickly
> revert the commits first and consider better refactoring later.
>
>
> Thanks,
> SeongJae Park
>
> >
> > SeongJae Park (2):
> > Revert "coallocate socket_wq with socket itself"
> > Revert "sockfs: switch to ->free_inode()"
> >
> > drivers/net/tap.c | 5 +++--
> > drivers/net/tun.c | 8 +++++---
> > include/linux/if_tap.h | 1 +
> > include/linux/net.h | 4 ++--
> > include/net/sock.h | 4 ++--
> > net/core/sock.c | 2 +-
> > net/socket.c | 23 ++++++++++++++++-------
> > 7 files changed, 30 insertions(+), 17 deletions(-)
> >
> > --
> > 2.17.1

2020-05-05 15:13:55

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:

> On Tue, May 5, 2020 at 4:54 AM SeongJae Park <[email protected]> wrote:
> >
> > CC-ing [email protected] and adding some more explanations.
> >
> > On Tue, 5 May 2020 10:10:33 +0200 SeongJae Park <[email protected]> wrote:
> >
> > > From: SeongJae Park <[email protected]>
> > >
> > > The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> > > deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> > > same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
> > > socket_sq with socket itself") made those to have same life cycle.
> > >
> > > The changes made the code much more simple, but also made 'socket_alloc'
> > > live longer than before. For the reason, user programs intensively
> > > repeating allocations and deallocations of sockets could cause memory
> > > pressure on recent kernels.
> >
> > I found this problem on a production virtual machine utilizing 4GB memory while
> > running lebench[1]. The 'poll big' test of lebench opens 1000 sockets, polls
> > and closes those. This test is repeated 10,000 times. Therefore it should
> > consume only 1000 'socket_alloc' objects at once. As size of socket_alloc is
> > about 800 Bytes, it's only 800 KiB. However, on the recent kernels, it could
> > consume up to 10,000,000 objects (about 8 GiB). On the test machine, I
> > confirmed it consuming about 4GB of the system memory and results in OOM.
> >
> > [1] https://github.com/LinuxPerfStudy/LEBench
>
> To be fair, I have not backported Al patches to Google production
> kernels, nor I have tried this benchmark.
>
> Why do we have 10,000,000 objects around ? Could this be because of
> some RCU problem ?

Mainly because of a long RCU grace period, as you guess. I have no idea how
the grace period became so long in this case.

As my test machine was a virtual machine instance, I guess RCU readers
preemption[1] like problem might affected this.

[1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf

>
> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?

Yes, both the old kernel that prior to Al's patches and the recent kernel
reverting the Al's patches didn't reproduce the problem.


Thanks,
SeongJae Park

>
> Thanks.
>
> >
> > >
> > > To avoid the problem, this commit reverts the changes.
> >
> > I also tried to make fixup rather than reverts, but I couldn't easily find
> > simple fixup. As the commits 6d7855c54e1e and 333f7909a857 were for code
> > refactoring rather than performance optimization, I thought introducing complex
> > fixup for this problem would make no sense. Meanwhile, the memory pressure
> > regression could affect real machines. To this end, I decided to quickly
> > revert the commits first and consider better refactoring later.
> >
> >
> > Thanks,
> > SeongJae Park
> >
> > >
> > > SeongJae Park (2):
> > > Revert "coallocate socket_wq with socket itself"
> > > Revert "sockfs: switch to ->free_inode()"
> > >
> > > drivers/net/tap.c | 5 +++--
> > > drivers/net/tun.c | 8 +++++---
> > > include/linux/if_tap.h | 1 +
> > > include/linux/net.h | 4 ++--
> > > include/net/sock.h | 4 ++--
> > > net/core/sock.c | 2 +-
> > > net/socket.c | 23 ++++++++++++++++-------
> > > 7 files changed, 30 insertions(+), 17 deletions(-)
> > >
> > > --
> > > 2.17.1

2020-05-05 15:25:17

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change



On 5/5/20 8:07 AM, SeongJae Park wrote:
> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
>

>> Why do we have 10,000,000 objects around ? Could this be because of
>> some RCU problem ?
>
> Mainly because of a long RCU grace period, as you guess. I have no idea how
> the grace period became so long in this case.
>
> As my test machine was a virtual machine instance, I guess RCU readers
> preemption[1] like problem might affected this.
>
> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
>
>>
>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
>
> Yes, both the old kernel that prior to Al's patches and the recent kernel
> reverting the Al's patches didn't reproduce the problem.
>

I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?

TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
object that was allocated in sock_alloc_inode() before Al patches.

These objects should be visible in kmalloc-64 kmem cache.

2020-05-05 15:49:31

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:

>
>
> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> >
>
> >> Why do we have 10,000,000 objects around ? Could this be because of
> >> some RCU problem ?
> >
> > Mainly because of a long RCU grace period, as you guess. I have no idea how
> > the grace period became so long in this case.
> >
> > As my test machine was a virtual machine instance, I guess RCU readers
> > preemption[1] like problem might affected this.
> >
> > [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
> >
> >>
> >> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> >
> > Yes, both the old kernel that prior to Al's patches and the recent kernel
> > reverting the Al's patches didn't reproduce the problem.
> >
>
> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
>
> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> object that was allocated in sock_alloc_inode() before Al patches.
>
> These objects should be visible in kmalloc-64 kmem cache.

Not exactly the 10,000,000, as it is only the possible highest number, but I
was able to observe clear exponential increase of the number of the objects
using slabtop. Before the start of the problematic workload, the number of
objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
to 1,136,576.

OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64


Thanks,
SeongJae Park

2020-05-05 16:02:55

by Eric Dumazet

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
>
> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
>
> >
> >
> > On 5/5/20 8:07 AM, SeongJae Park wrote:
> > > On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > >
> >
> > >> Why do we have 10,000,000 objects around ? Could this be because of
> > >> some RCU problem ?
> > >
> > > Mainly because of a long RCU grace period, as you guess. I have no idea how
> > > the grace period became so long in this case.
> > >
> > > As my test machine was a virtual machine instance, I guess RCU readers
> > > preemption[1] like problem might affected this.
> > >
> > > [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
> > >
> > >>
> > >> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> > >
> > > Yes, both the old kernel that prior to Al's patches and the recent kernel
> > > reverting the Al's patches didn't reproduce the problem.
> > >
> >
> > I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
> >
> > TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> > object that was allocated in sock_alloc_inode() before Al patches.
> >
> > These objects should be visible in kmalloc-64 kmem cache.
>
> Not exactly the 10,000,000, as it is only the possible highest number, but I
> was able to observe clear exponential increase of the number of the objects
> using slabtop. Before the start of the problematic workload, the number of
> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> to 1,136,576.
>
> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
>

Great, thanks.

How recent is the kernel you are running for your experiment ?

Let's make sure the bug is not in RCU.

After Al changes, RCU got slightly better under stress.

2020-05-05 16:18:29

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:

> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> >
> > On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> >
> > >
> > >
> > > On 5/5/20 8:07 AM, SeongJae Park wrote:
> > > > On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > > >
> > >
> > > >> Why do we have 10,000,000 objects around ? Could this be because of
> > > >> some RCU problem ?
> > > >
> > > > Mainly because of a long RCU grace period, as you guess. I have no idea how
> > > > the grace period became so long in this case.
> > > >
> > > > As my test machine was a virtual machine instance, I guess RCU readers
> > > > preemption[1] like problem might affected this.
> > > >
> > > > [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
> > > >
> > > >>
> > > >> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> > > >
> > > > Yes, both the old kernel that prior to Al's patches and the recent kernel
> > > > reverting the Al's patches didn't reproduce the problem.
> > > >
> > >
> > > I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
> > >
> > > TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> > > object that was allocated in sock_alloc_inode() before Al patches.
> > >
> > > These objects should be visible in kmalloc-64 kmem cache.
> >
> > Not exactly the 10,000,000, as it is only the possible highest number, but I
> > was able to observe clear exponential increase of the number of the objects
> > using slabtop. Before the start of the problematic workload, the number of
> > objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> > to 1,136,576.
> >
> > OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> > before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> > after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
> >
>
> Great, thanks.
>
> How recent is the kernel you are running for your experiment ?

It's based on 5.4.35.

>
> Let's make sure the bug is not in RCU.

One thing I can currently say is that the grace period passes at last. I
modified the benchmark to repeat not 10,000 times but only 5,000 times to run
the test without OOM but easily observable memory pressure. As soon as the
benchmark finishes, the memory were freed.

If you need more tests, please let me know.


Thanks,
SeongJae Park

>
> After Al changes, RCU got slightly better under stress.
>

2020-05-05 16:27:06

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change



On 5/5/20 9:13 AM, SeongJae Park wrote:
> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
>
>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
>>>
>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
>>>
>>>>
>>>>
>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
>>>>>
>>>>
>>>>>> Why do we have 10,000,000 objects around ? Could this be because of
>>>>>> some RCU problem ?
>>>>>
>>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
>>>>> the grace period became so long in this case.
>>>>>
>>>>> As my test machine was a virtual machine instance, I guess RCU readers
>>>>> preemption[1] like problem might affected this.
>>>>>
>>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
>>>>>
>>>>>>
>>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
>>>>>
>>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
>>>>> reverting the Al's patches didn't reproduce the problem.
>>>>>
>>>>
>>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
>>>>
>>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
>>>> object that was allocated in sock_alloc_inode() before Al patches.
>>>>
>>>> These objects should be visible in kmalloc-64 kmem cache.
>>>
>>> Not exactly the 10,000,000, as it is only the possible highest number, but I
>>> was able to observe clear exponential increase of the number of the objects
>>> using slabtop. Before the start of the problematic workload, the number of
>>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
>>> to 1,136,576.
>>>
>>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
>>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
>>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
>>>
>>
>> Great, thanks.
>>
>> How recent is the kernel you are running for your experiment ?
>
> It's based on 5.4.35.
>
>>
>> Let's make sure the bug is not in RCU.
>
> One thing I can currently say is that the grace period passes at last. I
> modified the benchmark to repeat not 10,000 times but only 5,000 times to run
> the test without OOM but easily observable memory pressure. As soon as the
> benchmark finishes, the memory were freed.
>
> If you need more tests, please let me know.
>

I would ask Paul opinion on this issue, because we have many objects
being freed after RCU grace periods.

If RCU subsystem can not keep-up, I guess other workloads will also suffer.

Sure, we can revert patches there and there trying to work around the issue,
but for objects allocated from process context, we should not have these problems.



2020-05-05 16:29:33

by Al Viro

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 05, 2020 at 09:00:44AM -0700, Eric Dumazet wrote:

> > Not exactly the 10,000,000, as it is only the possible highest number, but I
> > was able to observe clear exponential increase of the number of the objects
> > using slabtop. Before the start of the problematic workload, the number of
> > objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> > to 1,136,576.
> >
> > OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> > before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> > after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
> >
>
> Great, thanks.
>
> How recent is the kernel you are running for your experiment ?
>
> Let's make sure the bug is not in RCU.
>
> After Al changes, RCU got slightly better under stress.

The thing that worries me here is that this is far from being the only
source of RCU-delayed freeing of objects. If we really see bogus OOM
kills due to that (IRL, not in an artificial microbenchmark), we'd
better do something that would help with all those sources, not just
paper over the contributions from one of those. Because there's no
chance in hell to get rid of RCU-delayed freeing in general...

Does the problem extend to kfree_rcu()? And there's a lot of RCU
callbacks that boil down to kmem_cache_free(); those really look like
they should have exact same issue - sock_free_inode() is one of those,
after all.

2020-05-05 16:32:56

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change



On 5/5/20 9:25 AM, Eric Dumazet wrote:
>
>
> On 5/5/20 9:13 AM, SeongJae Park wrote:
>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
>>
>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
>>>>
>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
>>>>
>>>>>
>>>>>
>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
>>>>>>
>>>>>
>>>>>>> Why do we have 10,000,000 objects around ? Could this be because of
>>>>>>> some RCU problem ?
>>>>>>
>>>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
>>>>>> the grace period became so long in this case.
>>>>>>
>>>>>> As my test machine was a virtual machine instance, I guess RCU readers
>>>>>> preemption[1] like problem might affected this.
>>>>>>
>>>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
>>>>>>
>>>>>>>
>>>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
>>>>>>
>>>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
>>>>>> reverting the Al's patches didn't reproduce the problem.
>>>>>>
>>>>>
>>>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
>>>>>
>>>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
>>>>> object that was allocated in sock_alloc_inode() before Al patches.
>>>>>
>>>>> These objects should be visible in kmalloc-64 kmem cache.
>>>>
>>>> Not exactly the 10,000,000, as it is only the possible highest number, but I
>>>> was able to observe clear exponential increase of the number of the objects
>>>> using slabtop. Before the start of the problematic workload, the number of
>>>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
>>>> to 1,136,576.
>>>>
>>>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
>>>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
>>>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
>>>>
>>>
>>> Great, thanks.
>>>
>>> How recent is the kernel you are running for your experiment ?
>>
>> It's based on 5.4.35.
>>
>>>
>>> Let's make sure the bug is not in RCU.
>>
>> One thing I can currently say is that the grace period passes at last. I
>> modified the benchmark to repeat not 10,000 times but only 5,000 times to run
>> the test without OOM but easily observable memory pressure. As soon as the
>> benchmark finishes, the memory were freed.
>>
>> If you need more tests, please let me know.
>>
>
> I would ask Paul opinion on this issue, because we have many objects
> being freed after RCU grace periods.
>
> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
>
> Sure, we can revert patches there and there trying to work around the issue,
> but for objects allocated from process context, we should not have these problems.
>

I wonder if simply adjusting rcu_divisor to 6 or 5 would help

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
static bool rcu_kick_kthreads;
-static int rcu_divisor = 7;
+static int rcu_divisor = 6;
module_param(rcu_divisor, int, 0644);

/* Force an exit from rcu_do_batch() after 3 milliseconds. */

2020-05-05 16:39:49

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change



On 5/5/20 9:31 AM, Eric Dumazet wrote:
>
>
> On 5/5/20 9:25 AM, Eric Dumazet wrote:
>>
>>
>> On 5/5/20 9:13 AM, SeongJae Park wrote:
>>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
>>>
>>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
>>>>>
>>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
>>>>>
>>>>>>
>>>>>>
>>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
>>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
>>>>>>>
>>>>>>
>>>>>>>> Why do we have 10,000,000 objects around ? Could this be because of
>>>>>>>> some RCU problem ?
>>>>>>>
>>>>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
>>>>>>> the grace period became so long in this case.
>>>>>>>
>>>>>>> As my test machine was a virtual machine instance, I guess RCU readers
>>>>>>> preemption[1] like problem might affected this.
>>>>>>>
>>>>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
>>>>>>>
>>>>>>>>
>>>>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
>>>>>>>
>>>>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
>>>>>>> reverting the Al's patches didn't reproduce the problem.
>>>>>>>
>>>>>>
>>>>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
>>>>>>
>>>>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
>>>>>> object that was allocated in sock_alloc_inode() before Al patches.
>>>>>>
>>>>>> These objects should be visible in kmalloc-64 kmem cache.
>>>>>
>>>>> Not exactly the 10,000,000, as it is only the possible highest number, but I
>>>>> was able to observe clear exponential increase of the number of the objects
>>>>> using slabtop. Before the start of the problematic workload, the number of
>>>>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
>>>>> to 1,136,576.
>>>>>
>>>>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
>>>>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
>>>>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
>>>>>
>>>>
>>>> Great, thanks.
>>>>
>>>> How recent is the kernel you are running for your experiment ?
>>>
>>> It's based on 5.4.35.
>>>
>>>>
>>>> Let's make sure the bug is not in RCU.
>>>
>>> One thing I can currently say is that the grace period passes at last. I
>>> modified the benchmark to repeat not 10,000 times but only 5,000 times to run
>>> the test without OOM but easily observable memory pressure. As soon as the
>>> benchmark finishes, the memory were freed.
>>>
>>> If you need more tests, please let me know.
>>>
>>
>> I would ask Paul opinion on this issue, because we have many objects
>> being freed after RCU grace periods.
>>
>> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
>>
>> Sure, we can revert patches there and there trying to work around the issue,
>> but for objects allocated from process context, we should not have these problems.
>>
>
> I wonder if simply adjusting rcu_divisor to 6 or 5 would help
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
> static ulong jiffies_till_first_fqs = ULONG_MAX;
> static ulong jiffies_till_next_fqs = ULONG_MAX;
> static bool rcu_kick_kthreads;
> -static int rcu_divisor = 7;
> +static int rcu_divisor = 6;
> module_param(rcu_divisor, int, 0644);
>
> /* Force an exit from rcu_do_batch() after 3 milliseconds. */
>

To be clear, you can adjust the value without building a new kernel.

echo 6 >/sys/module/rcutree/parameters/rcu_divisor


2020-05-05 17:08:48

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 5 May 2020 09:37:42 -0700 Eric Dumazet <[email protected]> wrote:

>
>
> On 5/5/20 9:31 AM, Eric Dumazet wrote:
> >
> >
> > On 5/5/20 9:25 AM, Eric Dumazet wrote:
> >>
> >>
> >> On 5/5/20 9:13 AM, SeongJae Park wrote:
> >>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> >>>
> >>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> >>>>>
> >>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> >>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> >>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> >>>>>>>
> >>>>>>
[...]
> >>
> >> I would ask Paul opinion on this issue, because we have many objects
> >> being freed after RCU grace periods.
> >>
> >> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> >>
> >> Sure, we can revert patches there and there trying to work around the issue,
> >> but for objects allocated from process context, we should not have these problems.
> >>
> >
> > I wonder if simply adjusting rcu_divisor to 6 or 5 would help
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
> > static ulong jiffies_till_first_fqs = ULONG_MAX;
> > static ulong jiffies_till_next_fqs = ULONG_MAX;
> > static bool rcu_kick_kthreads;
> > -static int rcu_divisor = 7;
> > +static int rcu_divisor = 6;
> > module_param(rcu_divisor, int, 0644);
> >
> > /* Force an exit from rcu_do_batch() after 3 milliseconds. */
> >
>
> To be clear, you can adjust the value without building a new kernel.
>
> echo 6 >/sys/module/rcutree/parameters/rcu_divisor
>

I tried value 6, 5, and 4, but none of those removed the problem.


Thanks,
SeongJae Park

2020-05-05 17:25:48

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 05, 2020 at 09:25:06AM -0700, Eric Dumazet wrote:
>
>
> On 5/5/20 9:13 AM, SeongJae Park wrote:
> > On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> >
> >> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> >>>
> >>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> >>>
> >>>>
> >>>>
> >>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> >>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> >>>>>
> >>>>
> >>>>>> Why do we have 10,000,000 objects around ? Could this be because of
> >>>>>> some RCU problem ?
> >>>>>
> >>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
> >>>>> the grace period became so long in this case.
> >>>>>
> >>>>> As my test machine was a virtual machine instance, I guess RCU readers
> >>>>> preemption[1] like problem might affected this.
> >>>>>
> >>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf

If this is the root cause of the problem, then it will be necessary to
provide a hint to the hypervisor. Or, in the near term, avoid loading
the hypervisor the point that vCPU preemption is so lengthy.

RCU could also provide some sort of pre-stall-warning notification that
some of the CPUs aren't passing through quiescent states, which might
allow the guest OS's userspace to take corrective action.

But first, what are you doing to either confirm or invalidate the
hypothesis that this might be due to vCPU preemption?

> >>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> >>>>>
> >>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
> >>>>> reverting the Al's patches didn't reproduce the problem.
> >>>>>
> >>>>
> >>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
> >>>>
> >>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> >>>> object that was allocated in sock_alloc_inode() before Al patches.
> >>>>
> >>>> These objects should be visible in kmalloc-64 kmem cache.
> >>>
> >>> Not exactly the 10,000,000, as it is only the possible highest number, but I
> >>> was able to observe clear exponential increase of the number of the objects
> >>> using slabtop. Before the start of the problematic workload, the number of
> >>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> >>> to 1,136,576.
> >>>
> >>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> >>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> >>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
> >>>
> >>
> >> Great, thanks.
> >>
> >> How recent is the kernel you are running for your experiment ?
> >
> > It's based on 5.4.35.

Is it possible to retest on v5.6? I have been adding various mechanisms
to make RCU keep up better with heavy callback overload.

Also, could you please provide the .config? If either NO_HZ_FULL or
RCU_NOCB_CPU, please also provide the kernel boot parameters.

> >> Let's make sure the bug is not in RCU.
> >
> > One thing I can currently say is that the grace period passes at last. I
> > modified the benchmark to repeat not 10,000 times but only 5,000 times to run
> > the test without OOM but easily observable memory pressure. As soon as the
> > benchmark finishes, the memory were freed.
> >
> > If you need more tests, please let me know.
>
> I would ask Paul opinion on this issue, because we have many objects
> being freed after RCU grace periods.

As always, "It depends."

o If the problem is a too-long RCU reader, RCU is prohibited from
ending the grace period. The reader duration must be shortened,
and until it is shortened, there is nothing RCU can do.

o In some special cases of the above, RCU can and does help, for
example, by enlisting the aid of cond_resched(). So perhaps
there is a long in-kernel loop that needs a cond_resched().

And perhaps RCU can help for some types of vCPU preemption.

o As Al suggested offline and as has been discussed in the past,
it would not be hard to cause RCU to burn CPU to attain faster
grace periods during OOM events. This could be helpful, but only
given that RCU readers are completing in reasonable timeframes.

> If RCU subsystem can not keep-up, I guess other workloads will also suffer.

If readers are not excessively long, RCU should be able to keep up.
(In the absence of misconfigurations, for example, both NO_HZ_FULL and
then binding all the rcuo kthreads to a single CPU on a 100-CPU system
or some such.)

> Sure, we can revert patches there and there trying to work around the issue,
> but for objects allocated from process context, we should not have these problems.

Agreed, let's get more info on what is happening to RCU.

One approach is to shorten the RCU CPU stall warning timeout
(rcupdate.rcu_cpu_stall_timeout=10 for 10 seconds).

Thanx, Paul

2020-05-05 17:31:05

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 05, 2020 at 09:37:42AM -0700, Eric Dumazet wrote:
>
>
> On 5/5/20 9:31 AM, Eric Dumazet wrote:
> >
> >
> > On 5/5/20 9:25 AM, Eric Dumazet wrote:
> >>
> >>
> >> On 5/5/20 9:13 AM, SeongJae Park wrote:
> >>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> >>>
> >>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> >>>>>
> >>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> >>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> >>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> >>>>>>>
> >>>>>>
> >>>>>>>> Why do we have 10,000,000 objects around ? Could this be because of
> >>>>>>>> some RCU problem ?
> >>>>>>>
> >>>>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
> >>>>>>> the grace period became so long in this case.
> >>>>>>>
> >>>>>>> As my test machine was a virtual machine instance, I guess RCU readers
> >>>>>>> preemption[1] like problem might affected this.
> >>>>>>>
> >>>>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
> >>>>>>>
> >>>>>>>>
> >>>>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> >>>>>>>
> >>>>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
> >>>>>>> reverting the Al's patches didn't reproduce the problem.
> >>>>>>>
> >>>>>>
> >>>>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
> >>>>>>
> >>>>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> >>>>>> object that was allocated in sock_alloc_inode() before Al patches.
> >>>>>>
> >>>>>> These objects should be visible in kmalloc-64 kmem cache.
> >>>>>
> >>>>> Not exactly the 10,000,000, as it is only the possible highest number, but I
> >>>>> was able to observe clear exponential increase of the number of the objects
> >>>>> using slabtop. Before the start of the problematic workload, the number of
> >>>>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> >>>>> to 1,136,576.
> >>>>>
> >>>>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> >>>>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> >>>>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
> >>>>>
> >>>>
> >>>> Great, thanks.
> >>>>
> >>>> How recent is the kernel you are running for your experiment ?
> >>>
> >>> It's based on 5.4.35.
> >>>
> >>>>
> >>>> Let's make sure the bug is not in RCU.
> >>>
> >>> One thing I can currently say is that the grace period passes at last. I
> >>> modified the benchmark to repeat not 10,000 times but only 5,000 times to run
> >>> the test without OOM but easily observable memory pressure. As soon as the
> >>> benchmark finishes, the memory were freed.
> >>>
> >>> If you need more tests, please let me know.
> >>>
> >>
> >> I would ask Paul opinion on this issue, because we have many objects
> >> being freed after RCU grace periods.
> >>
> >> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> >>
> >> Sure, we can revert patches there and there trying to work around the issue,
> >> but for objects allocated from process context, we should not have these problems.
> >>
> >
> > I wonder if simply adjusting rcu_divisor to 6 or 5 would help
> >
> > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
> > static ulong jiffies_till_first_fqs = ULONG_MAX;
> > static ulong jiffies_till_next_fqs = ULONG_MAX;
> > static bool rcu_kick_kthreads;
> > -static int rcu_divisor = 7;
> > +static int rcu_divisor = 6;
> > module_param(rcu_divisor, int, 0644);
> >
> > /* Force an exit from rcu_do_batch() after 3 milliseconds. */
> >
>
> To be clear, you can adjust the value without building a new kernel.
>
> echo 6 >/sys/module/rcutree/parameters/rcu_divisor

Worth a try! If that helps significantly, I have some ideas for updating
that heuristic, such as checking for sudden increases in the number of
pending callbacks.

But I would really also like to know whether there are long readers and
whether v5.6 fares better.

Thanx, Paul

2020-05-05 17:32:30

by Paul E. McKenney

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 05, 2020 at 07:05:53PM +0200, SeongJae Park wrote:
> On Tue, 5 May 2020 09:37:42 -0700 Eric Dumazet <[email protected]> wrote:
>
> >
> >
> > On 5/5/20 9:31 AM, Eric Dumazet wrote:
> > >
> > >
> > > On 5/5/20 9:25 AM, Eric Dumazet wrote:
> > >>
> > >>
> > >> On 5/5/20 9:13 AM, SeongJae Park wrote:
> > >>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > >>>
> > >>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > >>>>>
> > >>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > >>>>>
> > >>>>>>
> > >>>>>>
> > >>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > >>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > >>>>>>>
> > >>>>>>
> [...]
> > >>
> > >> I would ask Paul opinion on this issue, because we have many objects
> > >> being freed after RCU grace periods.
> > >>
> > >> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> > >>
> > >> Sure, we can revert patches there and there trying to work around the issue,
> > >> but for objects allocated from process context, we should not have these problems.
> > >>
> > >
> > > I wonder if simply adjusting rcu_divisor to 6 or 5 would help
> > >
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
> > > static ulong jiffies_till_first_fqs = ULONG_MAX;
> > > static ulong jiffies_till_next_fqs = ULONG_MAX;
> > > static bool rcu_kick_kthreads;
> > > -static int rcu_divisor = 7;
> > > +static int rcu_divisor = 6;
> > > module_param(rcu_divisor, int, 0644);
> > >
> > > /* Force an exit from rcu_do_batch() after 3 milliseconds. */
> > >
> >
> > To be clear, you can adjust the value without building a new kernel.
> >
> > echo 6 >/sys/module/rcutree/parameters/rcu_divisor
>
> I tried value 6, 5, and 4, but none of those removed the problem.

Thank you for checking this!

Was your earlier discussion on long RCU readers speculation, or do you
have measurements?

Thanx, Paul

2020-05-05 17:52:03

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 5 May 2020 10:23:58 -0700 "Paul E. McKenney" <[email protected]> wrote:

> On Tue, May 05, 2020 at 09:25:06AM -0700, Eric Dumazet wrote:
> >
> >
> > On 5/5/20 9:13 AM, SeongJae Park wrote:
> > > On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > >
> > >> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > >>>
> > >>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > >>>
> > >>>>
> > >>>>
> > >>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > >>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > >>>>>
> > >>>>
> > >>>>>> Why do we have 10,000,000 objects around ? Could this be because of
> > >>>>>> some RCU problem ?
> > >>>>>
> > >>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
> > >>>>> the grace period became so long in this case.
> > >>>>>
> > >>>>> As my test machine was a virtual machine instance, I guess RCU readers
> > >>>>> preemption[1] like problem might affected this.
> > >>>>>
> > >>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
>
> If this is the root cause of the problem, then it will be necessary to
> provide a hint to the hypervisor. Or, in the near term, avoid loading
> the hypervisor the point that vCPU preemption is so lengthy.
>
> RCU could also provide some sort of pre-stall-warning notification that
> some of the CPUs aren't passing through quiescent states, which might
> allow the guest OS's userspace to take corrective action.
>
> But first, what are you doing to either confirm or invalidate the
> hypothesis that this might be due to vCPU preemption?

Nothing, I was just guessing. Sorry if this made you confused.

>
> > >>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> > >>>>>
> > >>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
> > >>>>> reverting the Al's patches didn't reproduce the problem.
> > >>>>>
> > >>>>
> > >>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
> > >>>>
> > >>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> > >>>> object that was allocated in sock_alloc_inode() before Al patches.
> > >>>>
> > >>>> These objects should be visible in kmalloc-64 kmem cache.
> > >>>
> > >>> Not exactly the 10,000,000, as it is only the possible highest number, but I
> > >>> was able to observe clear exponential increase of the number of the objects
> > >>> using slabtop. Before the start of the problematic workload, the number of
> > >>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> > >>> to 1,136,576.
> > >>>
> > >>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> > >>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> > >>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
> > >>>
> > >>
> > >> Great, thanks.
> > >>
> > >> How recent is the kernel you are running for your experiment ?
> > >
> > > It's based on 5.4.35.
>
> Is it possible to retest on v5.6? I have been adding various mechanisms
> to make RCU keep up better with heavy callback overload.

I will try soon!

>
> Also, could you please provide the .config? If either NO_HZ_FULL or
> RCU_NOCB_CPU, please also provide the kernel boot parameters.

NO_HZ_FULL is not set, but RCU_NOCB_CPU is y.

I think I should check whether it's ok to share the full config and boot
parameters. Please wait this.

>
> > >> Let's make sure the bug is not in RCU.
> > >
> > > One thing I can currently say is that the grace period passes at last. I
> > > modified the benchmark to repeat not 10,000 times but only 5,000 times to run
> > > the test without OOM but easily observable memory pressure. As soon as the
> > > benchmark finishes, the memory were freed.
> > >
> > > If you need more tests, please let me know.
> >
> > I would ask Paul opinion on this issue, because we have many objects
> > being freed after RCU grace periods.
>
> As always, "It depends."
>
> o If the problem is a too-long RCU reader, RCU is prohibited from
> ending the grace period. The reader duration must be shortened,
> and until it is shortened, there is nothing RCU can do.
>
> o In some special cases of the above, RCU can and does help, for
> example, by enlisting the aid of cond_resched(). So perhaps
> there is a long in-kernel loop that needs a cond_resched().
>
> And perhaps RCU can help for some types of vCPU preemption.
>
> o As Al suggested offline and as has been discussed in the past,
> it would not be hard to cause RCU to burn CPU to attain faster
> grace periods during OOM events. This could be helpful, but only
> given that RCU readers are completing in reasonable timeframes.

Totally agreed.

>
> > If RCU subsystem can not keep-up, I guess other workloads will also suffer.
>
> If readers are not excessively long, RCU should be able to keep up.
> (In the absence of misconfigurations, for example, both NO_HZ_FULL and
> then binding all the rcuo kthreads to a single CPU on a 100-CPU system
> or some such.)
>
> > Sure, we can revert patches there and there trying to work around the issue,
> > but for objects allocated from process context, we should not have these problems.
>
> Agreed, let's get more info on what is happening to RCU.
>
> One approach is to shorten the RCU CPU stall warning timeout
> (rcupdate.rcu_cpu_stall_timeout=10 for 10 seconds).

I will also try this and let you know the results.


Thanks,
SeongJae Park

>
> Thanx, Paul

2020-05-05 17:58:38

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 5 May 2020 10:30:36 -0700 "Paul E. McKenney" <[email protected]> wrote:

> On Tue, May 05, 2020 at 07:05:53PM +0200, SeongJae Park wrote:
> > On Tue, 5 May 2020 09:37:42 -0700 Eric Dumazet <[email protected]> wrote:
> >
> > >
> > >
> > > On 5/5/20 9:31 AM, Eric Dumazet wrote:
> > > >
> > > >
> > > > On 5/5/20 9:25 AM, Eric Dumazet wrote:
> > > >>
> > > >>
> > > >> On 5/5/20 9:13 AM, SeongJae Park wrote:
> > > >>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > > >>>
> > > >>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > > >>>>>
> > > >>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > > >>>>>
> > > >>>>>>
> > > >>>>>>
> > > >>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > > >>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > > >>>>>>>
> > > >>>>>>
> > [...]
> > > >>
> > > >> I would ask Paul opinion on this issue, because we have many objects
> > > >> being freed after RCU grace periods.
> > > >>
> > > >> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> > > >>
> > > >> Sure, we can revert patches there and there trying to work around the issue,
> > > >> but for objects allocated from process context, we should not have these problems.
> > > >>
> > > >
> > > > I wonder if simply adjusting rcu_divisor to 6 or 5 would help
> > > >
> > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
> > > > --- a/kernel/rcu/tree.c
> > > > +++ b/kernel/rcu/tree.c
> > > > @@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
> > > > static ulong jiffies_till_first_fqs = ULONG_MAX;
> > > > static ulong jiffies_till_next_fqs = ULONG_MAX;
> > > > static bool rcu_kick_kthreads;
> > > > -static int rcu_divisor = 7;
> > > > +static int rcu_divisor = 6;
> > > > module_param(rcu_divisor, int, 0644);
> > > >
> > > > /* Force an exit from rcu_do_batch() after 3 milliseconds. */
> > > >
> > >
> > > To be clear, you can adjust the value without building a new kernel.
> > >
> > > echo 6 >/sys/module/rcutree/parameters/rcu_divisor
> >
> > I tried value 6, 5, and 4, but none of those removed the problem.
>
> Thank you for checking this!
>
> Was your earlier discussion on long RCU readers speculation, or do you
> have measurements?

It was just a guess without any measurement or dedicated investigation.


Thanks,
SeongJae Park

>
> Thanx, Paul

2020-05-05 18:14:06

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 5 May 2020 10:28:50 -0700 "Paul E. McKenney" <[email protected]> wrote:

> On Tue, May 05, 2020 at 09:37:42AM -0700, Eric Dumazet wrote:
> >
> >
> > On 5/5/20 9:31 AM, Eric Dumazet wrote:
> > >
> > >
> > > On 5/5/20 9:25 AM, Eric Dumazet wrote:
> > >>
> > >>
> > >> On 5/5/20 9:13 AM, SeongJae Park wrote:
> > >>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > >>>
> > >>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > >>>>>
> > >>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > >>>>>
> > >>>>>>
> > >>>>>>
> > >>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > >>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > >>>>>>>
> > >>>>>>
> > >>>>>>>> Why do we have 10,000,000 objects around ? Could this be because of
> > >>>>>>>> some RCU problem ?
> > >>>>>>>
> > >>>>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
> > >>>>>>> the grace period became so long in this case.
> > >>>>>>>
> > >>>>>>> As my test machine was a virtual machine instance, I guess RCU readers
> > >>>>>>> preemption[1] like problem might affected this.
> > >>>>>>>
> > >>>>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
> > >>>>>>>
> > >>>>>>>>
> > >>>>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> > >>>>>>>
> > >>>>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
> > >>>>>>> reverting the Al's patches didn't reproduce the problem.
> > >>>>>>>
> > >>>>>>
> > >>>>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
> > >>>>>>
> > >>>>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> > >>>>>> object that was allocated in sock_alloc_inode() before Al patches.
> > >>>>>>
> > >>>>>> These objects should be visible in kmalloc-64 kmem cache.
> > >>>>>
> > >>>>> Not exactly the 10,000,000, as it is only the possible highest number, but I
> > >>>>> was able to observe clear exponential increase of the number of the objects
> > >>>>> using slabtop. Before the start of the problematic workload, the number of
> > >>>>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> > >>>>> to 1,136,576.
> > >>>>>
> > >>>>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> > >>>>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> > >>>>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
> > >>>>>
> > >>>>
> > >>>> Great, thanks.
> > >>>>
> > >>>> How recent is the kernel you are running for your experiment ?
> > >>>
> > >>> It's based on 5.4.35.
> > >>>
> > >>>>
> > >>>> Let's make sure the bug is not in RCU.
> > >>>
> > >>> One thing I can currently say is that the grace period passes at last. I
> > >>> modified the benchmark to repeat not 10,000 times but only 5,000 times to run
> > >>> the test without OOM but easily observable memory pressure. As soon as the
> > >>> benchmark finishes, the memory were freed.
> > >>>
> > >>> If you need more tests, please let me know.
> > >>>
> > >>
> > >> I would ask Paul opinion on this issue, because we have many objects
> > >> being freed after RCU grace periods.
> > >>
> > >> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> > >>
> > >> Sure, we can revert patches there and there trying to work around the issue,
> > >> but for objects allocated from process context, we should not have these problems.
> > >>
> > >
> > > I wonder if simply adjusting rcu_divisor to 6 or 5 would help
> > >
> > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
> > > --- a/kernel/rcu/tree.c
> > > +++ b/kernel/rcu/tree.c
> > > @@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
> > > static ulong jiffies_till_first_fqs = ULONG_MAX;
> > > static ulong jiffies_till_next_fqs = ULONG_MAX;
> > > static bool rcu_kick_kthreads;
> > > -static int rcu_divisor = 7;
> > > +static int rcu_divisor = 6;
> > > module_param(rcu_divisor, int, 0644);
> > >
> > > /* Force an exit from rcu_do_batch() after 3 milliseconds. */
> > >
> >
> > To be clear, you can adjust the value without building a new kernel.
> >
> > echo 6 >/sys/module/rcutree/parameters/rcu_divisor
>
> Worth a try! If that helps significantly, I have some ideas for updating
> that heuristic, such as checking for sudden increases in the number of
> pending callbacks.
>
> But I would really also like to know whether there are long readers and
> whether v5.6 fares better.

I will share the results as soon as possible :)


Thanks,
SeongJae Park

>
> Thanx, Paul

2020-05-05 18:19:19

by Paul E. McKenney

[permalink] [raw]
Subject: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 05, 2020 at 07:56:05PM +0200, SeongJae Park wrote:
> On Tue, 5 May 2020 10:30:36 -0700 "Paul E. McKenney" <[email protected]> wrote:
>
> > On Tue, May 05, 2020 at 07:05:53PM +0200, SeongJae Park wrote:
> > > On Tue, 5 May 2020 09:37:42 -0700 Eric Dumazet <[email protected]> wrote:
> > >
> > > >
> > > >
> > > > On 5/5/20 9:31 AM, Eric Dumazet wrote:
> > > > >
> > > > >
> > > > > On 5/5/20 9:25 AM, Eric Dumazet wrote:
> > > > >>
> > > > >>
> > > > >> On 5/5/20 9:13 AM, SeongJae Park wrote:
> > > > >>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > > > >>>
> > > > >>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > > > >>>>>
> > > > >>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > > > >>>>>
> > > > >>>>>>
> > > > >>>>>>
> > > > >>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > > > >>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > > > >>>>>>>
> > > > >>>>>>
> > > [...]
> > > > >>
> > > > >> I would ask Paul opinion on this issue, because we have many objects
> > > > >> being freed after RCU grace periods.
> > > > >>
> > > > >> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> > > > >>
> > > > >> Sure, we can revert patches there and there trying to work around the issue,
> > > > >> but for objects allocated from process context, we should not have these problems.
> > > > >>
> > > > >
> > > > > I wonder if simply adjusting rcu_divisor to 6 or 5 would help
> > > > >
> > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
> > > > > --- a/kernel/rcu/tree.c
> > > > > +++ b/kernel/rcu/tree.c
> > > > > @@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
> > > > > static ulong jiffies_till_first_fqs = ULONG_MAX;
> > > > > static ulong jiffies_till_next_fqs = ULONG_MAX;
> > > > > static bool rcu_kick_kthreads;
> > > > > -static int rcu_divisor = 7;
> > > > > +static int rcu_divisor = 6;
> > > > > module_param(rcu_divisor, int, 0644);
> > > > >
> > > > > /* Force an exit from rcu_do_batch() after 3 milliseconds. */
> > > > >
> > > >
> > > > To be clear, you can adjust the value without building a new kernel.
> > > >
> > > > echo 6 >/sys/module/rcutree/parameters/rcu_divisor
> > >
> > > I tried value 6, 5, and 4, but none of those removed the problem.
> >
> > Thank you for checking this!
> >
> > Was your earlier discussion on long RCU readers speculation, or do you
> > have measurements?
>
> It was just a guess without any measurement or dedicated investigation.

OK, another thing to check is the duration of the low-memory episode.
Does this duration exceed the RCU CPU stall warning time? (21 seconds
in mainline, 60 in many distros, but check rcupdate.rcu_cpu_stall_timeout
to be sure.)

Also, any chance of a .config? Or at least the RCU portions? I am
guessing CONFIG_PREEMPT=n, for example.

Thanx, Paul

2020-05-05 18:31:13

by Paul E. McKenney

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 05, 2020 at 07:49:43PM +0200, SeongJae Park wrote:
> On Tue, 5 May 2020 10:23:58 -0700 "Paul E. McKenney" <[email protected]> wrote:
>
> > On Tue, May 05, 2020 at 09:25:06AM -0700, Eric Dumazet wrote:
> > >
> > >
> > > On 5/5/20 9:13 AM, SeongJae Park wrote:
> > > > On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > > >
> > > >> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > > >>>
> > > >>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > > >>>
> > > >>>>
> > > >>>>
> > > >>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > > >>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > > >>>>>
> > > >>>>
> > > >>>>>> Why do we have 10,000,000 objects around ? Could this be because of
> > > >>>>>> some RCU problem ?
> > > >>>>>
> > > >>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
> > > >>>>> the grace period became so long in this case.
> > > >>>>>
> > > >>>>> As my test machine was a virtual machine instance, I guess RCU readers
> > > >>>>> preemption[1] like problem might affected this.
> > > >>>>>
> > > >>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
> >
> > If this is the root cause of the problem, then it will be necessary to
> > provide a hint to the hypervisor. Or, in the near term, avoid loading
> > the hypervisor the point that vCPU preemption is so lengthy.
> >
> > RCU could also provide some sort of pre-stall-warning notification that
> > some of the CPUs aren't passing through quiescent states, which might
> > allow the guest OS's userspace to take corrective action.
> >
> > But first, what are you doing to either confirm or invalidate the
> > hypothesis that this might be due to vCPU preemption?
>
> Nothing, I was just guessing. Sorry if this made you confused.
>
> >
> > > >>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> > > >>>>>
> > > >>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
> > > >>>>> reverting the Al's patches didn't reproduce the problem.
> > > >>>>>
> > > >>>>
> > > >>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
> > > >>>>
> > > >>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> > > >>>> object that was allocated in sock_alloc_inode() before Al patches.
> > > >>>>
> > > >>>> These objects should be visible in kmalloc-64 kmem cache.
> > > >>>
> > > >>> Not exactly the 10,000,000, as it is only the possible highest number, but I
> > > >>> was able to observe clear exponential increase of the number of the objects
> > > >>> using slabtop. Before the start of the problematic workload, the number of
> > > >>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> > > >>> to 1,136,576.
> > > >>>
> > > >>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> > > >>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> > > >>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
> > > >>>
> > > >>
> > > >> Great, thanks.
> > > >>
> > > >> How recent is the kernel you are running for your experiment ?
> > > >
> > > > It's based on 5.4.35.
> >
> > Is it possible to retest on v5.6? I have been adding various mechanisms
> > to make RCU keep up better with heavy callback overload.
>
> I will try soon!
>
> >
> > Also, could you please provide the .config? If either NO_HZ_FULL or
> > RCU_NOCB_CPU, please also provide the kernel boot parameters.
>
> NO_HZ_FULL is not set, but RCU_NOCB_CPU is y.

OK, this is important information.

> I think I should check whether it's ok to share the full config and boot
> parameters. Please wait this.

I probably don't need the whole thing. So, if it makes it easier to
gain approval...

The main thing I need are CONFIG_PREEMPT and the various Kconfig options
having "RCU" in their names. For example, I have no need for any of the
options pertaining to device drivers. (As far as I know at the moment,
anyway!)

For the boot parameters, I am very interested in rcu_nocbs=. Along with
any other boot parameters whose names contain "rcu".

If rcu_nocbs does designate have any CPUs listed, another thing to check
is where the rcuo kthreads are permitted to run. The reason that this
is important is that any CPU listed in the rcu_nocbs= boot parameter
has its RCU callbacks invoked by one of the rcuo kthreads. If you have
booted with (say) "rcu_nocbs=1,63" and then bound all of the resulting
rcuo kthreads to CPU 0, you just tied RCU's hands, making it unable to
keep up with any reasonable RCU callback load.

This sort of configuration is permitted, but it is intended for tightly
controlled real-time or HPC systems whose configurations and workloads
avoid tossing out large numbers of callbacks. Which might not be the
case for your workload.

> > > >> Let's make sure the bug is not in RCU.
> > > >
> > > > One thing I can currently say is that the grace period passes at last. I
> > > > modified the benchmark to repeat not 10,000 times but only 5,000 times to run
> > > > the test without OOM but easily observable memory pressure. As soon as the
> > > > benchmark finishes, the memory were freed.
> > > >
> > > > If you need more tests, please let me know.
> > >
> > > I would ask Paul opinion on this issue, because we have many objects
> > > being freed after RCU grace periods.
> >
> > As always, "It depends."
> >
> > o If the problem is a too-long RCU reader, RCU is prohibited from
> > ending the grace period. The reader duration must be shortened,
> > and until it is shortened, there is nothing RCU can do.
> >
> > o In some special cases of the above, RCU can and does help, for
> > example, by enlisting the aid of cond_resched(). So perhaps
> > there is a long in-kernel loop that needs a cond_resched().
> >
> > And perhaps RCU can help for some types of vCPU preemption.
> >
> > o As Al suggested offline and as has been discussed in the past,
> > it would not be hard to cause RCU to burn CPU to attain faster
> > grace periods during OOM events. This could be helpful, but only
> > given that RCU readers are completing in reasonable timeframes.
>
> Totally agreed.
>
> > > If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> >
> > If readers are not excessively long, RCU should be able to keep up.
> > (In the absence of misconfigurations, for example, both NO_HZ_FULL and
> > then binding all the rcuo kthreads to a single CPU on a 100-CPU system
> > or some such.)
> >
> > > Sure, we can revert patches there and there trying to work around the issue,
> > > but for objects allocated from process context, we should not have these problems.
> >
> > Agreed, let's get more info on what is happening to RCU.
> >
> > One approach is to shorten the RCU CPU stall warning timeout
> > (rcupdate.rcu_cpu_stall_timeout=10 for 10 seconds).
>
> I will also try this and let you know the results.

Sounds good, thank you!

Thanx, Paul

2020-05-05 18:36:52

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 5 May 2020 11:17:07 -0700 "Paul E. McKenney" <[email protected]> wrote:

> On Tue, May 05, 2020 at 07:56:05PM +0200, SeongJae Park wrote:
> > On Tue, 5 May 2020 10:30:36 -0700 "Paul E. McKenney" <[email protected]> wrote:
> >
> > > On Tue, May 05, 2020 at 07:05:53PM +0200, SeongJae Park wrote:
> > > > On Tue, 5 May 2020 09:37:42 -0700 Eric Dumazet <[email protected]> wrote:
> > > >
> > > > >
> > > > >
> > > > > On 5/5/20 9:31 AM, Eric Dumazet wrote:
> > > > > >
> > > > > >
> > > > > > On 5/5/20 9:25 AM, Eric Dumazet wrote:
> > > > > >>
> > > > > >>
> > > > > >> On 5/5/20 9:13 AM, SeongJae Park wrote:
> > > > > >>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > > > > >>>
> > > > > >>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > > > > >>>>>
> > > > > >>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > > > > >>>>>
> > > > > >>>>>>
> > > > > >>>>>>
> > > > > >>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > > > > >>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > > > > >>>>>>>
> > > > > >>>>>>
> > > > [...]
> > > > > >>
> > > > > >> I would ask Paul opinion on this issue, because we have many objects
> > > > > >> being freed after RCU grace periods.
> > > > > >>
> > > > > >> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> > > > > >>
> > > > > >> Sure, we can revert patches there and there trying to work around the issue,
> > > > > >> but for objects allocated from process context, we should not have these problems.
> > > > > >>
> > > > > >
> > > > > > I wonder if simply adjusting rcu_divisor to 6 or 5 would help
> > > > > >
> > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > > index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
> > > > > > --- a/kernel/rcu/tree.c
> > > > > > +++ b/kernel/rcu/tree.c
> > > > > > @@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
> > > > > > static ulong jiffies_till_first_fqs = ULONG_MAX;
> > > > > > static ulong jiffies_till_next_fqs = ULONG_MAX;
> > > > > > static bool rcu_kick_kthreads;
> > > > > > -static int rcu_divisor = 7;
> > > > > > +static int rcu_divisor = 6;
> > > > > > module_param(rcu_divisor, int, 0644);
> > > > > >
> > > > > > /* Force an exit from rcu_do_batch() after 3 milliseconds. */
> > > > > >
> > > > >
> > > > > To be clear, you can adjust the value without building a new kernel.
> > > > >
> > > > > echo 6 >/sys/module/rcutree/parameters/rcu_divisor
> > > >
> > > > I tried value 6, 5, and 4, but none of those removed the problem.
> > >
> > > Thank you for checking this!
> > >
> > > Was your earlier discussion on long RCU readers speculation, or do you
> > > have measurements?
> >
> > It was just a guess without any measurement or dedicated investigation.
>
> OK, another thing to check is the duration of the low-memory episode.
> Does this duration exceed the RCU CPU stall warning time? (21 seconds
> in mainline, 60 in many distros, but check rcupdate.rcu_cpu_stall_timeout
> to be sure.)

The benchmark takes about 36 seconds for 10,000 repeats of the test.

The value on the test machine is 60.

So the duration would not exceeded the warning time and therefore I haven't
seen the warning message.

As told in other mail, I will also adjust this value to shorter one.

>
> Also, any chance of a .config? Or at least the RCU portions? I am
> guessing CONFIG_PREEMPT=n, for example.

I guess this would be ok.

# CONFIG_PREEMPT is not set

#
# RCU Subsystem
#
CONFIG_TREE_RCU=y
CONFIG_RCU_EXPERT=y
CONFIG_SRCU=y
CONFIG_TREE_SRCU=y
CONFIG_RCU_STALL_COMMON=y
CONFIG_RCU_NEED_SEGCBLIST=y
CONFIG_RCU_FANOUT=64
CONFIG_RCU_FANOUT_LEAF=16
# CONFIG_RCU_FAST_NO_HZ is not set
CONFIG_RCU_NOCB_CPU=y
# end of RCU Subsystem


Thanks,
SeongJae Park

2020-05-05 18:45:34

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 5 May 2020 11:27:20 -0700 "Paul E. McKenney" <[email protected]> wrote:

> On Tue, May 05, 2020 at 07:49:43PM +0200, SeongJae Park wrote:
> > On Tue, 5 May 2020 10:23:58 -0700 "Paul E. McKenney" <[email protected]> wrote:
> >
> > > On Tue, May 05, 2020 at 09:25:06AM -0700, Eric Dumazet wrote:
> > > >
> > > >
> > > > On 5/5/20 9:13 AM, SeongJae Park wrote:
> > > > > On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > > > >
> > > > >> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > > > >>>
> > > > >>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > > > >>>
> > > > >>>>
> > > > >>>>
> > > > >>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > > > >>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > > > >>>>>
> > > > >>>>
> > > > >>>>>> Why do we have 10,000,000 objects around ? Could this be because of
> > > > >>>>>> some RCU problem ?
> > > > >>>>>
> > > > >>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
> > > > >>>>> the grace period became so long in this case.
> > > > >>>>>
> > > > >>>>> As my test machine was a virtual machine instance, I guess RCU readers
> > > > >>>>> preemption[1] like problem might affected this.
> > > > >>>>>
> > > > >>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
> > >
> > > If this is the root cause of the problem, then it will be necessary to
> > > provide a hint to the hypervisor. Or, in the near term, avoid loading
> > > the hypervisor the point that vCPU preemption is so lengthy.
> > >
> > > RCU could also provide some sort of pre-stall-warning notification that
> > > some of the CPUs aren't passing through quiescent states, which might
> > > allow the guest OS's userspace to take corrective action.
> > >
> > > But first, what are you doing to either confirm or invalidate the
> > > hypothesis that this might be due to vCPU preemption?
> >
> > Nothing, I was just guessing. Sorry if this made you confused.
> >
> > >
> > > > >>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> > > > >>>>>
> > > > >>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
> > > > >>>>> reverting the Al's patches didn't reproduce the problem.
> > > > >>>>>
> > > > >>>>
> > > > >>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
> > > > >>>>
> > > > >>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> > > > >>>> object that was allocated in sock_alloc_inode() before Al patches.
> > > > >>>>
> > > > >>>> These objects should be visible in kmalloc-64 kmem cache.
> > > > >>>
> > > > >>> Not exactly the 10,000,000, as it is only the possible highest number, but I
> > > > >>> was able to observe clear exponential increase of the number of the objects
> > > > >>> using slabtop. Before the start of the problematic workload, the number of
> > > > >>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> > > > >>> to 1,136,576.
> > > > >>>
> > > > >>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> > > > >>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> > > > >>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
> > > > >>>
> > > > >>
> > > > >> Great, thanks.
> > > > >>
> > > > >> How recent is the kernel you are running for your experiment ?
> > > > >
> > > > > It's based on 5.4.35.
> > >
> > > Is it possible to retest on v5.6? I have been adding various mechanisms
> > > to make RCU keep up better with heavy callback overload.
> >
> > I will try soon!
> >
> > >
> > > Also, could you please provide the .config? If either NO_HZ_FULL or
> > > RCU_NOCB_CPU, please also provide the kernel boot parameters.
> >
> > NO_HZ_FULL is not set, but RCU_NOCB_CPU is y.
>
> OK, this is important information.
>
> > I think I should check whether it's ok to share the full config and boot
> > parameters. Please wait this.
>
> I probably don't need the whole thing. So, if it makes it easier to
> gain approval...
>
> The main thing I need are CONFIG_PREEMPT and the various Kconfig options
> having "RCU" in their names. For example, I have no need for any of the
> options pertaining to device drivers. (As far as I know at the moment,
> anyway!)
>
> For the boot parameters, I am very interested in rcu_nocbs=. Along with
> any other boot parameters whose names contain "rcu".

I guess this would be ok.

It uses no 'rcu_nocbs=' boot parameter.

The configs you asked are as below:

# CONFIG_PREEMPT is not set

#
# RCU Subsystem
#
CONFIG_TREE_RCU=y
CONFIG_RCU_EXPERT=y
CONFIG_SRCU=y
CONFIG_TREE_SRCU=y
CONFIG_RCU_STALL_COMMON=y
CONFIG_RCU_NEED_SEGCBLIST=y
CONFIG_RCU_FANOUT=64
CONFIG_RCU_FANOUT_LEAF=16
# CONFIG_RCU_FAST_NO_HZ is not set
CONFIG_RCU_NOCB_CPU=y
# end of RCU Subsystem

>
> If rcu_nocbs does designate have any CPUs listed, another thing to check
> is where the rcuo kthreads are permitted to run. The reason that this
> is important is that any CPU listed in the rcu_nocbs= boot parameter
> has its RCU callbacks invoked by one of the rcuo kthreads. If you have
> booted with (say) "rcu_nocbs=1,63" and then bound all of the resulting
> rcuo kthreads to CPU 0, you just tied RCU's hands, making it unable to
> keep up with any reasonable RCU callback load.
>
> This sort of configuration is permitted, but it is intended for tightly
> controlled real-time or HPC systems whose configurations and workloads
> avoid tossing out large numbers of callbacks. Which might not be the
> case for your workload.
>
> > > > >> Let's make sure the bug is not in RCU.
> > > > >
> > > > > One thing I can currently say is that the grace period passes at last. I
> > > > > modified the benchmark to repeat not 10,000 times but only 5,000 times to run
> > > > > the test without OOM but easily observable memory pressure. As soon as the
> > > > > benchmark finishes, the memory were freed.
> > > > >
> > > > > If you need more tests, please let me know.
> > > >
> > > > I would ask Paul opinion on this issue, because we have many objects
> > > > being freed after RCU grace periods.
> > >
> > > As always, "It depends."
> > >
> > > o If the problem is a too-long RCU reader, RCU is prohibited from
> > > ending the grace period. The reader duration must be shortened,
> > > and until it is shortened, there is nothing RCU can do.
> > >
> > > o In some special cases of the above, RCU can and does help, for
> > > example, by enlisting the aid of cond_resched(). So perhaps
> > > there is a long in-kernel loop that needs a cond_resched().
> > >
> > > And perhaps RCU can help for some types of vCPU preemption.
> > >
> > > o As Al suggested offline and as has been discussed in the past,
> > > it would not be hard to cause RCU to burn CPU to attain faster
> > > grace periods during OOM events. This could be helpful, but only
> > > given that RCU readers are completing in reasonable timeframes.
> >
> > Totally agreed.
> >
> > > > If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> > >
> > > If readers are not excessively long, RCU should be able to keep up.
> > > (In the absence of misconfigurations, for example, both NO_HZ_FULL and
> > > then binding all the rcuo kthreads to a single CPU on a 100-CPU system
> > > or some such.)
> > >
> > > > Sure, we can revert patches there and there trying to work around the issue,
> > > > but for objects allocated from process context, we should not have these problems.
> > >
> > > Agreed, let's get more info on what is happening to RCU.
> > >
> > > One approach is to shorten the RCU CPU stall warning timeout
> > > (rcupdate.rcu_cpu_stall_timeout=10 for 10 seconds).
> >
> > I will also try this and let you know the results.
>
> Sounds good, thank you!

:)


Thanks,
SeongJae Park

2020-05-05 18:50:16

by Paul E. McKenney

[permalink] [raw]
Subject: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 05, 2020 at 08:40:07PM +0200, SeongJae Park wrote:
> On Tue, 5 May 2020 11:27:20 -0700 "Paul E. McKenney" <[email protected]> wrote:
>
> > On Tue, May 05, 2020 at 07:49:43PM +0200, SeongJae Park wrote:
> > > On Tue, 5 May 2020 10:23:58 -0700 "Paul E. McKenney" <[email protected]> wrote:
> > >
> > > > On Tue, May 05, 2020 at 09:25:06AM -0700, Eric Dumazet wrote:
> > > > >
> > > > >
> > > > > On 5/5/20 9:13 AM, SeongJae Park wrote:
> > > > > > On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > > > > >
> > > > > >> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > > > > >>>
> > > > > >>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > > > > >>>
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > > > > >>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > > > > >>>>>
> > > > > >>>>
> > > > > >>>>>> Why do we have 10,000,000 objects around ? Could this be because of
> > > > > >>>>>> some RCU problem ?
> > > > > >>>>>
> > > > > >>>>> Mainly because of a long RCU grace period, as you guess. I have no idea how
> > > > > >>>>> the grace period became so long in this case.
> > > > > >>>>>
> > > > > >>>>> As my test machine was a virtual machine instance, I guess RCU readers
> > > > > >>>>> preemption[1] like problem might affected this.
> > > > > >>>>>
> > > > > >>>>> [1] https://www.usenix.org/system/files/conference/atc17/atc17-prasad.pdf
> > > >
> > > > If this is the root cause of the problem, then it will be necessary to
> > > > provide a hint to the hypervisor. Or, in the near term, avoid loading
> > > > the hypervisor the point that vCPU preemption is so lengthy.
> > > >
> > > > RCU could also provide some sort of pre-stall-warning notification that
> > > > some of the CPUs aren't passing through quiescent states, which might
> > > > allow the guest OS's userspace to take corrective action.
> > > >
> > > > But first, what are you doing to either confirm or invalidate the
> > > > hypothesis that this might be due to vCPU preemption?
> > >
> > > Nothing, I was just guessing. Sorry if this made you confused.
> > >
> > > >
> > > > > >>>>>> Once Al patches reverted, do you have 10,000,000 sock_alloc around ?
> > > > > >>>>>
> > > > > >>>>> Yes, both the old kernel that prior to Al's patches and the recent kernel
> > > > > >>>>> reverting the Al's patches didn't reproduce the problem.
> > > > > >>>>>
> > > > > >>>>
> > > > > >>>> I repeat my question : Do you have 10,000,000 (smaller) objects kept in slab caches ?
> > > > > >>>>
> > > > > >>>> TCP sockets use the (very complex, error prone) SLAB_TYPESAFE_BY_RCU, but not the struct socket_wq
> > > > > >>>> object that was allocated in sock_alloc_inode() before Al patches.
> > > > > >>>>
> > > > > >>>> These objects should be visible in kmalloc-64 kmem cache.
> > > > > >>>
> > > > > >>> Not exactly the 10,000,000, as it is only the possible highest number, but I
> > > > > >>> was able to observe clear exponential increase of the number of the objects
> > > > > >>> using slabtop. Before the start of the problematic workload, the number of
> > > > > >>> objects of 'kmalloc-64' was 5760, but I was able to observe the number increase
> > > > > >>> to 1,136,576.
> > > > > >>>
> > > > > >>> OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
> > > > > >>> before: 5760 5088 88% 0.06K 90 64 360K kmalloc-64
> > > > > >>> after: 1136576 1136576 100% 0.06K 17759 64 71036K kmalloc-64
> > > > > >>>
> > > > > >>
> > > > > >> Great, thanks.
> > > > > >>
> > > > > >> How recent is the kernel you are running for your experiment ?
> > > > > >
> > > > > > It's based on 5.4.35.
> > > >
> > > > Is it possible to retest on v5.6? I have been adding various mechanisms
> > > > to make RCU keep up better with heavy callback overload.
> > >
> > > I will try soon!
> > >
> > > >
> > > > Also, could you please provide the .config? If either NO_HZ_FULL or
> > > > RCU_NOCB_CPU, please also provide the kernel boot parameters.
> > >
> > > NO_HZ_FULL is not set, but RCU_NOCB_CPU is y.
> >
> > OK, this is important information.
> >
> > > I think I should check whether it's ok to share the full config and boot
> > > parameters. Please wait this.
> >
> > I probably don't need the whole thing. So, if it makes it easier to
> > gain approval...
> >
> > The main thing I need are CONFIG_PREEMPT and the various Kconfig options
> > having "RCU" in their names. For example, I have no need for any of the
> > options pertaining to device drivers. (As far as I know at the moment,
> > anyway!)
> >
> > For the boot parameters, I am very interested in rcu_nocbs=. Along with
> > any other boot parameters whose names contain "rcu".
>
> I guess this would be ok.
>
> It uses no 'rcu_nocbs=' boot parameter.

OK, thank you!

For whatever it is worth, if the 'rcu_nocbs=' boot parameters is never
specified, there is no need to build with CONFIG_RCU_NOCB_CPU=y.

> The configs you asked are as below:
>
> # CONFIG_PREEMPT is not set
>
> #
> # RCU Subsystem
> #
> CONFIG_TREE_RCU=y
> CONFIG_RCU_EXPERT=y
> CONFIG_SRCU=y
> CONFIG_TREE_SRCU=y
> CONFIG_RCU_STALL_COMMON=y
> CONFIG_RCU_NEED_SEGCBLIST=y
> CONFIG_RCU_FANOUT=64
> CONFIG_RCU_FANOUT_LEAF=16
> # CONFIG_RCU_FAST_NO_HZ is not set
> CONFIG_RCU_NOCB_CPU=y
> # end of RCU Subsystem

Looks pretty standard otherwise. ;-)

Thanx, Paul

> > If rcu_nocbs does designate have any CPUs listed, another thing to check
> > is where the rcuo kthreads are permitted to run. The reason that this
> > is important is that any CPU listed in the rcu_nocbs= boot parameter
> > has its RCU callbacks invoked by one of the rcuo kthreads. If you have
> > booted with (say) "rcu_nocbs=1,63" and then bound all of the resulting
> > rcuo kthreads to CPU 0, you just tied RCU's hands, making it unable to
> > keep up with any reasonable RCU callback load.
> >
> > This sort of configuration is permitted, but it is intended for tightly
> > controlled real-time or HPC systems whose configurations and workloads
> > avoid tossing out large numbers of callbacks. Which might not be the
> > case for your workload.
> >
> > > > > >> Let's make sure the bug is not in RCU.
> > > > > >
> > > > > > One thing I can currently say is that the grace period passes at last. I
> > > > > > modified the benchmark to repeat not 10,000 times but only 5,000 times to run
> > > > > > the test without OOM but easily observable memory pressure. As soon as the
> > > > > > benchmark finishes, the memory were freed.
> > > > > >
> > > > > > If you need more tests, please let me know.
> > > > >
> > > > > I would ask Paul opinion on this issue, because we have many objects
> > > > > being freed after RCU grace periods.
> > > >
> > > > As always, "It depends."
> > > >
> > > > o If the problem is a too-long RCU reader, RCU is prohibited from
> > > > ending the grace period. The reader duration must be shortened,
> > > > and until it is shortened, there is nothing RCU can do.
> > > >
> > > > o In some special cases of the above, RCU can and does help, for
> > > > example, by enlisting the aid of cond_resched(). So perhaps
> > > > there is a long in-kernel loop that needs a cond_resched().
> > > >
> > > > And perhaps RCU can help for some types of vCPU preemption.
> > > >
> > > > o As Al suggested offline and as has been discussed in the past,
> > > > it would not be hard to cause RCU to burn CPU to attain faster
> > > > grace periods during OOM events. This could be helpful, but only
> > > > given that RCU readers are completing in reasonable timeframes.
> > >
> > > Totally agreed.
> > >
> > > > > If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> > > >
> > > > If readers are not excessively long, RCU should be able to keep up.
> > > > (In the absence of misconfigurations, for example, both NO_HZ_FULL and
> > > > then binding all the rcuo kthreads to a single CPU on a 100-CPU system
> > > > or some such.)
> > > >
> > > > > Sure, we can revert patches there and there trying to work around the issue,
> > > > > but for objects allocated from process context, we should not have these problems.
> > > >
> > > > Agreed, let's get more info on what is happening to RCU.
> > > >
> > > > One approach is to shorten the RCU CPU stall warning timeout
> > > > (rcupdate.rcu_cpu_stall_timeout=10 for 10 seconds).
> > >
> > > I will also try this and let you know the results.
> >
> > Sounds good, thank you!
>
> :)
>
>
> Thanks,
> SeongJae Park

2020-05-05 18:52:11

by Paul E. McKenney

[permalink] [raw]
Subject: Re: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, May 05, 2020 at 08:34:02PM +0200, SeongJae Park wrote:
> On Tue, 5 May 2020 11:17:07 -0700 "Paul E. McKenney" <[email protected]> wrote:
>
> > On Tue, May 05, 2020 at 07:56:05PM +0200, SeongJae Park wrote:
> > > On Tue, 5 May 2020 10:30:36 -0700 "Paul E. McKenney" <[email protected]> wrote:
> > >
> > > > On Tue, May 05, 2020 at 07:05:53PM +0200, SeongJae Park wrote:
> > > > > On Tue, 5 May 2020 09:37:42 -0700 Eric Dumazet <[email protected]> wrote:
> > > > >
> > > > > >
> > > > > >
> > > > > > On 5/5/20 9:31 AM, Eric Dumazet wrote:
> > > > > > >
> > > > > > >
> > > > > > > On 5/5/20 9:25 AM, Eric Dumazet wrote:
> > > > > > >>
> > > > > > >>
> > > > > > >> On 5/5/20 9:13 AM, SeongJae Park wrote:
> > > > > > >>> On Tue, 5 May 2020 09:00:44 -0700 Eric Dumazet <[email protected]> wrote:
> > > > > > >>>
> > > > > > >>>> On Tue, May 5, 2020 at 8:47 AM SeongJae Park <[email protected]> wrote:
> > > > > > >>>>>
> > > > > > >>>>> On Tue, 5 May 2020 08:20:50 -0700 Eric Dumazet <[email protected]> wrote:
> > > > > > >>>>>
> > > > > > >>>>>>
> > > > > > >>>>>>
> > > > > > >>>>>> On 5/5/20 8:07 AM, SeongJae Park wrote:
> > > > > > >>>>>>> On Tue, 5 May 2020 07:53:39 -0700 Eric Dumazet <[email protected]> wrote:
> > > > > > >>>>>>>
> > > > > > >>>>>>
> > > > > [...]
> > > > > > >>
> > > > > > >> I would ask Paul opinion on this issue, because we have many objects
> > > > > > >> being freed after RCU grace periods.
> > > > > > >>
> > > > > > >> If RCU subsystem can not keep-up, I guess other workloads will also suffer.
> > > > > > >>
> > > > > > >> Sure, we can revert patches there and there trying to work around the issue,
> > > > > > >> but for objects allocated from process context, we should not have these problems.
> > > > > > >>
> > > > > > >
> > > > > > > I wonder if simply adjusting rcu_divisor to 6 or 5 would help
> > > > > > >
> > > > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> > > > > > > index d9a49cd6065a20936edbda1b334136ab597cde52..fde833bac0f9f81e8536211b4dad6e7575c1219a 100644
> > > > > > > --- a/kernel/rcu/tree.c
> > > > > > > +++ b/kernel/rcu/tree.c
> > > > > > > @@ -427,7 +427,7 @@ module_param(qovld, long, 0444);
> > > > > > > static ulong jiffies_till_first_fqs = ULONG_MAX;
> > > > > > > static ulong jiffies_till_next_fqs = ULONG_MAX;
> > > > > > > static bool rcu_kick_kthreads;
> > > > > > > -static int rcu_divisor = 7;
> > > > > > > +static int rcu_divisor = 6;
> > > > > > > module_param(rcu_divisor, int, 0644);
> > > > > > >
> > > > > > > /* Force an exit from rcu_do_batch() after 3 milliseconds. */
> > > > > > >
> > > > > >
> > > > > > To be clear, you can adjust the value without building a new kernel.
> > > > > >
> > > > > > echo 6 >/sys/module/rcutree/parameters/rcu_divisor
> > > > >
> > > > > I tried value 6, 5, and 4, but none of those removed the problem.
> > > >
> > > > Thank you for checking this!
> > > >
> > > > Was your earlier discussion on long RCU readers speculation, or do you
> > > > have measurements?
> > >
> > > It was just a guess without any measurement or dedicated investigation.
> >
> > OK, another thing to check is the duration of the low-memory episode.
> > Does this duration exceed the RCU CPU stall warning time? (21 seconds
> > in mainline, 60 in many distros, but check rcupdate.rcu_cpu_stall_timeout
> > to be sure.)
>
> The benchmark takes about 36 seconds for 10,000 repeats of the test.
>
> The value on the test machine is 60.
>
> So the duration would not exceeded the warning time and therefore I haven't
> seen the warning message.
>
> As told in other mail, I will also adjust this value to shorter one.

Sounds good, thank you!

> > Also, any chance of a .config? Or at least the RCU portions? I am
> > guessing CONFIG_PREEMPT=n, for example.
>
> I guess this would be ok.
>
> # CONFIG_PREEMPT is not set
>
> #
> # RCU Subsystem
> #
> CONFIG_TREE_RCU=y
> CONFIG_RCU_EXPERT=y
> CONFIG_SRCU=y
> CONFIG_TREE_SRCU=y
> CONFIG_RCU_STALL_COMMON=y
> CONFIG_RCU_NEED_SEGCBLIST=y
> CONFIG_RCU_FANOUT=64
> CONFIG_RCU_FANOUT_LEAF=16
> # CONFIG_RCU_FAST_NO_HZ is not set
> CONFIG_RCU_NOCB_CPU=y
> # end of RCU Subsystem

And thank you again!

Thanx, Paul

2020-05-05 18:52:33

by David Miller

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

From: SeongJae Park <[email protected]>
Date: Tue, 5 May 2020 10:10:33 +0200

> From: SeongJae Park <[email protected]>
>
> The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
> socket_sq with socket itself") made those to have same life cycle.
>
> The changes made the code much more simple, but also made 'socket_alloc'
> live longer than before. For the reason, user programs intensively
> repeating allocations and deallocations of sockets could cause memory
> pressure on recent kernels.
>
> To avoid the problem, this commit reverts the changes.

Series applied and queued up for -stable, thanks.

2020-05-05 19:05:46

by David Miller

[permalink] [raw]
Subject: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

From: David Miller <[email protected]>
Date: Tue, 05 May 2020 11:48:25 -0700 (PDT)

> Series applied and queued up for -stable, thanks.

Nevermind, this doesn't even compile.

net/smc/af_smc.c: In function ?smc_switch_to_fallback?:
net/smc/af_smc.c:473:19: error: ?smc->clcsock->wq? is a pointer; did you mean to use ?->??
473 | smc->clcsock->wq.fasync_list =
| ^
| ->
net/smc/af_smc.c:474:25: error: ?smc->sk.sk_socket->wq? is a pointer; did you mean to use ?->??
474 | smc->sk.sk_socket->wq.fasync_list;
| ^
| ->

So I had to revert these changes.

When you make a change of this magnitude and scope you must do an
allmodconfig build.

Thank you.

2020-05-06 05:20:48

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH net v2 1/2] Revert "coallocate socket_wq with socket itself"

Hi SeongJae,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net/master]

url: https://github.com/0day-ci/linux/commits/SeongJae-Park/Revert-the-socket_alloc-life-cycle-change/20200506-032051
base: https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 755f5738ff981769211a0bfac709d514ef5b9f86
config: x86_64-randconfig-g001-20200505 (attached as .config)
compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 24b4965ce65b14ead595dcc68add22ba37533207)
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install x86_64 cross compiling tool for clang build
# apt-get install binutils-x86-64-linux-gnu
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot <[email protected]>

All errors (new ones prefixed by >>):

>> net/smc/af_smc.c:473:19: error: member reference type 'struct socket_wq *' is a pointer; did you mean to use '->'?
smc->clcsock->wq.fasync_list =
~~~~~~~~~~~~~~~~^
->
net/smc/af_smc.c:474:25: error: member reference type 'struct socket_wq *' is a pointer; did you mean to use '->'?
smc->sk.sk_socket->wq.fasync_list;
~~~~~~~~~~~~~~~~~~~~~^
->
2 errors generated.

vim +473 net/smc/af_smc.c

0cfdd8f92cac01 Ursula Braun 2017-01-09 466
07603b230895a7 Ursula Braun 2019-04-11 467 static void smc_switch_to_fallback(struct smc_sock *smc)
07603b230895a7 Ursula Braun 2019-04-11 468 {
07603b230895a7 Ursula Braun 2019-04-11 469 smc->use_fallback = true;
07603b230895a7 Ursula Braun 2019-04-11 470 if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
07603b230895a7 Ursula Braun 2019-04-11 471 smc->clcsock->file = smc->sk.sk_socket->file;
07603b230895a7 Ursula Braun 2019-04-11 472 smc->clcsock->file->private_data = smc->clcsock;
67f562e3e14775 Ursula Braun 2020-02-14 @473 smc->clcsock->wq.fasync_list =
67f562e3e14775 Ursula Braun 2020-02-14 474 smc->sk.sk_socket->wq.fasync_list;
07603b230895a7 Ursula Braun 2019-04-11 475 }
07603b230895a7 Ursula Braun 2019-04-11 476 }
07603b230895a7 Ursula Braun 2019-04-11 477

:::::: The code at line 473 was first introduced by commit
:::::: 67f562e3e147750a02b2a91d21a163fc44a1d13e net/smc: transfer fasync_list in case of fallback

:::::: TO: Ursula Braun <[email protected]>
:::::: CC: David S. Miller <[email protected]>

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/[email protected]


Attachments:
(No filename) (2.85 kB)
.config.gz (37.58 kB)
Download all attachments

2020-05-06 06:29:02

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Tue, 05 May 2020 12:00:49 -0700 (PDT) David Miller <[email protected]> wrote:

> From: David Miller <[email protected]>
> Date: Tue, 05 May 2020 11:48:25 -0700 (PDT)
>
> > Series applied and queued up for -stable, thanks.
>
> Nevermind, this doesn't even compile.
>
> net/smc/af_smc.c: In function ‘smc_switch_to_fallback’:
> net/smc/af_smc.c:473:19: error: ‘smc->clcsock->wq’ is a pointer; did you mean to use ‘->’?
> 473 | smc->clcsock->wq.fasync_list =
> | ^
> | ->
> net/smc/af_smc.c:474:25: error: ‘smc->sk.sk_socket->wq’ is a pointer; did you mean to use ‘->’?
> 474 | smc->sk.sk_socket->wq.fasync_list;
> | ^
> | ->
>
> So I had to revert these changes.
>
> When you make a change of this magnitude and scope you must do an
> allmodconfig build.

Definitely my fault. I will fix this in next spin.


Thanks,
SeongJae Park

>
> Thank you.

2020-05-06 13:02:32

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

TL; DR: It was not kernel's fault, but the benchmark program.

So, the problem is reproducible using the lebench[1] only. I carefully read
it's code again.

Before running the problem occurred "poll big" sub test, lebench executes
"context switch" sub test. For the test, it sets the cpu affinity[2] and
process priority[3] of itself to '0' and '-20', respectively. However, it
doesn't restore the values to original value even after the "context switch" is
finished. For the reason, "select big" sub test also run binded on CPU 0 and
has lowest nice value. Therefore, it can disturb the RCU callback thread for
the CPU 0, which processes the deferred deallocations of the sockets, and as a
result it triggers the OOM.

We confirmed the problem disappears by offloading the RCU callbacks from the
CPU 0 using rcu_nocbs=0 boot parameter or simply restoring the affinity and/or
priority.

Someone _might_ still argue that this is kernel problem because the problem
didn't occur on the old kernels prior to the Al's patches. However, setting
the affinity and priority was available because the program received the
permission. Therefore, it would be reasonable to blame the system
administrators rather than the kernel.

So, please ignore this patchset, apology for making confuse. If you still has
some doubts or need more tests, please let me know.

[1] https://github.com/LinuxPerfStudy/LEBench
[2] https://github.com/LinuxPerfStudy/LEBench/blob/master/TEST_DIR/OS_Eval.c#L820
[3] https://github.com/LinuxPerfStudy/LEBench/blob/master/TEST_DIR/OS_Eval.c#L822


Thanks,
SeongJae Park

2020-05-06 14:44:03

by Paul E. McKenney

[permalink] [raw]
Subject: Re: Re: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Wed, May 06, 2020 at 02:59:26PM +0200, SeongJae Park wrote:
> TL; DR: It was not kernel's fault, but the benchmark program.
>
> So, the problem is reproducible using the lebench[1] only. I carefully read
> it's code again.
>
> Before running the problem occurred "poll big" sub test, lebench executes
> "context switch" sub test. For the test, it sets the cpu affinity[2] and
> process priority[3] of itself to '0' and '-20', respectively. However, it
> doesn't restore the values to original value even after the "context switch" is
> finished. For the reason, "select big" sub test also run binded on CPU 0 and
> has lowest nice value. Therefore, it can disturb the RCU callback thread for
> the CPU 0, which processes the deferred deallocations of the sockets, and as a
> result it triggers the OOM.
>
> We confirmed the problem disappears by offloading the RCU callbacks from the
> CPU 0 using rcu_nocbs=0 boot parameter or simply restoring the affinity and/or
> priority.
>
> Someone _might_ still argue that this is kernel problem because the problem
> didn't occur on the old kernels prior to the Al's patches. However, setting
> the affinity and priority was available because the program received the
> permission. Therefore, it would be reasonable to blame the system
> administrators rather than the kernel.
>
> So, please ignore this patchset, apology for making confuse. If you still has
> some doubts or need more tests, please let me know.
>
> [1] https://github.com/LinuxPerfStudy/LEBench
> [2] https://github.com/LinuxPerfStudy/LEBench/blob/master/TEST_DIR/OS_Eval.c#L820
> [3] https://github.com/LinuxPerfStudy/LEBench/blob/master/TEST_DIR/OS_Eval.c#L822

Thank you for chasing this down!

I have had this sort of thing on my list as a potential issue, but given
that it is now really showing up, it sounds like it is time to bump
up its priority a bit. Of course there are limits, so if userspace is
running at any of the real-time priorities, making sufficient CPU time
available to RCU's kthreads becomes userspace's responsibility. But if
everything is running at SCHED_OTHER (which is this case here, correct?),
then it is reasonable for RCU to do some work to avoid this situation.

But still, yes, the immediate job is fixing the benchmark. ;-)

Thanx, Paul

PS. Why not just attack all potential issues on my list? Because I
usually learn quite a bit from seeing the problem actually happen.
And sometimes other changes in RCU eliminate the potential issue
before it has a chance to happen.

2020-05-06 15:25:09

by SeongJae Park

[permalink] [raw]
Subject: Re: Re: Re: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Wed, 6 May 2020 07:41:51 -0700 "Paul E. McKenney" <[email protected]> wrote:

> On Wed, May 06, 2020 at 02:59:26PM +0200, SeongJae Park wrote:
> > TL; DR: It was not kernel's fault, but the benchmark program.
> >
> > So, the problem is reproducible using the lebench[1] only. I carefully read
> > it's code again.
> >
> > Before running the problem occurred "poll big" sub test, lebench executes
> > "context switch" sub test. For the test, it sets the cpu affinity[2] and
> > process priority[3] of itself to '0' and '-20', respectively. However, it
> > doesn't restore the values to original value even after the "context switch" is
> > finished. For the reason, "select big" sub test also run binded on CPU 0 and
> > has lowest nice value. Therefore, it can disturb the RCU callback thread for
> > the CPU 0, which processes the deferred deallocations of the sockets, and as a
> > result it triggers the OOM.
> >
> > We confirmed the problem disappears by offloading the RCU callbacks from the
> > CPU 0 using rcu_nocbs=0 boot parameter or simply restoring the affinity and/or
> > priority.
> >
> > Someone _might_ still argue that this is kernel problem because the problem
> > didn't occur on the old kernels prior to the Al's patches. However, setting
> > the affinity and priority was available because the program received the
> > permission. Therefore, it would be reasonable to blame the system
> > administrators rather than the kernel.
> >
> > So, please ignore this patchset, apology for making confuse. If you still has
> > some doubts or need more tests, please let me know.
> >
> > [1] https://github.com/LinuxPerfStudy/LEBench
> > [2] https://github.com/LinuxPerfStudy/LEBench/blob/master/TEST_DIR/OS_Eval.c#L820
> > [3] https://github.com/LinuxPerfStudy/LEBench/blob/master/TEST_DIR/OS_Eval.c#L822
>
> Thank you for chasing this down!
>
> I have had this sort of thing on my list as a potential issue, but given
> that it is now really showing up, it sounds like it is time to bump
> up its priority a bit. Of course there are limits, so if userspace is
> running at any of the real-time priorities, making sufficient CPU time
> available to RCU's kthreads becomes userspace's responsibility. But if
> everything is running at SCHED_OTHER (which is this case here, correct?),

Correct.

> then it is reasonable for RCU to do some work to avoid this situation.

That would be also great!

>
> But still, yes, the immediate job is fixing the benchmark. ;-)

Totally agreed.

>
> Thanx, Paul
>
> PS. Why not just attack all potential issues on my list? Because I
> usually learn quite a bit from seeing the problem actually happen.
> And sometimes other changes in RCU eliminate the potential issue
> before it has a chance to happen.

Sounds interesting, I will try some of those in my spare time ;)


Thanks,
SeongJae Park

2020-05-06 23:26:17

by Eric Dumazet

[permalink] [raw]
Subject: Re: Re: Re: Re: Re: [PATCH net v2 0/2] Revert the 'socket_alloc' life cycle change

On Wed, May 6, 2020 at 5:59 AM SeongJae Park <[email protected]> wrote:
>
> TL; DR: It was not kernel's fault, but the benchmark program.
>
> So, the problem is reproducible using the lebench[1] only. I carefully read
> it's code again.
>
> Before running the problem occurred "poll big" sub test, lebench executes
> "context switch" sub test. For the test, it sets the cpu affinity[2] and
> process priority[3] of itself to '0' and '-20', respectively. However, it
> doesn't restore the values to original value even after the "context switch" is
> finished. For the reason, "select big" sub test also run binded on CPU 0 and
> has lowest nice value. Therefore, it can disturb the RCU callback thread for
> the CPU 0, which processes the deferred deallocations of the sockets, and as a
> result it triggers the OOM.
>
> We confirmed the problem disappears by offloading the RCU callbacks from the
> CPU 0 using rcu_nocbs=0 boot parameter or simply restoring the affinity and/or
> priority.
>
> Someone _might_ still argue that this is kernel problem because the problem
> didn't occur on the old kernels prior to the Al's patches. However, setting
> the affinity and priority was available because the program received the
> permission. Therefore, it would be reasonable to blame the system
> administrators rather than the kernel.
>
> So, please ignore this patchset, apology for making confuse. If you still has
> some doubts or need more tests, please let me know.
>
> [1] https://github.com/LinuxPerfStudy/LEBench
> [2] https://github.com/LinuxPerfStudy/LEBench/blob/master/TEST_DIR/OS_Eval.c#L820
> [3] https://github.com/LinuxPerfStudy/LEBench/blob/master/TEST_DIR/OS_Eval.c#L822
>
>
> Thanks,
> SeongJae Park

No harm done, thanks for running more tests and root-causing the issue !