From: SeongJae Park <[email protected]>
The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
deallocation of 'socket_alloc' to be done asynchronously using RCU, as
same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
socket_sq with socket itself") made those to have same life cycle.
The changes made the code much more simple, but also made 'socket_alloc'
live longer than before. For the reason, user programs intensively
repeating allocations and deallocations of sockets could cause memory
pressure on recent kernels.
To avoid the problem, this commit reverts the changes.
SeongJae Park (2):
Revert "coallocate socket_wq with socket itself"
Revert "sockfs: switch to ->free_inode()"
drivers/net/tap.c | 5 +++--
drivers/net/tun.c | 8 +++++---
include/linux/if_tap.h | 1 +
include/linux/net.h | 4 ++--
include/net/sock.h | 4 ++--
net/core/sock.c | 2 +-
net/socket.c | 23 ++++++++++++++++-------
7 files changed, 30 insertions(+), 17 deletions(-)
--
2.17.1
From: SeongJae Park <[email protected]>
This reverts commit 333f7909a8573145811c4ab7d8c9092301707721.
The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
deallocation of 'socket_alloc' to be done asynchronously using RCU, as
same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
socket_sq with socket itself") made those to have same life cycle.
The changes made the code much more simple, but also made 'socket_alloc'
live longer than before. For the reason, user programs intensively
repeating allocations and deallocations of sockets could cause memory
pressure on recent kernels.
To avoid the problem, this commit separates the life cycle of
'socket_alloc' and 'sock.wq' again. The following commit will make the
deallocation of 'socket_alloc' to be done synchronously again.
---
drivers/net/tap.c | 5 +++--
drivers/net/tun.c | 8 +++++---
include/linux/if_tap.h | 1 +
include/linux/net.h | 4 ++--
include/net/sock.h | 4 ++--
net/core/sock.c | 2 +-
net/socket.c | 19 ++++++++++++++-----
7 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 1f4bdd94407a..7912039a4846 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -518,7 +518,8 @@ static int tap_open(struct inode *inode, struct file *file)
goto err;
}
- init_waitqueue_head(&q->sock.wq.wait);
+ RCU_INIT_POINTER(q->sock.wq, &q->wq);
+ init_waitqueue_head(&q->wq.wait);
q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED;
q->sock.file = file;
@@ -576,7 +577,7 @@ static __poll_t tap_poll(struct file *file, poll_table *wait)
goto out;
mask = 0;
- poll_wait(file, &q->sock.wq.wait, wait);
+ poll_wait(file, &q->wq.wait, wait);
if (!ptr_ring_empty(&q->ring))
mask |= EPOLLIN | EPOLLRDNORM;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 650c937ed56b..16a5f3b80edf 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -160,6 +160,7 @@ struct tun_pcpu_stats {
struct tun_file {
struct sock sk;
struct socket socket;
+ struct socket_wq wq;
struct tun_struct __rcu *tun;
struct fasync_struct *fasync;
/* only used for fasnyc */
@@ -2173,7 +2174,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
goto out;
}
- add_wait_queue(&tfile->socket.wq.wait, &wait);
+ add_wait_queue(&tfile->wq.wait, &wait);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -2193,7 +2194,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
}
__set_current_state(TASK_RUNNING);
- remove_wait_queue(&tfile->socket.wq.wait, &wait);
+ remove_wait_queue(&tfile->wq.wait, &wait);
out:
*err = error;
@@ -3434,7 +3435,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
tfile->flags = 0;
tfile->ifindex = 0;
- init_waitqueue_head(&tfile->socket.wq.wait);
+ init_waitqueue_head(&tfile->wq.wait);
+ RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);
tfile->socket.file = file;
tfile->socket.ops = &tun_socket_ops;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 915a187cfabd..8e66866c11be 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -62,6 +62,7 @@ struct tap_dev {
struct tap_queue {
struct sock sk;
struct socket sock;
+ struct socket_wq wq;
int vnet_hdr_sz;
struct tap_dev __rcu *tap;
struct file *file;
diff --git a/include/linux/net.h b/include/linux/net.h
index 6451425e828f..28c929bebb4a 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -116,11 +116,11 @@ struct socket {
unsigned long flags;
+ struct socket_wq *wq;
+
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
-
- struct socket_wq wq;
};
struct vm_area_struct;
diff --git a/include/net/sock.h b/include/net/sock.h
index 328564525526..20799a333570 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1841,7 +1841,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
{
WARN_ON(parent->sk);
write_lock_bh(&sk->sk_callback_lock);
- rcu_assign_pointer(sk->sk_wq, &parent->wq);
+ rcu_assign_pointer(sk->sk_wq, parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
sk->sk_uid = SOCK_INODE(parent)->i_uid;
@@ -2119,7 +2119,7 @@ static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
{
if (!poll_does_not_wait(p)) {
- poll_wait(filp, &sock->wq.wait, p);
+ poll_wait(filp, &sock->wq->wait, p);
/* We need to be sure we are in sync with the
* socket flags modification.
*
diff --git a/net/core/sock.c b/net/core/sock.c
index 8f71684305c3..7fa3241b5507 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2869,7 +2869,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
if (sock) {
sk->sk_type = sock->type;
- RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
+ RCU_INIT_POINTER(sk->sk_wq, sock->wq);
sock->sk = sk;
sk->sk_uid = SOCK_INODE(sock)->i_uid;
} else {
diff --git a/net/socket.c b/net/socket.c
index 2eecf1517f76..e274ae4b45e4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -249,13 +249,20 @@ static struct kmem_cache *sock_inode_cachep __ro_after_init;
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
+ struct socket_wq *wq;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
- init_waitqueue_head(&ei->socket.wq.wait);
- ei->socket.wq.fasync_list = NULL;
- ei->socket.wq.flags = 0;
+ wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq) {
+ kmem_cache_free(sock_inode_cachep, ei);
+ return NULL;
+ }
+ init_waitqueue_head(&wq->wait);
+ wq->fasync_list = NULL;
+ wq->flags = 0;
+ ei->socket.wq = wq;
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
@@ -271,6 +278,7 @@ static void sock_free_inode(struct inode *inode)
struct socket_alloc *ei;
ei = container_of(inode, struct socket_alloc, vfs_inode);
+ kfree(ei->socket.wq);
kmem_cache_free(sock_inode_cachep, ei);
}
@@ -610,7 +618,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
module_put(owner);
}
- if (sock->wq.fasync_list)
+ if (sock->wq->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
if (!sock->file) {
@@ -1299,12 +1307,13 @@ static int sock_fasync(int fd, struct file *filp, int on)
{
struct socket *sock = filp->private_data;
struct sock *sk = sock->sk;
- struct socket_wq *wq = &sock->wq;
+ struct socket_wq *wq;
if (sk == NULL)
return -EINVAL;
lock_sock(sk);
+ wq = sock->wq;
fasync_helper(fd, filp, on, &wq->fasync_list);
if (!wq->fasync_list)
--
2.17.1
From: SeongJae Park <[email protected]>
This reverts commit 6d7855c54e1e269275d7c504f8f62a0b7a5b3f18.
The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
deallocation of 'socket_alloc' to be done asynchronously using RCU, as
same to 'sock.wq'.
The change made 'socket_alloc' live longer than before. As a result,
user programs intensively repeating allocations and deallocations of
sockets could cause memory pressure on recent kernels.
To avoid the problem, this commit reverts the change.
---
net/socket.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/socket.c b/net/socket.c
index e274ae4b45e4..27174021f47f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -273,12 +273,12 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
return &ei->vfs_inode;
}
-static void sock_free_inode(struct inode *inode)
+static void sock_destroy_inode(struct inode *inode)
{
struct socket_alloc *ei;
ei = container_of(inode, struct socket_alloc, vfs_inode);
- kfree(ei->socket.wq);
+ kfree_rcu(ei->socket.wq, rcu);
kmem_cache_free(sock_inode_cachep, ei);
}
@@ -303,7 +303,7 @@ static void init_inodecache(void)
static const struct super_operations sockfs_ops = {
.alloc_inode = sock_alloc_inode,
- .free_inode = sock_free_inode,
+ .destroy_inode = sock_destroy_inode,
.statfs = simple_statfs,
};
--
2.17.1
On Tue, May 05, 2020 at 09:28:40AM +0200, SeongJae Park wrote:
> From: SeongJae Park <[email protected]>
>
> This reverts commit 333f7909a8573145811c4ab7d8c9092301707721.
>
> The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
> socket_sq with socket itself") made those to have same life cycle.
>
> The changes made the code much more simple, but also made 'socket_alloc'
> live longer than before. For the reason, user programs intensively
> repeating allocations and deallocations of sockets could cause memory
> pressure on recent kernels.
>
> To avoid the problem, this commit separates the life cycle of
> 'socket_alloc' and 'sock.wq' again. The following commit will make the
> deallocation of 'socket_alloc' to be done synchronously again.
> ---
No signed-off-by?
No "Fixes:"?
:(
On Tue, May 05, 2020 at 09:28:41AM +0200, SeongJae Park wrote:
> From: SeongJae Park <[email protected]>
>
> This reverts commit 6d7855c54e1e269275d7c504f8f62a0b7a5b3f18.
>
> The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> same to 'sock.wq'.
>
> The change made 'socket_alloc' live longer than before. As a result,
> user programs intensively repeating allocations and deallocations of
> sockets could cause memory pressure on recent kernels.
>
> To avoid the problem, this commit reverts the change.
> ---
> net/socket.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
Same problems here as in patch 1/2 :(
On Tue, 5 May 2020 09:45:11 +0200 Greg KH <[email protected]> wrote:
> On Tue, May 05, 2020 at 09:28:40AM +0200, SeongJae Park wrote:
> > From: SeongJae Park <[email protected]>
> >
> > This reverts commit 333f7909a8573145811c4ab7d8c9092301707721.
> >
> > The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> > deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> > same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
> > socket_sq with socket itself") made those to have same life cycle.
> >
> > The changes made the code much more simple, but also made 'socket_alloc'
> > live longer than before. For the reason, user programs intensively
> > repeating allocations and deallocations of sockets could cause memory
> > pressure on recent kernels.
> >
> > To avoid the problem, this commit separates the life cycle of
> > 'socket_alloc' and 'sock.wq' again. The following commit will make the
> > deallocation of 'socket_alloc' to be done synchronously again.
> > ---
>
> No signed-off-by?
> No "Fixes:"?
Oops, my mistake. I will post next version right now.
Thanks,
SeongJae Park
>
> :(
On Tue, 5 May 2020 09:45:35 +0200 Greg KH <[email protected]> wrote:
> On Tue, May 05, 2020 at 09:28:41AM +0200, SeongJae Park wrote:
> > From: SeongJae Park <[email protected]>
> >
> > This reverts commit 6d7855c54e1e269275d7c504f8f62a0b7a5b3f18.
> >
> > The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> > deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> > same to 'sock.wq'.
> >
> > The change made 'socket_alloc' live longer than before. As a result,
> > user programs intensively repeating allocations and deallocations of
> > sockets could cause memory pressure on recent kernels.
> >
> > To avoid the problem, this commit reverts the change.
> > ---
> > net/socket.c | 6 +++---
> > 1 file changed, 3 insertions(+), 3 deletions(-)
>
> Same problems here as in patch 1/2 :(
Yes, indeed. I will send next version right now.
Thanks,
SeongJae Park
>
On Tue, May 05, 2020 at 09:28:39AM +0200, SeongJae Park wrote:
> From: SeongJae Park <[email protected]>
>
> The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
> socket_sq with socket itself") made those to have same life cycle.
>
> The changes made the code much more simple, but also made 'socket_alloc'
> live longer than before. For the reason, user programs intensively
> repeating allocations and deallocations of sockets could cause memory
> pressure on recent kernels.
>
> To avoid the problem, this commit reverts the changes.
Is it "could cause" or is it "have been actually observed to"?
On Tue, 5 May 2020 13:44:42 +0100 Al Viro <[email protected]> wrote:
> CAUTION: This email originated from outside of the organization. Do not cli=
> ck links or open attachments unless you can confirm the sender and know the=
> content is safe.
>
>
>
> On Tue, May 05, 2020 at 09:28:39AM +0200, SeongJae Park wrote:
> > From: SeongJae Park <[email protected]>
> >
> > The commit 6d7855c54e1e ("sockfs: switch to ->free_inode()") made the
> > deallocation of 'socket_alloc' to be done asynchronously using RCU, as
> > same to 'sock.wq'. And the following commit 333f7909a857 ("coallocate
> > socket_sq with socket itself") made those to have same life cycle.
> >
> > The changes made the code much more simple, but also made 'socket_alloc'
> > live longer than before. For the reason, user programs intensively
> > repeating allocations and deallocations of sockets could cause memory
> > pressure on recent kernels.
> >
> > To avoid the problem, this commit reverts the changes.
>
> Is it "could cause" or is it "have been actually observed to"?
Actually observed. Sorry for lack of that explanation. Could you please refer
to this link?
https://lore.kernel.org/netdev/[email protected]/
Thanks,
SeongJae Park