Hi,
This is v5 of the series where the tw_timer is un-pinned to get rid of
interferences in isolated CPUs setups.
The first patch is a new one stemming from Jakub's bug reported. It's there
mainly to make the reviewing a bit easier, but as it changes behaviour it should
be squashed with the second one.
Revisions
=========
v4 -> v5
++++++++
o Rebased against latest Linus' tree
o Converted tw_timer into a delayed work following Jakub's bug report on v4
http://lore.kernel.org/r/[email protected]
v3 -> v4
++++++++
o Rebased against latest Linus' tree
o Added ehash lock usage to serialize scheduling vs descheduling of the tw_timer
(Paolo)
v2 -> v3
++++++++
o Dropped bh_disable patch
o Rebased against latest Linus' tree
RFCv1 -> v2
++++++++
o Added comment in inet_twsk_deschedule_put() to highlight the race
o Added bh_disable patch
Valentin Schneider (2):
SQUASH: tcp/dcpp: Convert timewait timer into a delayed_work
tcp/dcpp: Un-pin tw_timer
include/net/inet_timewait_sock.h | 8 +-
net/dccp/minisocks.c | 9 +--
net/ipv4/inet_diag.c | 2 +-
net/ipv4/inet_timewait_sock.c | 73 +++++++++++++------
net/ipv4/tcp_ipv4.c | 2 +-
net/ipv4/tcp_minisocks.c | 9 +--
net/ipv6/tcp_ipv6.c | 2 +-
.../selftests/bpf/progs/bpf_iter_tcp4.c | 2 +-
.../selftests/bpf/progs/bpf_iter_tcp6.c | 2 +-
9 files changed, 69 insertions(+), 40 deletions(-)
--
2.43.0
!NOTE! this changes behaviour, as the tw_timer is TIMER_PINNED and delayed
works currently cannot behave as such. TIMER_PINNED timers are enqueued
onto the local base, whereas delayed_works have their timer enqueued on the
global base AND also check for HK_TYPE_TIMER isolation.
The split between this commit and the next is mainly there for ease of
reviewing. This commit should be squashed with the next one.
Signed-off-by: Valentin Schneider <[email protected]>
---
include/net/inet_timewait_sock.h | 2 +-
net/ipv4/inet_diag.c | 2 +-
net/ipv4/inet_timewait_sock.c | 16 +++++++++-------
net/ipv4/tcp_ipv4.c | 2 +-
net/ipv6/tcp_ipv6.c | 2 +-
.../testing/selftests/bpf/progs/bpf_iter_tcp4.c | 2 +-
.../testing/selftests/bpf/progs/bpf_iter_tcp6.c | 2 +-
7 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index f28da08a37b4e..c4d64f1f8d415 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -72,7 +72,7 @@ struct inet_timewait_sock {
tw_tos : 8;
u32 tw_txhash;
u32 tw_priority;
- struct timer_list tw_timer;
+ struct delayed_work tw_expiry_work;
struct inet_bind_bucket *tw_tb;
struct inet_bind2_bucket *tw_tb2;
};
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7adace541fe29..ab11b688f1eeb 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -444,7 +444,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
r->idiag_state = tw->tw_substate;
r->idiag_timer = 3;
- tmo = tw->tw_timer.expires - jiffies;
+ tmo = tw->tw_expiry_work.timer.expires - jiffies;
r->idiag_expires = jiffies_delta_to_msecs(tmo);
r->idiag_rqueue = 0;
r->idiag_wqueue = 0;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index e8de45d34d56a..7a2dcbaa1a61e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -150,11 +150,13 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
}
EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
-static void tw_timer_handler(struct timer_list *t)
+static void tw_expiry_workfn(struct work_struct *work)
{
- struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
-
+ struct inet_timewait_sock *tw = container_of(
+ work, struct inet_timewait_sock, tw_expiry_work.work);
+ local_bh_disable();
inet_twsk_kill(tw);
+ local_bh_enable();
}
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
@@ -192,7 +194,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
- timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
+ INIT_DELAYED_WORK(&tw->tw_expiry_work, tw_expiry_workfn);
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
@@ -217,7 +219,7 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc);
*/
void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
- if (del_timer_sync(&tw->tw_timer))
+ if (cancel_delayed_work_sync(&tw->tw_expiry_work))
inet_twsk_kill(tw);
inet_twsk_put(tw);
}
@@ -255,10 +257,10 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
__NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
LINUX_MIB_TIMEWAITED);
- BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
+ BUG_ON(!queue_delayed_work(system_unbound_wq, &tw->tw_expiry_work, timeo));
refcount_inc(&tw->tw_dr->tw_refcount);
} else {
- mod_timer_pending(&tw->tw_timer, jiffies + timeo);
+ mod_delayed_work(system_unbound_wq, &tw->tw_expiry_work, timeo);
}
}
EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a22ee58387518..4b106f017a81f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2920,7 +2920,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
static void get_timewait4_sock(const struct inet_timewait_sock *tw,
struct seq_file *f, int i)
{
- long delta = tw->tw_timer.expires - jiffies;
+ long delta = tw->tw_expiry_work.timer.expires - jiffies;
__be32 dest, src;
__u16 destp, srcp;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3f4cba49e9ee6..58cd12fdc91a5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2242,7 +2242,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
static void get_timewait6_sock(struct seq_file *seq,
struct inet_timewait_sock *tw, int i)
{
- long delta = tw->tw_timer.expires - jiffies;
+ long delta = tw->tw_expiry_work.timer.expires - jiffies;
const struct in6_addr *dest, *src;
__u16 destp, srcp;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
index 92267abb462fc..a429f4eb9939c 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
@@ -152,7 +152,7 @@ static int dump_tw_sock(struct seq_file *seq, struct tcp_timewait_sock *ttw,
__be32 dest, src;
long delta;
- delta = tw->tw_timer.expires - bpf_jiffies64();
+ delta = tw->tw_expiry_work.timer.expires - bpf_jiffies64();
dest = tw->tw_daddr;
src = tw->tw_rcv_saddr;
destp = bpf_ntohs(tw->tw_dport);
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
index 943f7bba180e7..795bb34c95f72 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
@@ -157,7 +157,7 @@ static int dump_tw_sock(struct seq_file *seq, struct tcp_timewait_sock *ttw,
__u16 destp, srcp;
long delta;
- delta = tw->tw_timer.expires - bpf_jiffies64();
+ delta = tw->tw_expiry_work.timer.expires - bpf_jiffies64();
dest = &tw->tw_v6_daddr;
src = &tw->tw_v6_rcv_saddr;
destp = bpf_ntohs(tw->tw_dport);
--
2.43.0
Context
=======
The TCP timewait timer is proving to be problematic for setups where scheduler
CPU isolation is achieved at runtime via cpusets (as opposed to statically via
isolcpus=domains).
What happens there is a CPU goes through tcp_time_wait(), arming the time_wait
timer, then gets isolated. TCP_TIMEWAIT_LEN later, the timer fires, causing
interference for the now-isolated CPU. This is conceptually similar to the issue
described in
e02b93124855 ("workqueue: Unbind kworkers before sending them to exit()")
Changes
=======
Move inet_twsk_schedule() to within inet_twsk_hashdance(), with the ehash
lock held. Expand the lock's critical section from inet_twsk_kill() to
inet_twsk_deschedule_put(), serializing the scheduling vs descheduling of
the timer. IOW, this prevents the following race:
tcp_time_wait()
inet_twsk_hashdance()
inet_twsk_deschedule_put()
del_timer_sync()
inet_twsk_schedule()
Lastly, to prevent the ehash lock from being taken both in timer context
and in timer deletion context (which leads to deadlocks), move the
inet_twsk_kill() from timer expiry context to workqueue context by changing
the tw_timer to a delayed_work.
Thanks to Paolo for suggesting to leverage the ehash lock.
Note on partial reverts
=======================
This partially reverts
ec94c2696f0b ("tcp/dccp: avoid one atomic operation for timewait hashdance")
This also reinstores a comment from
ec94c2696f0b ("tcp/dccp: avoid one atomic operation for timewait hashdance")
as inet_twsk_hashdance() had a "Step 1" and "Step 3" comment, but the "Step
2" had gone missing.
Link: https://lore.kernel.org/all/[email protected]/
Signed-off-by: Valentin Schneider <[email protected]>
---
include/net/inet_timewait_sock.h | 6 ++--
net/dccp/minisocks.c | 9 +++--
net/ipv4/inet_timewait_sock.c | 59 +++++++++++++++++++++++---------
net/ipv4/tcp_minisocks.c | 9 +++--
4 files changed, 55 insertions(+), 28 deletions(-)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c4d64f1f8d415..93a50eed41872 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -93,8 +93,10 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
struct inet_timewait_death_row *dr,
const int state);
-void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
- struct inet_hashinfo *hashinfo);
+void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
+ struct sock *sk,
+ struct inet_hashinfo *hashinfo,
+ int timeo);
void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo,
bool rearm);
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 64d805b27adde..dd0b09553f142 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -58,11 +58,10 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
* we complete the initialization.
*/
local_bh_disable();
- inet_twsk_schedule(tw, timeo);
- /* Linkage updates.
- * Note that access to tw after this point is illegal.
- */
- inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
+ /* Linkage updates. */
+ inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo);
+ /* Access to tw after this point is illegal. */
+ inet_twsk_put(tw);
local_bh_enable();
} else {
/* Sorry, if we're out of memory, just CLOSE this
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 7a2dcbaa1a61e..165b9cdf27e1a 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -44,14 +44,14 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
__sock_put((struct sock *)tw);
}
-/* Must be called with locally disabled BHs. */
-static void inet_twsk_kill(struct inet_timewait_sock *tw)
+static void __inet_twsk_kill(struct inet_timewait_sock *tw, spinlock_t *lock)
+__releases(lock)
{
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
- spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
struct inet_bind_hashbucket *bhead, *bhead2;
- spin_lock(lock);
+ lockdep_assert_held(lock);
+
sk_nulls_del_node_init_rcu((struct sock *)tw);
spin_unlock(lock);
@@ -71,6 +71,16 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
inet_twsk_put(tw);
}
+/* Must be called with locally disabled BHs. */
+static void inet_twsk_kill(struct inet_timewait_sock *tw)
+{
+ struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+
+ spin_lock(lock);
+ __inet_twsk_kill(tw, lock);
+}
+
void inet_twsk_free(struct inet_timewait_sock *tw)
{
struct module *owner = tw->tw_prot->owner;
@@ -97,8 +107,10 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
* Essentially we whip up a timewait bucket, copy the relevant info into it
* from the SK, and mess with hash chains and list linkage.
*/
-void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
- struct inet_hashinfo *hashinfo)
+void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
+ struct sock *sk,
+ struct inet_hashinfo *hashinfo,
+ int timeo)
{
const struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -129,26 +141,33 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
spin_lock(lock);
+ /* Step 2: Hash TW into tcp ehash chain */
inet_twsk_add_node_rcu(tw, &ehead->chain);
/* Step 3: Remove SK from hash chain */
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- spin_unlock(lock);
- /* tw_refcnt is set to 3 because we have :
+ /* Ensure above writes are committed into memory before updating the
+ * refcount.
+ * Provides ordering vs later refcount_inc().
+ */
+ smp_wmb();
+ /* tw_refcnt is set to 4 because we have :
* - one reference for bhash chain.
* - one reference for ehash chain.
* - one reference for timer.
- * We can use atomic_set() because prior spin_lock()/spin_unlock()
- * committed into memory all tw fields.
- * Also note that after this point, we lost our implicit reference
- * so we are not allowed to use tw anymore.
+ * - one reference for ourself (our caller will release it).
*/
- refcount_set(&tw->tw_refcnt, 3);
+ refcount_set(&tw->tw_refcnt, 4);
+
+ inet_twsk_schedule(tw, timeo);
+
+ spin_unlock(lock);
+
}
-EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
+EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule);
static void tw_expiry_workfn(struct work_struct *work)
{
@@ -219,8 +238,16 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc);
*/
void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
- if (cancel_delayed_work_sync(&tw->tw_expiry_work))
- inet_twsk_kill(tw);
+ struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+
+ spin_lock(lock);
+ if (cancel_delayed_work_sync(&tw->tw_expiry_work)) {
+ /* releases @lock */
+ __inet_twsk_kill(tw, lock);
+ } else {
+ spin_unlock(lock);
+ }
inet_twsk_put(tw);
}
EXPORT_SYMBOL(inet_twsk_deschedule_put);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f0761f060a837..77e2515409e40 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -343,11 +343,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
* we complete the initialization.
*/
local_bh_disable();
- inet_twsk_schedule(tw, timeo);
- /* Linkage updates.
- * Note that access to tw after this point is illegal.
- */
- inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo);
+ /* Linkage updates. */
+ inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo);
+ /* Access to tw after this point is illegal. */
+ inet_twsk_put(tw);
local_bh_enable();
} else {
/* Sorry, if we're out of memory, just CLOSE this
--
2.43.0
On 15/04/24 14:35, Eric Dumazet wrote:
> On Mon, Apr 15, 2024 at 1:34 PM Valentin Schneider <[email protected]> wrote:
>>
>> Hi,
>>
>> This is v5 of the series where the tw_timer is un-pinned to get rid of
>> interferences in isolated CPUs setups.
>>
>> The first patch is a new one stemming from Jakub's bug reported. It's there
>> mainly to make the reviewing a bit easier, but as it changes behaviour it should
>> be squashed with the second one.
>>
>> Revisions
>> =========
>>
>> v4 -> v5
>> ++++++++
>>
>> o Rebased against latest Linus' tree
>> o Converted tw_timer into a delayed work following Jakub's bug report on v4
>> http://lore.kernel.org/r/[email protected]
>
> What was the issue again ?
>
> Please explain precisely why it was fundamentally tied to the use of
> timers (and this was not possible to fix the issue without
> adding work queues and more dependencies to TCP stack)
In v4 I added the use of the ehash lock to serialize arming the timewait
timer vs destroying it (inet_twsk_schedule() vs inet_twsk_deschedule_put()).
Unfortunately, holding a lock both in a timer callback and in the context
in which it is destroyed is invalid. AIUI the issue is as follows:
CPUx CPUy
spin_lock(foo);
<timer fires>
call_timer_fn()
spin_lock(foo) // blocks
timer_shutdown_sync()
__timer_delete_sync()
__try_to_del_timer_sync() // looped as long as timer is running
<deadlock>
In our case, we had in v4:
inet_twsk_deschedule_put()
spin_lock(ehash_lock);
tw_timer_handler()
inet_twsk_kill()
spin_lock(ehash_lock);
__inet_twsk_kill();
timer_shutdown_sync(&tw->tw_timer);
The fix here is to move the timer deletion to a non-timer
context. Workqueues fit the bill, and as the tw_timer_handler() would just queue
a work item, I converted it to a delayed_work.
Apologies for the delayed reply, I was away for most of last week;
On 16/04/24 17:01, Eric Dumazet wrote:
> On Mon, Apr 15, 2024 at 4:33 PM Valentin Schneider <[email protected]> wrote:
>>
>> On 15/04/24 14:35, Eric Dumazet wrote:
>> > On Mon, Apr 15, 2024 at 1:34 PM Valentin Schneider <[email protected]> wrote:
>> >> v4 -> v5
>> >> ++++++++
>> >>
>> >> o Rebased against latest Linus' tree
>> >> o Converted tw_timer into a delayed work following Jakub's bug report on v4
>> >> http://lore.kernel.org/r/[email protected]
>> >
>> > What was the issue again ?
>> >
>> > Please explain precisely why it was fundamentally tied to the use of
>> > timers (and this was not possible to fix the issue without
>> > adding work queues and more dependencies to TCP stack)
>>
>> In v4 I added the use of the ehash lock to serialize arming the timewait
>> timer vs destroying it (inet_twsk_schedule() vs inet_twsk_deschedule_put()).
>>
>> Unfortunately, holding a lock both in a timer callback and in the context
>> in which it is destroyed is invalid. AIUI the issue is as follows:
>>
>> CPUx CPUy
>> spin_lock(foo);
>> <timer fires>
>> call_timer_fn()
>> spin_lock(foo) // blocks
>> timer_shutdown_sync()
>> __timer_delete_sync()
>> __try_to_del_timer_sync() // looped as long as timer is running
>> <deadlock>
>>
>> In our case, we had in v4:
>>
>> inet_twsk_deschedule_put()
>> spin_lock(ehash_lock);
>> tw_timer_handler()
>> inet_twsk_kill()
>> spin_lock(ehash_lock);
>> __inet_twsk_kill();
>> timer_shutdown_sync(&tw->tw_timer);
>>
>> The fix here is to move the timer deletion to a non-timer
>> context. Workqueues fit the bill, and as the tw_timer_handler() would just queue
>> a work item, I converted it to a delayed_work.
>
> I do not like this delayed work approach.
>
> Adding more dependencies to the TCP stack is not very nice from a
> maintainer point of view.
>
> Why couldn't you call timer_shutdown_sync() before grabbing the lock ?
We need the timer_shutdown_sync() and mod_timer() of tw->tw_timer to be
serialized in some way. If they aren't, we have the following race:
tcp_time_wait()
inet_twsk_hashdance()
inet_twsk_deschedule_put()
// Returns 0 because not pending, but prevents future arming
timer_shutdown_sync()
inet_twsk_schedule()
// Returns 0 as if timer had been succesfully armed
mod_timer()
This means inet_twsk_deschedule_put() doesn't end up calling
inet_twsk_kill() (because the timer wasn't pending when it got shutdown),
but inet_twsk_schedule() doesn't arm it either despite the hashdance()
having updated the refcounts.
If we leave the deschedule as a del_timer_sync(), the timer ends up armed
in inet_twsk_schedule(), but that means waiting for the timer to fire to
clean up the resources despite having called inet_twsk_deschedule_put().
Hello,
kernel test robot noticed "BUG:sleeping_function_called_from_invalid_context_at_kernel/workqueue.c" on:
commit: 1463958a05a90694cf63a6decf02983ef9a0b102 ("[PATCH v5 1/2] SQUASH: tcp/dcpp: Convert timewait timer into a delayed_work")
url: https://github.com/intel-lab-lkp/linux/commits/Valentin-Schneider/SQUASH-tcp-dcpp-Convert-timewait-timer-into-a-delayed_work/20240415-193744
base: https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git master
patch link: https://lore.kernel.org/all/[email protected]/
patch subject: [PATCH v5 1/2] SQUASH: tcp/dcpp: Convert timewait timer into a delayed_work
in testcase: ltp
version: ltp-x86_64-14c1f76-1_20240420
with following parameters:
test: net_stress.appl-ssh
compiler: gcc-13
test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-4770 CPU @ 3.40GHz (Haswell) with 16G memory
(please refer to attached dmesg/kmsg for entire log/backtrace)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-lkp/[email protected]
[ 157.135844][ C7] BUG: sleeping function called from invalid context at kernel/workqueue.c:2195
[ 157.144790][ C7] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 3686, name: sshd
[ 157.153204][ C7] preempt_count: 101, expected: 0
[ 157.158116][ C7] RCU nest depth: 4, expected: 0
[ 157.162939][ C7] CPU: 7 PID: 3686 Comm: sshd Tainted: G S 6.9.0-rc1-00332-g1463958a05a9 #1
[ 157.172905][ C7] Hardware name: Dell Inc. OptiPlex 9020/0DNKMN, BIOS A05 12/05/2013
[ 157.180861][ C7] Call Trace:
[ 157.184021][ C7] <IRQ>
[ 157.186743][ C7] dump_stack_lvl (lib/dump_stack.c:117)
[ 157.191131][ C7] __might_resched (kernel/sched/core.c:10198)
[ 157.195780][ C7] ? __pfx___might_resched (kernel/sched/core.c:10152)
[ 157.200951][ C7] __cancel_work_sync (include/linux/kernel.h:73 kernel/workqueue.c:2195 kernel/workqueue.c:4295)
[ 157.205773][ C7] ? __pfx___cancel_work_sync (kernel/workqueue.c:4290)
[ 157.211209][ C7] inet_twsk_deschedule_put (net/ipv4/inet_timewait_sock.c:222 (discriminator 1))
[ 157.216470][ C7] tcp_v4_rcv (net/ipv4/tcp_ipv4.c:2413)
[ 157.220770][ C7] ? __pfx_tcp_v4_rcv (net/ipv4/tcp_ipv4.c:2163)
[ 157.225501][ C7] ? __kernel_text_address (kernel/extable.c:79 (discriminator 1))
[ 157.230668][ C7] ? __pfx_raw_v4_input (net/ipv4/raw.c:165)
[ 157.235574][ C7] ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205 (discriminator 1))
[ 157.240830][ C7] ip_local_deliver_finish (include/linux/rcupdate.h:813 net/ipv4/ip_input.c:234)
[ 157.246171][ C7] ? tcp_wfree (net/ipv4/tcp_output.c:1225)
[ 157.250464][ C7] ip_local_deliver (include/linux/netfilter.h:314 include/linux/netfilter.h:308 net/ipv4/ip_input.c:254)
[ 157.255190][ C7] ? __pfx_ip_local_deliver (net/ipv4/ip_input.c:243)
[ 157.260442][ C7] ? ip_rcv_finish_core+0x1c0/0x10a0
[ 157.266219][ C7] ip_rcv (include/net/dst.h:460 (discriminator 4) net/ipv4/ip_input.c:449 (discriminator 4) include/linux/netfilter.h:314 (discriminator 4) include/linux/netfilter.h:308 (discriminator 4) net/ipv4/ip_input.c:569 (discriminator 4))
[ 157.270078][ C7] ? __pfx_ip_rcv (net/ipv4/ip_input.c:562)
[ 157.274459][ C7] ? do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1))
[ 157.279012][ C7] ? update_curr (kernel/sched/fair.c:1164 (discriminator 1))
[ 157.283395][ C7] ? update_load_avg (kernel/sched/fair.c:4411 kernel/sched/fair.c:4748)
[ 157.288300][ C7] ? __pfx_ip_rcv (net/ipv4/ip_input.c:562)
[ 157.292684][ C7] __netif_receive_skb_one_core (net/core/dev.c:5585 (discriminator 4))
[ 157.298462][ C7] ? __pfx___netif_receive_skb_one_core (net/core/dev.c:5578)
[ 157.304760][ C7] ? _raw_spin_lock_irq (arch/x86/include/asm/atomic.h:115 (discriminator 4) include/linux/atomic/atomic-arch-fallback.h:2170 (discriminator 4) include/linux/atomic/atomic-instrumented.h:1302 (discriminator 4) include/asm-generic/qspinlock.h:111 (discriminator 4) include/linux/spinlock.h:187 (discriminator 4) include/linux/spinlock_api_smp.h:120 (discriminator 4) kernel/locking/spinlock.c:170 (discriminator 4))
[ 157.309663][ C7] process_backlog (include/linux/rcupdate.h:813 net/core/dev.c:6029)
[ 157.314305][ C7] ? __pfx_trigger_load_balance (kernel/sched/fair.c:12435)
[ 157.319904][ C7] __napi_poll (net/core/dev.c:6679)
[ 157.324110][ C7] ? update_process_times (kernel/time/timer.c:2480)
[ 157.329276][ C7] net_rx_action (net/core/dev.c:6750 net/core/dev.c:6864)
[ 157.333748][ C7] ? __pfx_net_rx_action (net/core/dev.c:6828)
[ 157.338734][ C7] ? __pfx_sched_clock_cpu (kernel/sched/clock.c:389)
[ 157.343902][ C7] ? __pfx_sched_clock_cpu (kernel/sched/clock.c:389)
[ 157.349070][ C7] __do_softirq (arch/x86/include/asm/jump_label.h:27 include/linux/jump_label.h:207 include/trace/events/irq.h:142 kernel/softirq.c:555)
[ 157.353453][ C7] ? __pfx___do_softirq (kernel/softirq.c:512)
[ 157.358361][ C7] ? irqtime_account_irq (kernel/sched/cputime.c:64 (discriminator 1))
[ 157.363449][ C7] do_softirq (kernel/softirq.c:455 (discriminator 32) kernel/softirq.c:442 (discriminator 32))
[ 157.367488][ C7] </IRQ>
[ 157.370296][ C7] <TASK>
[ 157.373107][ C7] __local_bh_enable_ip (kernel/softirq.c:382)
[ 157.378015][ C7] __dev_queue_xmit (net/core/dev.c:4389)
[ 157.382836][ C7] ? unwind_next_frame (arch/x86/kernel/unwind_orc.c:406 (discriminator 1) arch/x86/kernel/unwind_orc.c:648 (discriminator 1))
[ 157.388007][ C7] ? __pfx___dev_queue_xmit (net/core/dev.c:4270)
[ 157.393263][ C7] ? kernel_text_address (kernel/extable.c:119 (discriminator 1) kernel/extable.c:94 (discriminator 1))
[ 157.398348][ C7] ? __kernel_text_address (kernel/extable.c:79 (discriminator 1))
[ 157.403516][ C7] ? unwind_get_return_address (arch/x86/kernel/unwind_orc.c:369 (discriminator 1))
[ 157.409035][ C7] ? arch_stack_walk (arch/x86/kernel/stacktrace.c:26)
[ 157.413769][ C7] ? eth_header (net/ethernet/eth.c:100)
[ 157.418065][ C7] ? neigh_resolve_output (include/linux/netdevice.h:3145 net/core/neighbour.c:1558 net/core/neighbour.c:1543)
[ 157.423324][ C7] ip_finish_output2 (include/net/neighbour.h:542 (discriminator 2) net/ipv4/ip_output.c:235 (discriminator 2))
[ 157.428226][ C7] ? arch_stack_walk (arch/x86/kernel/stacktrace.c:24 (discriminator 1))
[ 157.432953][ C7] ? __pfx_ip_skb_dst_mtu (include/net/ip.h:496)
[ 157.438041][ C7] ? __pfx_ip_finish_output2 (net/ipv4/ip_output.c:199)
[ 157.443394][ C7] ? __ip_finish_output (include/linux/skbuff.h:1636 include/linux/skbuff.h:4958 net/ipv4/ip_output.c:307 net/ipv4/ip_output.c:295)
[ 157.448381][ C7] ip_output (net/ipv4/ip_output.c:427)
[ 157.452495][ C7] ? __pfx_ip_output (net/ipv4/ip_output.c:427)
[ 157.457137][ C7] __ip_queue_xmit (net/ipv4/ip_output.c:535)
[ 157.461862][ C7] ? __copy_skb_header (include/net/dst.h:290 net/core/skbuff.c:1528)
[ 157.466765][ C7] ? __skb_clone (arch/x86/include/asm/atomic.h:53 include/linux/atomic/atomic-arch-fallback.h:992 include/linux/atomic/atomic-instrumented.h:436 net/core/skbuff.c:1599)
[ 157.471229][ C7] __tcp_transmit_skb (net/ipv4/tcp_output.c:1462 (discriminator 4))
[ 157.476317][ C7] ? __pfx___tcp_transmit_skb (net/ipv4/tcp_output.c:1283)
[ 157.481740][ C7] tcp_write_xmit (net/ipv4/tcp_output.c:2792 (discriminator 2))
[ 157.486379][ C7] ? skb_do_copy_data_nocache (include/linux/uio.h:204 include/linux/uio.h:211 include/net/sock.h:2238)
[ 157.491977][ C7] ? __pfx_skb_do_copy_data_nocache (include/net/sock.h:2229)
[ 157.497923][ C7] ? skb_page_frag_refill (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:457 include/linux/atomic/atomic-instrumented.h:33 include/linux/page_ref.h:67 net/core/sock.c:2908)
[ 157.503083][ C7] __tcp_push_pending_frames (net/ipv4/tcp_output.c:2977 (discriminator 2))
[ 157.508505][ C7] tcp_sendmsg_locked (net/ipv4/tcp.c:1310)
[ 157.513578][ C7] ? __pfx_chacha_block_generic (lib/crypto/chacha.c:77)
[ 157.519173][ C7] ? __pfx_tcp_sendmsg_locked (net/ipv4/tcp.c:1040)
[ 157.524594][ C7] ? _raw_spin_lock_bh (arch/x86/include/asm/atomic.h:115 (discriminator 4) include/linux/atomic/atomic-arch-fallback.h:2170 (discriminator 4) include/linux/atomic/atomic-instrumented.h:1302 (discriminator 4) include/asm-generic/qspinlock.h:111 (discriminator 4) include/linux/spinlock.h:187 (discriminator 4) include/linux/spinlock_api_smp.h:127 (discriminator 4) kernel/locking/spinlock.c:178 (discriminator 4))
[ 157.529405][ C7] ? __pfx__raw_spin_lock_bh (kernel/locking/spinlock.c:177)
[ 157.534736][ C7] tcp_sendmsg (net/ipv4/tcp.c:1343)
[ 157.538850][ C7] sock_write_iter (net/socket.c:730 (discriminator 1) net/socket.c:745 (discriminator 1) net/socket.c:1160 (discriminator 1))
[ 157.543486][ C7] ? __pfx_sock_write_iter (net/socket.c:1144)
[ 157.548644][ C7] ? rw_verify_area (fs/read_write.c:377)
[ 157.553279][ C7] vfs_write (include/linux/fs.h:2108 fs/read_write.c:497 fs/read_write.c:590)
[ 157.557391][ C7] ? __pfx_vfs_write (fs/read_write.c:571)
[ 157.562026][ C7] ? __pfx___might_resched (kernel/sched/core.c:10152)
[ 157.567189][ C7] ksys_write (fs/read_write.c:643)
[ 157.571389][ C7] ? __pfx_ksys_write (fs/read_write.c:633)
[ 157.576112][ C7] do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1))
[ 157.580486][ C7] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129)
[ 157.586254][ C7] RIP: 0033:0x7f568f9de240
[ 157.590542][ C7] Code: 40 00 48 8b 15 c1 9b 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 80 3d a1 23 0e 00 00 74 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 48 89
All code
========
0: 40 00 48 8b add %cl,-0x75(%rax)
4: 15 c1 9b 0d 00 adc $0xd9bc1,%eax
9: f7 d8 neg %eax
b: 64 89 02 mov %eax,%fs:(%rdx)
e: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax
15: eb b7 jmp 0xffffffffffffffce
17: 0f 1f 00 nopl (%rax)
1a: 80 3d a1 23 0e 00 00 cmpb $0x0,0xe23a1(%rip) # 0xe23c2
21: 74 17 je 0x3a
23: b8 01 00 00 00 mov $0x1,%eax
28: 0f 05 syscall
2a:* 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax <-- trapping instruction
30: 77 58 ja 0x8a
32: c3 retq
33: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
3a: 48 83 ec 28 sub $0x28,%rsp
3e: 48 rex.W
3f: 89 .byte 0x89
Code starting with the faulting instruction
===========================================
0: 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax
6: 77 58 ja 0x60
8: c3 retq
9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
10: 48 83 ec 28 sub $0x28,%rsp
14: 48 rex.W
15: 89 .byte 0x89
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240426/[email protected]
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi,
On 22/04/24 16:31, Valentin Schneider wrote:
> Apologies for the delayed reply, I was away for most of last week;
>
> On 16/04/24 17:01, Eric Dumazet wrote:
>> On Mon, Apr 15, 2024 at 4:33 PM Valentin Schneider <[email protected]> wrote:
>>>
>>> On 15/04/24 14:35, Eric Dumazet wrote:
>>> > On Mon, Apr 15, 2024 at 1:34 PM Valentin Schneider <[email protected]> wrote:
>>> >> v4 -> v5
>>> >> ++++++++
>>> >>
>>> >> o Rebased against latest Linus' tree
>>> >> o Converted tw_timer into a delayed work following Jakub's bug report on v4
>>> >> http://lore.kernel.org/r/[email protected]
>>> >
>>> > What was the issue again ?
>>> >
>>> > Please explain precisely why it was fundamentally tied to the use of
>>> > timers (and this was not possible to fix the issue without
>>> > adding work queues and more dependencies to TCP stack)
>>>
>>> In v4 I added the use of the ehash lock to serialize arming the timewait
>>> timer vs destroying it (inet_twsk_schedule() vs inet_twsk_deschedule_put()).
>>>
>>> Unfortunately, holding a lock both in a timer callback and in the context
>>> in which it is destroyed is invalid. AIUI the issue is as follows:
>>>
>>> CPUx CPUy
>>> spin_lock(foo);
>>> <timer fires>
>>> call_timer_fn()
>>> spin_lock(foo) // blocks
>>> timer_shutdown_sync()
>>> __timer_delete_sync()
>>> __try_to_del_timer_sync() // looped as long as timer is running
>>> <deadlock>
>>>
>>> In our case, we had in v4:
>>>
>>> inet_twsk_deschedule_put()
>>> spin_lock(ehash_lock);
>>> tw_timer_handler()
>>> inet_twsk_kill()
>>> spin_lock(ehash_lock);
>>> __inet_twsk_kill();
>>> timer_shutdown_sync(&tw->tw_timer);
>>>
>>> The fix here is to move the timer deletion to a non-timer
>>> context. Workqueues fit the bill, and as the tw_timer_handler() would just queue
>>> a work item, I converted it to a delayed_work.
Does this explanation make sense? This is the reasoning that drove me to
involve workqueues. I'm open to suggestions on alternative approaches.