2022-03-11 00:39:54

by Luck, Tony

[permalink] [raw]
Subject: [PATCH v2 1/2] x86/split_lock: Make life miserable for split lockers

In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas
said:

Its's simply wishful thinking that stuff gets fixed because of a
WARN_ONCE(). This has never worked. The only thing which works is to
make stuff fail hard or slow it down in a way which makes it annoying
enough to users to complain.

He was talking about WBINVD. But it made me think about how we
use the split lock detection feature in Linux.

Existing code has three options for applications:
1) Don't enable split lock detection (allow arbitrary split locks)
2) Warn once when a process uses split lock, but let the process
keep running with split lock detection disabled
3) Kill process that use split locks

Option 2 falls into the "wishful thinking" territory that Thomas
warns does nothing. But option 3 might not be viable in a situation
with legacy applications that need to run.

Hence make option 2 much stricter to "slow it down in a way which makes
it annoying".

Primary reason for this change is to provide better quality of service
to the rest of the applications running on the system. Internal
testing shows that even with many processes splitting locks, performance
for the rest of the system is much more responsive.

The new "warn" mode operates like this. When an application tries to
execute a bus lock the #AC handler.

1) Delays (interruptibly) 10 ms before moving to next step.
2) Blocks (interruptibly) until it can get the semaphore
If interrupted, just return. Assume the signal will either
kill the task, or direct execution away from the instruction
that is trying to get the bus lock.
3) Disables split lock detection for the current core
4) Schedules a work queue to re-enable split lock detect in 2 jiffies
5) Returns

The work queue that re-enables split lock detection also releases the
semaphore.

There is a corner case where a CPU may be taken offline while
split lock detection is disabled. A CPU hotplug handler handles
this case.

Old behaviour was to only print the split lock warning on
the first occurrence of a split lock from a task. Preserve
that by adding a flag to the task structure that suppresses
subsequent split lock messages from that task.

Signed-off-by: Tony Luck <[email protected]>
---
arch/x86/kernel/cpu/intel.c | 65 +++++++++++++++++++++++++++++++------
include/linux/sched.h | 3 ++
kernel/fork.c | 5 +++
3 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8321c43554a1..2536784511e3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -7,10 +7,13 @@
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <linux/semaphore.h>
#include <linux/thread_info.h>
#include <linux/init.h>
#include <linux/uaccess.h>
+#include <linux/workqueue.h>
#include <linux/delay.h>
+#include <linux/cpuhotplug.h>

#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -1006,6 +1009,8 @@ static const struct {

static struct ratelimit_state bld_ratelimit;

+static DEFINE_SEMAPHORE(buslock_sem);
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt), ratelimit;
@@ -1116,18 +1121,54 @@ static void split_lock_init(void)
split_lock_verify_msr(sld_state != sld_off);
}

+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+/*
+ * If a CPU goes offline with pending delayed work to
+ * re-enable split lock detection then the delayed work
+ * will be executed on some other CPU. That handles releasing
+ * the buslock_sem, but because it executes on a different
+ * CPU probably won't re-enable split lock detection. This
+ * is a problem on HT systems since the sibling CPU on the
+ * same core may then be left running with split lock
+ * detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable);
+
static void split_lock_warn(unsigned long ip)
{
- pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
- current->comm, current->pid, ip);
+ int cpu;

- /*
- * Disable the split lock detection for this task so it can make
- * progress and set TIF_SLD so the detection is re-enabled via
- * switch_to_sld() when the task is scheduled out.
- */
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ /* misery factor #1, sleep 10ms before trying to execute split lock */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /* Misery factor #2, only allow one buslocked disabled core at a time */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ cpu = get_cpu();
+ schedule_delayed_work_on(cpu, &split_lock_reenable, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
sld_update_msr(false);
- set_tsk_thread_flag(current, TIF_SLD);
+ put_cpu();
}

bool handle_guest_split_lock(unsigned long ip)
@@ -1281,10 +1322,14 @@ static void sld_state_show(void)
pr_info("disabled\n");
break;
case sld_warn:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
- else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
pr_info("#DB: warning on user-space bus_locks\n");
+ }
break;
case sld_fatal:
if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75ba8aa60248..ffa7166e23d6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -938,6 +938,9 @@ struct task_struct {
/* Recursion prevention for eventfd_signal() */
unsigned in_eventfd_signal:1;
#endif
+#ifdef CONFIG_CPU_SUP_INTEL
+ unsigned reported_split_lock:1;
+#endif

unsigned long atomic_flags; /* Flags requiring atomic access. */

diff --git a/kernel/fork.c b/kernel/fork.c
index f1e89007f228..085d68d143b2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -970,6 +970,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
+
+#ifdef CONFIG_CPU_SUP_INTEL
+ tsk->reported_split_lock = 0;
+#endif
+
return tsk;

free_stack:
--
2.35.1


2022-03-17 13:46:06

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] x86/split_lock: Make life miserable for split lockers

Hi!

> In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas
> said:
>
> Its's simply wishful thinking that stuff gets fixed because of a
> WARN_ONCE(). This has never worked. The only thing which works is to
> make stuff fail hard or slow it down in a way which makes it annoying
> enough to users to complain.
>
> He was talking about WBINVD. But it made me think about how we
> use the split lock detection feature in Linux.
>
> Existing code has three options for applications:
> 1) Don't enable split lock detection (allow arbitrary split locks)
> 2) Warn once when a process uses split lock, but let the process
> keep running with split lock detection disabled
> 3) Kill process that use split locks

I'm not sure what split locks are, and if you want applications to
stop doing that maybe documentation would help.

Anyway, you can't really introduce regressions to userspace to "get
stuff fixed" in applications.
Pavel
--
DENX Software Engineering GmbH, Managing Director: Wolfgang Denk
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany


Attachments:
(No filename) (1.10 kB)
signature.asc (188.00 B)
Digital signature
Download all attachments

2022-03-17 19:48:06

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH v2 1/2] x86/split_lock: Make life miserable for split lockers

Pavel,

> I'm not sure what split locks are, and if you want applications to
> stop doing that maybe documentation would help.

See existing Documentation/x86/buslock.rst

> Anyway, you can't really introduce regressions to userspace to "get
> stuff fixed" in applications.

Applications can inflict pain on the system with atomic operations
on unaligned operands that cross cache line boundaries. This
is just pushing some pain back to the applications.

An alternate title for this patch series could have been:

"Split-locks: The OS strikes back"

Note that there are few applications that actually split locks.
Normal compiler alignment rules generally avoid them.
Applications that run on non-x86 architectures have to
avoid them because few others allow atomic operations
on unaligned operands.

-Tony

2022-03-17 20:42:26

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] x86/split_lock: Make life miserable for split lockers

On Thu, Mar 17 2022 at 12:13, Pavel Machek wrote:
>> In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas
>> said:
>>
>> Its's simply wishful thinking that stuff gets fixed because of a
>> WARN_ONCE(). This has never worked. The only thing which works is to
>> make stuff fail hard or slow it down in a way which makes it annoying
>> enough to users to complain.
>>
>> He was talking about WBINVD. But it made me think about how we
>> use the split lock detection feature in Linux.
>>
>> Existing code has three options for applications:
>> 1) Don't enable split lock detection (allow arbitrary split locks)
>> 2) Warn once when a process uses split lock, but let the process
>> keep running with split lock detection disabled
>> 3) Kill process that use split locks
>
> I'm not sure what split locks are, and if you want applications to
> stop doing that maybe documentation would help.

Split locks are lock prefixed operations which cross a cache line
boundary. The way how they are implemented is taking the bus lock, which
is the largest serialization hammer.

Bus locks are also triggered by lock prefixed operations on uncached
memory and on MMIO.

> Anyway, you can't really introduce regressions to userspace to "get
> stuff fixed" in applications.

Split locks can be triggered by unpriviledged user space and can be used
to run a local DoS attack. A very effective DoS to be clear.

We have no indication whether a process is malicious or just doing
stupid things. The only reason to not kill the offending process
instantly is that there can be legacy binary only executables which
trigger that due to stupidity.

But that opens up DoS at the same time. So the only way to "protect"
against that is to slow down the freqeuncy of buslocks unconditionally.
If you don't like that after analysing the situation you can turn split
lock detection off.

The only binary I've seen so far triggering this, is some stoneage "value
add" blob from HP which is also doing totally nuts other things.

Thanks,

tglx

2022-03-17 22:31:39

by David Laight

[permalink] [raw]
Subject: RE: [PATCH v2 1/2] x86/split_lock: Make life miserable for split lockers

From: Thomas Gleixner
> Sent: 17 March 2022 18:07
>
> On Thu, Mar 17 2022 at 12:13, Pavel Machek wrote:
> >> In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas
> >> said:
> >>
> >> Its's simply wishful thinking that stuff gets fixed because of a
> >> WARN_ONCE(). This has never worked. The only thing which works is to
> >> make stuff fail hard or slow it down in a way which makes it annoying
> >> enough to users to complain.
> >>
> >> He was talking about WBINVD. But it made me think about how we
> >> use the split lock detection feature in Linux.
> >>
> >> Existing code has three options for applications:
> >> 1) Don't enable split lock detection (allow arbitrary split locks)
> >> 2) Warn once when a process uses split lock, but let the process
> >> keep running with split lock detection disabled
> >> 3) Kill process that use split locks
> >
> > I'm not sure what split locks are, and if you want applications to
> > stop doing that maybe documentation would help.
>
> Split locks are lock prefixed operations which cross a cache line
> boundary. The way how they are implemented is taking the bus lock, which
> is the largest serialization hammer.
>
> Bus locks are also triggered by lock prefixed operations on uncached
> memory and on MMIO.
>
> > Anyway, you can't really introduce regressions to userspace to "get
> > stuff fixed" in applications.
>
> Split locks can be triggered by unpriviledged user space and can be used
> to run a local DoS attack. A very effective DoS to be clear.
>
> We have no indication whether a process is malicious or just doing
> stupid things. The only reason to not kill the offending process
> instantly is that there can be legacy binary only executables which
> trigger that due to stupidity.
>
> But that opens up DoS at the same time. So the only way to "protect"
> against that is to slow down the freqeuncy of buslocks unconditionally.
> If you don't like that after analysing the situation you can turn split
> lock detection off.
>
> The only binary I've seen so far triggering this, is some stoneage "value
> add" blob from HP which is also doing totally nuts other things.

They are actually more likely to happen in the kernel
when code casts int[] to long[] and then uses the 'BIT' functions to
set/clear bits - which do locked operations.
Quite often then don't need the locks anyway.
And that cast is surprisingly common and completely broken on BE.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2022-03-17 23:05:42

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH v2 1/2] x86/split_lock: Make life miserable for split lockers

> They are actually more likely to happen in the kernel
> when code casts int[] to long[] and then uses the 'BIT' functions to
> set/clear bits - which do locked operations.
> Quite often then don't need the locks anyway.
> And that cast is surprisingly common and completely broken on BE.

Default split lock mode is "kernel dies". We had to fix up a dozen or
so places (mostly involving set_bit/clr_bit as you describe). But
all Ice Lake. Tiger Lake and Alder Lake systems are running in
that mode ... and we haven't heard of widespread death (or disabling
the boot option).

-Tony

2022-03-18 14:30:38

by Fenghua Yu

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] x86/split_lock: Make life miserable for split lockers

On Thu, Mar 17, 2022 at 03:40:16PM -0700, Luck, Tony wrote:
> > They are actually more likely to happen in the kernel
> > when code casts int[] to long[] and then uses the 'BIT' functions to
> > set/clear bits - which do locked operations.
> > Quite often then don't need the locks anyway.
> > And that cast is surprisingly common and completely broken on BE.
>
> Default split lock mode is "kernel dies". We had to fix up a dozen or
> so places (mostly involving set_bit/clr_bit as you describe). But
> all Ice Lake. Tiger Lake and Alder Lake systems are running in
> that mode ... and we haven't heard of widespread death (or disabling
> the boot option).

Split lock is designed to report this kind of "hidden" performance issues.
After initial fix up wave, it has been sparse to find any legacy split lock
issue in the kernel.

If a new split lock issue is introduced, the developer hopefully will
capture and fix it before upstream. So others won't see it any more.

Thanks.

-Fenghua

Subject: [tip: x86/splitlock] x86/split_lock: Make life miserable for split lockers

The following commit has been merged into the x86/splitlock branch of tip:

Commit-ID: b041b525dab95352fbd666b14dc73ab898df465f
Gitweb: https://git.kernel.org/tip/b041b525dab95352fbd666b14dc73ab898df465f
Author: Tony Luck <[email protected]>
AuthorDate: Thu, 10 Mar 2022 12:48:53 -08:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Wed, 27 Apr 2022 15:43:38 +02:00

x86/split_lock: Make life miserable for split lockers

In https://lore.kernel.org/all/87y22uujkm.ffs@tglx/ Thomas
said:

Its's simply wishful thinking that stuff gets fixed because of a
WARN_ONCE(). This has never worked. The only thing which works is to
make stuff fail hard or slow it down in a way which makes it annoying
enough to users to complain.

He was talking about WBINVD. But it made me think about how we use the
split lock detection feature in Linux.

Existing code has three options for applications:

1) Don't enable split lock detection (allow arbitrary split locks)
2) Warn once when a process uses split lock, but let the process
keep running with split lock detection disabled
3) Kill process that use split locks

Option 2 falls into the "wishful thinking" territory that Thomas warns does
nothing. But option 3 might not be viable in a situation with legacy
applications that need to run.

Hence make option 2 much stricter to "slow it down in a way which makes
it annoying".

Primary reason for this change is to provide better quality of service to
the rest of the applications running on the system. Internal testing shows
that even with many processes splitting locks, performance for the rest of
the system is much more responsive.

The new "warn" mode operates like this. When an application tries to
execute a bus lock the #AC handler.

1) Delays (interruptibly) 10 ms before moving to next step.

2) Blocks (interruptibly) until it can get the semaphore
If interrupted, just return. Assume the signal will either
kill the task, or direct execution away from the instruction
that is trying to get the bus lock.
3) Disables split lock detection for the current core
4) Schedules a work queue to re-enable split lock detect in 2 jiffies
5) Returns

The work queue that re-enables split lock detection also releases the
semaphore.

There is a corner case where a CPU may be taken offline while split lock
detection is disabled. A CPU hotplug handler handles this case.

Old behaviour was to only print the split lock warning on the first
occurrence of a split lock from a task. Preserve that by adding a flag to
the task structure that suppresses subsequent split lock messages from that
task.

Signed-off-by: Tony Luck <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
arch/x86/kernel/cpu/intel.c | 63 ++++++++++++++++++++++++++++++------
include/linux/sched.h | 3 ++-
kernel/fork.c | 5 +++-
3 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f7a5370..be2a0bd 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -7,10 +7,13 @@
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <linux/semaphore.h>
#include <linux/thread_info.h>
#include <linux/init.h>
#include <linux/uaccess.h>
+#include <linux/workqueue.h>
#include <linux/delay.h>
+#include <linux/cpuhotplug.h>

#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -999,6 +1002,8 @@ static const struct {

static struct ratelimit_state bld_ratelimit;

+static DEFINE_SEMAPHORE(buslock_sem);
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt), ratelimit;
@@ -1109,18 +1114,52 @@ static void split_lock_init(void)
split_lock_verify_msr(sld_state != sld_off);
}

+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable);
+
static void split_lock_warn(unsigned long ip)
{
- pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
- current->comm, current->pid, ip);
+ int cpu;

- /*
- * Disable the split lock detection for this task so it can make
- * progress and set TIF_SLD so the detection is re-enabled via
- * switch_to_sld() when the task is scheduled out.
- */
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ /* misery factor #1, sleep 10ms before trying to execute split lock */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /* Misery factor #2, only allow one buslocked disabled core at a time */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ cpu = get_cpu();
+ schedule_delayed_work_on(cpu, &split_lock_reenable, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
sld_update_msr(false);
- set_tsk_thread_flag(current, TIF_SLD);
+ put_cpu();
}

bool handle_guest_split_lock(unsigned long ip)
@@ -1274,10 +1313,14 @@ static void sld_state_show(void)
pr_info("disabled\n");
break;
case sld_warn:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
- else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
pr_info("#DB: warning on user-space bus_locks\n");
+ }
break;
case sld_fatal:
if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a8911b1..23e03c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -941,6 +941,9 @@ struct task_struct {
#ifdef CONFIG_IOMMU_SVA
unsigned pasid_activated:1;
#endif
+#ifdef CONFIG_CPU_SUP_INTEL
+ unsigned reported_split_lock:1;
+#endif

unsigned long atomic_flags; /* Flags requiring atomic access. */

diff --git a/kernel/fork.c b/kernel/fork.c
index 9796897..f39795f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1045,6 +1045,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
+
+#ifdef CONFIG_CPU_SUP_INTEL
+ tsk->reported_split_lock = 0;
+#endif
+
return tsk;

free_stack: