2022-05-02 09:40:17

by Thomas Gleixner

[permalink] [raw]
Subject: [patch 3/3] x86/fpu: Make FPU protection more robust

FPU state maintenance is protected by fpregs_lock(), which is a wrapper
around local_bh_disable() on non-RT kernels and preempt_disable() on RT
kernels. In-kernel FPU usage has it's own protection via a per CPU
variable.

This separation is pointless and error-prone as a recently discovered wrong
condition for granting in-kernel FPU usage has shown.

Make the whole FPU state protection simpler and more robust by using the
per CPU usage variable for all FPU operations so state is tracked
consistently.

Change related WARN_ON_FPU() instances to WARN_ON_ONCE() as the usage of
CONFIG_X86_DEBUG_FPU is optional and hides inconsistencies for a
potentially long time.

Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/include/asm/fpu/api.h | 17 +-------
arch/x86/kernel/fpu/core.c | 78 ++++++++++++++++++++++++++---------------
2 files changed, 52 insertions(+), 43 deletions(-)

--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -66,21 +66,8 @@ static inline void kernel_fpu_begin(void
*
* Disabling preemption also serializes against kernel_fpu_begin().
*/
-static inline void fpregs_lock(void)
-{
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
- local_bh_disable();
- else
- preempt_disable();
-}
-
-static inline void fpregs_unlock(void)
-{
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
- local_bh_enable();
- else
- preempt_enable();
-}
+extern void fpregs_lock(void);
+extern void fpregs_unlock(void);

#ifdef CONFIG_X86_DEBUG_FPU
extern void fpregs_assert_state_consistent(void);
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -42,7 +42,7 @@ struct fpu_state_config fpu_user_cfg __r
struct fpstate init_fpstate __ro_after_init;

/* Track in-kernel FPU usage */
-static DEFINE_PER_CPU(bool, in_kernel_fpu);
+static DEFINE_PER_CPU(bool, fpu_in_use);

/*
* Track which context is using the FPU on the CPU:
@@ -50,6 +50,50 @@ static DEFINE_PER_CPU(bool, in_kernel_fp
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/**
+ * fpregs_lock - Lock FPU state for maintenance operations
+ *
+ * This protects against preemption, soft interrupts and in-kernel FPU
+ * usage on both !RT and RT enabled kernels.
+ *
+ * !RT kernels use local_bh_disable() to prevent soft interrupt processing
+ * and preemption.
+ *
+ * On RT kernels local_bh_disable() is not sufficient because it only
+ * serializes soft interrupt related sections via a local lock, but stays
+ * preemptible. Disabling preemption is the right choice here as bottom
+ * half processing is always in thread context on RT kernels so it
+ * implicitly prevents bottom half processing as well.
+ */
+void fpregs_lock(void)
+{
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_bh_disable();
+ else
+ preempt_disable();
+
+ WARN_ON_ONCE(this_cpu_read(fpu_in_use));
+ this_cpu_write(fpu_in_use, true);
+}
+EXPORT_SYMBOL_GPL(fpregs_lock);
+
+/**
+ * fpregs_unlock - Unlock FPU state after maintenance operations
+ *
+ * Counterpart to fpregs_lock().
+ */
+void fpregs_unlock(void)
+{
+ WARN_ON_ONCE(!this_cpu_read(fpu_in_use));
+ this_cpu_write(fpu_in_use, false);
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_bh_enable();
+ else
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(fpregs_unlock);
+
+/**
* kernel_fpu_usable - Check whether kernel FPU usage is possible
*
* Has to be invoked before calling kernel_fpu_begin().
@@ -59,28 +103,7 @@ bool kernel_fpu_usable(void)
if (WARN_ON_ONCE(in_nmi()))
return false;

- /* In kernel FPU usage already active? */
- if (this_cpu_read(in_kernel_fpu))
- return false;
-
- /*
- * When not in NMI or hard interrupt context, FPU can be used:
- *
- * - Task context is safe except from within fpregs_lock()'ed
- * critical regions.
- *
- * - Soft interrupt processing context which cannot happen
- * while in a fpregs_lock()'ed critical region.
- */
- if (!in_hardirq())
- return true;
-
- /*
- * In hard interrupt context it's safe when soft interrupts
- * are enabled, which means the interrupt did not hit in
- * a fpregs_lock()'ed critical region.
- */
- return !softirq_count();
+ return !this_cpu_read(fpu_in_use);
}
EXPORT_SYMBOL(kernel_fpu_usable);

@@ -410,10 +433,9 @@ void kernel_fpu_begin_mask(unsigned int
{
preempt_disable();

- WARN_ON_FPU(!kernel_fpu_usable());
- WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
+ WARN_ON_ONCE(!kernel_fpu_usable());

- this_cpu_write(in_kernel_fpu, true);
+ this_cpu_write(fpu_in_use, true);

if (!(current->flags & PF_KTHREAD) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
@@ -433,9 +455,9 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask)

void kernel_fpu_end(void)
{
- WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+ WARN_ON_ONCE(!this_cpu_read(fpu_in_use));

- this_cpu_write(in_kernel_fpu, false);
+ this_cpu_write(fpu_in_use, false);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);


2022-05-02 23:14:06

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 3/3] x86/fpu: Make FPU protection more robust

On Mon, May 02 2022 at 16:35, Borislav Petkov wrote:
> On Sun, May 01, 2022 at 09:31:47PM +0200, Thomas Gleixner wrote:
>> /**
>> + * fpregs_lock - Lock FPU state for maintenance operations
>
> "maintenance"?

I meant maintenance of user thread FPU state. Let me rephrase.

>> + *
>> + * This protects against preemption, soft interrupts and in-kernel FPU
>> + * usage on both !RT and RT enabled kernels.
>> + *
>> + * !RT kernels use local_bh_disable() to prevent soft interrupt processing
>> + * and preemption.
>> + *
>> + * On RT kernels local_bh_disable() is not sufficient because it only
>> + * serializes soft interrupt related sections via a local lock, but stays
>> + * preemptible. Disabling preemption is the right choice here as bottom
>> + * half processing is always in thread context on RT kernels so it
>> + * implicitly prevents bottom half processing as well.
>> + */
>> +void fpregs_lock(void)
>> +{
>> + if (!IS_ENABLED(CONFIG_PREEMPT_RT))
>> + local_bh_disable();
>> + else
>> + preempt_disable();
>
> So I'm wondering: can we get rid of this distinction and simply do
> preempt_disable()?
>
> Or can FPU be used in softirq processing too so we want to block that
> there?

Yes, FPU can be used legitimately in softirq processing context.

> But even if, fpu_in_use will already state that fact...

Right, though currently it's guaranteed that softirq processing context
can use the FPU. Quite some of the network crypto work runs in softirq
context, so this might cause a regression. If so, then this needs to be
an explicit commit on top which is easy to revert. Let me stare at it
some more.

>> @@ -410,10 +433,9 @@ void kernel_fpu_begin_mask(unsigned int
>> {
>> preempt_disable();
>>
>> - WARN_ON_FPU(!kernel_fpu_usable());
>> - WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
>> + WARN_ON_ONCE(!kernel_fpu_usable());
>>
>> - this_cpu_write(in_kernel_fpu, true);
>> + this_cpu_write(fpu_in_use, true);
>
> This starts to look awfully similar to fpregs_lock()...

Similar, but not identical and we cannot use fpregs_lock() here as we
don't want to have local_bh_disable() when in hard interrupt context.

>> if (!(current->flags & PF_KTHREAD) &&
>> !test_thread_flag(TIF_NEED_FPU_LOAD)) {
>> @@ -433,9 +455,9 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask)
>>
>> void kernel_fpu_end(void)
>> {
>> - WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
>> + WARN_ON_ONCE(!this_cpu_read(fpu_in_use));
>>
>> - this_cpu_write(in_kernel_fpu, false);
>> + this_cpu_write(fpu_in_use, false);
>> preempt_enable();
>
> ... and this to fpregs_unlock().
>
> Can we use those here too instead of open-coding them mostly?

Not really, unless we drop the use FPU in softirq processing context
guarantee. See above.

Let me think about it.

Thanks,

tglx

2022-05-03 00:19:31

by Borislav Petkov

[permalink] [raw]
Subject: Re: [patch 3/3] x86/fpu: Make FPU protection more robust

On Sun, May 01, 2022 at 09:31:47PM +0200, Thomas Gleixner wrote:
> --- a/arch/x86/kernel/fpu/core.c
> +++ b/arch/x86/kernel/fpu/core.c
> @@ -42,7 +42,7 @@ struct fpu_state_config fpu_user_cfg __r
> struct fpstate init_fpstate __ro_after_init;
>
> /* Track in-kernel FPU usage */
> -static DEFINE_PER_CPU(bool, in_kernel_fpu);
> +static DEFINE_PER_CPU(bool, fpu_in_use);
>
> /*
> * Track which context is using the FPU on the CPU:
> @@ -50,6 +50,50 @@ static DEFINE_PER_CPU(bool, in_kernel_fp
> DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
>
> /**
> + * fpregs_lock - Lock FPU state for maintenance operations

"maintenance"?

> + *
> + * This protects against preemption, soft interrupts and in-kernel FPU
> + * usage on both !RT and RT enabled kernels.
> + *
> + * !RT kernels use local_bh_disable() to prevent soft interrupt processing
> + * and preemption.
> + *
> + * On RT kernels local_bh_disable() is not sufficient because it only
> + * serializes soft interrupt related sections via a local lock, but stays
> + * preemptible. Disabling preemption is the right choice here as bottom
> + * half processing is always in thread context on RT kernels so it
> + * implicitly prevents bottom half processing as well.
> + */
> +void fpregs_lock(void)
> +{
> + if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> + local_bh_disable();
> + else
> + preempt_disable();

So I'm wondering: can we get rid of this distinction and simply do
preempt_disable()?

Or can FPU be used in softirq processing too so we want to block that
there?

But even if, fpu_in_use will already state that fact...

...

> @@ -410,10 +433,9 @@ void kernel_fpu_begin_mask(unsigned int
> {
> preempt_disable();
>
> - WARN_ON_FPU(!kernel_fpu_usable());
> - WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
> + WARN_ON_ONCE(!kernel_fpu_usable());
>
> - this_cpu_write(in_kernel_fpu, true);
> + this_cpu_write(fpu_in_use, true);

This starts to look awfully similar to fpregs_lock()...

>
> if (!(current->flags & PF_KTHREAD) &&
> !test_thread_flag(TIF_NEED_FPU_LOAD)) {
> @@ -433,9 +455,9 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask)
>
> void kernel_fpu_end(void)
> {
> - WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
> + WARN_ON_ONCE(!this_cpu_read(fpu_in_use));
>
> - this_cpu_write(in_kernel_fpu, false);
> + this_cpu_write(fpu_in_use, false);
> preempt_enable();

... and this to fpregs_unlock().

Can we use those here too instead of open-coding them mostly?

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-05-03 09:27:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 3/3] x86/fpu: Make FPU protection more robust

On Mon, May 02, 2022 at 05:58:40PM +0200, Thomas Gleixner wrote:

> >> +void fpregs_lock(void)
> >> +{
> >> + if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> >> + local_bh_disable();
> >> + else
> >> + preempt_disable();
> >
> > So I'm wondering: can we get rid of this distinction and simply do
> > preempt_disable()?
> >
> > Or can FPU be used in softirq processing too so we want to block that
> > there?
>
> Yes, FPU can be used legitimately in softirq processing context.
>
> > But even if, fpu_in_use will already state that fact...
>
> Right, though currently it's guaranteed that softirq processing context
> can use the FPU. Quite some of the network crypto work runs in softirq
> context, so this might cause a regression. If so, then this needs to be
> an explicit commit on top which is easy to revert. Let me stare at it
> some more.

Right, so with the:

preempt_disable();
this_cpu_write(fpu_in_use, true);
barrier();

sequence it is safe against both softirq and hardirq fpu usage. The only
concern is performance not correctness when dropping that
local_bh_disable() thing.

So what Thomas proposes makes sense to me.

2022-05-03 11:28:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 3/3] x86/fpu: Make FPU protection more robust

On Sun, May 01, 2022 at 09:31:47PM +0200, Thomas Gleixner wrote:
> +void fpregs_lock(void)
> +{
> + if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> + local_bh_disable();
> + else
> + preempt_disable();
> +
> + WARN_ON_ONCE(this_cpu_read(fpu_in_use));
> + this_cpu_write(fpu_in_use, true);

barrier();
> +}
> +EXPORT_SYMBOL_GPL(fpregs_lock);

> +void fpregs_unlock(void)
> +{
barrier();

> + WARN_ON_ONCE(!this_cpu_read(fpu_in_use));
> + this_cpu_write(fpu_in_use, false);
> +
> + if (!IS_ENABLED(CONFIG_PREEMPT_RT))
> + local_bh_enable();
> + else
> + preempt_enable();
> +}
> +EXPORT_SYMBOL_GPL(fpregs_unlock);

I think this isn't currently a problem because a function call is a C
sequence point, but 'funnily' C doesn't preserve sequence points when
inlining so LTO can actually break this without barrier() on.

2022-05-05 12:19:56

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 3/3] x86/fpu: Make FPU protection more robust

Jason,

On Wed, May 04 2022 at 21:05, Jason A. Donenfeld wrote:
> The other stance is that these input/disk events are relatively rare --
> compared to, say, a storm of interrupts from a NIC -- so mixing into the
> input pool from there isn't actually a problem, and we benefit from the
> quasi domain-specific accounting and the superior mixing function,
> there, so keep it around. And the non-raw spinlock on the input pool
> won't negatively affect RT from this context, because all its callers on
> RT should be threaded.

I'm not worried about RT here.

> The second stance seems easier and more conservative from a certain
> perspective -- we don't need to change anything -- so I'm more inclined
> toward it.

That's not conservative, that's lazy and lame. Staying with the status
quo and piling more stuff on top because we can is just increasing
technical debt. Works for a while by some definition of works.

> And given that you've fixed the bug now, it sounds like that's fine
> with you too. But if you're thinking about it differently in fact, let
> me know.

That still does not address my observation that using the FPU for this
mixing, which is handling a couple of bytes per invocation, is not
really benefitial.

Which in turn bears the question, why we have to maintain an asymmetric
FPU protection mechanism in order to support hard interrupt FPU usage
for no or questionable benefit.

The current implementation, courtesy to hard interrupt support, has the
following downside:

Any FPU usage in task context where soft interrupts are enabled will
prevent FPU usage in soft interrupt processing when the interrupt hits
into the FPU usage region. That means the softirq processing has to
fall back to the generic implementations.

Sure, the protection could be context dependent, but that's generally
frowned upon. If we go there, then there has to be a really convincing
technical argument.

Thanks,

tglx

2022-05-07 05:27:07

by Jason A. Donenfeld

[permalink] [raw]
Subject: Re: [patch 3/3] x86/fpu: Make FPU protection more robust

Hey again Thomas,

On Thu, May 05, 2022 at 01:02:02PM +0200, Jason A. Donenfeld wrote:
> Interestingly, disabling the simd paths makes things around 283 cycles
> slower on my Tiger Lake laptop, just doing ordinary things. I'm actually
> slightly surprised, so I'll probably keep playing with this. My patch
> for this is attached. Let me know if you have a different methodology in
> mind...

Using RDPMC/perf, the performance is shown to be even closer for real
world cases, with the simd code only ~80 cycles faster. Bench code
follows below. If the observation on this hardware holds for other
hardware, we can probably improve the performance of the generic code a
bit, and then the difference really won't matter. Any thoughts about
this and the test code?

Jason

diff --git a/drivers/char/random.c b/drivers/char/random.c
index bd292927654c..6577e9f2f3b7 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -53,6 +53,7 @@
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
+#include <linux/sort.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#include <asm/processor.h>
@@ -755,9 +756,54 @@ static struct {
.lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
};

+static DEFINE_PER_CPU(int, pmc_index) = -1;
+static struct {
+ u32 durations[1 << 20];
+ u32 pos, len;
+} irqbench;
+
static void _mix_pool_bytes(const void *in, size_t nbytes)
{
+ int idx = *this_cpu_ptr(&pmc_index);
+ u32 ctr = input_pool.hash.t[0], reg = 0;
+ cycles_t end, start;
+
+
+ native_cpuid(&reg, &reg, &reg, &reg);
+ start = idx == -1 ? 0 : native_read_pmc(idx);
blake2s_update(&input_pool.hash, in, nbytes);
+ end = idx == -1 ? 0 : native_read_pmc(idx);
+
+ if (ctr == input_pool.hash.t[0] || !in_hardirq() || idx == -1)
+ return;
+
+ irqbench.durations[irqbench.pos++ % ARRAY_SIZE(irqbench.durations)] = end - start;
+ irqbench.len = min_t(u32, irqbench.len + 1, ARRAY_SIZE(irqbench.durations));
+}
+
+static int cmp_u32(const void *a, const void *b)
+{
+ return *(const u32 *)a - *(const u32 *)b;
+}
+
+static int proc_do_irqbench_median(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ u32 len = READ_ONCE(irqbench.len), median, *sorted;
+ struct ctl_table fake_table = {
+ .data = &median,
+ .maxlen = sizeof(median)
+ };
+ if (!len)
+ return -ENODATA;
+ sorted = kmalloc_array(len, sizeof(*sorted), GFP_KERNEL);
+ if (!sorted)
+ return -ENOMEM;
+ memcpy(sorted, irqbench.durations, len * sizeof(*sorted));
+ sort(sorted, len, sizeof(*sorted), cmp_u32, NULL);
+ median = sorted[len / 2];
+ kfree(sorted);
+ return write ? 0 : proc_douintvec(&fake_table, 0, buffer, lenp, ppos);
}

/*
@@ -1709,6 +1755,18 @@ static struct ctl_table random_table[] = {
.mode = 0444,
.proc_handler = proc_do_uuid,
},
+ {
+ .procname = "irqbench_median",
+ .mode = 0444,
+ .proc_handler = proc_do_irqbench_median,
+ },
+ {
+ .procname = "irqbench_count",
+ .data = &irqbench.len,
+ .maxlen = sizeof(irqbench.len),
+ .mode = 0444,
+ .proc_handler = proc_douintvec,
+ },
{ }
};

@@ -1718,6 +1776,21 @@ static struct ctl_table random_table[] = {
*/
static int __init random_sysctls_init(void)
{
+ int i;
+ struct perf_event *cycles_event;
+ struct perf_event_attr perf_cycles_attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = true
+ };
+ for_each_possible_cpu(i) {
+ cycles_event = perf_event_create_kernel_counter(&perf_cycles_attr, i, NULL, NULL, NULL);
+ if (IS_ERR(cycles_event))
+ pr_err("unable to create perf counter on cpu %d: %ld\n", i, PTR_ERR(cycles_event));
+ else
+ *per_cpu_ptr(&pmc_index, i) = cycles_event->hw.event_base_rdpmc;
+ }
register_sysctl_init("kernel/random", random_table);
return 0;
}